def loadJSON(self, path): """ Generic function for loading JSON from a path, handling local file systems and S3 or GS Parameters ---------- path : str Path to a file, can be on a local file system or an S3 or GS bucket Returns ------- A string with the JSON """ import json from thunder.rdds.fileio.readers import getFileReaderForPath, FileNotFoundError from thunder.utils.serializable import _decode_dict reader = getFileReaderForPath(path)(awsCredentialsOverride=self._credentials) try: buffer = reader.read(path) except FileNotFoundError: raise Exception("Cannot find file %s" % path) return json.loads(buffer, object_hook=_decode_dict)
def loadConf(self, dataPath, confFilename='conf.json'): """Returns a dict loaded from a json file. Looks for file named `conffile` in same directory as `dataPath` Returns {} if file not found """ if not confFilename: return {} reader = getFileReaderForPath(dataPath)( awsCredentialsOverride=self.awsCredentialsOverride) try: jsonBuf = reader.read(dataPath, filename=confFilename) except FileNotFoundError: return {} params = json.loads(jsonBuf) if 'format' in params: raise Exception( "Numerical format of value should be specified as 'valuetype', not 'format'" ) if 'keyformat' in params: raise Exception( "Numerical format of key should be specified as 'keytype', not 'keyformat'" ) return params
def loadParams(self, path): """ Load a file with parameters from a local file system or S3. Assumes file is JSON with basic types (strings, integers, doubles, lists), in either a single dict or list of dict-likes, and each dict has at least a "name" field and a "value" field. Useful for loading generic meta data, parameters, covariates, etc. Parameters ---------- path : str Path to file, can be on a local file system or an S3 bucket Returns ------- A dict or list with the parameters """ import json from thunder.rdds.fileio.readers import getFileReaderForPath, FileNotFoundError reader = getFileReaderForPath(path)( awsCredentialsOverride=self._credentials) try: buffer = reader.read(path) except FileNotFoundError: raise Exception("Cannot find file %s" % path) return Params(json.loads(buffer))
def loadJSON(self, path): """ Generic function for loading JSON from a path, handling local file systems and S3 or GS Parameters ---------- path : str Path to a file, can be on a local file system or an S3 or GS bucket Returns ------- A string with the JSON """ import json from thunder.rdds.fileio.readers import getFileReaderForPath, FileNotFoundError from thunder.utils.serializable import _decode_dict reader = getFileReaderForPath(path)( awsCredentialsOverride=self._credentials) try: buffer = reader.read(path) except FileNotFoundError: raise Exception("Cannot find file %s" % path) return json.loads(buffer, object_hook=_decode_dict)
def loadParams(self, path): """ Load a file with parameters from a local file system or S3. Assumes file is JSON with basic types (strings, integers, doubles, lists), in either a single dict or list of dict-likes, and each dict has at least a "name" field and a "value" field. Useful for loading generic meta data, parameters, covariates, etc. Parameters ---------- path : str Path to file, can be on a local file system or an S3 bucket Returns ------- A dict or list with the parameters """ import json from thunder.rdds.fileio.readers import getFileReaderForPath, FileNotFoundError reader = getFileReaderForPath(path)(awsCredentialsOverride=self._credentials) try: buffer = reader.read(path) except FileNotFoundError: raise Exception("Cannot find file %s" % path) return Params(json.loads(buffer))
def readblockfromtif(pidxbidx_): planeidx, blockidx = pidxbidx_ blocks = [] planeshape = None blockstart = None blockend = None for fname in filenames: reader_ = getFileReaderForPath(fname)() fp = reader_.open(fname) try: if minimize_reads: # use multitif module to generate a fake, in-memory one-page tif file # the advantage of this is that it cuts way down on the many small reads # that PIL/pillow will make otherwise, which would be a problem for s3 tiffparser_ = multitif.TiffParser(fp, debug=False) tiffilebuffer = multitif.packSinglePage( tiffparser_, page_idx=planeidx) bytebuf = io.BytesIO(tiffilebuffer) try: pilimg = Image.open(bytebuf) ary = pil_to_array(pilimg).T finally: bytebuf.close() del tiffilebuffer, tiffparser_, pilimg, bytebuf else: # read tif using PIL directly pilimg = Image.open(fp) pilimg.seek(planeidx) ary = pil_to_array(pilimg).T del pilimg if not planeshape: planeshape = ary.shape[:] blockstart = blockidx * blocklenPixels blockend = min(blockstart + blocklenPixels, planeshape[0] * planeshape[1]) blocks.append(ary.ravel(order='C')[blockstart:blockend]) del ary finally: fp.close() buf = vstack( blocks).T # dimensions are now linindex x time (images) del blocks buf = buf.astype(newdtype, casting=casting, copy=False) # append subscript keys based on dimensions linindx = arange(blockstart, blockend) # zero-based serieskeys = zip( *map(tuple, unravel_index(linindx, planeshape, order='C'))) # add plane index to end of keys serieskeys = [ tuple(list(keys_)[::-1] + [planeidx]) for keys_ in serieskeys ] return zip(serieskeys, buf)
def readBlockFromTiff(planeIdxBlockIdx): planeIdx, blockIdx = planeIdxBlockIdx blocks = [] planeShape = None blockStart = None blockEnd = None for fname in filenames: reader_ = getFileReaderForPath(fname)(awsCredentialsOverride=awsCredentialsOverride) fp = reader_.open(fname) try: if doMinimizeReads: # use multitif module to generate a fake, in-memory # one-page tif file. the advantage of this is that it # cuts way down on the many small reads that PIL/pillow # will make otherwise, which would be a problem for s3 # or Google Storage tiffParser_ = multitif.TiffParser(fp, debug=False) tiffFilebuffer = multitif.packSinglePage(tiffParser_, pageIdx=planeIdx) byteBuf = io.BytesIO(tiffFilebuffer) try: pilImg = Image.open(byteBuf) ary = conversionFcn(pilImg).T finally: byteBuf.close() del tiffFilebuffer, tiffParser_, pilImg, byteBuf else: # read tif using PIL directly pilImg = Image.open(fp) pilImg.seek(planeIdx) ary = conversionFcn(pilImg).T del pilImg if not planeShape: planeShape = ary.shape[:] blockStart = blockIdx * blocklenPixels blockEnd = min(blockStart + blocklenPixels, planeShape[0] * planeShape[1]) blocks.append(ary.ravel(order="C")[blockStart:blockEnd]) del ary finally: fp.close() buf = vstack(blocks).T # dimensions are now linindex x time (images) del blocks buf = buf.astype(newDtype, casting=casting, copy=False) # append subscript keys based on dimensions linearIdx = arange(blockStart, blockEnd) # zero-based seriesKeys = zip(*map(tuple, unravel_index(linearIdx, planeShape, order="C"))) # add plane index to end of keys if npages > 1: seriesKeys = [tuple(list(keys_)[::-1] + [planeIdx]) for keys_ in seriesKeys] else: seriesKeys = [tuple(list(keys_)[::-1]) for keys_ in seriesKeys] return zip(seriesKeys, buf)
def raiseErrorIfPathExists(path, awsCredentialsOverride=None): """ The ValueError message will suggest calling with overwrite=True; this function is expected to be called from the various output methods that accept an 'overwrite' keyword argument. """ # check that specified output path does not already exist from thunder.rdds.fileio.readers import getFileReaderForPath reader = getFileReaderForPath(path)(awsCredentialsOverride=awsCredentialsOverride) existing = reader.list(path, includeDirectories=True) if existing: raise ValueError("Path %s appears to already exist. Specify a new directory, or call " % path + "with overwrite=True to overwrite.")
def raiseErrorIfPathExists(path): """Raises a ValueError if the passed path string is found to already exist. The ValueError message will suggest calling with overwrite=True; this function is expected to be called from the various output methods that accept an 'overwrite' keyword argument. """ # check that specified output path does not already exist from thunder.rdds.fileio.readers import getFileReaderForPath reader = getFileReaderForPath(path)() existing = reader.list(path, includeDirectories=True) if existing: raise ValueError("Path %s appears to already exist. Specify a new directory, or call " % path + "with overwrite=True to overwrite.")
def readblockfromtif(pidxbidx_): planeidx, blockidx = pidxbidx_ blocks = [] planeshape = None blockstart = None blockend = None for fname in filenames: reader_ = getFileReaderForPath(fname)() fp = reader_.open(fname) try: if minimize_reads: # use multitif module to generate a fake, in-memory one-page tif file # the advantage of this is that it cuts way down on the many small reads # that PIL/pillow will make otherwise, which would be a problem for s3 tiffparser_ = multitif.TiffParser(fp, debug=False) tiffilebuffer = multitif.packSinglePage(tiffparser_, page_idx=planeidx) bytebuf = io.BytesIO(tiffilebuffer) try: pilimg = Image.open(bytebuf) ary = array(pilimg).T finally: bytebuf.close() del tiffilebuffer, tiffparser_, pilimg, bytebuf else: # read tif using PIL directly pilimg = Image.open(fp) pilimg.seek(planeidx) ary = array(pilimg).T del pilimg if not planeshape: planeshape = ary.shape[:] blockstart = blockidx * blocklenPixels blockend = min(blockstart+blocklenPixels, planeshape[0]*planeshape[1]) blocks.append(ary.ravel(order='C')[blockstart:blockend]) del ary finally: fp.close() buf = vstack(blocks).T # dimensions are now linindex x time (images) del blocks buf = buf.astype(newdtype, casting=casting, copy=False) # append subscript keys based on dimensions linindx = arange(blockstart, blockend) # zero-based serieskeys = zip(*map(tuple, unravel_index(linindx, planeshape, order='C'))) # add plane index to end of keys serieskeys = [tuple(list(keys_)[::-1]+[planeidx]) for keys_ in serieskeys] return zip(serieskeys, buf)
def loadConf(self, dataPath, confFilename='conf.json'): """Returns a dict loaded from a json file. Looks for file named `conffile` in same directory as `dataPath` Returns {} if file not found """ if not confFilename: return {} reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride) try: jsonBuf = reader.read(dataPath, filename=confFilename) except FileNotFoundError: return {} params = json.loads(jsonBuf) if 'format' in params: raise Exception("Numerical format of value should be specified as 'valuetype', not 'format'") if 'keyformat' in params: raise Exception("Numerical format of key should be specified as 'keytype', not 'keyformat'") return params
def _getSeriesBlocksFromMultiTif(self, dataPath, ext="tif", blockSize="150M", newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None, recursive=False): import thunder.rdds.fileio.multitif as multitif import itertools from PIL import Image import io dataPath = self.__normalizeDatafilePattern(dataPath, ext) blockSize = parseMemoryString(blockSize) reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride) filenames = reader.list(dataPath, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) if not filenames: raise IOError("No files found for path '%s'" % dataPath) ntimepoints = len(filenames) doMinimizeReads = dataPath.lower().startswith("s3") # check PIL version to see whether it is actually pillow or indeed old PIL and choose # conversion function appropriately. See ImagesLoader.fromMultipageTif and common.pil_to_array # for more explanation. isPillow = hasattr(Image, "PILLOW_VERSION") if isPillow: conversionFcn = array # use numpy's array() function else: from thunder.utils.common import pil_to_array conversionFcn = pil_to_array # use our modified version of matplotlib's pil_to_array height, width, npages, dtype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif(reader, filenames[0]) if dtype.startswith('int'): raise ValueError('Signed integer tiff images are not supported in SeriesLoader (shuffle=False);' + ' please try loading as Images (shuffle=True)') pixelBytesize = dtypeFunc(dtype).itemsize if newDtype is None or str(newDtype) == '': newDtype = str(dtype) elif newDtype == 'smallfloat': newDtype = str(smallestFloatType(dtype)) else: newDtype = str(newDtype) # intialize at one block per plane bytesPerPlane = height * width * pixelBytesize * ntimepoints bytesPerBlock = bytesPerPlane blocksPerPlane = 1 # keep dividing while cutting our size in half still leaves us bigger than the requested size # should end up no more than 2x blockSize. while bytesPerBlock >= blockSize * 2: bytesPerBlock /= 2 blocksPerPlane *= 2 blocklenPixels = max((height * width) / blocksPerPlane, 1) # integer division while blocksPerPlane * blocklenPixels < height * width: # make sure we're reading the plane fully blocksPerPlane += 1 # prevent bringing in self in closure: awsCredentialsOverride = self.awsCredentialsOverride # keys will be planeidx, blockidx: keys = list(itertools.product(xrange(npages), xrange(blocksPerPlane))) def readBlockFromTiff(planeIdxBlockIdx): planeIdx, blockIdx = planeIdxBlockIdx blocks = [] planeShape = None blockStart = None blockEnd = None for fname in filenames: reader_ = getFileReaderForPath(fname)(awsCredentialsOverride=awsCredentialsOverride) fp = reader_.open(fname) try: if doMinimizeReads: # use multitif module to generate a fake, in-memory one-page tif file # the advantage of this is that it cuts way down on the many small reads # that PIL/pillow will make otherwise, which would be a problem for s3 tiffParser_ = multitif.TiffParser(fp, debug=False) tiffFilebuffer = multitif.packSinglePage(tiffParser_, pageIdx=planeIdx) byteBuf = io.BytesIO(tiffFilebuffer) try: pilImg = Image.open(byteBuf) ary = conversionFcn(pilImg).T finally: byteBuf.close() del tiffFilebuffer, tiffParser_, pilImg, byteBuf else: # read tif using PIL directly pilImg = Image.open(fp) pilImg.seek(planeIdx) ary = conversionFcn(pilImg).T del pilImg if not planeShape: planeShape = ary.shape[:] blockStart = blockIdx * blocklenPixels blockEnd = min(blockStart+blocklenPixels, planeShape[0]*planeShape[1]) blocks.append(ary.ravel(order='C')[blockStart:blockEnd]) del ary finally: fp.close() buf = vstack(blocks).T # dimensions are now linindex x time (images) del blocks buf = buf.astype(newDtype, casting=casting, copy=False) # append subscript keys based on dimensions linearIdx = arange(blockStart, blockEnd) # zero-based seriesKeys = zip(*map(tuple, unravel_index(linearIdx, planeShape, order='C'))) # add plane index to end of keys if npages > 1: seriesKeys = [tuple(list(keys_)[::-1]+[planeIdx]) for keys_ in seriesKeys] else: seriesKeys = [tuple(list(keys_)[::-1]) for keys_ in seriesKeys] return zip(seriesKeys, buf) # map over blocks rdd = self.sc.parallelize(keys, len(keys)).flatMap(readBlockFromTiff) if npages > 1: dims = (npages, width, height) else: dims = (width, height) metadata = (dims, ntimepoints, newDtype) return rdd, metadata
def _getSeriesBlocksFromStack(self, dataPath, dims, ext="stack", blockSize="150M", dtype='int16', newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None, recursive=False): """Create an RDD of <string blocklabel, (int k-tuple indices, array of datatype values)> Parameters ---------- dataPath: string URI or local filesystem path Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://", "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified by a glob-style expression using a single wildcard character '*'. dims: tuple of positive int Dimensions of input image data, ordered with the fastest-changing dimension first. dtype: dtype or dtype specifier, optional, default 'int16' Numpy dtype of input stack data newDtype: floating-point dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat' Numpy dtype of output series data. Series data must be floating-point. Input data will be cast to the requested `newdtype` - see numpy `astype()` method. casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's `astype()` method; see numpy documentation for details. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems (not s3). Returns --------- pair of (RDD, ntimepoints) RDD: sequence of keys, values pairs (call using flatMap) RDD Key: tuple of int zero-based indicies of position within original image volume RDD Value: numpy array of datatype series of values at position across loaded image volumes ntimepoints: int number of time points in returned series, determined from number of stack files found at dataPath newDtype: string string representation of numpy data type of returned blocks """ dataPath = self.__normalizeDatafilePattern(dataPath, ext) blockSize = parseMemoryString(blockSize) totalDim = reduce(lambda x_, y_: x_*y_, dims) dtype = dtypeFunc(dtype) if newDtype is None or newDtype == '': newDtype = str(dtype) elif newDtype == 'smallfloat': newDtype = str(smallestFloatType(dtype)) else: newDtype = str(newDtype) reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride) filenames = reader.list(dataPath, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) if not filenames: raise IOError("No files found for path '%s'" % dataPath) dataSize = totalDim * len(filenames) * dtype.itemsize nblocks = max(dataSize / blockSize, 1) # integer division if len(dims) >= 3: # for 3D stacks, do calculations to ensure that # different planes appear in distinct files blocksPerPlane = max(nblocks / dims[-1], 1) pixPerPlane = reduce(lambda x_, y_: x_*y_, dims[:-1]) # all but last dimension # get the greatest number of blocks in a plane (up to as many as requested) that still divide the plane # evenly. This will always be at least one. kUpdated = [x for x in range(1, blocksPerPlane+1) if not pixPerPlane % x][-1] nblocks = kUpdated * dims[-1] blockSizePerStack = (totalDim / nblocks) * dtype.itemsize else: # otherwise just round to make contents divide into nearly even blocks blockSizePerStack = int(math.ceil(totalDim / float(nblocks))) nblocks = int(math.ceil(totalDim / float(blockSizePerStack))) blockSizePerStack *= dtype.itemsize fileSize = totalDim * dtype.itemsize def readBlock(blockNum): # copy size out from closure; will modify later: blockSizePerStack_ = blockSizePerStack # get start position for this block position = blockNum * blockSizePerStack_ # adjust if at end of file if (position + blockSizePerStack_) > fileSize: blockSizePerStack_ = int(fileSize - position) # loop over files, loading one block from each bufs = [] for fname in filenames: buf = reader.read(fname, startOffset=position, size=blockSizePerStack_) bufs.append(frombuffer(buf, dtype=dtype)) buf = vstack(bufs).T # dimensions are now linindex x time (images) del bufs buf = buf.astype(newDtype, casting=casting, copy=False) # append subscript keys based on dimensions itemPosition = position / dtype.itemsize itemBlocksize = blockSizePerStack_ / dtype.itemsize linearIdx = arange(itemPosition, itemPosition + itemBlocksize) # zero-based keys = zip(*map(tuple, unravel_index(linearIdx, dims, order='F'))) return zip(keys, buf) # map over blocks return (self.sc.parallelize(range(0, nblocks), nblocks).flatMap(lambda bn: readBlock(bn)), len(filenames), newDtype)
def fromStack(self, dataPath, dims=None, dtype=None, ext='stack', startIdx=None, stopIdx=None, recursive=False, nplanes=None, npartitions=None, confFilename='conf.json'): """Load an Images object stored in a directory of flat binary files The RDD wrapped by the returned Images object will have a number of partitions equal to the number of image data files read in by this method. Currently all binary data read by this method is assumed to be formatted as signed 16 bit integers in native byte order. Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. dims: tuple of positive int Dimensions of input image data, ordered with fastest-changing dimension first ext: string, optional, default "stack" Extension required on data files to be loaded. startIdx, stopIdx: nonnegative int. optional. Indices of the first and last-plus-one data file to load, relative to the sorted filenames matching `datapath` and `ext`. Interpreted according to python slice indexing conventions. recursive: boolean, default False If true, will recursively descend directories rooted at datapath, loading all files in the tree that have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems (not s3). nplanes: positive integer, default None If passed, will cause a single binary stack file to be subdivided into multiple records. Every `nplanes` z-planes in the file will be taken as a new record, with the first nplane planes of the first file being record 0, the second nplane planes being record 1, etc, until the first file is exhausted and record ordering continues with the first nplane planes of the second file, and so on. With nplanes=None (the default), a single file will be considered as representing a single record. npartitions: positive int, optional. If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1 partition per image file. """ reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride) try: jsonBuf = reader.read(dataPath, filename=confFilename) params = json.loads(jsonBuf) except FileNotFoundError: params = {} if 'dtype' in params.keys(): dtype = params['dtype'] if 'dims' in params.keys(): dims = params['dims'] if not dims: raise ValueError("Image dimensions must be specified either as argument or in a conf.json file") if not dtype: dtype = 'int16' if nplanes is not None: if nplanes <= 0: raise ValueError("nplanes must be positive if passed, got %d" % nplanes) if dims[-1] % nplanes: raise ValueError("Last dimension of stack image '%d' must be divisible by nplanes '%d'" % (dims[-1], nplanes)) def toArray(idxAndBuf): idx, buf = idxAndBuf ary = frombuffer(buf, dtype=dtype, count=int(prod(dims))).reshape(dims, order='F') if nplanes is None: yield idx, ary else: # divide array into chunks of nplanes npoints = dims[-1] / nplanes # integer division if dims[-1] % nplanes: npoints += 1 timepoint = 0 lastPlane = 0 curPlane = 1 while curPlane < ary.shape[-1]: if curPlane % nplanes == 0: slices = [slice(None)] * (ary.ndim - 1) + [slice(lastPlane, curPlane)] yield idx*npoints + timepoint, ary[slices] timepoint += 1 lastPlane = curPlane curPlane += 1 # yield remaining planes slices = [slice(None)] * (ary.ndim - 1) + [slice(lastPlane, ary.shape[-1])] yield idx*npoints + timepoint, ary[slices] reader = getParallelReaderForPath(dataPath)(self.sc, awsCredentialsOverride=self.awsCredentialsOverride) readerRdd = reader.read(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, npartitions=npartitions) nrecords = reader.lastNRecs if nplanes is None else None newDims = tuple(list(dims[:-1]) + [nplanes]) if nplanes else dims return Images(readerRdd.flatMap(toArray), nrecords=nrecords, dims=newDims, dtype=dtype)
def _getSeriesBlocksFromMultiTif(self, datapath, ext="tif", blockSize="150M", newdtype='smallfloat', casting='safe', startidx=None, stopidx=None): import thunder.rdds.fileio.multitif as multitif import itertools from PIL import Image from thunder.utils.common import pil_to_array, smallest_float_type import io datapath = self.__normalizeDatafilePattern(datapath, ext) blockSize = parseMemoryString(blockSize) reader = getFileReaderForPath(datapath)() filenames = reader.list(datapath) if not filenames: raise IOError("No files found for path '%s'" % datapath) filenames = selectByStartAndStopIndices(filenames, startidx, stopidx) ntimepoints = len(filenames) minimize_reads = datapath.lower().startswith("s3") height, width, npages, datatype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif( reader, filenames[0]) pixelbytesize = dtypefunc(datatype).itemsize if newdtype is None or str(newdtype) == '': newdtype = str(datatype) elif newdtype == 'smallfloat': newdtype = str(smallest_float_type(datatype)) else: newdtype = str(newdtype) # intialize at one block per plane bytesperplane = height * width * pixelbytesize * ntimepoints bytesperblock = bytesperplane blocksperplane = 1 # keep dividing while cutting our size in half still leaves us bigger than the requested size # should end up no more than 2x blockSize. while bytesperblock >= blockSize * 2: bytesperblock /= 2 blocksperplane *= 2 blocklenPixels = max((height * width) / blocksperplane, 1) # integer division while blocksperplane * blocklenPixels < height * width: # make sure we're reading the plane fully blocksperplane += 1 # keys will be planeidx, blockidx: keys = list(itertools.product(xrange(npages), xrange(blocksperplane))) def readblockfromtif(pidxbidx_): planeidx, blockidx = pidxbidx_ blocks = [] planeshape = None blockstart = None blockend = None for fname in filenames: reader_ = getFileReaderForPath(fname)() fp = reader_.open(fname) try: if minimize_reads: # use multitif module to generate a fake, in-memory one-page tif file # the advantage of this is that it cuts way down on the many small reads # that PIL/pillow will make otherwise, which would be a problem for s3 tiffparser_ = multitif.TiffParser(fp, debug=False) tiffilebuffer = multitif.packSinglePage( tiffparser_, page_idx=planeidx) bytebuf = io.BytesIO(tiffilebuffer) try: pilimg = Image.open(bytebuf) ary = pil_to_array(pilimg).T finally: bytebuf.close() del tiffilebuffer, tiffparser_, pilimg, bytebuf else: # read tif using PIL directly pilimg = Image.open(fp) pilimg.seek(planeidx) ary = pil_to_array(pilimg).T del pilimg if not planeshape: planeshape = ary.shape[:] blockstart = blockidx * blocklenPixels blockend = min(blockstart + blocklenPixels, planeshape[0] * planeshape[1]) blocks.append(ary.ravel(order='C')[blockstart:blockend]) del ary finally: fp.close() buf = vstack( blocks).T # dimensions are now linindex x time (images) del blocks buf = buf.astype(newdtype, casting=casting, copy=False) # append subscript keys based on dimensions linindx = arange(blockstart, blockend) # zero-based serieskeys = zip( *map(tuple, unravel_index(linindx, planeshape, order='C'))) # add plane index to end of keys serieskeys = [ tuple(list(keys_)[::-1] + [planeidx]) for keys_ in serieskeys ] return zip(serieskeys, buf) # map over blocks rdd = self.sc.parallelize(keys, len(keys)).flatMap(readblockfromtif) dims = (npages, width, height) metadata = (dims, ntimepoints, newdtype) return rdd, metadata
def fromStack(self, dataPath, dims=None, dtype=None, ext='stack', startIdx=None, stopIdx=None, recursive=False, nplanes=None, npartitions=None, confFilename='conf.json'): """Load an Images object stored in a directory of flat binary files The RDD wrapped by the returned Images object will have a number of partitions equal to the number of image data files read in by this method. Currently all binary data read by this method is assumed to be formatted as signed 16 bit integers in native byte order. Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. dims: tuple of positive int Dimensions of input image data, ordered with fastest-changing dimension first ext: string, optional, default "stack" Extension required on data files to be loaded. startIdx, stopIdx: nonnegative int. optional. Indices of the first and last-plus-one data file to load, relative to the sorted filenames matching `datapath` and `ext`. Interpreted according to python slice indexing conventions. recursive: boolean, default False If true, will recursively descend directories rooted at datapath, loading all files in the tree that have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems (not s3). nplanes: positive integer, default None If passed, will cause a single binary stack file to be subdivided into multiple records. Every `nplanes` z-planes in the file will be taken as a new record, with the first nplane planes of the first file being record 0, the second nplane planes being record 1, etc, until the first file is exhausted and record ordering continues with the first nplane planes of the second file, and so on. With nplanes=None (the default), a single file will be considered as representing a single record. npartitions: positive int, optional. If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1 partition per image file. """ reader = getFileReaderForPath(dataPath)( awsCredentialsOverride=self.awsCredentialsOverride) try: jsonBuf = reader.read(dataPath, filename=confFilename) params = json.loads(jsonBuf) except FileNotFoundError: params = {} if 'dtype' in params.keys(): dtype = params['dtype'] if 'dims' in params.keys(): dims = params['dims'] if not dims: raise ValueError( "Image dimensions must be specified either as argument or in a conf.json file" ) if not dtype: dtype = 'int16' if nplanes is not None: if nplanes <= 0: raise ValueError("nplanes must be positive if passed, got %d" % nplanes) if dims[-1] % nplanes: raise ValueError( "Last dimension of stack image '%d' must be divisible by nplanes '%d'" % (dims[-1], nplanes)) def toArray(idxAndBuf): idx, buf = idxAndBuf ary = frombuffer(buf, dtype=dtype, count=int(prod(dims))).reshape(dims, order='F') if nplanes is None: yield idx, ary else: # divide array into chunks of nplanes npoints = dims[-1] / nplanes # integer division if dims[-1] % nplanes: npoints += 1 timepoint = 0 lastPlane = 0 curPlane = 1 while curPlane < ary.shape[-1]: if curPlane % nplanes == 0: slices = [slice(None)] * (ary.ndim - 1) + [ slice(lastPlane, curPlane) ] yield idx * npoints + timepoint, ary[slices] timepoint += 1 lastPlane = curPlane curPlane += 1 # yield remaining planes slices = [slice(None)] * (ary.ndim - 1) + [ slice(lastPlane, ary.shape[-1]) ] yield idx * npoints + timepoint, ary[slices] reader = getParallelReaderForPath(dataPath)( self.sc, awsCredentialsOverride=self.awsCredentialsOverride) readerRdd = reader.read(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, npartitions=npartitions) nrecords = reader.lastNRecs if nplanes is None else None newDims = tuple(list(dims[:-1]) + [nplanes]) if nplanes else dims return Images(readerRdd.flatMap(toArray), nrecords=nrecords, dims=newDims, dtype=dtype)
def _getSeriesBlocksFromMultiTif(self, datapath, ext="tif", blockSize="150M", newdtype='smallfloat', casting='safe', startidx=None, stopidx=None): import thunder.rdds.fileio.multitif as multitif import itertools from PIL import Image import io datapath = self.__normalizeDatafilePattern(datapath, ext) blockSize = parseMemoryString(blockSize) reader = getFileReaderForPath(datapath)() filenames = reader.list(datapath) if not filenames: raise IOError("No files found for path '%s'" % datapath) filenames = selectByStartAndStopIndices(filenames, startidx, stopidx) ntimepoints = len(filenames) minimize_reads = datapath.lower().startswith("s3") height, width, npages, datatype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif(reader, filenames[0]) pixelbytesize = dtypefunc(datatype).itemsize if newdtype is None or str(newdtype) == '': newdtype = str(datatype) elif newdtype == 'smallfloat': newdtype = str(smallest_float_type(datatype)) else: newdtype = str(newdtype) # intialize at one block per plane bytesperplane = height * width * pixelbytesize * ntimepoints bytesperblock = bytesperplane blocksperplane = 1 # keep dividing while cutting our size in half still leaves us bigger than the requested size # should end up no more than 2x blockSize. while bytesperblock >= blockSize * 2: bytesperblock /= 2 blocksperplane *= 2 blocklenPixels = max((height * width) / blocksperplane, 1) # integer division while blocksperplane * blocklenPixels < height * width: # make sure we're reading the plane fully blocksperplane += 1 # keys will be planeidx, blockidx: keys = list(itertools.product(xrange(npages), xrange(blocksperplane))) def readblockfromtif(pidxbidx_): planeidx, blockidx = pidxbidx_ blocks = [] planeshape = None blockstart = None blockend = None for fname in filenames: reader_ = getFileReaderForPath(fname)() fp = reader_.open(fname) try: if minimize_reads: # use multitif module to generate a fake, in-memory one-page tif file # the advantage of this is that it cuts way down on the many small reads # that PIL/pillow will make otherwise, which would be a problem for s3 tiffparser_ = multitif.TiffParser(fp, debug=False) tiffilebuffer = multitif.packSinglePage(tiffparser_, page_idx=planeidx) bytebuf = io.BytesIO(tiffilebuffer) try: pilimg = Image.open(bytebuf) ary = array(pilimg).T finally: bytebuf.close() del tiffilebuffer, tiffparser_, pilimg, bytebuf else: # read tif using PIL directly pilimg = Image.open(fp) pilimg.seek(planeidx) ary = array(pilimg).T del pilimg if not planeshape: planeshape = ary.shape[:] blockstart = blockidx * blocklenPixels blockend = min(blockstart+blocklenPixels, planeshape[0]*planeshape[1]) blocks.append(ary.ravel(order='C')[blockstart:blockend]) del ary finally: fp.close() buf = vstack(blocks).T # dimensions are now linindex x time (images) del blocks buf = buf.astype(newdtype, casting=casting, copy=False) # append subscript keys based on dimensions linindx = arange(blockstart, blockend) # zero-based serieskeys = zip(*map(tuple, unravel_index(linindx, planeshape, order='C'))) # add plane index to end of keys serieskeys = [tuple(list(keys_)[::-1]+[planeidx]) for keys_ in serieskeys] return zip(serieskeys, buf) # map over blocks rdd = self.sc.parallelize(keys, len(keys)).flatMap(readblockfromtif) dims = (npages, width, height) metadata = (dims, ntimepoints, newdtype) return rdd, metadata