def fromBinary(self, dataPath, ext='bin', confFilename='conf.json', nkeys=None, nvalues=None, keyType=None, valueType=None, newDtype='smallfloat', casting='safe', maxPartitionSize='32mb'): """ Load a Series object from a directory of binary files. Parameters ---------- dataPath : string URI or local filesystem path Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://", "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified by a glob-style expression using a single wildcard character '*'. newDtype : dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat' Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be cast to the requested `newdtype` if not None - see Data `astype()` method. casting : 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's `astype()` method; see numpy documentation for details. maxPartitionSize : str, optional, default = '32mb' Maximum size of partitions as Java-style memory, will indirectly control the number of partitions """ paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename, nkeys, nvalues, keyType, valueType) self.__checkBinaryParametersAreSpecified(paramsObj) dataPath = self.__normalizeDatafilePattern(dataPath, ext) keyDtype = dtypeFunc(paramsObj.keytype) valDtype = dtypeFunc(paramsObj.valuetype) keySize = paramsObj.nkeys * keyDtype.itemsize recordSize = keySize + paramsObj.nvalues * valDtype.itemsize from thunder.utils.common import parseMemoryString if isinstance(maxPartitionSize, basestring): size = parseMemoryString(maxPartitionSize) else: raise Exception("Invalid size specification") hadoopConf = {'recordLength': str(recordSize), 'mapred.max.split.size': str(size)} lines = self.sc.newAPIHadoopFile(dataPath, 'thunder.util.io.hadoop.FixedLengthBinaryInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.BytesWritable', conf=hadoopConf) data = lines.map(lambda (_, v): (tuple(int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype)), frombuffer(buffer(v, keySize), dtype=valDtype))) return Series(data, dtype=str(valDtype), index=arange(paramsObj.nvalues)).astype(newDtype, casting)
def _calcSplitsForBlockSize(blockSize, elementSize, dims): from thunder.utils.common import parseMemoryString import bisect if isinstance(blockSize, basestring): blockSize = parseMemoryString(blockSize) memSeq = _BlockMemoryAsReversedSequence(_normDimsToShapeTuple(dims)) tmpIdx = bisect.bisect_left(memSeq, blockSize / float(elementSize)) if tmpIdx == len(memSeq): # handle case where requested block is bigger than the biggest image # we can produce; just give back the biggest block size tmpIdx -= 1 return memSeq.indToSub(tmpIdx)
def _scatterToBlocks(self, blockSize="150M", blocksPerDim=None, groupingDim=None): if not groupingDim is None: # get series from blocks defined by pivoting: gd = self.__validateOrCalcGroupingDim(groupingDim=groupingDim) blocksdata = self._toBlocksByImagePlanes(groupingDim=gd) else: # get series from blocks defined by splits if not blocksPerDim: # get splits from requested block size blockSize = parseMemoryString(blockSize) blocksPerDim = self.__calcBlocksPerDim(blockSize) blocksdata = self._toBlocksBySplits(blocksPerDim) return blocksdata
def _getSeriesBlocksFromMultiTif(self, dataPath, ext="tif", blockSize="150M", newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None, recursive=False): import thunder.rdds.fileio.multitif as multitif import itertools from PIL import Image import io dataPath = self.__normalizeDatafilePattern(dataPath, ext) blockSize = parseMemoryString(blockSize) reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride) filenames = reader.list(dataPath, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) if not filenames: raise IOError("No files found for path '%s'" % dataPath) ntimepoints = len(filenames) doMinimizeReads = dataPath.lower().startswith("s3") # check PIL version to see whether it is actually pillow or indeed old PIL and choose # conversion function appropriately. See ImagesLoader.fromMultipageTif and common.pil_to_array # for more explanation. isPillow = hasattr(Image, "PILLOW_VERSION") if isPillow: conversionFcn = array # use numpy's array() function else: from thunder.utils.common import pil_to_array conversionFcn = pil_to_array # use our modified version of matplotlib's pil_to_array height, width, npages, dtype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif(reader, filenames[0]) if dtype.startswith('int'): raise ValueError('Signed integer tiff images are not supported in SeriesLoader (shuffle=False);' + ' please try loading as Images (shuffle=True)') pixelBytesize = dtypeFunc(dtype).itemsize if newDtype is None or str(newDtype) == '': newDtype = str(dtype) elif newDtype == 'smallfloat': newDtype = str(smallestFloatType(dtype)) else: newDtype = str(newDtype) # intialize at one block per plane bytesPerPlane = height * width * pixelBytesize * ntimepoints bytesPerBlock = bytesPerPlane blocksPerPlane = 1 # keep dividing while cutting our size in half still leaves us bigger than the requested size # should end up no more than 2x blockSize. while bytesPerBlock >= blockSize * 2: bytesPerBlock /= 2 blocksPerPlane *= 2 blocklenPixels = max((height * width) / blocksPerPlane, 1) # integer division while blocksPerPlane * blocklenPixels < height * width: # make sure we're reading the plane fully blocksPerPlane += 1 # prevent bringing in self in closure: awsCredentialsOverride = self.awsCredentialsOverride # keys will be planeidx, blockidx: keys = list(itertools.product(xrange(npages), xrange(blocksPerPlane))) def readBlockFromTiff(planeIdxBlockIdx): planeIdx, blockIdx = planeIdxBlockIdx blocks = [] planeShape = None blockStart = None blockEnd = None for fname in filenames: reader_ = getFileReaderForPath(fname)(awsCredentialsOverride=awsCredentialsOverride) fp = reader_.open(fname) try: if doMinimizeReads: # use multitif module to generate a fake, in-memory one-page tif file # the advantage of this is that it cuts way down on the many small reads # that PIL/pillow will make otherwise, which would be a problem for s3 tiffParser_ = multitif.TiffParser(fp, debug=False) tiffFilebuffer = multitif.packSinglePage(tiffParser_, pageIdx=planeIdx) byteBuf = io.BytesIO(tiffFilebuffer) try: pilImg = Image.open(byteBuf) ary = conversionFcn(pilImg).T finally: byteBuf.close() del tiffFilebuffer, tiffParser_, pilImg, byteBuf else: # read tif using PIL directly pilImg = Image.open(fp) pilImg.seek(planeIdx) ary = conversionFcn(pilImg).T del pilImg if not planeShape: planeShape = ary.shape[:] blockStart = blockIdx * blocklenPixels blockEnd = min(blockStart+blocklenPixels, planeShape[0]*planeShape[1]) blocks.append(ary.ravel(order='C')[blockStart:blockEnd]) del ary finally: fp.close() buf = vstack(blocks).T # dimensions are now linindex x time (images) del blocks buf = buf.astype(newDtype, casting=casting, copy=False) # append subscript keys based on dimensions linearIdx = arange(blockStart, blockEnd) # zero-based seriesKeys = zip(*map(tuple, unravel_index(linearIdx, planeShape, order='C'))) # add plane index to end of keys if npages > 1: seriesKeys = [tuple(list(keys_)[::-1]+[planeIdx]) for keys_ in seriesKeys] else: seriesKeys = [tuple(list(keys_)[::-1]) for keys_ in seriesKeys] return zip(seriesKeys, buf) # map over blocks rdd = self.sc.parallelize(keys, len(keys)).flatMap(readBlockFromTiff) if npages > 1: dims = (npages, width, height) else: dims = (width, height) metadata = (dims, ntimepoints, newDtype) return rdd, metadata
def _getSeriesBlocksFromStack(self, dataPath, dims, ext="stack", blockSize="150M", dtype='int16', newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None, recursive=False): """Create an RDD of <string blocklabel, (int k-tuple indices, array of datatype values)> Parameters ---------- dataPath: string URI or local filesystem path Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://", "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified by a glob-style expression using a single wildcard character '*'. dims: tuple of positive int Dimensions of input image data, ordered with the fastest-changing dimension first. dtype: dtype or dtype specifier, optional, default 'int16' Numpy dtype of input stack data newDtype: floating-point dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat' Numpy dtype of output series data. Series data must be floating-point. Input data will be cast to the requested `newdtype` - see numpy `astype()` method. casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's `astype()` method; see numpy documentation for details. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems (not s3). Returns --------- pair of (RDD, ntimepoints) RDD: sequence of keys, values pairs (call using flatMap) RDD Key: tuple of int zero-based indicies of position within original image volume RDD Value: numpy array of datatype series of values at position across loaded image volumes ntimepoints: int number of time points in returned series, determined from number of stack files found at dataPath newDtype: string string representation of numpy data type of returned blocks """ dataPath = self.__normalizeDatafilePattern(dataPath, ext) blockSize = parseMemoryString(blockSize) totalDim = reduce(lambda x_, y_: x_*y_, dims) dtype = dtypeFunc(dtype) if newDtype is None or newDtype == '': newDtype = str(dtype) elif newDtype == 'smallfloat': newDtype = str(smallestFloatType(dtype)) else: newDtype = str(newDtype) reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride) filenames = reader.list(dataPath, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) if not filenames: raise IOError("No files found for path '%s'" % dataPath) dataSize = totalDim * len(filenames) * dtype.itemsize nblocks = max(dataSize / blockSize, 1) # integer division if len(dims) >= 3: # for 3D stacks, do calculations to ensure that # different planes appear in distinct files blocksPerPlane = max(nblocks / dims[-1], 1) pixPerPlane = reduce(lambda x_, y_: x_*y_, dims[:-1]) # all but last dimension # get the greatest number of blocks in a plane (up to as many as requested) that still divide the plane # evenly. This will always be at least one. kUpdated = [x for x in range(1, blocksPerPlane+1) if not pixPerPlane % x][-1] nblocks = kUpdated * dims[-1] blockSizePerStack = (totalDim / nblocks) * dtype.itemsize else: # otherwise just round to make contents divide into nearly even blocks blockSizePerStack = int(math.ceil(totalDim / float(nblocks))) nblocks = int(math.ceil(totalDim / float(blockSizePerStack))) blockSizePerStack *= dtype.itemsize fileSize = totalDim * dtype.itemsize def readBlock(blockNum): # copy size out from closure; will modify later: blockSizePerStack_ = blockSizePerStack # get start position for this block position = blockNum * blockSizePerStack_ # adjust if at end of file if (position + blockSizePerStack_) > fileSize: blockSizePerStack_ = int(fileSize - position) # loop over files, loading one block from each bufs = [] for fname in filenames: buf = reader.read(fname, startOffset=position, size=blockSizePerStack_) bufs.append(frombuffer(buf, dtype=dtype)) buf = vstack(bufs).T # dimensions are now linindex x time (images) del bufs buf = buf.astype(newDtype, casting=casting, copy=False) # append subscript keys based on dimensions itemPosition = position / dtype.itemsize itemBlocksize = blockSizePerStack_ / dtype.itemsize linearIdx = arange(itemPosition, itemPosition + itemBlocksize) # zero-based keys = zip(*map(tuple, unravel_index(linearIdx, dims, order='F'))) return zip(keys, buf) # map over blocks return (self.sc.parallelize(range(0, nblocks), nblocks).flatMap(lambda bn: readBlock(bn)), len(filenames), newDtype)
def fromBinary(self, dataPath, ext='bin', confFilename='conf.json', nkeys=None, nvalues=None, keyType=None, valueType=None, newDtype='smallfloat', casting='safe', maxPartitionSize='32mb'): """ Load a Series object from a directory of binary files. Parameters ---------- dataPath : string URI or local filesystem path Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://", "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified by a glob-style expression using a single wildcard character '*'. newDtype : dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat' Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be cast to the requested `newdtype` if not None - see Data `astype()` method. casting : 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's `astype()` method; see numpy documentation for details. maxPartitionSize : str, optional, default = '32mb' Maximum size of partitions as Java-style memory, will indirectly control the number of partitions """ paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename, nkeys, nvalues, keyType, valueType) self.__checkBinaryParametersAreSpecified(paramsObj) dataPath = self.__normalizeDatafilePattern(dataPath, ext) keyDtype = dtypeFunc(paramsObj.keytype) valDtype = dtypeFunc(paramsObj.valuetype) keySize = paramsObj.nkeys * keyDtype.itemsize recordSize = keySize + paramsObj.nvalues * valDtype.itemsize from thunder.utils.common import parseMemoryString if isinstance(maxPartitionSize, basestring): size = parseMemoryString(maxPartitionSize) else: raise Exception("Invalid size specification") hadoopConf = { 'recordLength': str(recordSize), 'mapred.max.split.size': str(size) } lines = self.sc.newAPIHadoopFile( dataPath, 'thunder.util.io.hadoop.FixedLengthBinaryInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.BytesWritable', conf=hadoopConf) data = lines.map(lambda (_, v): (tuple( int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype) ), frombuffer(buffer(v, keySize), dtype=valDtype))) return Series(data, dtype=str(valDtype), index=arange(paramsObj.nvalues)).astype( newDtype, casting)
def _getSeriesBlocksFromMultiTif(self, datapath, ext="tif", blockSize="150M", newdtype='smallfloat', casting='safe', startidx=None, stopidx=None): import thunder.rdds.fileio.multitif as multitif import itertools from PIL import Image from thunder.utils.common import pil_to_array, smallest_float_type import io datapath = self.__normalizeDatafilePattern(datapath, ext) blockSize = parseMemoryString(blockSize) reader = getFileReaderForPath(datapath)() filenames = reader.list(datapath) if not filenames: raise IOError("No files found for path '%s'" % datapath) filenames = selectByStartAndStopIndices(filenames, startidx, stopidx) ntimepoints = len(filenames) minimize_reads = datapath.lower().startswith("s3") height, width, npages, datatype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif( reader, filenames[0]) pixelbytesize = dtypefunc(datatype).itemsize if newdtype is None or str(newdtype) == '': newdtype = str(datatype) elif newdtype == 'smallfloat': newdtype = str(smallest_float_type(datatype)) else: newdtype = str(newdtype) # intialize at one block per plane bytesperplane = height * width * pixelbytesize * ntimepoints bytesperblock = bytesperplane blocksperplane = 1 # keep dividing while cutting our size in half still leaves us bigger than the requested size # should end up no more than 2x blockSize. while bytesperblock >= blockSize * 2: bytesperblock /= 2 blocksperplane *= 2 blocklenPixels = max((height * width) / blocksperplane, 1) # integer division while blocksperplane * blocklenPixels < height * width: # make sure we're reading the plane fully blocksperplane += 1 # keys will be planeidx, blockidx: keys = list(itertools.product(xrange(npages), xrange(blocksperplane))) def readblockfromtif(pidxbidx_): planeidx, blockidx = pidxbidx_ blocks = [] planeshape = None blockstart = None blockend = None for fname in filenames: reader_ = getFileReaderForPath(fname)() fp = reader_.open(fname) try: if minimize_reads: # use multitif module to generate a fake, in-memory one-page tif file # the advantage of this is that it cuts way down on the many small reads # that PIL/pillow will make otherwise, which would be a problem for s3 tiffparser_ = multitif.TiffParser(fp, debug=False) tiffilebuffer = multitif.packSinglePage( tiffparser_, page_idx=planeidx) bytebuf = io.BytesIO(tiffilebuffer) try: pilimg = Image.open(bytebuf) ary = pil_to_array(pilimg).T finally: bytebuf.close() del tiffilebuffer, tiffparser_, pilimg, bytebuf else: # read tif using PIL directly pilimg = Image.open(fp) pilimg.seek(planeidx) ary = pil_to_array(pilimg).T del pilimg if not planeshape: planeshape = ary.shape[:] blockstart = blockidx * blocklenPixels blockend = min(blockstart + blocklenPixels, planeshape[0] * planeshape[1]) blocks.append(ary.ravel(order='C')[blockstart:blockend]) del ary finally: fp.close() buf = vstack( blocks).T # dimensions are now linindex x time (images) del blocks buf = buf.astype(newdtype, casting=casting, copy=False) # append subscript keys based on dimensions linindx = arange(blockstart, blockend) # zero-based serieskeys = zip( *map(tuple, unravel_index(linindx, planeshape, order='C'))) # add plane index to end of keys serieskeys = [ tuple(list(keys_)[::-1] + [planeidx]) for keys_ in serieskeys ] return zip(serieskeys, buf) # map over blocks rdd = self.sc.parallelize(keys, len(keys)).flatMap(readblockfromtif) dims = (npages, width, height) metadata = (dims, ntimepoints, newdtype) return rdd, metadata
def _getSeriesBlocksFromMultiTif(self, datapath, ext="tif", blockSize="150M", newdtype='smallfloat', casting='safe', startidx=None, stopidx=None): import thunder.rdds.fileio.multitif as multitif import itertools from PIL import Image import io datapath = self.__normalizeDatafilePattern(datapath, ext) blockSize = parseMemoryString(blockSize) reader = getFileReaderForPath(datapath)() filenames = reader.list(datapath) if not filenames: raise IOError("No files found for path '%s'" % datapath) filenames = selectByStartAndStopIndices(filenames, startidx, stopidx) ntimepoints = len(filenames) minimize_reads = datapath.lower().startswith("s3") height, width, npages, datatype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif(reader, filenames[0]) pixelbytesize = dtypefunc(datatype).itemsize if newdtype is None or str(newdtype) == '': newdtype = str(datatype) elif newdtype == 'smallfloat': newdtype = str(smallest_float_type(datatype)) else: newdtype = str(newdtype) # intialize at one block per plane bytesperplane = height * width * pixelbytesize * ntimepoints bytesperblock = bytesperplane blocksperplane = 1 # keep dividing while cutting our size in half still leaves us bigger than the requested size # should end up no more than 2x blockSize. while bytesperblock >= blockSize * 2: bytesperblock /= 2 blocksperplane *= 2 blocklenPixels = max((height * width) / blocksperplane, 1) # integer division while blocksperplane * blocklenPixels < height * width: # make sure we're reading the plane fully blocksperplane += 1 # keys will be planeidx, blockidx: keys = list(itertools.product(xrange(npages), xrange(blocksperplane))) def readblockfromtif(pidxbidx_): planeidx, blockidx = pidxbidx_ blocks = [] planeshape = None blockstart = None blockend = None for fname in filenames: reader_ = getFileReaderForPath(fname)() fp = reader_.open(fname) try: if minimize_reads: # use multitif module to generate a fake, in-memory one-page tif file # the advantage of this is that it cuts way down on the many small reads # that PIL/pillow will make otherwise, which would be a problem for s3 tiffparser_ = multitif.TiffParser(fp, debug=False) tiffilebuffer = multitif.packSinglePage(tiffparser_, page_idx=planeidx) bytebuf = io.BytesIO(tiffilebuffer) try: pilimg = Image.open(bytebuf) ary = array(pilimg).T finally: bytebuf.close() del tiffilebuffer, tiffparser_, pilimg, bytebuf else: # read tif using PIL directly pilimg = Image.open(fp) pilimg.seek(planeidx) ary = array(pilimg).T del pilimg if not planeshape: planeshape = ary.shape[:] blockstart = blockidx * blocklenPixels blockend = min(blockstart+blocklenPixels, planeshape[0]*planeshape[1]) blocks.append(ary.ravel(order='C')[blockstart:blockend]) del ary finally: fp.close() buf = vstack(blocks).T # dimensions are now linindex x time (images) del blocks buf = buf.astype(newdtype, casting=casting, copy=False) # append subscript keys based on dimensions linindx = arange(blockstart, blockend) # zero-based serieskeys = zip(*map(tuple, unravel_index(linindx, planeshape, order='C'))) # add plane index to end of keys serieskeys = [tuple(list(keys_)[::-1]+[planeidx]) for keys_ in serieskeys] return zip(serieskeys, buf) # map over blocks rdd = self.sc.parallelize(keys, len(keys)).flatMap(readblockfromtif) dims = (npages, width, height) metadata = (dims, ntimepoints, newdtype) return rdd, metadata