Exemple #1
0
    def fromBinary(self, dataPath, ext='bin', confFilename='conf.json',
                   nkeys=None, nvalues=None, keyType=None, valueType=None,
                   newDtype='smallfloat', casting='safe', maxPartitionSize='32mb'):
        """
        Load a Series object from a directory of binary files.

        Parameters
        ----------

        dataPath : string URI or local filesystem path
            Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://",
            "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path
            must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified
            by a glob-style expression using a single wildcard character '*'.

        newDtype : dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat'
            Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be
            cast to the requested `newdtype` if not None - see Data `astype()` method.

        casting : 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's `astype()` method; see numpy documentation for details.

        maxPartitionSize : str, optional, default = '32mb'
            Maximum size of partitions as Java-style memory, will indirectly control the number of partitions

        """

        paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename, nkeys, nvalues, keyType, valueType)
        self.__checkBinaryParametersAreSpecified(paramsObj)

        dataPath = self.__normalizeDatafilePattern(dataPath, ext)

        keyDtype = dtypeFunc(paramsObj.keytype)
        valDtype = dtypeFunc(paramsObj.valuetype)

        keySize = paramsObj.nkeys * keyDtype.itemsize
        recordSize = keySize + paramsObj.nvalues * valDtype.itemsize

        from thunder.utils.common import parseMemoryString
        if isinstance(maxPartitionSize, basestring):
            size = parseMemoryString(maxPartitionSize)
        else:
            raise Exception("Invalid size specification")
        hadoopConf = {'recordLength': str(recordSize), 'mapred.max.split.size': str(size)}

        lines = self.sc.newAPIHadoopFile(dataPath, 'thunder.util.io.hadoop.FixedLengthBinaryInputFormat',
                                         'org.apache.hadoop.io.LongWritable',
                                         'org.apache.hadoop.io.BytesWritable',
                                         conf=hadoopConf)

        data = lines.map(lambda (_, v):
                         (tuple(int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype)),
                          frombuffer(buffer(v, keySize), dtype=valDtype)))

        return Series(data, dtype=str(valDtype), index=arange(paramsObj.nvalues)).astype(newDtype, casting)
Exemple #2
0
def _calcSplitsForBlockSize(blockSize, elementSize, dims):
    from thunder.utils.common import parseMemoryString
    import bisect
    if isinstance(blockSize, basestring):
        blockSize = parseMemoryString(blockSize)

    memSeq = _BlockMemoryAsReversedSequence(_normDimsToShapeTuple(dims))
    tmpIdx = bisect.bisect_left(memSeq, blockSize / float(elementSize))
    if tmpIdx == len(memSeq):
        # handle case where requested block is bigger than the biggest image
        # we can produce; just give back the biggest block size
        tmpIdx -= 1
    return memSeq.indToSub(tmpIdx)
Exemple #3
0
def _calcSplitsForBlockSize(blockSize, elementSize, dims):
    from thunder.utils.common import parseMemoryString
    import bisect
    if isinstance(blockSize, basestring):
        blockSize = parseMemoryString(blockSize)

    memSeq = _BlockMemoryAsReversedSequence(_normDimsToShapeTuple(dims))
    tmpIdx = bisect.bisect_left(memSeq, blockSize / float(elementSize))
    if tmpIdx == len(memSeq):
        # handle case where requested block is bigger than the biggest image
        # we can produce; just give back the biggest block size
        tmpIdx -= 1
    return memSeq.indToSub(tmpIdx)
Exemple #4
0
    def _scatterToBlocks(self, blockSize="150M", blocksPerDim=None, groupingDim=None):
        if not groupingDim is None:
            # get series from blocks defined by pivoting:
            gd = self.__validateOrCalcGroupingDim(groupingDim=groupingDim)
            blocksdata = self._toBlocksByImagePlanes(groupingDim=gd)

        else:
            # get series from blocks defined by splits
            if not blocksPerDim:
                # get splits from requested block size
                blockSize = parseMemoryString(blockSize)
                blocksPerDim = self.__calcBlocksPerDim(blockSize)
            blocksdata = self._toBlocksBySplits(blocksPerDim)

        return blocksdata
Exemple #5
0
    def _scatterToBlocks(self,
                         blockSize="150M",
                         blocksPerDim=None,
                         groupingDim=None):
        if not groupingDim is None:
            # get series from blocks defined by pivoting:
            gd = self.__validateOrCalcGroupingDim(groupingDim=groupingDim)
            blocksdata = self._toBlocksByImagePlanes(groupingDim=gd)

        else:
            # get series from blocks defined by splits
            if not blocksPerDim:
                # get splits from requested block size
                blockSize = parseMemoryString(blockSize)
                blocksPerDim = self.__calcBlocksPerDim(blockSize)
            blocksdata = self._toBlocksBySplits(blocksPerDim)

        return blocksdata
    def _getSeriesBlocksFromMultiTif(self, dataPath, ext="tif", blockSize="150M",
                                     newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None,
                                     recursive=False):
        import thunder.rdds.fileio.multitif as multitif
        import itertools
        from PIL import Image
        import io

        dataPath = self.__normalizeDatafilePattern(dataPath, ext)
        blockSize = parseMemoryString(blockSize)

        reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride)
        filenames = reader.list(dataPath, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
        if not filenames:
            raise IOError("No files found for path '%s'" % dataPath)
        ntimepoints = len(filenames)

        doMinimizeReads = dataPath.lower().startswith("s3")
        # check PIL version to see whether it is actually pillow or indeed old PIL and choose
        # conversion function appropriately. See ImagesLoader.fromMultipageTif and common.pil_to_array
        # for more explanation.
        isPillow = hasattr(Image, "PILLOW_VERSION")
        if isPillow:
            conversionFcn = array  # use numpy's array() function
        else:
            from thunder.utils.common import pil_to_array
            conversionFcn = pil_to_array  # use our modified version of matplotlib's pil_to_array

        height, width, npages, dtype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif(reader, filenames[0])
        if dtype.startswith('int'):
            raise ValueError('Signed integer tiff images are not supported in SeriesLoader (shuffle=False);' +
                             ' please try loading as Images (shuffle=True)')
        pixelBytesize = dtypeFunc(dtype).itemsize
        if newDtype is None or str(newDtype) == '':
            newDtype = str(dtype)
        elif newDtype == 'smallfloat':
            newDtype = str(smallestFloatType(dtype))
        else:
            newDtype = str(newDtype)

        # intialize at one block per plane
        bytesPerPlane = height * width * pixelBytesize * ntimepoints
        bytesPerBlock = bytesPerPlane
        blocksPerPlane = 1
        # keep dividing while cutting our size in half still leaves us bigger than the requested size
        # should end up no more than 2x blockSize.
        while bytesPerBlock >= blockSize * 2:
            bytesPerBlock /= 2
            blocksPerPlane *= 2

        blocklenPixels = max((height * width) / blocksPerPlane, 1)  # integer division
        while blocksPerPlane * blocklenPixels < height * width:  # make sure we're reading the plane fully
            blocksPerPlane += 1

        # prevent bringing in self in closure:
        awsCredentialsOverride = self.awsCredentialsOverride

        # keys will be planeidx, blockidx:
        keys = list(itertools.product(xrange(npages), xrange(blocksPerPlane)))

        def readBlockFromTiff(planeIdxBlockIdx):
            planeIdx, blockIdx = planeIdxBlockIdx
            blocks = []
            planeShape = None
            blockStart = None
            blockEnd = None
            for fname in filenames:
                reader_ = getFileReaderForPath(fname)(awsCredentialsOverride=awsCredentialsOverride)
                fp = reader_.open(fname)
                try:
                    if doMinimizeReads:
                        # use multitif module to generate a fake, in-memory one-page tif file
                        # the advantage of this is that it cuts way down on the many small reads
                        # that PIL/pillow will make otherwise, which would be a problem for s3
                        tiffParser_ = multitif.TiffParser(fp, debug=False)
                        tiffFilebuffer = multitif.packSinglePage(tiffParser_, pageIdx=planeIdx)
                        byteBuf = io.BytesIO(tiffFilebuffer)
                        try:
                            pilImg = Image.open(byteBuf)
                            ary = conversionFcn(pilImg).T
                        finally:
                            byteBuf.close()
                        del tiffFilebuffer, tiffParser_, pilImg, byteBuf
                    else:
                        # read tif using PIL directly
                        pilImg = Image.open(fp)
                        pilImg.seek(planeIdx)
                        ary = conversionFcn(pilImg).T
                        del pilImg

                    if not planeShape:
                        planeShape = ary.shape[:]
                        blockStart = blockIdx * blocklenPixels
                        blockEnd = min(blockStart+blocklenPixels, planeShape[0]*planeShape[1])
                    blocks.append(ary.ravel(order='C')[blockStart:blockEnd])
                    del ary
                finally:
                    fp.close()

            buf = vstack(blocks).T  # dimensions are now linindex x time (images)
            del blocks
            buf = buf.astype(newDtype, casting=casting, copy=False)

            # append subscript keys based on dimensions
            linearIdx = arange(blockStart, blockEnd)  # zero-based

            seriesKeys = zip(*map(tuple, unravel_index(linearIdx, planeShape, order='C')))
            # add plane index to end of keys
            if npages > 1:
                seriesKeys = [tuple(list(keys_)[::-1]+[planeIdx]) for keys_ in seriesKeys]
            else:
                seriesKeys = [tuple(list(keys_)[::-1]) for keys_ in seriesKeys]
            return zip(seriesKeys, buf)

        # map over blocks
        rdd = self.sc.parallelize(keys, len(keys)).flatMap(readBlockFromTiff)
        if npages > 1:
            dims = (npages, width, height)
        else:
            dims = (width, height)

        metadata = (dims, ntimepoints, newDtype)
        return rdd, metadata
    def _getSeriesBlocksFromStack(self, dataPath, dims, ext="stack", blockSize="150M", dtype='int16',
                                  newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None, recursive=False):
        """Create an RDD of <string blocklabel, (int k-tuple indices, array of datatype values)>

        Parameters
        ----------

        dataPath: string URI or local filesystem path
            Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://",
            "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path
            must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified
            by a glob-style expression using a single wildcard character '*'.

        dims: tuple of positive int
            Dimensions of input image data, ordered with the fastest-changing dimension first.

        dtype: dtype or dtype specifier, optional, default 'int16'
            Numpy dtype of input stack data

        newDtype: floating-point dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat'
            Numpy dtype of output series data. Series data must be floating-point. Input data will be cast to the
            requested `newdtype` - see numpy `astype()` method.

        casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's `astype()` method; see numpy documentation for details.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at dataPath, loading all files in the tree that
            have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems
            (not s3).

        Returns
        ---------
        pair of (RDD, ntimepoints)

        RDD: sequence of keys, values pairs
            (call using flatMap)

        RDD Key: tuple of int
            zero-based indicies of position within original image volume

        RDD Value: numpy array of datatype
            series of values at position across loaded image volumes

        ntimepoints: int
            number of time points in returned series, determined from number of stack files found at dataPath

        newDtype: string
            string representation of numpy data type of returned blocks

        """
        dataPath = self.__normalizeDatafilePattern(dataPath, ext)
        blockSize = parseMemoryString(blockSize)
        totalDim = reduce(lambda x_, y_: x_*y_, dims)
        dtype = dtypeFunc(dtype)
        if newDtype is None or newDtype == '':
            newDtype = str(dtype)
        elif newDtype == 'smallfloat':
            newDtype = str(smallestFloatType(dtype))
        else:
            newDtype = str(newDtype)

        reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride)
        filenames = reader.list(dataPath, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
        if not filenames:
            raise IOError("No files found for path '%s'" % dataPath)

        dataSize = totalDim * len(filenames) * dtype.itemsize
        nblocks = max(dataSize / blockSize, 1)  # integer division

        if len(dims) >= 3:
            # for 3D stacks, do calculations to ensure that
            # different planes appear in distinct files
            blocksPerPlane = max(nblocks / dims[-1], 1)

            pixPerPlane = reduce(lambda x_, y_: x_*y_, dims[:-1])  # all but last dimension

            # get the greatest number of blocks in a plane (up to as many as requested) that still divide the plane
            # evenly. This will always be at least one.
            kUpdated = [x for x in range(1, blocksPerPlane+1) if not pixPerPlane % x][-1]
            nblocks = kUpdated * dims[-1]
            blockSizePerStack = (totalDim / nblocks) * dtype.itemsize
        else:
            # otherwise just round to make contents divide into nearly even blocks
            blockSizePerStack = int(math.ceil(totalDim / float(nblocks)))
            nblocks = int(math.ceil(totalDim / float(blockSizePerStack)))
            blockSizePerStack *= dtype.itemsize

        fileSize = totalDim * dtype.itemsize

        def readBlock(blockNum):
            # copy size out from closure; will modify later:
            blockSizePerStack_ = blockSizePerStack
            # get start position for this block
            position = blockNum * blockSizePerStack_

            # adjust if at end of file
            if (position + blockSizePerStack_) > fileSize:
                blockSizePerStack_ = int(fileSize - position)
            # loop over files, loading one block from each
            bufs = []

            for fname in filenames:
                buf = reader.read(fname, startOffset=position, size=blockSizePerStack_)
                bufs.append(frombuffer(buf, dtype=dtype))

            buf = vstack(bufs).T  # dimensions are now linindex x time (images)
            del bufs
            buf = buf.astype(newDtype, casting=casting, copy=False)

            # append subscript keys based on dimensions
            itemPosition = position / dtype.itemsize
            itemBlocksize = blockSizePerStack_ / dtype.itemsize
            linearIdx = arange(itemPosition, itemPosition + itemBlocksize)  # zero-based

            keys = zip(*map(tuple, unravel_index(linearIdx, dims, order='F')))
            return zip(keys, buf)

        # map over blocks
        return (self.sc.parallelize(range(0, nblocks), nblocks).flatMap(lambda bn: readBlock(bn)),
                len(filenames), newDtype)
Exemple #8
0
    def fromBinary(self,
                   dataPath,
                   ext='bin',
                   confFilename='conf.json',
                   nkeys=None,
                   nvalues=None,
                   keyType=None,
                   valueType=None,
                   newDtype='smallfloat',
                   casting='safe',
                   maxPartitionSize='32mb'):
        """
        Load a Series object from a directory of binary files.

        Parameters
        ----------

        dataPath : string URI or local filesystem path
            Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://",
            "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path
            must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified
            by a glob-style expression using a single wildcard character '*'.

        newDtype : dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat'
            Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be
            cast to the requested `newdtype` if not None - see Data `astype()` method.

        casting : 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's `astype()` method; see numpy documentation for details.

        maxPartitionSize : str, optional, default = '32mb'
            Maximum size of partitions as Java-style memory, will indirectly control the number of partitions

        """

        paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename,
                                                     nkeys, nvalues, keyType,
                                                     valueType)
        self.__checkBinaryParametersAreSpecified(paramsObj)

        dataPath = self.__normalizeDatafilePattern(dataPath, ext)

        keyDtype = dtypeFunc(paramsObj.keytype)
        valDtype = dtypeFunc(paramsObj.valuetype)

        keySize = paramsObj.nkeys * keyDtype.itemsize
        recordSize = keySize + paramsObj.nvalues * valDtype.itemsize

        from thunder.utils.common import parseMemoryString
        if isinstance(maxPartitionSize, basestring):
            size = parseMemoryString(maxPartitionSize)
        else:
            raise Exception("Invalid size specification")
        hadoopConf = {
            'recordLength': str(recordSize),
            'mapred.max.split.size': str(size)
        }

        lines = self.sc.newAPIHadoopFile(
            dataPath,
            'thunder.util.io.hadoop.FixedLengthBinaryInputFormat',
            'org.apache.hadoop.io.LongWritable',
            'org.apache.hadoop.io.BytesWritable',
            conf=hadoopConf)

        data = lines.map(lambda (_, v): (tuple(
            int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype)
        ), frombuffer(buffer(v, keySize), dtype=valDtype)))

        return Series(data,
                      dtype=str(valDtype),
                      index=arange(paramsObj.nvalues)).astype(
                          newDtype, casting)
Exemple #9
0
    def _getSeriesBlocksFromMultiTif(self,
                                     datapath,
                                     ext="tif",
                                     blockSize="150M",
                                     newdtype='smallfloat',
                                     casting='safe',
                                     startidx=None,
                                     stopidx=None):
        import thunder.rdds.fileio.multitif as multitif
        import itertools
        from PIL import Image
        from thunder.utils.common import pil_to_array, smallest_float_type
        import io

        datapath = self.__normalizeDatafilePattern(datapath, ext)
        blockSize = parseMemoryString(blockSize)

        reader = getFileReaderForPath(datapath)()
        filenames = reader.list(datapath)
        if not filenames:
            raise IOError("No files found for path '%s'" % datapath)
        filenames = selectByStartAndStopIndices(filenames, startidx, stopidx)
        ntimepoints = len(filenames)

        minimize_reads = datapath.lower().startswith("s3")

        height, width, npages, datatype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif(
            reader, filenames[0])
        pixelbytesize = dtypefunc(datatype).itemsize
        if newdtype is None or str(newdtype) == '':
            newdtype = str(datatype)
        elif newdtype == 'smallfloat':
            newdtype = str(smallest_float_type(datatype))
        else:
            newdtype = str(newdtype)

        # intialize at one block per plane
        bytesperplane = height * width * pixelbytesize * ntimepoints
        bytesperblock = bytesperplane
        blocksperplane = 1
        # keep dividing while cutting our size in half still leaves us bigger than the requested size
        # should end up no more than 2x blockSize.
        while bytesperblock >= blockSize * 2:
            bytesperblock /= 2
            blocksperplane *= 2

        blocklenPixels = max((height * width) / blocksperplane,
                             1)  # integer division
        while blocksperplane * blocklenPixels < height * width:  # make sure we're reading the plane fully
            blocksperplane += 1

        # keys will be planeidx, blockidx:
        keys = list(itertools.product(xrange(npages), xrange(blocksperplane)))

        def readblockfromtif(pidxbidx_):
            planeidx, blockidx = pidxbidx_
            blocks = []
            planeshape = None
            blockstart = None
            blockend = None
            for fname in filenames:
                reader_ = getFileReaderForPath(fname)()
                fp = reader_.open(fname)
                try:
                    if minimize_reads:
                        # use multitif module to generate a fake, in-memory one-page tif file
                        # the advantage of this is that it cuts way down on the many small reads
                        # that PIL/pillow will make otherwise, which would be a problem for s3
                        tiffparser_ = multitif.TiffParser(fp, debug=False)
                        tiffilebuffer = multitif.packSinglePage(
                            tiffparser_, page_idx=planeidx)
                        bytebuf = io.BytesIO(tiffilebuffer)
                        try:
                            pilimg = Image.open(bytebuf)
                            ary = pil_to_array(pilimg).T
                        finally:
                            bytebuf.close()
                        del tiffilebuffer, tiffparser_, pilimg, bytebuf
                    else:
                        # read tif using PIL directly
                        pilimg = Image.open(fp)
                        pilimg.seek(planeidx)
                        ary = pil_to_array(pilimg).T
                        del pilimg

                    if not planeshape:
                        planeshape = ary.shape[:]
                        blockstart = blockidx * blocklenPixels
                        blockend = min(blockstart + blocklenPixels,
                                       planeshape[0] * planeshape[1])
                    blocks.append(ary.ravel(order='C')[blockstart:blockend])
                    del ary
                finally:
                    fp.close()

            buf = vstack(
                blocks).T  # dimensions are now linindex x time (images)
            del blocks
            buf = buf.astype(newdtype, casting=casting, copy=False)

            # append subscript keys based on dimensions
            linindx = arange(blockstart, blockend)  # zero-based

            serieskeys = zip(
                *map(tuple, unravel_index(linindx, planeshape, order='C')))
            # add plane index to end of keys
            serieskeys = [
                tuple(list(keys_)[::-1] + [planeidx]) for keys_ in serieskeys
            ]
            return zip(serieskeys, buf)

        # map over blocks
        rdd = self.sc.parallelize(keys, len(keys)).flatMap(readblockfromtif)
        dims = (npages, width, height)

        metadata = (dims, ntimepoints, newdtype)
        return rdd, metadata
Exemple #10
0
    def _getSeriesBlocksFromMultiTif(self, datapath, ext="tif", blockSize="150M",
                                     newdtype='smallfloat', casting='safe', startidx=None, stopidx=None):
        import thunder.rdds.fileio.multitif as multitif
        import itertools
        from PIL import Image
        import io

        datapath = self.__normalizeDatafilePattern(datapath, ext)
        blockSize = parseMemoryString(blockSize)

        reader = getFileReaderForPath(datapath)()
        filenames = reader.list(datapath)
        if not filenames:
            raise IOError("No files found for path '%s'" % datapath)
        filenames = selectByStartAndStopIndices(filenames, startidx, stopidx)
        ntimepoints = len(filenames)

        minimize_reads = datapath.lower().startswith("s3")

        height, width, npages, datatype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif(reader, filenames[0])
        pixelbytesize = dtypefunc(datatype).itemsize
        if newdtype is None or str(newdtype) == '':
            newdtype = str(datatype)
        elif newdtype == 'smallfloat':
            newdtype = str(smallest_float_type(datatype))
        else:
            newdtype = str(newdtype)

        # intialize at one block per plane
        bytesperplane = height * width * pixelbytesize * ntimepoints
        bytesperblock = bytesperplane
        blocksperplane = 1
        # keep dividing while cutting our size in half still leaves us bigger than the requested size
        # should end up no more than 2x blockSize.
        while bytesperblock >= blockSize * 2:
            bytesperblock /= 2
            blocksperplane *= 2

        blocklenPixels = max((height * width) / blocksperplane, 1)  # integer division
        while blocksperplane * blocklenPixels < height * width:  # make sure we're reading the plane fully
            blocksperplane += 1

        # keys will be planeidx, blockidx:
        keys = list(itertools.product(xrange(npages), xrange(blocksperplane)))

        def readblockfromtif(pidxbidx_):
            planeidx, blockidx = pidxbidx_
            blocks = []
            planeshape = None
            blockstart = None
            blockend = None
            for fname in filenames:
                reader_ = getFileReaderForPath(fname)()
                fp = reader_.open(fname)
                try:
                    if minimize_reads:
                        # use multitif module to generate a fake, in-memory one-page tif file
                        # the advantage of this is that it cuts way down on the many small reads
                        # that PIL/pillow will make otherwise, which would be a problem for s3
                        tiffparser_ = multitif.TiffParser(fp, debug=False)
                        tiffilebuffer = multitif.packSinglePage(tiffparser_, page_idx=planeidx)
                        bytebuf = io.BytesIO(tiffilebuffer)
                        try:
                            pilimg = Image.open(bytebuf)
                            ary = array(pilimg).T
                        finally:
                            bytebuf.close()
                        del tiffilebuffer, tiffparser_, pilimg, bytebuf
                    else:
                        # read tif using PIL directly
                        pilimg = Image.open(fp)
                        pilimg.seek(planeidx)
                        ary = array(pilimg).T
                        del pilimg

                    if not planeshape:
                        planeshape = ary.shape[:]
                        blockstart = blockidx * blocklenPixels
                        blockend = min(blockstart+blocklenPixels, planeshape[0]*planeshape[1])
                    blocks.append(ary.ravel(order='C')[blockstart:blockend])
                    del ary
                finally:
                    fp.close()

            buf = vstack(blocks).T  # dimensions are now linindex x time (images)
            del blocks
            buf = buf.astype(newdtype, casting=casting, copy=False)

            # append subscript keys based on dimensions
            linindx = arange(blockstart, blockend)  # zero-based

            serieskeys = zip(*map(tuple, unravel_index(linindx, planeshape, order='C')))
            # add plane index to end of keys
            serieskeys = [tuple(list(keys_)[::-1]+[planeidx]) for keys_ in serieskeys]
            return zip(serieskeys, buf)

        # map over blocks
        rdd = self.sc.parallelize(keys, len(keys)).flatMap(readblockfromtif)
        dims = (npages, width, height)

        metadata = (dims, ntimepoints, newdtype)
        return rdd, metadata