Exemple #1
0
    def loadJSON(self, path):
        """
        Generic function for loading JSON from a path, handling local file systems and S3 or GS

        Parameters
        ----------
        path : str
            Path to a file, can be on a local file system or an S3 or GS bucket

        Returns
        -------
        A string with the JSON
        """

        import json
        from thunder.rdds.fileio.readers import getFileReaderForPath, FileNotFoundError
        from thunder.utils.serializable import _decode_dict

        reader = getFileReaderForPath(path)(awsCredentialsOverride=self._credentials)
        try:
            buffer = reader.read(path)
        except FileNotFoundError:
            raise Exception("Cannot find file %s" % path)

        return json.loads(buffer, object_hook=_decode_dict)
Exemple #2
0
    def loadConf(self, dataPath, confFilename='conf.json'):
        """Returns a dict loaded from a json file.

        Looks for file named `conffile` in same directory as `dataPath`

        Returns {} if file not found
        """
        if not confFilename:
            return {}

        reader = getFileReaderForPath(dataPath)(
            awsCredentialsOverride=self.awsCredentialsOverride)
        try:
            jsonBuf = reader.read(dataPath, filename=confFilename)
        except FileNotFoundError:
            return {}

        params = json.loads(jsonBuf)

        if 'format' in params:
            raise Exception(
                "Numerical format of value should be specified as 'valuetype', not 'format'"
            )
        if 'keyformat' in params:
            raise Exception(
                "Numerical format of key should be specified as 'keytype', not 'keyformat'"
            )

        return params
Exemple #3
0
    def loadParams(self, path):
        """
        Load a file with parameters from a local file system or S3.

        Assumes file is JSON with basic types (strings, integers, doubles, lists),
        in either a single dict or list of dict-likes, and each dict has at least
        a "name" field and a "value" field.

        Useful for loading generic meta data, parameters, covariates, etc.

        Parameters
        ----------
        path : str
            Path to file, can be on a local file system or an S3 bucket

        Returns
        -------
        A dict or list with the parameters
        """
        import json
        from thunder.rdds.fileio.readers import getFileReaderForPath, FileNotFoundError

        reader = getFileReaderForPath(path)(
            awsCredentialsOverride=self._credentials)
        try:
            buffer = reader.read(path)
        except FileNotFoundError:
            raise Exception("Cannot find file %s" % path)

        return Params(json.loads(buffer))
Exemple #4
0
    def loadJSON(self, path):
        """
        Generic function for loading JSON from a path, handling local file systems and S3 or GS

        Parameters
        ----------
        path : str
            Path to a file, can be on a local file system or an S3 or GS bucket

        Returns
        -------
        A string with the JSON
        """

        import json
        from thunder.rdds.fileio.readers import getFileReaderForPath, FileNotFoundError
        from thunder.utils.serializable import _decode_dict

        reader = getFileReaderForPath(path)(
            awsCredentialsOverride=self._credentials)
        try:
            buffer = reader.read(path)
        except FileNotFoundError:
            raise Exception("Cannot find file %s" % path)

        return json.loads(buffer, object_hook=_decode_dict)
Exemple #5
0
    def loadParams(self, path):
        """
        Load a file with parameters from a local file system or S3.

        Assumes file is JSON with basic types (strings, integers, doubles, lists),
        in either a single dict or list of dict-likes, and each dict has at least
        a "name" field and a "value" field.

        Useful for loading generic meta data, parameters, covariates, etc.

        Parameters
        ----------
        path : str
            Path to file, can be on a local file system or an S3 bucket

        Returns
        -------
        A dict or list with the parameters
        """
        import json
        from thunder.rdds.fileio.readers import getFileReaderForPath, FileNotFoundError

        reader = getFileReaderForPath(path)(awsCredentialsOverride=self._credentials)
        try:
            buffer = reader.read(path)
        except FileNotFoundError:
            raise Exception("Cannot find file %s" % path)

        return Params(json.loads(buffer))
Exemple #6
0
        def readblockfromtif(pidxbidx_):
            planeidx, blockidx = pidxbidx_
            blocks = []
            planeshape = None
            blockstart = None
            blockend = None
            for fname in filenames:
                reader_ = getFileReaderForPath(fname)()
                fp = reader_.open(fname)
                try:
                    if minimize_reads:
                        # use multitif module to generate a fake, in-memory one-page tif file
                        # the advantage of this is that it cuts way down on the many small reads
                        # that PIL/pillow will make otherwise, which would be a problem for s3
                        tiffparser_ = multitif.TiffParser(fp, debug=False)
                        tiffilebuffer = multitif.packSinglePage(
                            tiffparser_, page_idx=planeidx)
                        bytebuf = io.BytesIO(tiffilebuffer)
                        try:
                            pilimg = Image.open(bytebuf)
                            ary = pil_to_array(pilimg).T
                        finally:
                            bytebuf.close()
                        del tiffilebuffer, tiffparser_, pilimg, bytebuf
                    else:
                        # read tif using PIL directly
                        pilimg = Image.open(fp)
                        pilimg.seek(planeidx)
                        ary = pil_to_array(pilimg).T
                        del pilimg

                    if not planeshape:
                        planeshape = ary.shape[:]
                        blockstart = blockidx * blocklenPixels
                        blockend = min(blockstart + blocklenPixels,
                                       planeshape[0] * planeshape[1])
                    blocks.append(ary.ravel(order='C')[blockstart:blockend])
                    del ary
                finally:
                    fp.close()

            buf = vstack(
                blocks).T  # dimensions are now linindex x time (images)
            del blocks
            buf = buf.astype(newdtype, casting=casting, copy=False)

            # append subscript keys based on dimensions
            linindx = arange(blockstart, blockend)  # zero-based

            serieskeys = zip(
                *map(tuple, unravel_index(linindx, planeshape, order='C')))
            # add plane index to end of keys
            serieskeys = [
                tuple(list(keys_)[::-1] + [planeidx]) for keys_ in serieskeys
            ]
            return zip(serieskeys, buf)
        def readBlockFromTiff(planeIdxBlockIdx):
            planeIdx, blockIdx = planeIdxBlockIdx
            blocks = []
            planeShape = None
            blockStart = None
            blockEnd = None
            for fname in filenames:
                reader_ = getFileReaderForPath(fname)(awsCredentialsOverride=awsCredentialsOverride)
                fp = reader_.open(fname)
                try:
                    if doMinimizeReads:
                        # use multitif module to generate a fake, in-memory
                        # one-page tif file. the advantage of this is that it
                        # cuts way down on the many small reads that PIL/pillow
                        # will make otherwise, which would be a problem for s3
                        # or Google Storage
                        tiffParser_ = multitif.TiffParser(fp, debug=False)
                        tiffFilebuffer = multitif.packSinglePage(tiffParser_, pageIdx=planeIdx)
                        byteBuf = io.BytesIO(tiffFilebuffer)
                        try:
                            pilImg = Image.open(byteBuf)
                            ary = conversionFcn(pilImg).T
                        finally:
                            byteBuf.close()
                        del tiffFilebuffer, tiffParser_, pilImg, byteBuf
                    else:
                        # read tif using PIL directly
                        pilImg = Image.open(fp)
                        pilImg.seek(planeIdx)
                        ary = conversionFcn(pilImg).T
                        del pilImg

                    if not planeShape:
                        planeShape = ary.shape[:]
                        blockStart = blockIdx * blocklenPixels
                        blockEnd = min(blockStart + blocklenPixels, planeShape[0] * planeShape[1])
                    blocks.append(ary.ravel(order="C")[blockStart:blockEnd])
                    del ary
                finally:
                    fp.close()

            buf = vstack(blocks).T  # dimensions are now linindex x time (images)
            del blocks
            buf = buf.astype(newDtype, casting=casting, copy=False)

            # append subscript keys based on dimensions
            linearIdx = arange(blockStart, blockEnd)  # zero-based

            seriesKeys = zip(*map(tuple, unravel_index(linearIdx, planeShape, order="C")))
            # add plane index to end of keys
            if npages > 1:
                seriesKeys = [tuple(list(keys_)[::-1] + [planeIdx]) for keys_ in seriesKeys]
            else:
                seriesKeys = [tuple(list(keys_)[::-1]) for keys_ in seriesKeys]
            return zip(seriesKeys, buf)
Exemple #8
0
def raiseErrorIfPathExists(path, awsCredentialsOverride=None):
    """
    The ValueError message will suggest calling with overwrite=True; this function is expected to be
    called from the various output methods that accept an 'overwrite' keyword argument.
    """
    # check that specified output path does not already exist
    from thunder.rdds.fileio.readers import getFileReaderForPath
    reader = getFileReaderForPath(path)(awsCredentialsOverride=awsCredentialsOverride)
    existing = reader.list(path, includeDirectories=True)
    if existing:
        raise ValueError("Path %s appears to already exist. Specify a new directory, or call " % path +
                         "with overwrite=True to overwrite.")
Exemple #9
0
def raiseErrorIfPathExists(path):
    """Raises a ValueError if the passed path string is found to already exist.

    The ValueError message will suggest calling with overwrite=True; this function is expected to be
    called from the various output methods that accept an 'overwrite' keyword argument.
    """
    # check that specified output path does not already exist
    from thunder.rdds.fileio.readers import getFileReaderForPath
    reader = getFileReaderForPath(path)()
    existing = reader.list(path, includeDirectories=True)
    if existing:
        raise ValueError("Path %s appears to already exist. Specify a new directory, or call " % path +
                         "with overwrite=True to overwrite.")
Exemple #10
0
        def readblockfromtif(pidxbidx_):
            planeidx, blockidx = pidxbidx_
            blocks = []
            planeshape = None
            blockstart = None
            blockend = None
            for fname in filenames:
                reader_ = getFileReaderForPath(fname)()
                fp = reader_.open(fname)
                try:
                    if minimize_reads:
                        # use multitif module to generate a fake, in-memory one-page tif file
                        # the advantage of this is that it cuts way down on the many small reads
                        # that PIL/pillow will make otherwise, which would be a problem for s3
                        tiffparser_ = multitif.TiffParser(fp, debug=False)
                        tiffilebuffer = multitif.packSinglePage(tiffparser_, page_idx=planeidx)
                        bytebuf = io.BytesIO(tiffilebuffer)
                        try:
                            pilimg = Image.open(bytebuf)
                            ary = array(pilimg).T
                        finally:
                            bytebuf.close()
                        del tiffilebuffer, tiffparser_, pilimg, bytebuf
                    else:
                        # read tif using PIL directly
                        pilimg = Image.open(fp)
                        pilimg.seek(planeidx)
                        ary = array(pilimg).T
                        del pilimg

                    if not planeshape:
                        planeshape = ary.shape[:]
                        blockstart = blockidx * blocklenPixels
                        blockend = min(blockstart+blocklenPixels, planeshape[0]*planeshape[1])
                    blocks.append(ary.ravel(order='C')[blockstart:blockend])
                    del ary
                finally:
                    fp.close()

            buf = vstack(blocks).T  # dimensions are now linindex x time (images)
            del blocks
            buf = buf.astype(newdtype, casting=casting, copy=False)

            # append subscript keys based on dimensions
            linindx = arange(blockstart, blockend)  # zero-based

            serieskeys = zip(*map(tuple, unravel_index(linindx, planeshape, order='C')))
            # add plane index to end of keys
            serieskeys = [tuple(list(keys_)[::-1]+[planeidx]) for keys_ in serieskeys]
            return zip(serieskeys, buf)
Exemple #11
0
    def loadConf(self, dataPath, confFilename='conf.json'):
        """Returns a dict loaded from a json file.

        Looks for file named `conffile` in same directory as `dataPath`

        Returns {} if file not found
        """
        if not confFilename:
            return {}

        reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride)
        try:
            jsonBuf = reader.read(dataPath, filename=confFilename)
        except FileNotFoundError:
            return {}

        params = json.loads(jsonBuf)

        if 'format' in params:
            raise Exception("Numerical format of value should be specified as 'valuetype', not 'format'")
        if 'keyformat' in params:
            raise Exception("Numerical format of key should be specified as 'keytype', not 'keyformat'")

        return params
    def _getSeriesBlocksFromMultiTif(self, dataPath, ext="tif", blockSize="150M",
                                     newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None,
                                     recursive=False):
        import thunder.rdds.fileio.multitif as multitif
        import itertools
        from PIL import Image
        import io

        dataPath = self.__normalizeDatafilePattern(dataPath, ext)
        blockSize = parseMemoryString(blockSize)

        reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride)
        filenames = reader.list(dataPath, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
        if not filenames:
            raise IOError("No files found for path '%s'" % dataPath)
        ntimepoints = len(filenames)

        doMinimizeReads = dataPath.lower().startswith("s3")
        # check PIL version to see whether it is actually pillow or indeed old PIL and choose
        # conversion function appropriately. See ImagesLoader.fromMultipageTif and common.pil_to_array
        # for more explanation.
        isPillow = hasattr(Image, "PILLOW_VERSION")
        if isPillow:
            conversionFcn = array  # use numpy's array() function
        else:
            from thunder.utils.common import pil_to_array
            conversionFcn = pil_to_array  # use our modified version of matplotlib's pil_to_array

        height, width, npages, dtype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif(reader, filenames[0])
        if dtype.startswith('int'):
            raise ValueError('Signed integer tiff images are not supported in SeriesLoader (shuffle=False);' +
                             ' please try loading as Images (shuffle=True)')
        pixelBytesize = dtypeFunc(dtype).itemsize
        if newDtype is None or str(newDtype) == '':
            newDtype = str(dtype)
        elif newDtype == 'smallfloat':
            newDtype = str(smallestFloatType(dtype))
        else:
            newDtype = str(newDtype)

        # intialize at one block per plane
        bytesPerPlane = height * width * pixelBytesize * ntimepoints
        bytesPerBlock = bytesPerPlane
        blocksPerPlane = 1
        # keep dividing while cutting our size in half still leaves us bigger than the requested size
        # should end up no more than 2x blockSize.
        while bytesPerBlock >= blockSize * 2:
            bytesPerBlock /= 2
            blocksPerPlane *= 2

        blocklenPixels = max((height * width) / blocksPerPlane, 1)  # integer division
        while blocksPerPlane * blocklenPixels < height * width:  # make sure we're reading the plane fully
            blocksPerPlane += 1

        # prevent bringing in self in closure:
        awsCredentialsOverride = self.awsCredentialsOverride

        # keys will be planeidx, blockidx:
        keys = list(itertools.product(xrange(npages), xrange(blocksPerPlane)))

        def readBlockFromTiff(planeIdxBlockIdx):
            planeIdx, blockIdx = planeIdxBlockIdx
            blocks = []
            planeShape = None
            blockStart = None
            blockEnd = None
            for fname in filenames:
                reader_ = getFileReaderForPath(fname)(awsCredentialsOverride=awsCredentialsOverride)
                fp = reader_.open(fname)
                try:
                    if doMinimizeReads:
                        # use multitif module to generate a fake, in-memory one-page tif file
                        # the advantage of this is that it cuts way down on the many small reads
                        # that PIL/pillow will make otherwise, which would be a problem for s3
                        tiffParser_ = multitif.TiffParser(fp, debug=False)
                        tiffFilebuffer = multitif.packSinglePage(tiffParser_, pageIdx=planeIdx)
                        byteBuf = io.BytesIO(tiffFilebuffer)
                        try:
                            pilImg = Image.open(byteBuf)
                            ary = conversionFcn(pilImg).T
                        finally:
                            byteBuf.close()
                        del tiffFilebuffer, tiffParser_, pilImg, byteBuf
                    else:
                        # read tif using PIL directly
                        pilImg = Image.open(fp)
                        pilImg.seek(planeIdx)
                        ary = conversionFcn(pilImg).T
                        del pilImg

                    if not planeShape:
                        planeShape = ary.shape[:]
                        blockStart = blockIdx * blocklenPixels
                        blockEnd = min(blockStart+blocklenPixels, planeShape[0]*planeShape[1])
                    blocks.append(ary.ravel(order='C')[blockStart:blockEnd])
                    del ary
                finally:
                    fp.close()

            buf = vstack(blocks).T  # dimensions are now linindex x time (images)
            del blocks
            buf = buf.astype(newDtype, casting=casting, copy=False)

            # append subscript keys based on dimensions
            linearIdx = arange(blockStart, blockEnd)  # zero-based

            seriesKeys = zip(*map(tuple, unravel_index(linearIdx, planeShape, order='C')))
            # add plane index to end of keys
            if npages > 1:
                seriesKeys = [tuple(list(keys_)[::-1]+[planeIdx]) for keys_ in seriesKeys]
            else:
                seriesKeys = [tuple(list(keys_)[::-1]) for keys_ in seriesKeys]
            return zip(seriesKeys, buf)

        # map over blocks
        rdd = self.sc.parallelize(keys, len(keys)).flatMap(readBlockFromTiff)
        if npages > 1:
            dims = (npages, width, height)
        else:
            dims = (width, height)

        metadata = (dims, ntimepoints, newDtype)
        return rdd, metadata
    def _getSeriesBlocksFromStack(self, dataPath, dims, ext="stack", blockSize="150M", dtype='int16',
                                  newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None, recursive=False):
        """Create an RDD of <string blocklabel, (int k-tuple indices, array of datatype values)>

        Parameters
        ----------

        dataPath: string URI or local filesystem path
            Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://",
            "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path
            must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified
            by a glob-style expression using a single wildcard character '*'.

        dims: tuple of positive int
            Dimensions of input image data, ordered with the fastest-changing dimension first.

        dtype: dtype or dtype specifier, optional, default 'int16'
            Numpy dtype of input stack data

        newDtype: floating-point dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat'
            Numpy dtype of output series data. Series data must be floating-point. Input data will be cast to the
            requested `newdtype` - see numpy `astype()` method.

        casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's `astype()` method; see numpy documentation for details.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at dataPath, loading all files in the tree that
            have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems
            (not s3).

        Returns
        ---------
        pair of (RDD, ntimepoints)

        RDD: sequence of keys, values pairs
            (call using flatMap)

        RDD Key: tuple of int
            zero-based indicies of position within original image volume

        RDD Value: numpy array of datatype
            series of values at position across loaded image volumes

        ntimepoints: int
            number of time points in returned series, determined from number of stack files found at dataPath

        newDtype: string
            string representation of numpy data type of returned blocks

        """
        dataPath = self.__normalizeDatafilePattern(dataPath, ext)
        blockSize = parseMemoryString(blockSize)
        totalDim = reduce(lambda x_, y_: x_*y_, dims)
        dtype = dtypeFunc(dtype)
        if newDtype is None or newDtype == '':
            newDtype = str(dtype)
        elif newDtype == 'smallfloat':
            newDtype = str(smallestFloatType(dtype))
        else:
            newDtype = str(newDtype)

        reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride)
        filenames = reader.list(dataPath, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
        if not filenames:
            raise IOError("No files found for path '%s'" % dataPath)

        dataSize = totalDim * len(filenames) * dtype.itemsize
        nblocks = max(dataSize / blockSize, 1)  # integer division

        if len(dims) >= 3:
            # for 3D stacks, do calculations to ensure that
            # different planes appear in distinct files
            blocksPerPlane = max(nblocks / dims[-1], 1)

            pixPerPlane = reduce(lambda x_, y_: x_*y_, dims[:-1])  # all but last dimension

            # get the greatest number of blocks in a plane (up to as many as requested) that still divide the plane
            # evenly. This will always be at least one.
            kUpdated = [x for x in range(1, blocksPerPlane+1) if not pixPerPlane % x][-1]
            nblocks = kUpdated * dims[-1]
            blockSizePerStack = (totalDim / nblocks) * dtype.itemsize
        else:
            # otherwise just round to make contents divide into nearly even blocks
            blockSizePerStack = int(math.ceil(totalDim / float(nblocks)))
            nblocks = int(math.ceil(totalDim / float(blockSizePerStack)))
            blockSizePerStack *= dtype.itemsize

        fileSize = totalDim * dtype.itemsize

        def readBlock(blockNum):
            # copy size out from closure; will modify later:
            blockSizePerStack_ = blockSizePerStack
            # get start position for this block
            position = blockNum * blockSizePerStack_

            # adjust if at end of file
            if (position + blockSizePerStack_) > fileSize:
                blockSizePerStack_ = int(fileSize - position)
            # loop over files, loading one block from each
            bufs = []

            for fname in filenames:
                buf = reader.read(fname, startOffset=position, size=blockSizePerStack_)
                bufs.append(frombuffer(buf, dtype=dtype))

            buf = vstack(bufs).T  # dimensions are now linindex x time (images)
            del bufs
            buf = buf.astype(newDtype, casting=casting, copy=False)

            # append subscript keys based on dimensions
            itemPosition = position / dtype.itemsize
            itemBlocksize = blockSizePerStack_ / dtype.itemsize
            linearIdx = arange(itemPosition, itemPosition + itemBlocksize)  # zero-based

            keys = zip(*map(tuple, unravel_index(linearIdx, dims, order='F')))
            return zip(keys, buf)

        # map over blocks
        return (self.sc.parallelize(range(0, nblocks), nblocks).flatMap(lambda bn: readBlock(bn)),
                len(filenames), newDtype)
Exemple #14
0
    def fromStack(self, dataPath, dims=None, dtype=None, ext='stack', startIdx=None, stopIdx=None, recursive=False,
                  nplanes=None, npartitions=None, confFilename='conf.json'):
        """Load an Images object stored in a directory of flat binary files

        The RDD wrapped by the returned Images object will have a number of partitions equal to the number of image data
        files read in by this method.

        Currently all binary data read by this method is assumed to be formatted as signed 16 bit integers in native
        byte order.

        Parameters
        ----------

        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A dataPath argument may include a single '*' wildcard character in the filename.

        dims: tuple of positive int
            Dimensions of input image data, ordered with fastest-changing dimension first

        ext: string, optional, default "stack"
            Extension required on data files to be loaded.

        startIdx, stopIdx: nonnegative int. optional.
            Indices of the first and last-plus-one data file to load, relative to the sorted filenames matching
            `datapath` and `ext`. Interpreted according to python slice indexing conventions.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at datapath, loading all files in the tree that
            have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems
            (not s3).

        nplanes: positive integer, default None
            If passed, will cause a single binary stack file to be subdivided into multiple records. Every
            `nplanes` z-planes in the file will be taken as a new record, with the first nplane planes of the
            first file being record 0, the second nplane planes being record 1, etc, until the first file is
            exhausted and record ordering continues with the first nplane planes of the second file, and so on.
            With nplanes=None (the default), a single file will be considered as representing a single record.

        npartitions: positive int, optional.
            If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1
            partition per image file.
        """
        reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride)
        try:
            jsonBuf = reader.read(dataPath, filename=confFilename)
            params = json.loads(jsonBuf)
        except FileNotFoundError:
            params = {}

        if 'dtype' in params.keys():
            dtype = params['dtype']
        if 'dims' in params.keys():
            dims = params['dims']

        if not dims:
            raise ValueError("Image dimensions must be specified either as argument or in a conf.json file")

        if not dtype:
            dtype = 'int16'

        if nplanes is not None:
            if nplanes <= 0:
                raise ValueError("nplanes must be positive if passed, got %d" % nplanes)
            if dims[-1] % nplanes:
                raise ValueError("Last dimension of stack image '%d' must be divisible by nplanes '%d'" %
                                 (dims[-1], nplanes))

        def toArray(idxAndBuf):
            idx, buf = idxAndBuf
            ary = frombuffer(buf, dtype=dtype, count=int(prod(dims))).reshape(dims, order='F')
            if nplanes is None:
                yield idx, ary
            else:
                # divide array into chunks of nplanes
                npoints = dims[-1] / nplanes  # integer division
                if dims[-1] % nplanes:
                    npoints += 1
                timepoint = 0
                lastPlane = 0
                curPlane = 1
                while curPlane < ary.shape[-1]:
                    if curPlane % nplanes == 0:
                        slices = [slice(None)] * (ary.ndim - 1) + [slice(lastPlane, curPlane)]
                        yield idx*npoints + timepoint, ary[slices]
                        timepoint += 1
                        lastPlane = curPlane
                    curPlane += 1
                # yield remaining planes
                slices = [slice(None)] * (ary.ndim - 1) + [slice(lastPlane, ary.shape[-1])]
                yield idx*npoints + timepoint, ary[slices]

        reader = getParallelReaderForPath(dataPath)(self.sc, awsCredentialsOverride=self.awsCredentialsOverride)
        readerRdd = reader.read(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive,
                                npartitions=npartitions)
        nrecords = reader.lastNRecs if nplanes is None else None
        newDims = tuple(list(dims[:-1]) + [nplanes]) if nplanes else dims
        return Images(readerRdd.flatMap(toArray), nrecords=nrecords, dims=newDims, dtype=dtype)
Exemple #15
0
    def _getSeriesBlocksFromMultiTif(self,
                                     datapath,
                                     ext="tif",
                                     blockSize="150M",
                                     newdtype='smallfloat',
                                     casting='safe',
                                     startidx=None,
                                     stopidx=None):
        import thunder.rdds.fileio.multitif as multitif
        import itertools
        from PIL import Image
        from thunder.utils.common import pil_to_array, smallest_float_type
        import io

        datapath = self.__normalizeDatafilePattern(datapath, ext)
        blockSize = parseMemoryString(blockSize)

        reader = getFileReaderForPath(datapath)()
        filenames = reader.list(datapath)
        if not filenames:
            raise IOError("No files found for path '%s'" % datapath)
        filenames = selectByStartAndStopIndices(filenames, startidx, stopidx)
        ntimepoints = len(filenames)

        minimize_reads = datapath.lower().startswith("s3")

        height, width, npages, datatype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif(
            reader, filenames[0])
        pixelbytesize = dtypefunc(datatype).itemsize
        if newdtype is None or str(newdtype) == '':
            newdtype = str(datatype)
        elif newdtype == 'smallfloat':
            newdtype = str(smallest_float_type(datatype))
        else:
            newdtype = str(newdtype)

        # intialize at one block per plane
        bytesperplane = height * width * pixelbytesize * ntimepoints
        bytesperblock = bytesperplane
        blocksperplane = 1
        # keep dividing while cutting our size in half still leaves us bigger than the requested size
        # should end up no more than 2x blockSize.
        while bytesperblock >= blockSize * 2:
            bytesperblock /= 2
            blocksperplane *= 2

        blocklenPixels = max((height * width) / blocksperplane,
                             1)  # integer division
        while blocksperplane * blocklenPixels < height * width:  # make sure we're reading the plane fully
            blocksperplane += 1

        # keys will be planeidx, blockidx:
        keys = list(itertools.product(xrange(npages), xrange(blocksperplane)))

        def readblockfromtif(pidxbidx_):
            planeidx, blockidx = pidxbidx_
            blocks = []
            planeshape = None
            blockstart = None
            blockend = None
            for fname in filenames:
                reader_ = getFileReaderForPath(fname)()
                fp = reader_.open(fname)
                try:
                    if minimize_reads:
                        # use multitif module to generate a fake, in-memory one-page tif file
                        # the advantage of this is that it cuts way down on the many small reads
                        # that PIL/pillow will make otherwise, which would be a problem for s3
                        tiffparser_ = multitif.TiffParser(fp, debug=False)
                        tiffilebuffer = multitif.packSinglePage(
                            tiffparser_, page_idx=planeidx)
                        bytebuf = io.BytesIO(tiffilebuffer)
                        try:
                            pilimg = Image.open(bytebuf)
                            ary = pil_to_array(pilimg).T
                        finally:
                            bytebuf.close()
                        del tiffilebuffer, tiffparser_, pilimg, bytebuf
                    else:
                        # read tif using PIL directly
                        pilimg = Image.open(fp)
                        pilimg.seek(planeidx)
                        ary = pil_to_array(pilimg).T
                        del pilimg

                    if not planeshape:
                        planeshape = ary.shape[:]
                        blockstart = blockidx * blocklenPixels
                        blockend = min(blockstart + blocklenPixels,
                                       planeshape[0] * planeshape[1])
                    blocks.append(ary.ravel(order='C')[blockstart:blockend])
                    del ary
                finally:
                    fp.close()

            buf = vstack(
                blocks).T  # dimensions are now linindex x time (images)
            del blocks
            buf = buf.astype(newdtype, casting=casting, copy=False)

            # append subscript keys based on dimensions
            linindx = arange(blockstart, blockend)  # zero-based

            serieskeys = zip(
                *map(tuple, unravel_index(linindx, planeshape, order='C')))
            # add plane index to end of keys
            serieskeys = [
                tuple(list(keys_)[::-1] + [planeidx]) for keys_ in serieskeys
            ]
            return zip(serieskeys, buf)

        # map over blocks
        rdd = self.sc.parallelize(keys, len(keys)).flatMap(readblockfromtif)
        dims = (npages, width, height)

        metadata = (dims, ntimepoints, newdtype)
        return rdd, metadata
Exemple #16
0
    def fromStack(self,
                  dataPath,
                  dims=None,
                  dtype=None,
                  ext='stack',
                  startIdx=None,
                  stopIdx=None,
                  recursive=False,
                  nplanes=None,
                  npartitions=None,
                  confFilename='conf.json'):
        """Load an Images object stored in a directory of flat binary files

        The RDD wrapped by the returned Images object will have a number of partitions equal to the number of image data
        files read in by this method.

        Currently all binary data read by this method is assumed to be formatted as signed 16 bit integers in native
        byte order.

        Parameters
        ----------

        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A dataPath argument may include a single '*' wildcard character in the filename.

        dims: tuple of positive int
            Dimensions of input image data, ordered with fastest-changing dimension first

        ext: string, optional, default "stack"
            Extension required on data files to be loaded.

        startIdx, stopIdx: nonnegative int. optional.
            Indices of the first and last-plus-one data file to load, relative to the sorted filenames matching
            `datapath` and `ext`. Interpreted according to python slice indexing conventions.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at datapath, loading all files in the tree that
            have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems
            (not s3).

        nplanes: positive integer, default None
            If passed, will cause a single binary stack file to be subdivided into multiple records. Every
            `nplanes` z-planes in the file will be taken as a new record, with the first nplane planes of the
            first file being record 0, the second nplane planes being record 1, etc, until the first file is
            exhausted and record ordering continues with the first nplane planes of the second file, and so on.
            With nplanes=None (the default), a single file will be considered as representing a single record.

        npartitions: positive int, optional.
            If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1
            partition per image file.
        """
        reader = getFileReaderForPath(dataPath)(
            awsCredentialsOverride=self.awsCredentialsOverride)
        try:
            jsonBuf = reader.read(dataPath, filename=confFilename)
            params = json.loads(jsonBuf)
        except FileNotFoundError:
            params = {}

        if 'dtype' in params.keys():
            dtype = params['dtype']
        if 'dims' in params.keys():
            dims = params['dims']

        if not dims:
            raise ValueError(
                "Image dimensions must be specified either as argument or in a conf.json file"
            )

        if not dtype:
            dtype = 'int16'

        if nplanes is not None:
            if nplanes <= 0:
                raise ValueError("nplanes must be positive if passed, got %d" %
                                 nplanes)
            if dims[-1] % nplanes:
                raise ValueError(
                    "Last dimension of stack image '%d' must be divisible by nplanes '%d'"
                    % (dims[-1], nplanes))

        def toArray(idxAndBuf):
            idx, buf = idxAndBuf
            ary = frombuffer(buf, dtype=dtype,
                             count=int(prod(dims))).reshape(dims, order='F')
            if nplanes is None:
                yield idx, ary
            else:
                # divide array into chunks of nplanes
                npoints = dims[-1] / nplanes  # integer division
                if dims[-1] % nplanes:
                    npoints += 1
                timepoint = 0
                lastPlane = 0
                curPlane = 1
                while curPlane < ary.shape[-1]:
                    if curPlane % nplanes == 0:
                        slices = [slice(None)] * (ary.ndim - 1) + [
                            slice(lastPlane, curPlane)
                        ]
                        yield idx * npoints + timepoint, ary[slices]
                        timepoint += 1
                        lastPlane = curPlane
                    curPlane += 1
                # yield remaining planes
                slices = [slice(None)] * (ary.ndim - 1) + [
                    slice(lastPlane, ary.shape[-1])
                ]
                yield idx * npoints + timepoint, ary[slices]

        reader = getParallelReaderForPath(dataPath)(
            self.sc, awsCredentialsOverride=self.awsCredentialsOverride)
        readerRdd = reader.read(dataPath,
                                ext=ext,
                                startIdx=startIdx,
                                stopIdx=stopIdx,
                                recursive=recursive,
                                npartitions=npartitions)
        nrecords = reader.lastNRecs if nplanes is None else None
        newDims = tuple(list(dims[:-1]) + [nplanes]) if nplanes else dims
        return Images(readerRdd.flatMap(toArray),
                      nrecords=nrecords,
                      dims=newDims,
                      dtype=dtype)
Exemple #17
0
    def _getSeriesBlocksFromMultiTif(self, datapath, ext="tif", blockSize="150M",
                                     newdtype='smallfloat', casting='safe', startidx=None, stopidx=None):
        import thunder.rdds.fileio.multitif as multitif
        import itertools
        from PIL import Image
        import io

        datapath = self.__normalizeDatafilePattern(datapath, ext)
        blockSize = parseMemoryString(blockSize)

        reader = getFileReaderForPath(datapath)()
        filenames = reader.list(datapath)
        if not filenames:
            raise IOError("No files found for path '%s'" % datapath)
        filenames = selectByStartAndStopIndices(filenames, startidx, stopidx)
        ntimepoints = len(filenames)

        minimize_reads = datapath.lower().startswith("s3")

        height, width, npages, datatype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif(reader, filenames[0])
        pixelbytesize = dtypefunc(datatype).itemsize
        if newdtype is None or str(newdtype) == '':
            newdtype = str(datatype)
        elif newdtype == 'smallfloat':
            newdtype = str(smallest_float_type(datatype))
        else:
            newdtype = str(newdtype)

        # intialize at one block per plane
        bytesperplane = height * width * pixelbytesize * ntimepoints
        bytesperblock = bytesperplane
        blocksperplane = 1
        # keep dividing while cutting our size in half still leaves us bigger than the requested size
        # should end up no more than 2x blockSize.
        while bytesperblock >= blockSize * 2:
            bytesperblock /= 2
            blocksperplane *= 2

        blocklenPixels = max((height * width) / blocksperplane, 1)  # integer division
        while blocksperplane * blocklenPixels < height * width:  # make sure we're reading the plane fully
            blocksperplane += 1

        # keys will be planeidx, blockidx:
        keys = list(itertools.product(xrange(npages), xrange(blocksperplane)))

        def readblockfromtif(pidxbidx_):
            planeidx, blockidx = pidxbidx_
            blocks = []
            planeshape = None
            blockstart = None
            blockend = None
            for fname in filenames:
                reader_ = getFileReaderForPath(fname)()
                fp = reader_.open(fname)
                try:
                    if minimize_reads:
                        # use multitif module to generate a fake, in-memory one-page tif file
                        # the advantage of this is that it cuts way down on the many small reads
                        # that PIL/pillow will make otherwise, which would be a problem for s3
                        tiffparser_ = multitif.TiffParser(fp, debug=False)
                        tiffilebuffer = multitif.packSinglePage(tiffparser_, page_idx=planeidx)
                        bytebuf = io.BytesIO(tiffilebuffer)
                        try:
                            pilimg = Image.open(bytebuf)
                            ary = array(pilimg).T
                        finally:
                            bytebuf.close()
                        del tiffilebuffer, tiffparser_, pilimg, bytebuf
                    else:
                        # read tif using PIL directly
                        pilimg = Image.open(fp)
                        pilimg.seek(planeidx)
                        ary = array(pilimg).T
                        del pilimg

                    if not planeshape:
                        planeshape = ary.shape[:]
                        blockstart = blockidx * blocklenPixels
                        blockend = min(blockstart+blocklenPixels, planeshape[0]*planeshape[1])
                    blocks.append(ary.ravel(order='C')[blockstart:blockend])
                    del ary
                finally:
                    fp.close()

            buf = vstack(blocks).T  # dimensions are now linindex x time (images)
            del blocks
            buf = buf.astype(newdtype, casting=casting, copy=False)

            # append subscript keys based on dimensions
            linindx = arange(blockstart, blockend)  # zero-based

            serieskeys = zip(*map(tuple, unravel_index(linindx, planeshape, order='C')))
            # add plane index to end of keys
            serieskeys = [tuple(list(keys_)[::-1]+[planeidx]) for keys_ in serieskeys]
            return zip(serieskeys, buf)

        # map over blocks
        rdd = self.sc.parallelize(keys, len(keys)).flatMap(readblockfromtif)
        dims = (npages, width, height)

        metadata = (dims, ntimepoints, newdtype)
        return rdd, metadata