Beispiel #1
0
    def fromArrays(self, arrays, npartitions=None):
        """Load Images data from passed sequence of numpy arrays.

        Expected usage is mainly in testing - having a full dataset loaded in memory
        on the driver is likely prohibitive in the use cases for which Thunder is intended.
        """
        # if passed a single array, cast it to a sequence of length 1
        if isinstance(arrays, ndarray):
            arrays = [arrays]

        shape = None
        dtype = None
        for ary in arrays:
            if shape is None:
                shape = ary.shape
                dtype = ary.dtype
            if not ary.shape == shape:
                raise ValueError(
                    "Arrays must all be of same shape; got both %s and %s" %
                    (str(shape), str(ary.shape)))
            if not ary.dtype == dtype:
                raise ValueError(
                    "Arrays must all be of same data type; got both %s and %s"
                    % (str(dtype), str(ary.dtype)))
        narrays = len(arrays)
        npartitions = min(narrays, npartitions) if npartitions else narrays
        return Images(self.sc.parallelize(enumerate(arrays), npartitions),
                      dims=shape,
                      dtype=str(dtype),
                      nrecords=narrays)
Beispiel #2
0
    def fromPng(self, dataPath, ext='png', startIdx=None, stopIdx=None, recursive=False):
        """Load an Images object stored in a directory of png files

        The RDD wrapped by the returned Images object will have a number of partitions equal to the number of image data
        files read in by this method.

        Parameters
        ----------

        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A datapath argument may include a single '*' wildcard character in the filename.

        ext: string, optional, default "png"
            Extension required on data files to be loaded.

        startIdx, stopIdx: nonnegative int. optional.
            Indices of the first and last-plus-one data file to load, relative to the sorted filenames matching
            `datapath` and `ext`. Interpreted according to python slice indexing conventions.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at datapath, loading all files in the tree that
            have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems
            (not s3).
        """
        def readPngFromBuf(buf):
            fbuf = BytesIO(buf)
            return imread(fbuf, format='png')

        reader = getParallelReaderForPath(dataPath)(self.sc)
        readerRdd = reader.read(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
        return Images(readerRdd.mapValues(readPngFromBuf), nimages=reader.lastNRecs)
Beispiel #3
0
 def toImages(self):
     from thunder.rdds.images import Images
     timeRdd = self.rdd.flatMap(
         lambda kv: SimpleBlocks._toTimeSlicedBlocksIter(kv[0], kv[1]))
     timeSortedRdd = timeRdd.groupBy(lambda
                                     (k, _): k.temporalKey).sortByKey()
     imagesRdd = timeSortedRdd.map(SimpleBlocks._combineTimeSlicedBlocks)
     return Images(imagesRdd,
                   dims=self._dims,
                   nrecords=self._nimages,
                   dtype=self._dtype)
Beispiel #4
0
    def transform(self, images):
        """
        Apply the transformation to an Images object.

        Will apply the underlying dictionary of transformations to
        the images or volumes of the Images object. The dictionary acts as a lookup
        table specifying which transformation should be applied to which record of the
        Images object based on the key. Because transformations are small,
        we broadcast the transformations rather than using a join.
        """

        from thunder.rdds.images import Images

        # broadcast the transformations
        bcTransformations = images.rdd.context.broadcast(self.transformations)

        # apply the transformations
        newrdd = images.rdd.map(lambda (k, im):
                                (k, bcTransformations.value[k].apply(im)))
        return Images(newrdd).__finalize__(images)
Beispiel #5
0
    def transform(self, images, reference):
        """
        Apply registration to a collection of images / volumes.

        Parameters
        ----------
        images : Images
            An Images object containing the images / volumes to apply registration to

        reference : ndarray
            The reference image / volume to register against
        """

        if not (isinstance(images, Images)):
            raise Exception('Input data must be Images or a subclass')

        self._checkReference(images, reference)

        # apply filtering to reference if defined
        if hasattr(self, '_filter'):
            reference = self._applyVol(reference, self.filter)

        # broadcast the reference (a potentially very large array)
        referenceBC = images.rdd.context.broadcast(reference)

        # compute and apply transformation on an image / volume
        def register(im, ref):
            if im.ndim == 2:
                t = self.getTransform(self.filter(im), ref.value)
                return self.applyTransform(im, t)
            else:
                im.setflags(write=True)
                for z in arange(0, im.shape[2]):
                    t = self.getTransform(self.filter(im[:, :, z]),
                                          ref.value[:, :, z])
                    im[:, :, z] = self.applyTransform(im[:, :, z], t)
                return im

        # return the transformed volumes
        newRdd = images.rdd.mapValues(lambda x: register(x, referenceBC))
        return Images(newRdd).__finalize__(images)
Beispiel #6
0
    def fromStack(self, dataPath, dims, dtype='int16', ext='stack', startIdx=None, stopIdx=None, recursive=False):
        """Load an Images object stored in a directory of flat binary files

        The RDD wrapped by the returned Images object will have a number of partitions equal to the number of image data
        files read in by this method.

        Currently all binary data read by this method is assumed to be formatted as signed 16 bit integers in native
        byte order.

        Parameters
        ----------

        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A datapath argument may include a single '*' wildcard character in the filename.

        dims: tuple of positive int
            Dimensions of input image data, ordered with fastest-changing dimension first

        ext: string, optional, default "stack"
            Extension required on data files to be loaded.

        startIdx, stopIdx: nonnegative int. optional.
            Indices of the first and last-plus-one data file to load, relative to the sorted filenames matching
            `datapath` and `ext`. Interpreted according to python slice indexing conventions.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at datapath, loading all files in the tree that
            have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems
            (not s3).
        """
        if not dims:
            raise ValueError("Image dimensions must be specified if loading from binary stack data")

        def toArray(buf):
            return frombuffer(buf, dtype=dtype, count=int(prod(dims))).reshape(dims, order='F')

        reader = getParallelReaderForPath(dataPath)(self.sc)
        readerRdd = reader.read(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
        return Images(readerRdd.mapValues(toArray), nimages=reader.lastNRecs, dims=dims,
                      dtype=dtype)
Beispiel #7
0
    def run(self, images):
        """
        Compute and implement registration on a collection of images / volumes.

        This is a lazy operation that combines the estimation of registration
        with its implementaiton. It returns a new Images object with transformed
        images, and does not expose the registration parameters directly, see the
        'fit' method to obtain parameters directly.

        Parameters
        ----------
        images : Images
            An Images object with the images / volumes to apply registration to.

        Return
        ------
        Images object with registered images / volumes
        """

        if not (isinstance(images, Images)):
            raise Exception('Input data must be Images or a subclass')

        if len(images.dims.count) not in set([2, 3]):
            raise Exception('Number of image dimensions %s must be 2 or 3' %
                            (len(images.dims.count)))

        self.isPrepared(images)

        # broadcast the reference
        bcReg = images.rdd.context.broadcast(self)

        def fitandtransform(im, reg):
            t = reg.value.getTransform(im)
            return t.apply(im)

        newrdd = images.rdd.mapValues(lambda im: fitandtransform(im, bcReg))

        return Images(newrdd).__finalize__(images)
Beispiel #8
0
    def fromStack(self,
                  dataPath,
                  dims=None,
                  dtype=None,
                  ext='stack',
                  startIdx=None,
                  stopIdx=None,
                  recursive=False,
                  nplanes=None,
                  npartitions=None,
                  confFilename='conf.json'):
        """Load an Images object stored in a directory of flat binary files

        The RDD wrapped by the returned Images object will have a number of partitions equal to the number of image data
        files read in by this method.

        Currently all binary data read by this method is assumed to be formatted as signed 16 bit integers in native
        byte order.

        Parameters
        ----------

        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A dataPath argument may include a single '*' wildcard character in the filename.

        dims: tuple of positive int
            Dimensions of input image data, ordered with fastest-changing dimension first

        ext: string, optional, default "stack"
            Extension required on data files to be loaded.

        startIdx, stopIdx: nonnegative int. optional.
            Indices of the first and last-plus-one data file to load, relative to the sorted filenames matching
            `datapath` and `ext`. Interpreted according to python slice indexing conventions.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at datapath, loading all files in the tree that
            have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems
            (not s3).

        nplanes: positive integer, default None
            If passed, will cause a single binary stack file to be subdivided into multiple records. Every
            `nplanes` z-planes in the file will be taken as a new record, with the first nplane planes of the
            first file being record 0, the second nplane planes being record 1, etc, until the first file is
            exhausted and record ordering continues with the first nplane planes of the second file, and so on.
            With nplanes=None (the default), a single file will be considered as representing a single record.

        npartitions: positive int, optional.
            If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1
            partition per image file.
        """
        reader = getFileReaderForPath(dataPath)(
            awsCredentialsOverride=self.awsCredentialsOverride)
        try:
            jsonBuf = reader.read(dataPath, filename=confFilename)
            params = json.loads(jsonBuf)
        except FileNotFoundError:
            params = {}

        if 'dtype' in params.keys():
            dtype = params['dtype']
        if 'dims' in params.keys():
            dims = params['dims']

        if not dims:
            raise ValueError(
                "Image dimensions must be specified either as argument or in a conf.json file"
            )

        if not dtype:
            dtype = 'int16'

        if nplanes is not None:
            if nplanes <= 0:
                raise ValueError("nplanes must be positive if passed, got %d" %
                                 nplanes)
            if dims[-1] % nplanes:
                raise ValueError(
                    "Last dimension of stack image '%d' must be divisible by nplanes '%d'"
                    % (dims[-1], nplanes))

        def toArray(idxAndBuf):
            idx, buf = idxAndBuf
            ary = frombuffer(buf, dtype=dtype,
                             count=int(prod(dims))).reshape(dims, order='F')
            if nplanes is None:
                yield idx, ary
            else:
                # divide array into chunks of nplanes
                npoints = dims[-1] / nplanes  # integer division
                if dims[-1] % nplanes:
                    npoints += 1
                timepoint = 0
                lastPlane = 0
                curPlane = 1
                while curPlane < ary.shape[-1]:
                    if curPlane % nplanes == 0:
                        slices = [slice(None)] * (ary.ndim - 1) + [
                            slice(lastPlane, curPlane)
                        ]
                        yield idx * npoints + timepoint, ary[slices]
                        timepoint += 1
                        lastPlane = curPlane
                    curPlane += 1
                # yield remaining planes
                slices = [slice(None)] * (ary.ndim - 1) + [
                    slice(lastPlane, ary.shape[-1])
                ]
                yield idx * npoints + timepoint, ary[slices]

        reader = getParallelReaderForPath(dataPath)(
            self.sc, awsCredentialsOverride=self.awsCredentialsOverride)
        readerRdd = reader.read(dataPath,
                                ext=ext,
                                startIdx=startIdx,
                                stopIdx=stopIdx,
                                recursive=recursive,
                                npartitions=npartitions)
        nrecords = reader.lastNRecs if nplanes is None else None
        newDims = tuple(list(dims[:-1]) + [nplanes]) if nplanes else dims
        return Images(readerRdd.flatMap(toArray),
                      nrecords=nrecords,
                      dims=newDims,
                      dtype=dtype)
Beispiel #9
0
class ImagesLoader(object):
    """Loader object used to instantiate Images data stored in a variety of formats.
    """
    def __init__(self, sparkContext):
        """Initialize a new ImagesLoader object.

        Parameters
        ----------
        sparkcontext: SparkContext
            The pyspark SparkContext object used by the current Thunder environment.
        """
        from thunder.utils.common import AWSCredentials
        self.sc = sparkContext
        self.awsCredentialsOverride = AWSCredentials.fromContext(sparkContext)

    def fromArrays(self, arrays, npartitions=None):
        """Load Images data from passed sequence of numpy arrays.

        Expected usage is mainly in testing - having a full dataset loaded in memory
        on the driver is likely prohibitive in the use cases for which Thunder is intended.
        """
        # if passed a single array, cast it to a sequence of length 1
        if isinstance(arrays, ndarray):
            arrays = [arrays]

        shape = None
        dtype = None
        for ary in arrays:
            if shape is None:
                shape = ary.shape
                dtype = ary.dtype
            if not ary.shape == shape:
                raise ValueError(
                    "Arrays must all be of same shape; got both %s and %s" %
                    (str(shape), str(ary.shape)))
            if not ary.dtype == dtype:
                raise ValueError(
                    "Arrays must all be of same data type; got both %s and %s"
                    % (str(dtype), str(ary.dtype)))
        narrays = len(arrays)
        npartitions = min(narrays, npartitions) if npartitions else narrays
        return Images(self.sc.parallelize(enumerate(arrays), npartitions),
                      dims=shape,
                      dtype=str(dtype),
                      nrecords=narrays)

    def fromStack(self,
                  dataPath,
                  dims=None,
                  dtype=None,
                  ext='stack',
                  startIdx=None,
                  stopIdx=None,
                  recursive=False,
                  nplanes=None,
                  npartitions=None,
                  confFilename='conf.json'):
        """Load an Images object stored in a directory of flat binary files

        The RDD wrapped by the returned Images object will have a number of partitions equal to the number of image data
        files read in by this method.

        Currently all binary data read by this method is assumed to be formatted as signed 16 bit integers in native
        byte order.

        Parameters
        ----------

        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A dataPath argument may include a single '*' wildcard character in the filename.

        dims: tuple of positive int
            Dimensions of input image data, ordered with fastest-changing dimension first

        ext: string, optional, default "stack"
            Extension required on data files to be loaded.

        startIdx, stopIdx: nonnegative int. optional.
            Indices of the first and last-plus-one data file to load, relative to the sorted filenames matching
            `datapath` and `ext`. Interpreted according to python slice indexing conventions.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at datapath, loading all files in the tree that
            have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems
            (not s3).

        nplanes: positive integer, default None
            If passed, will cause a single binary stack file to be subdivided into multiple records. Every
            `nplanes` z-planes in the file will be taken as a new record, with the first nplane planes of the
            first file being record 0, the second nplane planes being record 1, etc, until the first file is
            exhausted and record ordering continues with the first nplane planes of the second file, and so on.
            With nplanes=None (the default), a single file will be considered as representing a single record.

        npartitions: positive int, optional.
            If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1
            partition per image file.
        """
        reader = getFileReaderForPath(dataPath)(
            awsCredentialsOverride=self.awsCredentialsOverride)
        try:
            jsonBuf = reader.read(dataPath, filename=confFilename)
            params = json.loads(jsonBuf)
        except FileNotFoundError:
            params = {}

        if 'dtype' in params.keys():
            dtype = params['dtype']
        if 'dims' in params.keys():
            dims = params['dims']

        if not dims:
            raise ValueError(
                "Image dimensions must be specified either as argument or in a conf.json file"
            )

        if not dtype:
            dtype = 'int16'

        if nplanes is not None:
            if nplanes <= 0:
                raise ValueError("nplanes must be positive if passed, got %d" %
                                 nplanes)
            if dims[-1] % nplanes:
                raise ValueError(
                    "Last dimension of stack image '%d' must be divisible by nplanes '%d'"
                    % (dims[-1], nplanes))

        def toArray(idxAndBuf):
            idx, buf = idxAndBuf
            ary = frombuffer(buf, dtype=dtype,
                             count=int(prod(dims))).reshape(dims, order='F')
            if nplanes is None:
                yield idx, ary
            else:
                # divide array into chunks of nplanes
                npoints = dims[-1] / nplanes  # integer division
                if dims[-1] % nplanes:
                    npoints += 1
                timepoint = 0
                lastPlane = 0
                curPlane = 1
                while curPlane < ary.shape[-1]:
                    if curPlane % nplanes == 0:
                        slices = [slice(None)] * (ary.ndim - 1) + [
                            slice(lastPlane, curPlane)
                        ]
                        yield idx * npoints + timepoint, ary[slices]
                        timepoint += 1
                        lastPlane = curPlane
                    curPlane += 1
                # yield remaining planes
                slices = [slice(None)] * (ary.ndim - 1) + [
                    slice(lastPlane, ary.shape[-1])
                ]
                yield idx * npoints + timepoint, ary[slices]

        reader = getParallelReaderForPath(dataPath)(
            self.sc, awsCredentialsOverride=self.awsCredentialsOverride)
        readerRdd = reader.read(dataPath,
                                ext=ext,
                                startIdx=startIdx,
                                stopIdx=stopIdx,
                                recursive=recursive,
                                npartitions=npartitions)
        nrecords = reader.lastNRecs if nplanes is None else None
        newDims = tuple(list(dims[:-1]) + [nplanes]) if nplanes else dims
        return Images(readerRdd.flatMap(toArray),
                      nrecords=nrecords,
                      dims=newDims,
                      dtype=dtype)

    def fromTif(self,
                dataPath,
                ext='tif',
                startIdx=None,
                stopIdx=None,
                recursive=False,
                nplanes=None,
                npartitions=None):
        """Sets up a new Images object with data to be read from one or more tif files.

        Multiple pages of a multipage tif file will by default be assumed to represent the z-axis (depth) of a
        single 3-dimensional volume, in which case a single input multipage tif file will be converted into
        a single Images record. If `nplanes` is passed, then every nplanes pages will be interpreted as a single
        3d volume (2d if nplanes==1), allowing a single tif file to contain multiple Images records.

        This method attempts to explicitly import PIL. ImportError may be thrown if 'from PIL import Image' is
        unsuccessful. (PIL/pillow is not an explicit requirement for thunder.)

        The RDD wrapped by the returned Images object will by default have a number of partitions equal to the
        number of image data files read in by this method; it may have fewer partitions if npartitions is specified.

        Parameters
        ----------

        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A datapath argument may include a single '*' wildcard character in the filename.

        ext: string, optional, default "tif"
            Extension required on data files to be loaded.

        startIdx, stopIdx: nonnegative int. optional.
            Indices of the first and last-plus-one data file to load, relative to the sorted filenames matching
            `datapath` and `ext`. Interpreted according to python slice indexing conventions.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at datapath, loading all files in the tree that
            have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems
            (not s3).

        nplanes: positive integer, default None
            If passed, will cause a single multipage tif file to be subdivided into multiple records. Every
            `nplanes` tif pages in the file will be taken as a new record, with the first nplane pages of the
            first file being record 0, the second nplane pages being record 1, etc, until the first file is
            exhausted and record ordering continues with the first nplane images of the second file, and so on.
            With nplanes=None (the default), a single file will be considered as representing a single record.

        npartitions: positive int, optional.
            If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1
            partition per image file.
        """

        try:
            from PIL import Image
        except ImportError, e:
            Image = None
            raise ImportError(
                "fromMultipageTif requires a successful 'from PIL import Image'; "
                + "the PIL/pillow library appears to be missing or broken.", e)
        # we know that that array(pilimg) works correctly for pillow == 2.3.0, and that it
        # does not work (at least not with spark) for old PIL == 1.1.7. we believe but have not confirmed
        # that array(pilimg) works correctly for every version of pillow. thus currently we check only whether
        # our PIL library is in fact pillow, and choose our conversion function accordingly
        isPillow = hasattr(Image, "PILLOW_VERSION")
        if isPillow:
            conversionFcn = array  # use numpy's array() function
        else:
            from thunder.utils.common import pil_to_array
            conversionFcn = pil_to_array  # use our modified version of matplotlib's pil_to_array

        if nplanes is not None and nplanes <= 0:
            raise ValueError("nplanes must be positive if passed, got %d" %
                             nplanes)

        def multitifReader(idxAndBuf):
            idx, buf = idxAndBuf
            pageCount = -1
            values = []
            fbuf = BytesIO(buf)
            multipage = Image.open(fbuf)
            if multipage.mode.startswith('I') and 'S' in multipage.mode:
                # signed integer tiff file; use tifffile module to read
                import thunder.rdds.fileio.tifffile as tifffile
                fbuf.seek(0)  # reset pointer after read done by PIL
                tfh = tifffile.TiffFile(fbuf)
                ary = tfh.asarray(
                )  # ary comes back with pages as first dimension, will need to transpose
                pageCount = ary.shape[0]
                if nplanes is not None:
                    values = [
                        ary[i:(i + nplanes)]
                        for i in xrange(0, ary.shape[0], nplanes)
                    ]
                else:
                    values = [ary]
                tfh.close()
                # transpose Z dimension if any, leave X and Y in same order
                if ary.ndim == 3:
                    values = [val.transpose((1, 2, 0)) for val in values]
                    # squeeze out last dimension if singleton
                    values = [
                        val.squeeze(-1) if val.shape[-1] == 1 else val
                        for val in values
                    ]
            else:
                # normal case; use PIL/Pillow for anything but signed ints
                pageIdx = 0
                imgArys = []
                npagesLeft = -1 if nplanes is None else nplanes  # counts number of planes remaining in image if positive
                while True:
                    try:
                        multipage.seek(pageIdx)
                        imgArys.append(conversionFcn(multipage))
                        pageIdx += 1
                        npagesLeft -= 1
                        if npagesLeft == 0:
                            # we have just finished an image from this file
                            retAry = dstack(
                                imgArys) if len(imgArys) > 1 else imgArys[0]
                            values.append(retAry)
                            # reset counters:
                            npagesLeft = nplanes
                            imgArys = []
                    except EOFError:
                        # past last page in tif
                        break
                pageCount = pageIdx
                if imgArys:
                    retAry = dstack(
                        imgArys) if len(imgArys) > 1 else imgArys[0]
                    values.append(retAry)
            # check for inappropriate nplanes that doesn't evenly divide num pages
            if nplanes and (pageCount % nplanes):
                raise ValueError(
                    "nplanes '%d' does not evenly divide page count of multipage tif '%d'"
                    % (nplanes, pageCount))
            nvals = len(values)
            keys = [idx * nvals + timepoint for timepoint in xrange(nvals)]
            return zip(keys, values)

        reader = getParallelReaderForPath(dataPath)(
            self.sc, awsCredentialsOverride=self.awsCredentialsOverride)
        readerRdd = reader.read(dataPath,
                                ext=ext,
                                startIdx=startIdx,
                                stopIdx=stopIdx,
                                recursive=recursive,
                                npartitions=npartitions)
        nrecords = reader.lastNRecs if nplanes is None else None
        return Images(readerRdd.flatMap(multitifReader), nrecords=nrecords)
Beispiel #10
0
 def get_local_corr(self, data, neighborhood, images=False):
     rdd = self.sc.parallelize(data)
     imgs = Images(rdd) if images else Series(rdd).toImages()
     return imgs.localCorr(neighborhood=neighborhood)
Beispiel #11
0
 def get_local_corr(self, data, neighborhood, images=False):
     rdd = self.sc.parallelize(data)
     imgs = Images(rdd) if images else Series(rdd).toImages()
     return imgs.localCorr(neighborhood=neighborhood)
Beispiel #12
0
    def fromOCP(self, bucketName, resolution, server='ocp.me', startIdx=None, stopIdx=None,
                minBound=None, maxBound=None):
        """
        Creates up a new Image object with data read from OCP.
      
        Parameters
        ----------
        bucketName: string
            Name of the token/bucket in OCP. You can use the token name you created in OCP here.
            You can also access publicly available data on OCP at this URL "http://ocp.me/ocp/ca/public_tokens/"
        
        resolution: nonnegative int
            Resolution of the data in OCP

        server: string. optional.
            Name of the server in OCP which has the corresponding token.

        startIdx, stopIdx: nonnegative int. optional.
            Indices of the first and last-plus-one data file to load, relative to the sorted filenames matching
            `datapath` and `ext`. Interpreted according to python slice indexing conventions.

        minBound, maxBound: tuple of nonnegative int. optional.
            X,Y,Z bounds of the data you want to fetch from OCP. minBound contains
            the (xMin,yMin,zMin) while maxBound contains (xMax,yMax,zMax)
        """

        # Given a data-path/bucket Query JSON
        # Given bounds get a list of URI's
        import urllib2
        urlList = []
        url = 'http://{}/ocp/ca/{}/info/'.format(server, bucketName)

        try:
            f = urllib2.urlopen(url)
        except urllib2.URLError:
            raise Exception("Failed URL {}".format(url))

        import json
        projInfo = json.loads(f.read())

        # Loading Information from JSON object
        ximageSize, yimageSize = projInfo['dataset']['imagesize']['{}'.format(resolution)]
        zimageStart, zimageStop = projInfo['dataset']['slicerange']
        timageStart, timageStop = projInfo['dataset']['timerange']

        # Checking if dimensions are within bounds
        if startIdx is None:
            startIdx = timageStart
        elif startIdx < timageStart or startIdx > timageStop:
            raise Exception("startIdx out of bounds {},{}".format(timageStart, timageStop))

        if stopIdx is None:
            stopIdx = timageStop
        elif stopIdx < timageStart or stopIdx > timageStop:
            raise Exception("startIdx out of bounds {},{}".format(timageStart, timageStop))

        if minBound is None:
            minBound = (0, 0, zimageStart)
        elif minBound < (0, 0, zimageStart) or minBound > (ximageSize, yimageSize, zimageStop):
            raise Exception("minBound is incorrect {},{}".format((0, 0, zimageStart),
                                                                 (ximageSize, yimageSize, zimageStop)))

        if maxBound is None:
            maxBound = (ximageSize, yimageSize, zimageStop)
        elif maxBound < (0, 0, zimageStart) or maxBound > (ximageSize, yimageSize, zimageStop):
            raise Exception("minBound is incorrect {},{}".format((0, 0, zimageStart), (ximageSize, yimageSize,
                                                                                       zimageStop)))

        for t in range(timageStart, timageStop, 1):
            urlList.append("http://{}/ocp/ca/{}/npz/{},{}/{}/{},{}/{},{}/{},{}/".
                           format(server, bucketName, t, t + 1, resolution, minBound[0],
                                  maxBound[0], minBound[1], maxBound[1], minBound[2], maxBound[2]))

        def read(url):
            """Fetch URL from the server"""

            try:
                npzFile = urllib2.urlopen(url)
            except urllib2.URLError:
                raise Exception("Failed URL {}.".format(url))

            imgData = npzFile.read()
        
            import zlib
            import cStringIO
            pageStr = zlib.decompress(imgData[:])
            pageObj = cStringIO.StringIO(pageStr)
            data = load(pageObj)
            # Data comes in as 4d numpy array in t,z,y,x order. Swapping axes and removing the time dimension
            # to give back a 3d numpy array in x,y,z order
            data = swapaxes(data[0, :, :, :], 0, 2)

            return data
      
        rdd = self.sc.parallelize(enumerate(urlList), len(urlList)).map(lambda (k, v): (k, read(v)))
        return Images(rdd, nrecords=len(urlList))
Beispiel #13
0
class ImagesLoader(object):
    """Loader object used to instantiate Images data stored in a variety of formats.
    """
    def __init__(self, sparkContext):
        """Initialize a new ImagesLoader object.

        Parameters
        ----------
        sparkcontext: SparkContext
            The pyspark SparkContext object used by the current Thunder environment.
        """
        self.sc = sparkContext

    def fromArrays(self, arrays):
        """Load Images data from passed sequence of numpy arrays.

        Expected usage is mainly in testing - having a full dataset loaded in memory
        on the driver is likely prohibitive in the use cases for which Thunder is intended.
        """
        # if passed a single array, cast it to a sequence of length 1
        if isinstance(arrays, ndarray):
            arrays = [arrays]

        shape = None
        dtype = None
        for ary in arrays:
            if shape is None:
                shape = ary.shape
                dtype = ary.dtype
            if not ary.shape == shape:
                raise ValueError("Arrays must all be of same shape; got both %s and %s" %
                                 (str(shape), str(ary.shape)))
            if not ary.dtype == dtype:
                raise ValueError("Arrays must all be of same data type; got both %s and %s" %
                                 (str(dtype), str(ary.dtype)))
        return Images(self.sc.parallelize(enumerate(arrays), len(arrays)),
                      dims=shape, dtype=str(dtype), nimages=len(arrays))

    def fromStack(self, dataPath, dims, dtype='int16', ext='stack', startIdx=None, stopIdx=None, recursive=False):
        """Load an Images object stored in a directory of flat binary files

        The RDD wrapped by the returned Images object will have a number of partitions equal to the number of image data
        files read in by this method.

        Currently all binary data read by this method is assumed to be formatted as signed 16 bit integers in native
        byte order.

        Parameters
        ----------

        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A datapath argument may include a single '*' wildcard character in the filename.

        dims: tuple of positive int
            Dimensions of input image data, ordered with fastest-changing dimension first

        ext: string, optional, default "stack"
            Extension required on data files to be loaded.

        startIdx, stopIdx: nonnegative int. optional.
            Indices of the first and last-plus-one data file to load, relative to the sorted filenames matching
            `datapath` and `ext`. Interpreted according to python slice indexing conventions.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at datapath, loading all files in the tree that
            have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems
            (not s3).
        """
        if not dims:
            raise ValueError("Image dimensions must be specified if loading from binary stack data")

        def toArray(buf):
            return frombuffer(buf, dtype=dtype, count=int(prod(dims))).reshape(dims, order='F')

        reader = getParallelReaderForPath(dataPath)(self.sc)
        readerRdd = reader.read(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
        return Images(readerRdd.mapValues(toArray), nimages=reader.lastNRecs, dims=dims,
                      dtype=dtype)

    def fromTif(self, dataPath, ext='tif', startIdx=None, stopIdx=None, recursive=False):
        """Sets up a new Images object with data to be read from one or more tif files.

        The RDD underlying the returned Images will have key, value data as follows:

        key: int
            key is index of original data file, determined by lexicographic ordering of filenames
        value: numpy ndarray
            value dimensions with be x by y by num_channels*num_pages; all channels and pages in a file are
            concatenated together in the third dimension of the resulting ndarray. For pages 0, 1, etc
            of a multipage TIF of RGB images, ary[:,:,0] will be R channel of page 0 ("R0"), ary[:,:,1] will be B0,
            ... ary[:,:,3] == R1, and so on.

        This method attempts to explicitly import PIL. ImportError may be thrown if 'from PIL import Image' is
        unsuccessful. (PIL/pillow is not an explicit requirement for thunder.)
        """
        try:
            from PIL import Image
        except ImportError, e:
            Image = None
            raise ImportError("fromMultipageTif requires a successful 'from PIL import Image'; " +
                              "the PIL/pillow library appears to be missing or broken.", e)
        # we know that that array(pilimg) works correctly for pillow == 2.3.0, and that it
        # does not work (at least not with spark) for old PIL == 1.1.7. we believe but have not confirmed
        # that array(pilimg) works correctly for every version of pillow. thus currently we check only whether
        # our PIL library is in fact pillow, and choose our conversion function accordingly
        isPillow = hasattr(Image, "PILLOW_VERSION")
        if isPillow:
            conversionFcn = array  # use numpy's array() function
        else:
            from thunder.utils.common import pil_to_array
            conversionFcn = pil_to_array  # use our modified version of matplotlib's pil_to_array

        def multitifReader(buf):
            fbuf = BytesIO(buf)
            multipage = Image.open(fbuf)
            pageIdx = 0
            imgArys = []
            while True:
                try:
                    multipage.seek(pageIdx)
                    imgArys.append(conversionFcn(multipage))
                    pageIdx += 1
                except EOFError:
                    # past last page in tif
                    break
            if len(imgArys) == 1:
                return imgArys[0]
            else:
                return dstack(imgArys)

        reader = getParallelReaderForPath(dataPath)(self.sc)
        readerRdd = reader.read(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
        return Images(readerRdd.mapValues(multitifReader), nimages=reader.lastNRecs)