Ejemplo n.º 1
0
    def reference(images, method='mean', startIdx=None, stopIdx=None):
        """
        Compute a reference image for use in registration.

        Parameters
        ----------
        method : str, optional, default = 'mean'
            How to compute the reference

        startidx : int, optional, default = None
            Starting index if computing a mean over a specified range

        stopidx : int, optional, default = None
            Stopping index if computing a mean over a specified range

        """

        # TODO easy option for using the mean of the middle n images
        # TODO fix inclusive behavior to match e.g. image loading

        checkParams(method, ['mean'])

        if method == 'mean':
            if startIdx is not None and stopIdx is not None:
                idxRange = lambda x: startIdx <= x < stopIdx
                n = stopIdx - startIdx
                ref = images.filterOnKeys(idxRange)
            else:
                ref = images
                n = images.nimages
            refval = ref.sum() / (1.0 * n)
            return refval.astype(images.dtype)
Ejemplo n.º 2
0
    def similarity(self, other, metric='distance', thresh=5):
        """
        Estimate similarity between sources in self and other.

        Will compute the fraction of sources in self that are found
        in other, based on a given distance metric and a threshold.
        The fraction is estimated as the number of sources in self
        found in other, divided by the total number of sources in self.

        Parameters
        ----------
        other : SourceModel
            The sources to compare to

        metric : str, optional, default = "distance"
            Metric to use when computing distances

        thresh : scalar, optional, default = 5
            The distance below which a source is considered found
        """

        checkParams(metric, ['distance'])

        if metric == 'distance':
            vals = self.distance(other, minDistance=thresh)
            vals[isnan(vals)] = inf
        else:
            raise Exception("Metric not recognized")

        hits = sum(vals < thresh) / float(len(self.sources))

        return hits
Ejemplo n.º 3
0
    def distance(self, other, method='euclidean'):
        """
        Distance between the center of this source and another.

        Parameters
        ----------
        other : Source, or array-like
            Either another source, or the center coordinates of another source

        method : str
            Specify a distance measure to used for spatial distance between source
            centers. Current options include Euclidean distance ('euclidean') and 
            L1-norm ('l1'). 

        """
        from numpy.linalg import norm

        checkParams(method, ['euclidean', 'l1'])

        if method == 'l1':
            order = 1
        else:
            order = 2

        if isinstance(other, Source):
            return norm(self.center - other.center, ord=order)
        elif isinstance(other, list) or isinstance(other, ndarray):
            return norm(self.center - asarray(other), ord=order)
Ejemplo n.º 4
0
    def detrend(self, method='linear', **kwargs):
        """
        Detrend time series data with linear or nonlinear detrending
        Preserve intercept so that subsequent steps can adjust the baseline

        Parameters
        ----------
        method : str, optional, default = 'linear'
            Detrending method

        order : int, optional, default = 5
            Order of polynomial, for non-linear detrending only
        """
        checkParams(method, ['linear', 'nonlinear'])

        if method.lower() == 'linear':
            order = 1
        else:
            if 'order' in kwargs:
                order = kwargs['order']
            else:
                order = 5

        def func(y):
            x = arange(len(y))
            p = polyfit(x, y, order)
            p[-1] = 0
            yy = polyval(p, x)
            return y - yy

        return self.applyValues(func, keepIndex=True)
Ejemplo n.º 5
0
    def setFilter(self, filter='median', param=2):
        """
        Set a filter to apply to images before registration.

        The filtering will be applied to both the reference and
        image to compute the transformation parameters, but the filtering
        will not be applied to the images themselves.

        Parameters
        ----------

        filter : str, optional, default = 'median'
            Which filter to use (options are 'median' and 'gaussian')

        param : int, optional, default = 2
            Parameter to provide to filtering function (e.g. size for median filter)

        See also
        --------
        Images.medianFilter : apply median filter to images
        Images.gaussianFilter : apply gaussian filter to images

        """

        checkParams(filter, ['median', 'gaussian'])

        if filter == 'median':
            from scipy.ndimage.filters import median_filter
            self._filter = lambda x: median_filter(x, param)

        if filter == 'gaussian':
            from scipy.ndimage.filters import gaussian_filter
            self._filter = lambda x: gaussian_filter(x, param)

        return self
Ejemplo n.º 6
0
    def makeExample(self, dataset=None, **opts):
        """
        Make an example data set for testing analyses.

        Options include 'pca', 'factor', 'kmeans', 'ica', 'sources'
        See thunder.utils.datasets for detailed options.

        Parameters
        ----------
        dataset : str
            Which dataset to generate

        Returns
        -------
        data : RDD of (tuple, array) pairs
            Generated dataset

        """
        from thunder.utils.datasets import DATASET_MAKERS

        if dataset is None:
            return sorted(DATASET_MAKERS.keys())

        checkParams(dataset, DATASET_MAKERS.keys())

        return DataSets.make(self._sc, dataset, **opts)
Ejemplo n.º 7
0
    def distance(self, other, method='euclidean'):
        """
        Distance between the center of this source and another.

        Parameters
        ----------
        other : Source, or array-like
            Either another source, or the center coordinates of another source

        method : str
            Specify a distance measure to used for spatial distance between source
            centers. Current options include Euclidean distance ('euclidean') and 
            L1-norm ('l1'). 

        """
        from numpy.linalg import norm

        checkParams(method, ['euclidean', 'l1'])

        if method == 'l1':
            order = 1
        else:
            order = 2

        if isinstance(other, Source):
            return norm(self.center - other.center, ord=order)
        elif isinstance(other, list) or isinstance(other, ndarray):
            return norm(self.center - asarray(other), ord=order)
Ejemplo n.º 8
0
    def similarity(self, other, metric='distance', thresh=5):
        """
        Estimate similarity between sources in self and other.

        Will compute the fraction of sources in self that are found
        in other, based on a given distance metric and a threshold.
        The fraction is estimated as the number of sources in self
        found in other, divided by the total number of sources in self.

        Parameters
        ----------
        other : SourceModel
            The sources to compare to

        metric : str, optional, default = "distance"
            Metric to use when computing distances

        thresh : scalar, optional, default = 5
            The distance below which a source is considered found
        """

        checkParams(metric, ['distance'])

        if metric == 'distance':
            vals = self.distance(other, minDistance=thresh)
            vals[isnan(vals)] = inf
        else:
            raise Exception("Metric not recognized")

        hits = sum(vals < thresh) / float(len(self.sources))

        return hits
Ejemplo n.º 9
0
    def makeExample(self, dataset=None, **opts):
        """
        Make an example data set for testing analyses.

        Options include 'pca', 'factor', 'kmeans', 'ica', 'sources'
        See thunder.utils.datasets for detailed options.

        Parameters
        ----------
        dataset : str
            Which dataset to generate

        Returns
        -------
        data : RDD of (tuple, array) pairs
            Generated dataset

        """
        from thunder.utils.datasets import DATASET_MAKERS

        if dataset is None:
            return sorted(DATASET_MAKERS.keys())

        checkParams(dataset, DATASET_MAKERS.keys())

        return DataSets.make(self._sc, dataset, **opts)
Ejemplo n.º 10
0
    def export(self,
               data,
               filename,
               format=None,
               overwrite=False,
               varname=None):
        """
        Export local array data to a variety of formats.

        Can write to a local file sytem or S3 (destination inferred from filename schema).
        S3 writing useful for persisting arrays when working in an environment without
        accessible local storage.

        Parameters
        ----------
        data : array-like
            The data to export

        filename : str
            Output location (path/to/file.ext)

        format : str, optional, default = None
            Ouput format ("npy", "mat", or "txt"), if not provided will
            try to infer from file extension.

        overwrite : boolean, optional, default = False
            Whether to overwrite if directory or file already exists

        varname : str, optional, default = None
            Variable name for writing "mat" formatted files
        """
        from numpy import save, savetxt
        from scipy.io import savemat
        from StringIO import StringIO

        from thunder.rdds.fileio.writers import getFileWriterForPath

        path, file, format = handleFormat(filename, format)
        checkParams(format, ["npy", "mat", "txt"])
        clazz = getFileWriterForPath(filename)
        writer = clazz(path,
                       file,
                       overwrite=overwrite,
                       awsCredentialsOverride=self._credentials)

        stream = StringIO()

        if format == "mat":
            varname = os.path.splitext(file)[0] if varname is None else varname
            savemat(stream,
                    mdict={varname: data},
                    oned_as='column',
                    do_compression='true')
        if format == "npy":
            save(stream, data)
        if format == "txt":
            savetxt(stream, data)

        stream.seek(0)
        writer.writeFile(stream.buf)
Ejemplo n.º 11
0
    def __new__(cls, method="crosscorr"):

        checkParams(method, ["crosscorr"])

        if method == "crosscorr":
            return super(Register, cls).__new__(CrossCorr)
        else:
            raise Exception('Registration method not recognized')
Ejemplo n.º 12
0
    def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None,
                   confFilename='conf.json', keyType=None, valueType=None):
        """
        Loads a Series object from data stored as text or binary files.

        Supports single files or multiple files stored on a local file system, a networked file system (mounted
        and available on all cluster nodes), Amazon S3, or HDFS.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples
            of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif",
            "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/".

        nkeys: int, optional (but required if `inputFormat` is 'text')
            dimensionality of data keys. (For instance, (x,y,z) keyed data for 3-dimensional image timeseries data.)
            For text data, number of keys must be specified in this parameter; for binary data, number of keys must be
            specified either in this parameter or in a configuration file named by the 'conffile' argument if this
            parameter is not set.

        nvalues: int, optional (but required if `inputFormat` is 'text')
            Number of values expected to be read. For binary data, nvalues must be specified either in this parameter
            or in a configuration file named by the 'conffile' argument if this parameter is not set.

        inputFormat: {'text', 'binary'}. optional, default 'binary'
            Format of data to be read.

        minPartitions: int, optional
            Explicitly specify minimum number of Spark partitions to be generated from this data. Used only for
            text data. Default is to use minParallelism attribute of Spark context object.

        confFilename: string, optional, default 'conf.json'
            Path to JSON file with configuration options including 'nkeys', 'nvalues', 'keytype', and 'valuetype'.
            If a file is not found at the given path, then the base directory given in 'datafile'
            will also be checked. Parameters `nkeys` or `nvalues` that are specified as explicit arguments to this
            method will take priority over those found in conffile if both are present.

        Returns
        -------
        data: thunder.rdds.Series
            A newly-created Series object, wrapping an RDD of series data. This RDD will have as keys an n-tuple
            of int, with n given by `nkeys` or the configuration passed in `conffile`. RDD values will be a numpy
            array of length `nvalues` (or as specified in the passed configuration file).
        """
        checkParams(inputFormat, ['text', 'binary'])

        from thunder.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc, minPartitions=minPartitions)

        if inputFormat.lower() == 'text':
            data = loader.fromText(dataPath, nkeys=nkeys)
        else:
            # must be either 'text' or 'binary'
            data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues,
                                     keyType=keyType, valueType=valueType)
        return data
Ejemplo n.º 13
0
    def overlap(self, other, method='support', counts=False, symmetric=True):
        """
        Compute the overlap between this source and other, in terms
        of either support or similarity of coefficients.

        Support computes the number of overlapping pixels relative
        to the union of both sources. Correlation computes the similarity
        of the weights (not defined for binary masks).

        Parameters
        ----------
        other : Source
            The source to compute overlap with.

        method : str
            Compare either support of source coefficients ('support'), or the 
            source spatial filters (not yet implemented).

        counts : boolean, optional, default = True
            Whether to return raw counts when computing support, otherwise
            return a fraction.

        """
        checkParams(method, ['support', 'corr'])

        coordsSelf = aslist(self.coordinates)
        coordsOther = aslist(other.coordinates)

        intersection = [a for a in coordsSelf if a in coordsOther]
        complementLeft = [a for a in coordsSelf if a not in intersection]
        complementRight = [a for a in coordsOther if a not in intersection]
        hits = len(intersection)

        if symmetric is True:
            misses = len(complementLeft + complementRight)
        else:
            misses = len(complementLeft)

        if method == 'support':
            if counts:
                return hits, misses
            else:
                return hits/float(hits+misses)

        if method == 'corr':
            from scipy.stats import spearmanr

            if not (hasattr(self, 'values') and hasattr(other, 'values')):
                raise Exception('Sources must have values to compute correlation')
            else:
                valuesSelf = aslist(self.values)
                valuesOther = aslist(other.values)
            if len(intersection) > 0:
                rho, _ = spearmanr(valuesSelf[intersection], valuesOther[intersection])
            else:
                rho = 0.0
            return (rho * hits)/float(hits + misses)
Ejemplo n.º 14
0
    def __new__(cls, method, **kwargs):

        from thunder.extraction.block.methods.nmf import BlockNMF
        from thunder.extraction.feature.methods.localmax import LocalMax

        EXTRACTION_METHODS = {'nmf': BlockNMF, 'localmax': LocalMax}

        checkParams(method, EXTRACTION_METHODS.keys())
        return EXTRACTION_METHODS[method](**kwargs)
Ejemplo n.º 15
0
    def loadExample(self, dataset=None):
        """
        Load a local example data set for testing analyses.

        Some of these data sets are extremely downsampled and should be considered
        useful only for testing the API. If called with None,
        will return list of available datasets.

        Parameters
        ----------
        dataset : str
            Which dataset to load

        Returns
        -------
        data : Data object
            Generated dataset as a Thunder data objects (e.g Series or Images)
        """
        import atexit
        import shutil
        import tempfile
        from pkg_resources import resource_listdir, resource_filename

        DATASETS = {
            'iris': 'iris',
            'fish-series': 'fish/bin',
            'fish-images': 'fish/tif-stack'
        }

        if dataset is None:
            return DATASETS.keys()

        checkParams(dataset, DATASETS.keys())

        if 'ec2' in self._sc.master:
            tmpdir = os.path.join('/root/thunder/python/thunder/utils', 'data',
                                  DATASETS[dataset])
        else:
            tmpdir = tempfile.mkdtemp()
            atexit.register(shutil.rmtree, tmpdir)

            def copyLocal(target):
                files = resource_listdir('thunder.utils.data', target)
                for f in files:
                    path = resource_filename('thunder.utils.data',
                                             os.path.join(target, f))
                    shutil.copy(path, tmpdir)

            copyLocal(DATASETS[dataset])

        if dataset == "iris":
            return self.loadSeries(tmpdir)
        elif dataset == "fish-series":
            return self.loadSeries(tmpdir).astype('float')
        elif dataset == "fish-images":
            return self.loadImages(tmpdir, inputFormat="tif")
Ejemplo n.º 16
0
    def normalize(self, baseline='percentile', window=None, perc=20):
        """
        Normalize each time series by subtracting and dividing by a baseline.

        Baseline can be derived from a global mean or percentile,
        or a smoothed percentile estimated within a rolling window.

        Parameters
        ----------
        baseline : str, optional, default = 'percentile'
            Quantity to use as the baseline, options are 'mean', 'percentile', 'window', or 'window-fast'

        window : int, optional, default = 6
            Size of window for baseline estimation, for 'window' and 'window-fast' baseline only

        perc : int, optional, default = 20
            Percentile value to use, for 'percentile', 'window', or 'window-fast' baseline only
        """
        checkParams(baseline, ['mean', 'percentile', 'window', 'window-fast'])
        method = baseline.lower()

        from warnings import warn
        if not (method == 'window'
                or method == 'window-fast') and window is not None:
            warn('Setting window without using method "window" has no effect')

        if method == 'mean':
            baseFunc = mean

        if method == 'percentile':
            baseFunc = lambda x: percentile(x, perc)

        if method == 'window':
            if window & 0x1:
                left, right = (ceil(window / 2), ceil(window / 2) + 1)
            else:
                left, right = (window / 2, window / 2)

            n = len(self.index)
            baseFunc = lambda x: asarray([
                percentile(x[max(ix - left, 0):min(ix + right + 1, n)], perc)
                for ix in arange(0, n)
            ])

        if method == 'window-fast':
            from scipy.ndimage.filters import percentile_filter
            baseFunc = lambda x: percentile_filter(
                x.astype(float64), perc, window, mode='nearest')

        def get(y):
            b = baseFunc(y)
            return (y - b) / (b + 0.1)

        return self.applyValues(get, keepIndex=True)
Ejemplo n.º 17
0
    def normalize(self, baseline='percentile', window=None, perc=20, offset=0.1):
        """
        Normalize each time series by subtracting and dividing by a baseline.

        Baseline can be derived from a global mean or percentile,
        or a smoothed percentile estimated within a rolling window.

        Parameters
        ----------
        baseline : str, optional, default = 'percentile'
            Quantity to use as the baseline, options are 'mean', 'percentile', 'window', or 'window-fast'

        window : int, optional, default = 6
            Size of window for baseline estimation, for 'window' and 'window-fast' baseline only

        perc : int, optional, default = 20
            Percentile value to use, for 'percentile', 'window', or 'window-fast' baseline only

        offset : float, optional, default = 0.1
            Scalar added to baseline during division
        """
        checkParams(baseline, ['mean', 'percentile', 'window', 'window-fast'])
        method = baseline.lower()
    
        from warnings import warn
        if not (method == 'window' or method == 'window-fast') and window is not None:
            warn('Setting window without using method "window" has no effect')

        if method == 'mean':
            baseFunc = mean

        if method == 'percentile':
            baseFunc = lambda x: percentile(x, perc)

        if method == 'window':
            if window & 0x1:
                left, right = (ceil(window/2), ceil(window/2) + 1)
            else:
                left, right = (window/2, window/2)

            n = len(self.index)
            baseFunc = lambda x: asarray([percentile(x[max(ix-left, 0):min(ix+right+1, n)], perc)
                                          for ix in arange(0, n)])

        if method == 'window-fast':
            from scipy.ndimage.filters import percentile_filter
            baseFunc = lambda x: percentile_filter(x.astype(float64), perc, window, mode='nearest')

        def get(y):
            b = baseFunc(y)
            return (y - b) / (b + offset)

        return self.applyValues(get, keepIndex=True)
Ejemplo n.º 18
0
    def __new__(cls, method, **kwargs):

        from thunder.registration.methods.crosscorr import CrossCorr, PlanarCrossCorr

        REGMETHODS = {
            'crosscorr': CrossCorr,
            'planarcrosscorr': PlanarCrossCorr
        }

        checkParams(method, REGMETHODS.keys())

        return REGMETHODS[method](kwargs)
Ejemplo n.º 19
0
    def __new__(cls, method, **kwargs):

        from thunder.extraction.block.methods.nmf import BlockNMF
        from thunder.extraction.feature.methods.localmax import LocalMax

        EXTRACTION_METHODS = {
            'nmf': BlockNMF,
            'localmax': LocalMax
        }

        checkParams(method, EXTRACTION_METHODS.keys())
        return EXTRACTION_METHODS[method](**kwargs)
Ejemplo n.º 20
0
    def similarity(self, other, metric='distance', thresh=5, minDistance=inf):
        """
        Estimate similarity to another set of sources using recall and precision.

        Will compute the number of sources in self that are also
        in other, based on a given distance metric and a threshold.
        The recall rate is the number of matches divided by the number in self,
        and the precision rate is the number of matches divided by the number in other.
        Typically self is ground truth and other is an estimate.
        The F score is defined as 2 * (recall * precision) / (recall + precision)

        Before computing metrics, all sources in self are matched to other,
        and a minimum distance can be set to control matching.

        Parameters
        ----------
        other : SourceModel
            The sources to compare to.

        metric : str, optional, default = 'distance'
            Metric to use when computing distances,
            options include 'distance' and 'overlap'

        thresh : scalar, optional, default = 5
            The distance below which a source is considered found.

        minDistance : scalar, optional, default = inf
            Minimum distance to use when matching indices.
        """
        checkParams(metric, ['distance', 'overlap'])

        if metric == 'distance':
            # when evaluating distances,
            # minimum distance should be the threshold
            if minDistance == inf:
                minDistance = thresh
            vals = self.distance(other, minDistance=minDistance)
            vals[isnan(vals)] = inf
            compare = lambda x: x < thresh
        elif metric == 'overlap':
            vals = self.overlap(other,
                                method='fraction',
                                minDistance=minDistance)
            vals[isnan(vals)] = 0
            compare = lambda x: x > thresh
        else:
            raise Exception("Metric not recognized")

        recall = sum(map(compare, vals)) / float(self.count)
        precision = sum(map(compare, vals)) / float(other.count)
        score = 2 * (recall * precision) / (recall + precision)

        return recall, precision, score
Ejemplo n.º 21
0
    def __new__(cls, method, **kwargs):

        from thunder.registration.methods.crosscorr import CrossCorr, PlanarCrossCorr

        REGMETHODS = {
            'crosscorr': CrossCorr,
            'planarcrosscorr': PlanarCrossCorr
        }

        checkParams(method, REGMETHODS.keys())

        return REGMETHODS[method](kwargs)
Ejemplo n.º 22
0
def export(data, outputDirPath, outputFilename, outputFormat, sorting=False):
    """
    Export data to a variety of local formats.

    Can export local arrays or a Series. If passed a Series,
    it will first be packed into one or more local arrrays.

    Parameters
    ----------
    data : Series, or numpy array
        The data to export

    outputDirPath : str
        Output directory

    outputFilename : str
        Output filename

    outputFormat : str
        Output format ("matlab", "npy", or "text")

    """

    from thunder.rdds.series import Series
    from scipy.io import savemat

    checkParams(outputFormat, ['matlab', 'npy', 'text'])

    if not os.path.exists(outputDirPath):
        os.makedirs(outputDirPath)

    filename = os.path.join(outputDirPath, outputFilename)

    def write(array, file, format, varname=None):
        if format == 'matlab':
            savemat(file+".mat", mdict={varname: array}, oned_as='column', do_compression='true')
        if format == 'npy':
            save(file, array)
        if format == 'text':
            savetxt(file+".txt", array, fmt="%.6f")

    if isinstance(data, Series):
        # force calculation of dimensions
        _tmp = data.dims
        if size(data.index) > 1:
            for ix in data.index:
                result = data.select(ix).pack(sorting=sorting)
                write(result, filename+"_"+str(ix), outputFormat, varname=outputFilename+"_"+str(ix))
        else:
            result = data.pack(sorting=sorting)
            write(result, filename, outputFormat, varname=outputFilename+"_"+str(data.index))
    else:
        write(data, filename, outputFormat, varname=outputFilename)
Ejemplo n.º 23
0
    def overlap(self, other, method="fraction"):
        """
        Compute the overlap between this source and other.

        Options are a symmetric measure of overlap based on the fraction
        of intersecting pixels relative to the union ('fraction'), an assymmetric
        measure of overlap that expresses detected intersecting pixels
        (relative to this source) using precision and recall rates ('rates'), or
        a correlation coefficient of the weights within the intersection
        (not defined for binary weights) ('correlation')

        Parameters
        ----------
        other : Source
            The source to compute overlap with.

        method : str
            Which estimate of overlap to compute, options are
            'fraction' (symmetric) 'rates' (asymmetric) or 'correlation'
        """
        checkParams(method, ["fraction", "rates", "correlation"])

        coordsSelf = aslist(self.coordinates)
        coordsOther = aslist(other.coordinates)

        intersection = [a for a in coordsSelf if a in coordsOther]
        nhit = float(len(intersection))
        ntotal = float(len(set([tuple(x) for x in coordsSelf] + [tuple(x) for x in coordsOther])))

        if method == "rates":
            recall = nhit / len(coordsSelf)
            precision = nhit / len(coordsOther)
            return recall, precision

        if method == "fraction":
            return nhit / float(ntotal)

        if method == "correlation":
            from scipy.stats import spearmanr

            if not (hasattr(self, "values") and hasattr(other, "values")):
                raise ValueError("Sources must have values to compute correlation")
            else:
                valuesSelf = aslist(self.values)
                valuesOther = aslist(other.values)
            if len(intersection) > 0:
                left = [v for v, c in zip(valuesSelf, coordsSelf) if c in coordsOther]
                right = [v for v, c in zip(valuesOther, coordsOther) if c in coordsSelf]
                rho, _ = spearmanr(left, right)
            else:
                rho = 0.0
            return rho
Ejemplo n.º 24
0
    def similarity(self, other, metric="distance", thresh=5, minDistance=inf):
        """
        Estimate similarity to another set of sources using recall and precision.

        Will compute the number of sources in self that are also
        in other, based on a given distance metric and a threshold.
        The recall rate is the number of matches divided by the number in self,
        and the precision rate is the number of matches divided by the number in other.
        Typically self is ground truth and other is an estimate.
        The F score is defined as 2 * (recall * precision) / (recall + precision)

        Before computing metrics, all sources in self are matched to other,
        and a minimum distance can be set to control matching.

        Parameters
        ----------
        other : SourceModel
            The sources to compare to.

        metric : str, optional, default = 'distance'
            Metric to use when computing distances,
            options include 'distance' and 'overlap'

        thresh : scalar, optional, default = 5
            The distance below which a source is considered found.

        minDistance : scalar, optional, default = inf
            Minimum distance to use when matching indices.
        """
        checkParams(metric, ["distance", "overlap"])

        if metric == "distance":
            # when evaluating distances,
            # minimum distance should be the threshold
            if minDistance == inf:
                minDistance = thresh
            vals = self.distance(other, minDistance=minDistance)
            vals[isnan(vals)] = inf
            compare = lambda x: x < thresh
        elif metric == "overlap":
            vals = self.overlap(other, method="fraction", minDistance=minDistance)
            vals[isnan(vals)] = 0
            compare = lambda x: x > thresh
        else:
            raise Exception("Metric not recognized")

        recall = sum(map(compare, vals)) / float(self.count)
        precision = sum(map(compare, vals)) / float(other.count)
        score = 2 * (recall * precision) / (recall + precision)

        return recall, precision, score
Ejemplo n.º 25
0
    def export(self, data, filename, outputFormat=None, overwrite=False, varname=None):
        """
        Export local array data to a variety of formats.

        Can write to a local file sytem or S3 (destination inferred from filename schema).
        S3 writing useful for persisting arrays when working in an environment without
        accessible local storage.

        Parameters
        ----------
        data : array-like
            The data to export

        filename : str
            Output location (path/to/file.ext)

        outputFormat : str, optional, default = None
            Ouput format ("npy", "mat", or "txt"), if not provided will
            try to infer from file extension.

        overwrite : boolean, optional, default = False
            Whether to overwrite if directory or file already exists

        varname : str, optional, default = None
            Variable name for writing "mat" formatted files
        """
        from numpy import save, savetxt, asarray
        from scipy.io import savemat
        from StringIO import StringIO

        from thunder.rdds.fileio.writers import getFileWriterForPath

        path, file, outputFormat = handleFormat(filename, outputFormat)
        checkParams(outputFormat, ["npy", "mat", "txt"])
        clazz = getFileWriterForPath(filename)
        writer = clazz(path, file, overwrite=overwrite, awsCredentialsOverride=self._credentials)

        stream = StringIO()

        if outputFormat == "mat":
            varname = os.path.splitext(file)[0] if varname is None else varname
            savemat(stream, mdict={varname: data}, oned_as='column', do_compression='true')
        if outputFormat == "npy":
            save(stream, data)
        if outputFormat == "txt":
            if asarray(data).ndim > 2:
                raise Exception("Cannot write data with more than two dimensions to text")
            savetxt(stream, data)

        stream.seek(0)
        writer.writeFile(stream.buf)
Ejemplo n.º 26
0
    def loadExampleS3(self, dataset=None):
        """
        Load an example data set from S3.

        Info on the included datasets can be found at the CodeNeuro data repository
        (http://datasets.codeneuro.org/). If called with None, will return
        list of available datasets.

        Parameters
        ----------
        dataset : str
            Which dataset to load

        Returns
        -------
        data : a Data object (usually a Series or Images)
            The dataset as one of Thunder's data objects

        params : dict
            Parameters or metadata for dataset
        """
        DATASETS = {
            'ahrens.lab/direction.selectivity':
            'ahrens.lab/direction.selectivity/1/',
            'ahrens.lab/optomotor.response':
            'ahrens.lab/optomotor.response/1/',
            'svoboda.lab/tactile.navigation':
            'svoboda.lab/tactile.navigation/1/'
        }

        if 'local' in self._sc.master:
            raise Exception(
                "Must be running on an EC2 cluster to load this example data set"
            )

        if dataset is None:
            return DATASETS.keys()

        checkParams(dataset, DATASETS.keys())

        basePath = 's3n://neuro.datasets/'
        dataPath = DATASETS[dataset]

        data = self.loadSeries(basePath + dataPath + 'series')
        params = self.loadParams(basePath + dataPath +
                                 'params/covariates.json')

        return data, params
Ejemplo n.º 27
0
    def similarity(self, other, metric='distance', thresh=5, minDistance=inf):
        """
        Estimate similarity between sources in self and other.

        Will compute the fraction of sources in self that are found
        in other, based on a given distance metric and a threshold.
        The fraction is estimated as the number of sources in self
        found in other, divided by the total number of sources in self.

        Before computing metrics, all sources in self are matched to other,
        and a minimum distance can be set to control matching.

        Parameters
        ----------
        other : SourceModel
            The sources to compare to

        metric : str, optional, default = "distance"
            Metric to use when computing distances,
            options include 'distance' and 'overlap'

        thresh : scalar, optional, default = 5
            The distance below which a source is considered found

        minDistance : scalar, optional, default = inf
            Minimum distance to use when matching indices
        """
        checkParams(metric, ['distance', 'overlap'])

        if metric == 'distance':
            # when evaluating distances,
            # minimum distance should be the threshold
            if minDistance == inf:
                minDistance = thresh
            vals = self.distance(other, minDistance=minDistance)
            vals[isnan(vals)] = inf
            compare = lambda x: x < thresh
        elif metric == 'overlap':
            vals = self.overlap(other, method='support', minDistance=minDistance)
            vals[isnan(vals)] = 0
            compare = lambda x: x > thresh
        else:
            raise Exception("Metric not recognized")

        hits = sum(map(compare, vals)) / float(len(self.sources))

        return hits
Ejemplo n.º 28
0
    def loadSeriesLocal(self,
                        dataFilePath,
                        inputFormat='npy',
                        minPartitions=None,
                        keyFilePath=None,
                        varName=None):
        """
        Load a Series object from a local file (either npy or MAT format).

        File should contain a 1d or 2d matrix, where each row
        of the input matrix is a record.

        Keys can be provided in a separate file (with variable name 'keys', for MAT files).
        If not provided, linear indices will be used for keys.

        Parameters
        ----------
        dataFilePath: str
            File to import

        varName : str, optional, default = None
            Variable name to load (for MAT files only)

        keyFilePath : str, optional, default = None
            File containing the keys for each record as another 1d or 2d array

        minPartitions : Int, optional, default = 1
            Number of partitions for RDD
        """

        checkParams(inputFormat, ['mat', 'npy'])

        from thunder.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc, minPartitions=minPartitions)

        if inputFormat.lower() == 'mat':
            if varName is None:
                raise Exception(
                    'Must provide variable name for loading MAT files')
            data = loader.fromMatLocal(dataFilePath, varName, keyFilePath)
        else:
            data = loader.fromNpyLocal(dataFilePath, keyFilePath)

        return data
Ejemplo n.º 29
0
    def loadExampleS3(self, dataset=None):
        """
        Load an example data set from S3.

        Info on the included datasets can be found at the CodeNeuro data repository
        (http://datasets.codeneuro.org/). If called with None, will return
        list of available datasets.

        Parameters
        ----------
        dataset : str
            Which dataset to load

        Returns
        -------
        data : a Data object (usually a Series or Images)
            The dataset as one of Thunder's data objects

        params : dict
            Parameters or metadata for dataset
        """
        DATASETS = {
            'ahrens.lab/direction.selectivity': 'ahrens.lab/direction.selectivity/1/',
            'ahrens.lab/optomotor.response': 'ahrens.lab/optomotor.response/1/',
            'svoboda.lab/tactile.navigation': 'svoboda.lab/tactile.navigation/1/'
        }

        if dataset is None:
            return DATASETS.keys()

        if 'local' in self._sc.master:
            raise Exception("Must be running on an EC2 cluster to load this example data set")

        checkParams(dataset, DATASETS.keys())

        basePath = 's3n://neuro.datasets/'
        dataPath = DATASETS[dataset]

        data = self.loadSeries(basePath + dataPath + 'series')
        params = self.loadParams(basePath + dataPath + 'params/covariates.json')

        return data, params
Ejemplo n.º 30
0
    def makeExample(self, dataset, **opts):
        """
        Make an example data set for testing analyses.

        Options include 'pca', 'kmeans', and 'ica'.
        See thunder.utils.datasets for detailed options.

        Parameters
        ----------
        dataset : str
            Which dataset to generate

        Returns
        -------
        data : RDD of (tuple, array) pairs
            Generated dataset

        """
        checkParams(dataset, ['kmeans', 'pca', 'ica'])

        return DataSets.make(self._sc, dataset, **opts)
Ejemplo n.º 31
0
    def makeExample(self, dataset, **opts):
        """
        Make an example data set for testing analyses.

        Options include 'pca', 'kmeans', and 'ica'.
        See thunder.utils.datasets for detailed options.

        Parameters
        ----------
        dataset : str
            Which dataset to generate

        Returns
        -------
        data : RDD of (tuple, array) pairs
            Generated dataset

        """
        checkParams(dataset, ['kmeans', 'pca', 'ica'])

        return DataSets.make(self._sc, dataset, **opts)
Ejemplo n.º 32
0
    def loadSeriesLocal(self, dataFilePath, inputFormat='npy', minPartitions=None, keyFilePath=None, varName=None):
        """
        Load a Series object from a local file (either npy or MAT format).

        File should contain a 1d or 2d matrix, where each row
        of the input matrix is a record.

        Keys can be provided in a separate file (with variable name 'keys', for MAT files).
        If not provided, linear indices will be used for keys.

        Parameters
        ----------
        dataFilePath: str
            File to import

        varName : str, optional, default = None
            Variable name to load (for MAT files only)

        keyFilePath : str, optional, default = None
            File containing the keys for each record as another 1d or 2d array

        minPartitions : Int, optional, default = 1
            Number of partitions for RDD
        """

        checkParams(inputFormat, ['mat', 'npy'])

        from thunder.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc, minPartitions=minPartitions)

        if inputFormat.lower() == 'mat':
            if varName is None:
                raise Exception('Must provide variable name for loading MAT files')
            data = loader.fromMatLocal(dataFilePath, varName, keyFilePath)
        else:
            data = loader.fromNpyLocal(dataFilePath, keyFilePath)

        return data
Ejemplo n.º 33
0
    def loadImages(self,
                   dataPath,
                   dims=None,
                   dtype=None,
                   inputFormat='stack',
                   ext=None,
                   startIdx=None,
                   stopIdx=None,
                   recursive=False,
                   nplanes=None,
                   npartitions=None,
                   renumber=False,
                   confFilename='conf.json'):
        """
        Loads an Images object from data stored as a binary image stack, tif, or png files.

        Supports single files or multiple files, stored on a local file system, a networked file sytem
        (mounted and available on all nodes), or Amazon S3. HDFS is not currently supported for image file data.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples
            of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif",
            "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/".

        dims: tuple of positive int, optional (but required if inputFormat is 'stack')
            Dimensions of input image data, similar to a numpy 'shape' parameter, for instance (1024, 1024, 48). Binary
            stack data will be interpreted as coming from a multidimensional array of the specified dimensions. Stack
            data should be stored in row-major order (Fortran or Matlab convention) rather than column-major order (C
            or python/numpy convention), where the first dimension corresponds to that which is changing most rapidly
            on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file
            should be ordered as [(x0, y0, z0), (x1, y0, zo), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ...,
            (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)].
            If inputFormat is 'png' or 'tif', the dims parameter (if any) will be ignored; data dimensions
            will instead be read out from the image file headers.

        inputFormat: {'stack', 'png', 'tif'}. optional, default 'stack'
            Expected format of the input data. 'stack' indicates flat files of raw binary data. 'png' or 'tif' indicate
            image files of the corresponding formats. Each page of a multipage tif file will be interpreted as a
            separate z-plane. For all formats, separate files are interpreted as distinct time points, with ordering
            given by lexicographic sorting of file names.

        ext: string, optional, default None
            Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for
            inputFormat=='tif', and 'png' for inputFormat="png".

        dtype: string or numpy dtype. optional, default 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is
            'tif' or 'png', the dtype parameter (if any) will be ignored; data type will instead be read out from the
            tif headers.

        startIdx: nonnegative int, optional
            startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These
            parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used
            after lexicographically sorting all input data files matching the dataPath argument. For example,
            startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read
            in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx
            and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position).

        stopIdx: nonnegative int, optional
            See startIdx.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at dataPath, loading all files in the tree that
            have an appropriate extension. Recursive loading is currently only implemented for local filesystems
            (not s3).

        nplanes: positive integer, default None
            If passed, will cause a single image file to be subdivided into multiple records. Every
            `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the
            first nplane planes of the first file being record 0, the second nplane planes being record 1, etc,
            until the first file is exhausted and record ordering continues with the first nplane planes of the
            second file, and so on. With nplanes=None (the default), a single file will be considered as
            representing a single record. Keys are calculated assuming that all input files contain the same
            number of records; if the number of records per file is not the same across all files,
            then `renumber` should be set to True to ensure consistent keys.

        npartitions: positive int, optional
            If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1
            partition per image file.

        renumber: boolean, optional, default False
            If renumber evaluates to True, then the keys for each record will be explicitly recalculated after
            all images are loaded. This should only be necessary at load time when different files contain
            different number of records. See Images.renumber().

        Returns
        -------
        data: thunder.rdds.Images
            A newly-created Images object, wrapping an RDD of <int index, numpy array> key-value pairs.

        """
        checkParams(inputFormat, ['stack', 'png', 'tif', 'tif-stack'])

        from thunder.rdds.fileio.imagesloader import ImagesLoader
        loader = ImagesLoader(self._sc)

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        if inputFormat.lower() == 'stack':
            data = loader.fromStack(dataPath,
                                    dims=dims,
                                    dtype=dtype,
                                    ext=ext,
                                    startIdx=startIdx,
                                    stopIdx=stopIdx,
                                    recursive=recursive,
                                    nplanes=nplanes,
                                    npartitions=npartitions,
                                    confFilename=confFilename)
        elif inputFormat.lower().startswith('tif'):
            data = loader.fromTif(dataPath,
                                  ext=ext,
                                  startIdx=startIdx,
                                  stopIdx=stopIdx,
                                  recursive=recursive,
                                  nplanes=nplanes,
                                  npartitions=npartitions)
        else:
            if nplanes:
                raise NotImplementedError(
                    "nplanes argument is not supported for png files")
            data = loader.fromPng(dataPath,
                                  ext=ext,
                                  startIdx=startIdx,
                                  stopIdx=stopIdx,
                                  recursive=recursive,
                                  npartitions=npartitions)

        if not renumber:
            return data
        else:
            return data.renumber()
Ejemplo n.º 34
0
    def overlap(self, other, method='fraction'):
        """
        Compute the overlap between this source and other.

        Options are a symmetric measure of overlap based on the fraction
        of intersecting pixels relative to the union ('fraction'), an assymmetric
        measure of overlap that expresses detected intersecting pixels
        (relative to this source) using precision and recall rates ('rates'), or
        a correlation coefficient of the weights within the intersection
        (not defined for binary weights) ('correlation')

        Parameters
        ----------
        other : Source
            The source to compute overlap with.

        method : str
            Which estimate of overlap to compute, options are
            'fraction' (symmetric) 'rates' (asymmetric) or 'correlation'
        """
        checkParams(method, ['fraction', 'rates', 'correlation'])

        coordsSelf = aslist(self.coordinates)
        coordsOther = aslist(other.coordinates)

        intersection = [a for a in coordsSelf if a in coordsOther]
        nhit = float(len(intersection))
        ntotal = float(
            len(
                set([tuple(x)
                     for x in coordsSelf] + [tuple(x) for x in coordsOther])))

        if method == 'rates':
            recall = nhit / len(coordsSelf)
            precision = nhit / len(coordsOther)
            return recall, precision

        if method == 'fraction':
            return nhit / float(ntotal)

        if method == 'correlation':
            from scipy.stats import spearmanr
            if not (hasattr(self, 'values') and hasattr(other, 'values')):
                raise ValueError(
                    'Sources must have values to compute correlation')
            else:
                valuesSelf = aslist(self.values)
                valuesOther = aslist(other.values)
            if len(intersection) > 0:
                left = [
                    v for v, c in zip(valuesSelf, coordsSelf)
                    if c in coordsOther
                ]
                right = [
                    v for v, c in zip(valuesOther, coordsOther)
                    if c in coordsSelf
                ]
                rho, _ = spearmanr(left, right)
            else:
                rho = 0.0
            return rho
Ejemplo n.º 35
0
    def loadImages(self, dataPath, dims=None, inputFormat='stack', ext=None, dtype='int16',
                   startIdx=None, stopIdx=None, recursive=False, nplanes=None, npartitions=None,
                   renumber=False):
        """
        Loads an Images object from data stored as a binary image stack, tif, or png files.

        Supports single files or multiple files, stored on a local file system, a networked file sytem
        (mounted and available on all nodes), or Amazon S3. HDFS is not currently supported for image file data.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples
            of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif",
            "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/".

        dims: tuple of positive int, optional (but required if inputFormat is 'stack')
            Dimensions of input image data, similar to a numpy 'shape' parameter, for instance (1024, 1024, 48). Binary
            stack data will be interpreted as coming from a multidimensional array of the specified dimensions. Stack
            data should be stored in row-major order (Fortran or Matlab convention) rather than column-major order (C
            or python/numpy convention), where the first dimension corresponds to that which is changing most rapidly
            on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file
            should be ordered as [(x0, y0, z0), (x1, y0, zo), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ...,
            (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)].
            If inputFormat is 'png' or 'tif', the dims parameter (if any) will be ignored; data dimensions
            will instead be read out from the image file headers.

        inputFormat: {'stack', 'png', 'tif'}. optional, default 'stack'
            Expected format of the input data. 'stack' indicates flat files of raw binary data. 'png' or 'tif' indicate
            image files of the corresponding formats. Each page of a multipage tif file will be interpreted as a
            separate z-plane. For all formats, separate files are interpreted as distinct time points, with ordering
            given by lexicographic sorting of file names.

        ext: string, optional, default None
            Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for
            inputFormat=='tif', and 'png' for inputFormat="png".

        dtype: string or numpy dtype. optional, default 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is
            'tif' or 'png', the dtype parameter (if any) will be ignored; data type will instead be read out from the
            tif headers.

        startIdx: nonnegative int, optional
            startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These
            parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used
            after lexicographically sorting all input data files matching the dataPath argument. For example,
            startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read
            in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx
            and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position).

        stopIdx: nonnegative int, optional
            See startIdx.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at dataPath, loading all files in the tree that
            have an appropriate extension. Recursive loading is currently only implemented for local filesystems
            (not s3).

        nplanes: positive integer, default None
            If passed, will cause a single image file to be subdivided into multiple records. Every
            `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the
            first nplane planes of the first file being record 0, the second nplane planes being record 1, etc,
            until the first file is exhausted and record ordering continues with the first nplane planes of the
            second file, and so on. With nplanes=None (the default), a single file will be considered as
            representing a single record. Keys are calculated assuming that all input files contain the same
            number of records; if the number of records per file is not the same across all files,
            then `renumber` should be set to True to ensure consistent keys.

        npartitions: positive int, optional
            If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1
            partition per image file.

        renumber: boolean, optional, default False
            If renumber evaluates to True, then the keys for each record will be explicitly recalculated after
            all images are loaded. This should only be necessary at load time when different files contain
            different number of records. See Images.renumber().

        Returns
        -------
        data: thunder.rdds.Images
            A newly-created Images object, wrapping an RDD of <int index, numpy array> key-value pairs.

        """
        checkParams(inputFormat, ['stack', 'png', 'tif', 'tif-stack'])

        from thunder.rdds.fileio.imagesloader import ImagesLoader
        loader = ImagesLoader(self._sc)

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        if inputFormat.lower() == 'stack':
            data = loader.fromStack(dataPath, dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx,
                                    recursive=recursive, nplanes=nplanes, npartitions=npartitions)
        elif inputFormat.lower().startswith('tif'):
            data = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive,
                                  nplanes=nplanes, npartitions=npartitions)
        else:
            if nplanes:
                raise NotImplementedError("nplanes argument is not supported for png files")
            data = loader.fromPng(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx,
                                  recursive=recursive, npartitions=npartitions)

        if not renumber:
            return data
        else:
            return data.renumber()
Ejemplo n.º 36
0
    def convertImagesToSeries(self,
                              dataPath,
                              outputDirPath,
                              dims=None,
                              inputFormat='stack',
                              ext=None,
                              dtype='int16',
                              blockSize="150M",
                              blockSizeUnits="pixels",
                              startIdx=None,
                              stopIdx=None,
                              shuffle=True,
                              overwrite=False,
                              recursive=False,
                              nplanes=None,
                              npartitions=None,
                              renumber=False):
        """
        Write out Images data as Series data, saved in a flat binary format.

        The resulting Series data files may subsequently be read in using the loadSeries() method. The Series data
        object that results will be equivalent to that which would be generated by loadImagesAsSeries(). It is expected
        that loading Series data directly from the series flat binary format, using loadSeries(), will be faster than
        converting image data to a Series object through loadImagesAsSeries().

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples
            of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif",
            "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/".

        outputDirPath: string
            Path to a directory into which to write Series file output. An outputdir argument may be either a path
            on the local file system or a URI-like format, as in dataPath. Examples of valid outputDirPaths include
            "a/relative/directory/", "s3n:///my-s3-bucket/data/myoutput/", or "file:///mnt/a/new/directory/". If the
            directory specified by outputDirPath already exists and the 'overwrite' parameter is False, this method
            will throw a ValueError. If the directory exists and 'overwrite' is True, the existing directory and all
            its contents will be deleted and overwritten.

        dims: tuple of positive int, optional (but required if inputFormat is 'stack')
            Dimensions of input image data, for instance (1024, 1024, 48). Binary stack data will be interpreted as
            coming from a multidimensional array of the specified dimensions.

            The first dimension of the passed dims tuple should be the one that is changing most rapidly
            on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file
            should be ordered as [(x0, y0, z0), (x1, y0, z0), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ...,
            (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. This is the opposite convention from that used by numpy,
            which by default has the fastest-changing dimension listed last (column-major convention). Thus, if loading
            a numpy array `ary`, where `ary.shape == (z, y, x)`, written to disk by `ary.tofile("myarray.stack")`, the
            corresponding dims parameter should be (x, y, z).
            If inputFormat is 'tif', the dims parameter (if any) will be ignored; data dimensions will instead
            be read out from the tif file headers.

        inputFormat: {'stack', 'tif'}. optional, default 'stack'
            Expected format of the input data. 'stack' indicates flat files of raw binary data, while 'tif' indicates
            greyscale / luminance TIF images. Each page of a multipage tif file will be interpreted as a separate
            z-plane. For both stacks and tif stacks, separate files are interpreted as distinct time points, with
            ordering given by lexicographic sorting of file names.

        ext: string, optional, default None
            Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for
            inputFormat=='tif'.

        dtype: string or numpy dtype. optional, default 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is
            'tif', the dtype parameter (if any) will be ignored; data type will instead be read out from the
            tif headers.

        blockSize: string formatted as e.g. "64M", "512k", "2G", or positive int, tuple of positive int, or instance of
                   BlockingStrategy. optional, default "150M"
            Requested size of individual output files in bytes (or kilobytes, megabytes, gigabytes). blockSize can also
            be an instance of blockingStrategy, or a tuple of int specifying either the number of pixels or of splits
            per dimension to apply to the loaded images. Whether a tuple of int is interpreted as pixels or as splits
            depends on the value of the blockSizeUnits parameter.  This parameter also indirectly controls the number
            of Spark partitions to be used, with one partition used per block created.

        blockSizeUnits: string, either "pixels" or "splits" (or unique prefix of each, such as "s"), default "pixels"
            Specifies units to be used in interpreting a tuple passed as blockSizeSpec when shuffle=True. If a string
            or a BlockingStrategy instance is passed as blockSizeSpec, or if shuffle=False, this parameter has no
            effect.

        startIdx: nonnegative int, optional
            startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These
            parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used
            after lexicographically sorting all input data files matching the dataPath argument. For example,
            startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read
            in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx
            and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position).

        stopIdx: nonnegative int, optional
            See startIdx.

        shuffle: boolean, optional, default True
            Controls whether the conversion from Images to Series formats will make use of a Spark shuffle-based method.

        overwrite: boolean, optional, default False
            If true, the directory specified by outputDirPath will first be deleted, along with all its contents, if it
            already exists. (Use with caution.) If false, a ValueError will be thrown if outputDirPath is found to
            already exist.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at dataPath, loading all files in the tree that
            have an appropriate extension. Recursive loading is currently only implemented for local filesystems
            (not s3), and only with shuffle=True.

        nplanes: positive integer, default None
            If passed, will cause a single image file to be subdivided into multiple records. Every
            `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the
            first nplane planes of the first file being record 0, the second nplane planes being record 1, etc,
            until the first file is exhausted and record ordering continues with the first nplane planes of the
            second file, and so on. With nplanes=None (the default), a single file will be considered as
            representing a single record. Keys are calculated assuming that all input files contain the same
            number of records; if the number of records per file is not the same across all files,
            then `renumber` should be set to True to ensure consistent keys. nplanes is only supported for
            shuffle=True (the default).

        npartitions: positive int, optional
            If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1
            partition per image file. Only applies when shuffle=True.

        renumber: boolean, optional, default False
            If renumber evaluates to True, then the keys for each record will be explicitly recalculated after
            all images are loaded. This should only be necessary at load time when different files contain
            different number of records. renumber is only supported for shuffle=True (the default). See
            Images.renumber().
        """
        checkParams(inputFormat, ['stack', 'tif', 'tif-stack'])

        if inputFormat.lower() == 'stack' and not dims:
            raise ValueError(
                "Dimensions ('dims' parameter) must be specified if loading from binary image stack"
                + " ('stack' value for 'inputFormat' parameter)")

        if not overwrite:
            raiseErrorIfPathExists(outputDirPath,
                                   awsCredentialsOverride=self._credentials)
            overwrite = True  # prevent additional downstream checks for this path

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        if shuffle:
            from thunder.rdds.fileio.imagesloader import ImagesLoader
            loader = ImagesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                images = loader.fromStack(dataPath,
                                          dims,
                                          ext=ext,
                                          dtype=dtype,
                                          startIdx=startIdx,
                                          stopIdx=stopIdx,
                                          recursive=recursive,
                                          nplanes=nplanes,
                                          npartitions=npartitions)
            else:
                # 'tif' or 'tif-stack'
                images = loader.fromTif(dataPath,
                                        ext=ext,
                                        startIdx=startIdx,
                                        stopIdx=stopIdx,
                                        recursive=recursive,
                                        nplanes=nplanes,
                                        npartitions=npartitions)
            if renumber:
                images = images.renumber()
            images.toBlocks(blockSize,
                            units=blockSizeUnits).saveAsBinarySeries(
                                outputDirPath, overwrite=overwrite)
        else:
            from thunder.rdds.fileio.seriesloader import SeriesLoader
            if nplanes is not None:
                raise NotImplementedError(
                    "nplanes is not supported with shuffle=False")
            if npartitions is not None:
                raise NotImplementedError(
                    "npartitions is not supported with shuffle=False")
            loader = SeriesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                loader.saveFromStack(dataPath,
                                     outputDirPath,
                                     dims,
                                     ext=ext,
                                     dtype=dtype,
                                     blockSize=blockSize,
                                     overwrite=overwrite,
                                     startIdx=startIdx,
                                     stopIdx=stopIdx,
                                     recursive=recursive)
            else:
                # 'tif' or 'tif-stack'
                loader.saveFromTif(dataPath,
                                   outputDirPath,
                                   ext=ext,
                                   blockSize=blockSize,
                                   startIdx=startIdx,
                                   stopIdx=stopIdx,
                                   overwrite=overwrite,
                                   recursive=recursive)
Ejemplo n.º 37
0
 def load(paramFile, classifyMode, **opts):
     from thunder.utils.common import checkParams
     checkParams(classifyMode.lower(), CLASSIFIERS.keys())
     return CLASSIFIERS[classifyMode.lower()](paramFile, **opts)
Ejemplo n.º 38
0
    def convertImagesToSeries(self, dataPath, outputDirPath, dims=None, inputFormat='stack', ext=None,
                              dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None,
                              shuffle=True, overwrite=False, recursive=False, nplanes=None, npartitions=None,
                              renumber=False, confFilename='conf.json'):
        """
        Write out Images data as Series data, saved in a flat binary format.

        The resulting files may subsequently be read in using ThunderContext.loadSeries().
        Loading Series data directly will likely be faster than converting image data
        to a Series object through loadImagesAsSeries().

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        outputDirPath: string
            Path to directory to write Series file output. May be either a path on the local file system
            or a URI-like format, such as "local/directory", "s3n:///my-s3-bucket/data/",
            or "file:///mnt/another/directory/". If the directory exists and 'overwrite' is True,
            the existing directory and all its contents will be deleted and overwritten.

        dims: tuple of positive int, optional (required if inputFormat is 'stack')
            Image dimensions. Binary stack data will be interpreted as a multidimensional array
            with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention),
            where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions
            will be read from the image file headers.

        inputFormat: str, optional, default = 'stack'
            Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks.
            'png' or 'tif' indicate image formats. Page of a multipage tif file will be extend along
            the third dimension. Separate files interpreted as distinct records, with ordering
            given by lexicographic sorting of file names.

        ext: string, optional, default = None
            File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif',
            and 'png' for inputFormat=="png".

        dtype: string or numpy dtype. optional, default 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string.
            Ignored for 'tif' or 'png' (data will be inferred from image formats).

        blockSize: string or positive int, optional, default "150M"
            Requested size of blocks (e.g "64M", "512k", "2G"). If shuffle=True, can also be a
            tuple of int specifying the number of pixels or splits per dimension. Indirectly
            controls the number of Spark partitions, with one partition per block.

        blockSizeUnits: string, either "pixels" or "splits", default "pixels"
            Units for interpreting a tuple passed as blockSize when shuffle=True.

        startIdx: nonnegative int, optional, default = None
            Convenience parameters to read only a subset of input files. Uses python slice conventions
            (zero-based indexing with exclusive final position). These parameters give the starting
            and final index after lexicographic sorting.

        stopIdx: nonnegative int, optional, default = None
            See startIdx.

        shuffle: boolean, optional, default = True
            Controls whether the conversion from Images to Series formats will use of a Spark shuffle-based method.

        overwrite: boolean, optional, default False
            If true, the directory specified by outputDirPath will be deleted (recursively) if it
            already exists. (Use with caution.)

        recursive: boolean, optional, default = False
            If true, will recursively descend directories rooted at dataPath, loading all files
            in the tree with an appropriate extension.

        nplanes: positive integer, optional, default = None
            Subdivide individual image files. Every `nplanes` from each file will be considered a new record.
            With nplanes=None (the default), a single file will be considered as representing a single record.
            If the number of records per file is not the same across all files, then `renumber` should be set
            to True to ensure consistent keys.

        npartitions: positive int, optional, default = None
            Specify number of partitions for the RDD, if unspecified will use 1 partition per image.

        renumber: boolean, optional, default = False
            Recalculate keys for records after images are loading. Only necessary if different files contain
            different number of records (e.g. due to specifying nplanes). See Images.renumber().

        confFilename : string, optional, default = 'conf.json'
            Name of conf file if using to specify parameters for binary stack data

        """
        checkParams(inputFormat, ['stack', 'tif', 'tif-stack'])

        if not overwrite:
            raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=self._credentials)
            overwrite = True  # prevent additional downstream checks for this path

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        if shuffle:
            from thunder.rdds.fileio.imagesloader import ImagesLoader
            loader = ImagesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                images = loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, startIdx=startIdx, stopIdx=stopIdx,
                                          recursive=recursive, nplanes=nplanes, npartitions=npartitions,
                                          confFilename=confFilename)
            else:
                # 'tif' or 'tif-stack'
                images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx,
                                        recursive=recursive, nplanes=nplanes, npartitions=npartitions)
            if renumber:
                images = images.renumber()
            images.toBlocks(blockSize, units=blockSizeUnits).saveAsBinarySeries(outputDirPath, overwrite=overwrite)
        else:
            from thunder.rdds.fileio.seriesloader import SeriesLoader
            if nplanes is not None:
                raise NotImplementedError("nplanes is not supported with shuffle=False")
            if npartitions is not None:
                raise NotImplementedError("npartitions is not supported with shuffle=False")
            loader = SeriesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                loader.saveFromStack(dataPath, outputDirPath, dims, ext=ext, dtype=dtype,
                                     blockSize=blockSize, overwrite=overwrite, startIdx=startIdx,
                                     stopIdx=stopIdx, recursive=recursive)
            else:
                # 'tif' or 'tif-stack'
                loader.saveFromTif(dataPath, outputDirPath, ext=ext, blockSize=blockSize,
                                   startIdx=startIdx, stopIdx=stopIdx, overwrite=overwrite,
                                   recursive=recursive)
Ejemplo n.º 39
0
    def loadImagesAsSeries(self, dataPath, dims=None, inputFormat='stack', ext=None, dtype='int16',
                           blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None,
                           shuffle=True, recursive=False, nplanes=None, npartitions=None,
                           renumber=False, confFilename='conf.json'):
        """
        Load Images data as Series data.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        dims: tuple of positive int, optional (required if inputFormat is 'stack')
            Image dimensions. Binary stack data will be interpreted as a multidimensional array
            with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention),
            where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions
            will be read from the image file headers.

        inputFormat: str, optional, default = 'stack'
            Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks.
            'png' or 'tif' indicate image formats. Page of a multipage tif file will be extend along
            the third dimension. Separate files interpreted as distinct records, with ordering
            given by lexicographic sorting of file names.

        ext: string, optional, default = None
            File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif',
            and 'png' for inputFormat=="png".

        dtype: string or numpy dtype. optional, default 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string.
            Ignored for 'tif' or 'png' (data will be inferred from image formats).

        blockSize: string or positive int, optional, default "150M"
            Requested size of blocks (e.g "64M", "512k", "2G"). If shuffle=True, can also be a
            tuple of int specifying the number of pixels or splits per dimension. Indirectly
            controls the number of Spark partitions, with one partition per block.

        blockSizeUnits: string, either "pixels" or "splits", default "pixels"
            Units for interpreting a tuple passed as blockSize when shuffle=True.

        startIdx: nonnegative int, optional, default = None
            Convenience parameters to read only a subset of input files. Uses python slice conventions
            (zero-based indexing with exclusive final position). These parameters give the starting
            and final index after lexicographic sorting.

        stopIdx: nonnegative int, optional, default = None
            See startIdx.

        shuffle: boolean, optional, default = True
            Controls whether the conversion from Images to Series formats will use of a Spark shuffle-based method.

        recursive: boolean, optional, default = False
            If true, will recursively descend directories rooted at dataPath, loading all files
            in the tree with an appropriate extension.

        nplanes: positive integer, optional, default = None
            Subdivide individual image files. Every `nplanes` from each file will be considered a new record.
            With nplanes=None (the default), a single file will be considered as representing a single record.
            If the number of records per file is not the same across all files, then `renumber` should be set
            to True to ensure consistent keys.

        npartitions: positive int, optional, default = None
            Specify number of partitions for the RDD, if unspecified will use 1 partition per image.

        renumber: boolean, optional, default = False
            Recalculate keys for records after images are loading. Only necessary if different files contain
            different number of records (e.g. due to specifying nplanes). See Images.renumber().

        confFilename : string, optional, default = 'conf.json'
            Name of conf file if using to specify parameters for binary stack data

        Returns
        -------
        data: thunder.rdds.Series
            A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs.
            Keys will be n-tuples of int, with n given by dimensionality of the images, and correspond
            to indexes into the image arrays. Value will have length equal to the number of image files.
            With each image contributing one point to this value array, with ordering given by
            the lexicographic ordering of image file names.
        """
        checkParams(inputFormat, ['stack', 'tif', 'tif-stack'])

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        if shuffle:
            from thunder.rdds.fileio.imagesloader import ImagesLoader
            loader = ImagesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                images = loader.fromStack(dataPath, dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx,
                                          recursive=recursive, nplanes=nplanes, npartitions=npartitions,
                                          confFilename=confFilename)
            else:
                # tif / tif stack
                images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx,
                                        recursive=recursive, nplanes=nplanes, npartitions=npartitions)
            if renumber:
                images = images.renumber()
            return images.toBlocks(blockSize, units=blockSizeUnits).toSeries()

        else:
            from thunder.rdds.fileio.seriesloader import SeriesLoader
            if nplanes is not None:
                raise NotImplementedError("nplanes is not supported with shuffle=False")
            if npartitions is not None:
                raise NotImplementedError("npartitions is not supported with shuffle=False")
            if renumber:
                raise NotImplementedError("renumber is not supported with shuffle=False")

            loader = SeriesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                return loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, blockSize=blockSize,
                                        startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
            else:
                # tif / tif stack
                return loader.fromTif(dataPath, ext=ext, blockSize=blockSize,
                                      startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
Ejemplo n.º 40
0
    def convertImagesToSeries(self, dataPath, outputDirPath, dims=None, inputFormat='stack', ext=None,
                              dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None,
                              shuffle=True, overwrite=False, recursive=False, nplanes=None, npartitions=None,
                              renumber=False):
        """
        Write out Images data as Series data, saved in a flat binary format.

        The resulting Series data files may subsequently be read in using the loadSeries() method. The Series data
        object that results will be equivalent to that which would be generated by loadImagesAsSeries(). It is expected
        that loading Series data directly from the series flat binary format, using loadSeries(), will be faster than
        converting image data to a Series object through loadImagesAsSeries().

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples
            of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif",
            "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/".

        outputDirPath: string
            Path to a directory into which to write Series file output. An outputdir argument may be either a path
            on the local file system or a URI-like format, as in dataPath. Examples of valid outputDirPaths include
            "a/relative/directory/", "s3n:///my-s3-bucket/data/myoutput/", or "file:///mnt/a/new/directory/". If the
            directory specified by outputDirPath already exists and the 'overwrite' parameter is False, this method
            will throw a ValueError. If the directory exists and 'overwrite' is True, the existing directory and all
            its contents will be deleted and overwritten.

        dims: tuple of positive int, optional (but required if inputFormat is 'stack')
            Dimensions of input image data, for instance (1024, 1024, 48). Binary stack data will be interpreted as
            coming from a multidimensional array of the specified dimensions.

            The first dimension of the passed dims tuple should be the one that is changing most rapidly
            on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file
            should be ordered as [(x0, y0, z0), (x1, y0, z0), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ...,
            (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. This is the opposite convention from that used by numpy,
            which by default has the fastest-changing dimension listed last (column-major convention). Thus, if loading
            a numpy array `ary`, where `ary.shape == (z, y, x)`, written to disk by `ary.tofile("myarray.stack")`, the
            corresponding dims parameter should be (x, y, z).
            If inputFormat is 'tif', the dims parameter (if any) will be ignored; data dimensions will instead
            be read out from the tif file headers.

        inputFormat: {'stack', 'tif'}. optional, default 'stack'
            Expected format of the input data. 'stack' indicates flat files of raw binary data, while 'tif' indicates
            greyscale / luminance TIF images. Each page of a multipage tif file will be interpreted as a separate
            z-plane. For both stacks and tif stacks, separate files are interpreted as distinct time points, with
            ordering given by lexicographic sorting of file names.

        ext: string, optional, default None
            Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for
            inputFormat=='tif'.

        dtype: string or numpy dtype. optional, default 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is
            'tif', the dtype parameter (if any) will be ignored; data type will instead be read out from the
            tif headers.

        blockSize: string formatted as e.g. "64M", "512k", "2G", or positive int, tuple of positive int, or instance of
                   BlockingStrategy. optional, default "150M"
            Requested size of individual output files in bytes (or kilobytes, megabytes, gigabytes). blockSize can also
            be an instance of blockingStrategy, or a tuple of int specifying either the number of pixels or of splits
            per dimension to apply to the loaded images. Whether a tuple of int is interpreted as pixels or as splits
            depends on the value of the blockSizeUnits parameter.  This parameter also indirectly controls the number
            of Spark partitions to be used, with one partition used per block created.

        blockSizeUnits: string, either "pixels" or "splits" (or unique prefix of each, such as "s"), default "pixels"
            Specifies units to be used in interpreting a tuple passed as blockSizeSpec when shuffle=True. If a string
            or a BlockingStrategy instance is passed as blockSizeSpec, or if shuffle=False, this parameter has no
            effect.

        startIdx: nonnegative int, optional
            startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These
            parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used
            after lexicographically sorting all input data files matching the dataPath argument. For example,
            startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read
            in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx
            and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position).

        stopIdx: nonnegative int, optional
            See startIdx.

        shuffle: boolean, optional, default True
            Controls whether the conversion from Images to Series formats will make use of a Spark shuffle-based method.

        overwrite: boolean, optional, default False
            If true, the directory specified by outputDirPath will first be deleted, along with all its contents, if it
            already exists. (Use with caution.) If false, a ValueError will be thrown if outputDirPath is found to
            already exist.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at dataPath, loading all files in the tree that
            have an appropriate extension. Recursive loading is currently only implemented for local filesystems
            (not s3), and only with shuffle=True.

        nplanes: positive integer, default None
            If passed, will cause a single image file to be subdivided into multiple records. Every
            `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the
            first nplane planes of the first file being record 0, the second nplane planes being record 1, etc,
            until the first file is exhausted and record ordering continues with the first nplane planes of the
            second file, and so on. With nplanes=None (the default), a single file will be considered as
            representing a single record. Keys are calculated assuming that all input files contain the same
            number of records; if the number of records per file is not the same across all files,
            then `renumber` should be set to True to ensure consistent keys. nplanes is only supported for
            shuffle=True (the default).

        npartitions: positive int, optional
            If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1
            partition per image file. Only applies when shuffle=True.

        renumber: boolean, optional, default False
            If renumber evaluates to True, then the keys for each record will be explicitly recalculated after
            all images are loaded. This should only be necessary at load time when different files contain
            different number of records. renumber is only supported for shuffle=True (the default). See
            Images.renumber().
        """
        checkParams(inputFormat, ['stack', 'tif', 'tif-stack'])

        if inputFormat.lower() == 'stack' and not dims:
            raise ValueError("Dimensions ('dims' parameter) must be specified if loading from binary image stack" +
                             " ('stack' value for 'inputFormat' parameter)")

        if not overwrite:
            raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=self.awsCredentials)
            overwrite = True  # prevent additional downstream checks for this path

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        if shuffle:
            from thunder.rdds.fileio.imagesloader import ImagesLoader
            loader = ImagesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                images = loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, startIdx=startIdx, stopIdx=stopIdx,
                                          recursive=recursive, nplanes=nplanes, npartitions=npartitions)
            else:
                # 'tif' or 'tif-stack'
                images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx,
                                        recursive=recursive, nplanes=nplanes, npartitions=npartitions)
            if renumber:
                images = images.renumber()
            images.toBlocks(blockSize, units=blockSizeUnits).saveAsBinarySeries(outputDirPath, overwrite=overwrite)
        else:
            from thunder.rdds.fileio.seriesloader import SeriesLoader
            if nplanes is not None:
                raise NotImplementedError("nplanes is not supported with shuffle=False")
            if npartitions is not None:
                raise NotImplementedError("npartitions is not supported with shuffle=False")
            loader = SeriesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                loader.saveFromStack(dataPath, outputDirPath, dims, ext=ext, dtype=dtype,
                                     blockSize=blockSize, overwrite=overwrite, startIdx=startIdx,
                                     stopIdx=stopIdx, recursive=recursive)
            else:
                # 'tif' or 'tif-stack'
                loader.saveFromTif(dataPath, outputDirPath, ext=ext, blockSize=blockSize,
                                   startIdx=startIdx, stopIdx=stopIdx, overwrite=overwrite,
                                   recursive=recursive)
Ejemplo n.º 41
0
 def load(modelFile, tuningMode):
     from thunder.utils.common import checkParams
     checkParams(tuningMode.lower(), TUNING_MODELS.keys())
     return TUNING_MODELS[tuningMode.lower()](modelFile)
Ejemplo n.º 42
0
    def loadSeries(self,
                   dataPath,
                   nkeys=None,
                   nvalues=None,
                   inputFormat='binary',
                   minPartitions=None,
                   confFilename='conf.json',
                   keyType=None,
                   valueType=None):
        """
        Loads a Series object from data stored as text or binary files.

        Supports single files or multiple files stored on a local file system, a networked file system (mounted
        and available on all cluster nodes), Amazon S3, or HDFS.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples
            of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif",
            "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/".

        nkeys: int, optional (but required if `inputFormat` is 'text')
            dimensionality of data keys. (For instance, (x,y,z) keyed data for 3-dimensional image timeseries data.)
            For text data, number of keys must be specified in this parameter; for binary data, number of keys must be
            specified either in this parameter or in a configuration file named by the 'conffile' argument if this
            parameter is not set.

        nvalues: int, optional (but required if `inputFormat` is 'text')
            Number of values expected to be read. For binary data, nvalues must be specified either in this parameter
            or in a configuration file named by the 'conffile' argument if this parameter is not set.

        inputFormat: {'text', 'binary'}. optional, default 'binary'
            Format of data to be read.

        minPartitions: int, optional
            Explicitly specify minimum number of Spark partitions to be generated from this data. Used only for
            text data. Default is to use minParallelism attribute of Spark context object.

        confFilename: string, optional, default 'conf.json'
            Path to JSON file with configuration options including 'nkeys', 'nvalues', 'keytype', and 'valuetype'.
            If a file is not found at the given path, then the base directory given in 'datafile'
            will also be checked. Parameters `nkeys` or `nvalues` that are specified as explicit arguments to this
            method will take priority over those found in conffile if both are present.

        Returns
        -------
        data: thunder.rdds.Series
            A newly-created Series object, wrapping an RDD of series data. This RDD will have as keys an n-tuple
            of int, with n given by `nkeys` or the configuration passed in `conffile`. RDD values will be a numpy
            array of length `nvalues` (or as specified in the passed configuration file).
        """
        checkParams(inputFormat, ['text', 'binary'])

        from thunder.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc, minPartitions=minPartitions)

        if inputFormat.lower() == 'text':
            data = loader.fromText(dataPath, nkeys=nkeys)
        else:
            # must be either 'text' or 'binary'
            data = loader.fromBinary(dataPath,
                                     confFilename=confFilename,
                                     nkeys=nkeys,
                                     nvalues=nvalues,
                                     keyType=keyType,
                                     valueType=valueType)
        return data
Ejemplo n.º 43
0
 def load(paramFile, classifyMode, **opts):
     from thunder.utils.common import checkParams
     checkParams(classifyMode.lower(), CLASSIFIERS.keys())
     return CLASSIFIERS[classifyMode.lower()](paramFile, **opts)
Ejemplo n.º 44
0
 def load(modelFile, regressMode, **opts):
     from thunder.utils.common import checkParams
     checkParams(regressMode.lower(), REGRESSION_MODELS.keys())
     return REGRESSION_MODELS[regressMode.lower()](modelFile, **opts)
Ejemplo n.º 45
0
    def loadImages(self, dataPath, dims=None, dtype=None, inputFormat='stack', ext=None,
                   startIdx=None, stopIdx=None, recursive=False, nplanes=None, npartitions=None,
                   renumber=False, confFilename='conf.json'):
        """
        Loads an Images object from data stored as a binary image stack, tif, or png files.

        Supports single files or multiple files, stored on a local file system, a networked file sytem
        (mounted and available on all nodes), or Amazon S3. HDFS is not currently supported for image file data.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        dims: tuple of positive int, optional (required if inputFormat is 'stack')
            Image dimensions. Binary stack data will be interpreted as a multidimensional array
            with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention),
            where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions
            will be read from the image file headers.

        inputFormat: str, optional, default = 'stack'
            Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks.
            'png' or 'tif' indicate image format. Page of a multipage tif file will be extend along
            the third dimension. Separate files interpreted as distinct records, with ordering
            given by lexicographic sorting of file names.

        ext: string, optional, default = None
            File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif',
            and 'png' for inputFormat=="png".

        dtype: string or numpy dtype, optional, default = 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string.
            Ignored for 'tif' or 'png' (data will be inferred from image formats).

        startIdx: nonnegative int, optional, default = None
            Convenience parameters to read only a subset of input files. Uses python slice conventions
            (zero-based indexing with exclusive final position). These parameters give the starting
            and final index after lexicographic sorting.

        stopIdx: nonnegative int, optional, default = None
            See startIdx.

        recursive: boolean, optional, default = False
            If true, will recursively descend directories rooted at dataPath, loading all files
            in the tree with an appropriate extension.

        nplanes: positive integer, optional, default = None
            Subdivide individual image files. Every `nplanes` from each file will be considered a new record.
            With nplanes=None (the default), a single file will be considered as representing a single record.
            If the number of records per file is not the same across all files, then `renumber` should be set
            to True to ensure consistent keys.

        npartitions: positive int, optional, default = None
            Specify number of partitions for the RDD, if unspecified will use 1 partition per image.

        renumber: boolean, optional, default = False
            Recalculate keys for records after images are loading. Only necessary if different files contain
            different number of records (e.g. due to specifying nplanes). See Images.renumber().

        confFilename : string, optional, default = 'conf.json'
            Name of conf file if using to specify parameters for binary stack data

        Returns
        -------
        data: thunder.rdds.Images
            An Images object, wrapping an RDD of with (int) : (numpy array) pairs

        """
        checkParams(inputFormat, ['stack', 'png', 'tif', 'tif-stack'])

        from thunder.rdds.fileio.imagesloader import ImagesLoader
        loader = ImagesLoader(self._sc)

        # Checking StartIdx is smaller or equal to StopIdx
        if startIdx is not None and stopIdx is not None and startIdx > stopIdx:
            raise Exception("Error. startIdx {} is larger than stopIdx {}".inputFormat(startIdx, stopIdx))

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        if inputFormat.lower() == 'stack':
            data = loader.fromStack(dataPath, dims=dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx,
                                    recursive=recursive, nplanes=nplanes, npartitions=npartitions,
                                    confFilename=confFilename)
        elif inputFormat.lower().startswith('tif'):
            data = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive,
                                  nplanes=nplanes, npartitions=npartitions)
        else:
            if nplanes:
                raise NotImplementedError("nplanes argument is not supported for png files")
            data = loader.fromPng(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive,
                                  npartitions=npartitions)

        if not renumber:
            return data
        else:
            return data.renumber()
Ejemplo n.º 46
0
 def load(modelFile, tuningMode):
     from thunder.utils.common import checkParams
     checkParams(tuningMode.lower(), TUNING_MODELS.keys())
     return TUNING_MODELS[tuningMode.lower()](modelFile)
Ejemplo n.º 47
0
    def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None,
                   confFilename='conf.json', keyType=None, valueType=None, keyPath=None, varName=None):
        """
        Loads a Series object from data stored as binary, text, npy, or mat.

        For binary and text, supports single files or multiple files stored on a local file system,
        a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS.
        For local formats (npy and mat) only local file systems currently supported.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        nkeys: int, optional (required if `inputFormat` is 'text'), default = None
            Number of keys per record (e.g. 3 for (x, y, z) coordinate keys). Must be specified for
            text data; can be specified here or in a configuration file for binary data.

        nvalues: int, optional (required if `inputFormat` is 'text')
            Number of values per record. Must be specified here or in a configuration file for binary data.

        inputFormat: {'text', 'binary', 'npy', 'mat'}. optional, default = 'binary'
            inputFormat of data to be read.

        minPartitions: int, optional, default = SparkContext.minParallelism
            Minimum number of Spark partitions to use, only for text.

        confFilename: string, optional, default 'conf.json'
            Path to JSON file with configuration options including 'nkeys', 'nvalues',
            'keyType', and 'valueType'. If a file is not found at the given path, then the base
            directory in 'dataPath' will be checked. Parameters will override the conf file.

        keyType: string or numpy dtype, optional, default = None
            Numerical type of keys, will override conf file.

        valueType: string or numpy dtype, optional, default = None
            Numerical type of values, will override conf file.

        keyPath: string, optional, default = None
            Path to file with keys when loading from npy or mat.

        varName : str, optional, default = None
            Variable name to load (for MAT files only)

        Returns
        -------
        data: thunder.rdds.Series
            A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs
        """
        checkParams(inputFormat, ['text', 'binary', 'npy', 'mat'])

        from thunder.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc, minPartitions=minPartitions)

        if inputFormat.lower() == 'binary':
            data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues,
                                     keyType=keyType, valueType=valueType)
        elif inputFormat.lower() == 'text':
            if nkeys is None:
                raise Exception('Must provide number of keys per record for loading from text')
            data = loader.fromText(dataPath, nkeys=nkeys)
        elif inputFormat.lower() == 'npy':
            data = loader.fromNpyLocal(dataPath, keyPath)
        else:
            if varName is None:
                raise Exception('Must provide variable name for loading MAT files')
            data = loader.fromMatLocal(dataPath, varName, keyPath)

        return data
Ejemplo n.º 48
0
 def load(modelFile, regressMode, **opts):
     from thunder.utils.common import checkParams
     checkParams(regressMode.lower(), REGRESSION_MODELS.keys())
     return REGRESSION_MODELS[regressMode.lower()](modelFile, **opts)
Ejemplo n.º 49
0
    def loadExample(self, dataset=None):
        """
        Load a local example data set for testing analyses.

        Some of these data sets are extremely downsampled and should be considered
        useful only for testing the API. If called with None,
        will return list of available datasets.

        Parameters
        ----------
        dataset : str
            Which dataset to load

        Returns
        -------
        data : Data object
            Generated dataset as a Thunder data objects (e.g Series or Images)
        """
        import atexit
        import shutil
        import tempfile
        from pkg_resources import resource_listdir, resource_filename

        DATASETS = {
            'iris': 'iris',
            'fish-series': 'fish/series',
            'fish-images': 'fish/images',
            'mouse-series': 'mouse/series',
            'mouse-images': 'mouse/images',
            'mouse-params': 'mouse/params'
        }

        if dataset is None:
            return sorted(DATASETS.keys())

        checkParams(dataset, DATASETS.keys())

        if 'ec2' in self._sc.master:
            tmpdir = os.path.join('/root/thunder/python/thunder/utils', 'data', DATASETS[dataset])
        else:
            tmpdir = tempfile.mkdtemp()
            atexit.register(shutil.rmtree, tmpdir)

            def copyLocal(target):
                files = resource_listdir('thunder.utils.data', target)
                for f in files:
                    path = resource_filename('thunder.utils.data', os.path.join(target, f))
                    shutil.copy(path, tmpdir)

            copyLocal(DATASETS[dataset])

        npartitions = self._sc.defaultParallelism

        if dataset == "iris":
            return self.loadSeries(tmpdir)
        elif dataset == "fish-series":
            return self.loadSeries(tmpdir).astype('float')
        elif dataset == "fish-images":
            return self.loadImages(tmpdir, inputFormat="tif", npartitions=npartitions)
        elif dataset == "mouse-series":
            return self.loadSeries(tmpdir).astype('float')
        elif dataset == "mouse-images":
            return self.loadImages(tmpdir, npartitions=npartitions)
        elif dataset == "mouse-params":
            return self.loadParams(os.path.join(tmpdir, 'covariates.json'))
Ejemplo n.º 50
0
def export(data, outputDirPath, outputFilename, outputFormat, sorting=False):
    """
    Export data to a variety of local formats.

    Can export local arrays or a Series. If passed a Series,
    it will first be packed into one or more local arrrays.

    Parameters
    ----------
    data : Series, or numpy array
        The data to export

    outputDirPath : str
        Output directory

    outputFilename : str
        Output filename

    outputFormat : str
        Output format ("matlab", "npy", or "text")

    """

    from thunder.rdds.series import Series
    from scipy.io import savemat

    checkParams(outputFormat, ['matlab', 'npy', 'text'])

    if not os.path.exists(outputDirPath):
        os.makedirs(outputDirPath)

    filename = os.path.join(outputDirPath, outputFilename)

    def write(array, file, format, varname=None):
        if format == 'matlab':
            savemat(file + ".mat",
                    mdict={varname: array},
                    oned_as='column',
                    do_compression='true')
        if format == 'npy':
            save(file, array)
        if format == 'text':
            savetxt(file + ".txt", array, fmt="%.6f")

    if isinstance(data, Series):
        # force calculation of dimensions
        _tmp = data.dims
        if size(data.index) > 1:
            for ix in data.index:
                result = data.select(ix).pack(sorting=sorting)
                write(result,
                      filename + "_" + str(ix),
                      outputFormat,
                      varname=outputFilename + "_" + str(ix))
        else:
            result = data.pack(sorting=sorting)
            write(result,
                  filename,
                  outputFormat,
                  varname=outputFilename + "_" + str(data.index))
    else:
        write(data, filename, outputFormat, varname=outputFilename)
Ejemplo n.º 51
0
    def loadImagesAsSeries(self,
                           dataPath,
                           dims=None,
                           inputFormat='stack',
                           ext=None,
                           dtype='int16',
                           blockSize="150M",
                           blockSizeUnits="pixels",
                           startIdx=None,
                           stopIdx=None,
                           shuffle=True,
                           recursive=False):
        """
        Load Images data as Series data.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
            including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples
            of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif",
            "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/".

        dims: tuple of positive int, optional (but required if inputFormat is 'stack')
            Dimensions of input image data, for instance (1024, 1024, 48). Binary stack data will be interpreted as
            coming from a multidimensional array of the specified dimensions.

            The first dimension of the passed dims tuple should be the one that is changing most rapidly
            on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file
            should be ordered as [(x0, y0, z0), (x1, y0, z0), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ...,
            (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. This is the opposite convention from that used by numpy,
            which by default has the fastest-changing dimension listed last (column-major convention). Thus, if loading
            a numpy array `ary`, where `ary.shape == (z, y, x)`, written to disk by `ary.tofile("myarray.stack")`, the
            corresponding dims parameter should be (x, y, z).
            If inputFormat is 'tif', the dims parameter (if any) will be ignored; data dimensions will instead
            be read out from the tif file headers.

        inputFormat: {'stack', 'tif'}. optional, default 'stack'
            Expected format of the input data. 'stack' indicates flat files of raw binary data, while 'tif' indicates
            greyscale / luminance TIF images. Each page of a multipage tif file will be interpreted as a separate
            z-plane. For both stacks and tif stacks, separate files are interpreted as distinct time points, with
            ordering given by lexicographic sorting of file names.

        ext: string, optional, default None
            Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for
            inputFormat=='tif'.

        dtype: string or numpy dtype. optional, default 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is
            'tif', the dtype parameter (if any) will be ignored; data type will instead be read out from the
            tif headers.

        blockSize: string formatted as e.g. "64M", "512k", "2G", or positive int. optional, default "150M"
            Requested size of individual output files in bytes (or kilobytes, megabytes, gigabytes). If shuffle=True,
            blockSize can also be a tuple of int specifying either the number of pixels or of splits per dimension to
            apply to the loaded images, or an instance of BlockingStrategy. Whether a tuple of int is interpreted as
            pixels or as splits depends on the value of the blockSizeUnits parameter. blockSize also indirectly
            controls the number of Spark partitions to be used, with one partition used per block created.

        blockSizeUnits: string, either "pixels" or "splits" (or unique prefix of each, such as "s"), default "pixels"
            Specifies units to be used in interpreting a tuple passed as blockSizeSpec when shuffle=True. If a string
            or a BlockingStrategy instance is passed as blockSizeSpec, or if shuffle=False, this parameter has no
            effect.

        startIdx: nonnegative int, optional
            startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These
            parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used
            after lexicographically sorting all input data files matching the dataPath argument. For example,
            startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read
            in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx
            and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position).

        stopIdx: nonnegative int, optional
            See startIdx.

        shuffle: boolean, optional, default True
            Controls whether the conversion from Images to Series formats will make use of a Spark shuffle-based method.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at dataPath, loading all files in the tree that
            have an appropriate extension. Recursive loading is currently only implemented for local filesystems
            (not s3), and only with shuffle=True.

        Returns
        -------
        data: thunder.rdds.Series
            A newly-created Series object, wrapping an RDD of timeseries data generated from the images in dataPath.
            This RDD will have as keys an n-tuple of int, with n given by the dimensionality of the original images. The
            keys will be the zero-based spatial index of the timeseries data in the RDD value. The value will be
            a numpy array of length equal to the number of image files loaded. Each loaded image file will contribute
            one point to this value array, with ordering as implied by the lexicographic ordering of image file names.
        """
        checkParams(inputFormat, ['stack', 'tif', 'tif-stack'])

        if inputFormat.lower() == 'stack' and not dims:
            raise ValueError(
                "Dimensions ('dims' parameter) must be specified if loading from binary image stack"
                + " ('stack' value for 'inputFormat' parameter)")

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        if shuffle:
            from thunder.rdds.fileio.imagesloader import ImagesLoader
            loader = ImagesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                images = loader.fromStack(dataPath,
                                          dims,
                                          dtype=dtype,
                                          ext=ext,
                                          startIdx=startIdx,
                                          stopIdx=stopIdx,
                                          recursive=recursive)
            else:
                # tif / tif stack
                images = loader.fromTif(dataPath,
                                        ext=ext,
                                        startIdx=startIdx,
                                        stopIdx=stopIdx,
                                        recursive=recursive)
            return images.toBlocks(blockSize, units=blockSizeUnits).toSeries()

        else:
            from thunder.rdds.fileio.seriesloader import SeriesLoader
            loader = SeriesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                return loader.fromStack(dataPath,
                                        dims,
                                        ext=ext,
                                        dtype=dtype,
                                        blockSize=blockSize,
                                        startIdx=startIdx,
                                        stopIdx=stopIdx,
                                        recursive=recursive)
            else:
                # tif / tif stack
                return loader.fromTif(dataPath,
                                      ext=ext,
                                      blockSize=blockSize,
                                      startIdx=startIdx,
                                      stopIdx=stopIdx,
                                      recursive=recursive)
Ejemplo n.º 52
0
    def calc(self, mat):
        """
        Calcuate singular vectors

        Parameters
        ----------
        mat :  Series or a subclass (e.g. RowMatrix)
            Matrix to compute singular vectors from

        Returns
        ----------
        self : returns an instance of self.
        """

        from numpy import argsort, dot, outer, random, sqrt, sum
        from scipy.linalg import inv, orth
        from numpy.linalg import eigh

        if not (isinstance(mat, Series)):
            raise Exception('Input must be Series or a subclass (e.g. RowMatrix)')

        if not (isinstance(mat, RowMatrix)):
            mat = mat.toRowMatrix()

        checkParams(self.method, ['auto', 'direct', 'em'])

        if self.method == 'auto':
            if len(mat.index) < 750:
                method = 'direct'
            else:
                method = 'em'
        else:
            method = self.method

        if method == 'direct':

            # get the normalized gramian matrix
            cov = mat.gramian() / mat.nrows

            # do a local eigendecomposition
            eigw, eigv = eigh(cov)
            inds = argsort(eigw)[::-1]
            s = sqrt(eigw[inds[0:self.k]]) * sqrt(mat.nrows)
            v = eigv[:, inds[0:self.k]].T

            # project back into data, normalize by singular values
            u = mat.times(v.T / s)

            self.u = u
            self.s = s
            self.v = v

        if method == 'em':

            # initialize random matrix
            c = random.rand(self.k, mat.ncols)
            niter = 0
            error = 100

            # define an accumulator
            from pyspark.accumulators import AccumulatorParam

            class MatrixAccumulatorParam(AccumulatorParam):
                def zero(self, value):
                    return zeros(shape(value))

                def addInPlace(self, val1, val2):
                    val1 += val2
                    return val1

            # define an accumulator function
            global runSum

            def outerSumOther(x, y):
                global runSum
                runSum += outer(x, dot(x, y))

            # iterative update subspace using expectation maximization
            # e-step: x = (c'c)^-1 c' y
            # m-step: c = y x' (xx')^-1
            while (niter < self.maxIter) & (error > self.tol):

                cOld = c

                # pre compute (c'c)^-1 c'
                cInv = dot(c.T, inv(dot(c, c.T)))

                # compute (xx')^-1 through a map reduce
                xx = mat.times(cInv).gramian()
                xxInv = inv(xx)

                # pre compute (c'c)^-1 c' (xx')^-1
                preMult2 = mat.rdd.context.broadcast(dot(cInv, xxInv))

                # compute the new c using an accumulator
                # direct approach: c = mat.rows().map(lambda x: outer(x, dot(x, premult2.value))).sum()
                runSum = mat.rdd.context.accumulator(zeros((mat.ncols, self.k)), MatrixAccumulatorParam())
                mat.rows().foreach(lambda x: outerSumOther(x, preMult2.value))
                c = runSum.value

                # transpose result
                c = c.T

                error = sum(sum((c - cOld) ** 2))
                niter += 1

            # project data into subspace spanned by columns of c
            # use standard eigendecomposition to recover an orthonormal basis
            c = orth(c.T)
            cov = mat.times(c).gramian() / mat.nrows
            eigw, eigv = eigh(cov)
            inds = argsort(eigw)[::-1]
            s = sqrt(eigw[inds[0:self.k]]) * sqrt(mat.nrows)
            v = dot(eigv[:, inds[0:self.k]].T, c.T)
            u = mat.times(v.T / s)

            self.u = u
            self.s = s
            self.v = v

        return self