Exemple #1
0
    def saveAsBinarySeries(self, outputDirPath, overwrite=False):
        """Writes out Series-formatted data.

        Subclasses are *not* expected to override this method.

        Parameters
        ----------
        outputdirname : string path or URI to directory to be created
            Output files will be written underneath outputdirname. This directory must not yet exist
            (unless overwrite is True), and must be no more than one level beneath an existing directory.
            It will be created as a result of this call.

        overwrite : bool
            If true, outputdirname and all its contents will be deleted and recreated as part
            of this call.
        """
        from thunder.rdds.fileio.writers import getParallelWriterForPath
        from thunder.rdds.fileio.seriesloader import writeSeriesConfig
        from thunder.utils.aws import AWSCredentials

        if not overwrite:
            self._checkOverwrite(outputDirPath)
            overwrite = True  # prevent additional downstream checks for this path

        awsCredentialsOverride = AWSCredentials.fromContext(self.rdd.ctx)
        writer = getParallelWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite,
                                                         awsCredentialsOverride=awsCredentialsOverride)

        binseriesRdd = self.toBinarySeries()

        binseriesRdd.foreach(writer.writerFcn)
        writeSeriesConfig(outputDirPath, len(self.dims), self.nimages, keyType='int16', valueType=self.dtype,
                          overwrite=overwrite, awsCredentialsOverride=awsCredentialsOverride)
Exemple #2
0
 def _checkOverwrite(self, outputDirPath):
     """ Checks for existence of outputDirPath, raising ValueError if it already exists """
     from thunder.utils.aws import AWSCredentials
     from thunder.utils.common import raiseErrorIfPathExists
     awsCredentialOverride = AWSCredentials.fromContext(self.rdd.ctx)
     raiseErrorIfPathExists(outputDirPath,
                            awsCredentialsOverride=awsCredentialOverride)
Exemple #3
0
    def __init__(self, sparkContext):
        """Initialize a new ImagesLoader object.

        Parameters
        ----------
        sparkcontext: SparkContext
            The pyspark SparkContext object used by the current Thunder environment.
        """
        from thunder.utils.aws import AWSCredentials
        self.sc = sparkContext
        self.awsCredentialsOverride = AWSCredentials.fromContext(sparkContext)
Exemple #4
0
    def setAWSCredentials(self, awsAccessKeyId, awsSecretAccessKey):
        """
        Manually set AWS access credentials to be used by Thunder.

        Provided for hosted cloud environments without filesystem access. If
        launching a cluster using the thunder-ec2 script, credentials will be
        configured automatically (inside core-site.xml and ~/.boto), so this
        method should not need to be called.

        Parameters
        ----------
        awsAccessKeyId : string
            AWS public key, usually starts with "AKIA"

        awsSecretAccessKey : string
            AWS private key
        
        """
        from thunder.utils.aws import AWSCredentials
        self._credentials = AWSCredentials(awsAccessKeyId, awsSecretAccessKey)
        self._credentials.setOnContext(self._sc)
Exemple #5
0
    def saveAsPngs(self, outputDirPath, cmap=None, vmin=None, vmax=None, filePrefix="export", overwrite=False,
                     collectToDriver=True):
        """
        Write out basic png files for two-dimensional image data.

        Files will be written into a newly-created directory given by outputdirname.

        Parameters
        ----------
        outputDirPath : string
            Path to output directory to be created. Exception will be thrown if this directory already
            exists, unless overwrite is True. Directory must be one level below an existing directory.

        filePrefix : string
            String to prepend to all filenames. Files will be named <fileprefix>-00000.png, <fileprefix>-00001.png, etc

        overwrite : bool
            If true, the directory given by outputdirname will first be deleted if it already exists.

        collectToDriver : bool, default True
            If true, images will be collect()'ed at the driver first before being written out, allowing
            for use of a local filesystem at the expense of network overhead. If false, images will be written
            in parallel by each executor, presumably to a distributed or networked filesystem.
        """
        dims = self.dims
        if not len(dims) == 2:
            raise ValueError("Only two-dimensional images can be exported as .png files; image is %d-dimensional." %
                             len(dims))

        from matplotlib.pyplot import imsave
        from io import BytesIO
        from thunder.rdds.fileio.writers import getParallelWriterForPath, getCollectedFileWriterForPath
        from thunder.utils.aws import AWSCredentials

        def toFilenameAndPngBuf(kv):
            key, img = kv
            fname = filePrefix+"-"+"%05d.png" % int(key)
            bytebuf = BytesIO()
            imsave(bytebuf, img, vmin, vmax, cmap=cmap, format="png")
            return fname, bytebuf.getvalue()

        bufRdd = self.rdd.map(toFilenameAndPngBuf)

        awsCredentials = AWSCredentials.fromContext(self.rdd.ctx)
        if collectToDriver:
            writer = getCollectedFileWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite,
                                                                  awsCredentialsOverride=awsCredentials)
            writer.writeCollectedFiles(bufRdd.collect())
        else:
            writer = getParallelWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite,
                                                             awsCredentialsOverride=awsCredentials)
            bufRdd.foreach(writer.writerFcn)
Exemple #6
0
    def saveAsBinaryImages(self,
                           outputDirPath,
                           prefix="image",
                           overwrite=False):
        """
        Write out images or volumes as flat binary files.

        Files will be written into a newly-created directory given by outputdirname.

        Parameters
        ----------
        outputDirPath : string
            Path to output directory to be created. Exception will be thrown if this directory already
            exists, unless overwrite is True. Directory must be one level below an existing directory.

        prefix : string
            String to prepend to all filenames. Files will be named <fileprefix>-00000.bin, <fileprefix>-00001.bin, etc

        overwrite : bool
            If true, the directory given by outputdirname will first be deleted if it already exists.
        """
        from thunder.rdds.fileio.writers import getParallelWriterForPath
        from thunder.rdds.fileio.imagesloader import writeBinaryImagesConfig
        from thunder.utils.aws import AWSCredentials
        import StringIO

        dimsTotal = list(asarray(self.dims.max) - asarray(self.dims.min) + 1)

        def toFilenameAndBinaryBuf(kv):
            key, img = kv
            fname = prefix + "-" + "%05d.bin" % int(key)
            buf = StringIO.StringIO()
            buf.write(img.transpose().copy().tostring())
            val = buf.getvalue()
            buf.close()
            return fname, val

        bufRdd = self.rdd.map(toFilenameAndBinaryBuf)

        awsCredentials = AWSCredentials.fromContext(self.rdd.ctx)
        writer = getParallelWriterForPath(outputDirPath)(
            outputDirPath,
            overwrite=overwrite,
            awsCredentialsOverride=awsCredentials)
        bufRdd.foreach(writer.writerFcn)
        writeBinaryImagesConfig(outputDirPath,
                                dims=dimsTotal,
                                dtype=self.dtype,
                                overwrite=overwrite,
                                awsCredentialsOverride=awsCredentials)
Exemple #7
0
    def __init__(self, sparkContext, minPartitions=None):
        """Initialize a new SeriesLoader object.

        Parameters
        ----------
        sparkcontext: SparkContext
            The pyspark SparkContext object used by the current Thunder environment.

        minPartitions: int
            minimum number of partitions to use when loading data. (Used by fromText, fromMatLocal, and fromNpyLocal)
        """
        from thunder.utils.aws import AWSCredentials
        self.sc = sparkContext
        self.minPartitions = minPartitions
        self.awsCredentialsOverride = AWSCredentials.fromContext(sparkContext)
Exemple #8
0
    def __init__(self, sparkContext, minPartitions=None):
        """Initialize a new SeriesLoader object.

        Parameters
        ----------
        sparkcontext: SparkContext
            The pyspark SparkContext object used by the current Thunder environment.

        minPartitions: int
            minimum number of partitions to use when loading data. (Used by fromText, fromMatLocal, and fromNpyLocal)
        """
        from thunder.utils.aws import AWSCredentials
        self.sc = sparkContext
        self.minPartitions = minPartitions
        self.awsCredentialsOverride = AWSCredentials.fromContext(sparkContext)
Exemple #9
0
    def saveAsBinaryImages(self, outputDirPath, prefix="image", overwrite=False):
        """
        Write out images or volumes as flat binary files.

        Files will be written into a newly-created directory given by outputdirname.

        Parameters
        ----------
        outputDirPath : string
            Path to output directory to be created. Exception will be thrown if this directory already
            exists, unless overwrite is True. Directory must be one level below an existing directory.

        prefix : string
            String to prepend to all filenames. Files will be named <fileprefix>-00000.bin, <fileprefix>-00001.bin, etc

        overwrite : bool
            If true, the directory given by outputdirname will first be deleted if it already exists.
        """
        from thunder.rdds.fileio.writers import getParallelWriterForPath
        from thunder.rdds.fileio.imagesloader import writeBinaryImagesConfig
        from thunder.utils.aws import AWSCredentials
        import StringIO

        dimsTotal = list(asarray(self.dims.max)-asarray(self.dims.min)+1)

        def toFilenameAndBinaryBuf(kv):
            key, img = kv
            fname = prefix+"-"+"%05d.bin" % int(key)
            buf = StringIO.StringIO()
            buf.write(img.transpose().copy().tostring())
            val = buf.getvalue()
            buf.close()
            return fname, val

        bufRdd = self.rdd.map(toFilenameAndBinaryBuf)

        awsCredentials = AWSCredentials.fromContext(self.rdd.ctx)
        writer = getParallelWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite,
                                                         awsCredentialsOverride=awsCredentials)
        bufRdd.foreach(writer.writerFcn)
        writeBinaryImagesConfig(outputDirPath, dims=dimsTotal, dtype=self.dtype,
                                overwrite=overwrite, awsCredentialsOverride=awsCredentials)
Exemple #10
0
    def setAWSCredentials(self, awsAccessKeyId, awsSecretAccessKey):
        """
        Manually set AWS access credentials to be used by Thunder.

        Provided for hosted cloud environments without filesystem access. If
        launching a cluster using the thunder-ec2 script, credentials will be
        configured automatically (inside core-site.xml and ~/.boto), so this
        method should not need to be called.

        Parameters
        ----------
        awsAccessKeyId : string
            AWS public key, usually starts with "AKIA"

        awsSecretAccessKey : string
            AWS private key
        
        """
        from thunder.utils.aws import AWSCredentials
        self._credentials = AWSCredentials(awsAccessKeyId, awsSecretAccessKey)
        self._credentials.setOnContext(self._sc)
Exemple #11
0
    def __init__(self, awsCredentialsOverride=None):
        """Initialization; validates that AWS keys are available as environment variables.

        Will let boto library look up credentials itself according to its own rules - e.g. first looking for
        AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY, then going through several possible config files and finally
        looking for a ~/.aws/credentials .ini-formatted file. See boto docs:
        http://boto.readthedocs.org/en/latest/boto_config_tut.html

        However, if an AWSCredentials object is provided, its `awsAccessKeyId` and `awsSecretAccessKey` attributes
        will be used instead of those found by the standard boto credential lookup process.
        """
        if not _haveBoto:
            raise ValueError("The boto package does not appear to be available; boto is required for BotoReader")
        self.awsCredentialsOverride = awsCredentialsOverride if awsCredentialsOverride else AWSCredentials()
Exemple #12
0
class ThunderContext():
    """
    Wrapper for a SparkContext that provides an entry point for loading and saving.

    Also supports creation of example datasets, and loading example
    data both locally and from EC2.

    Attributes
    ----------
    `_sc` : SparkContext
        Spark context for Spark functionality

    `_credentials` : AWSCredentials object, optional, default = None
        Stores public and private keys for AWS services. Typically available through
        configuration files, and but can optionally be set using :func:`ThunderContext.setAWSCredentials()`.
    """
    def __init__(self, sparkcontext):
        self._sc = sparkcontext
        self._credentials = None

    @classmethod
    def start(cls, *args, **kwargs):
        """
        Starts a ThunderContext using the same arguments as SparkContext
        """
        from pyspark import SparkContext
        return ThunderContext(SparkContext(*args, **kwargs))

    def addPyFile(self, path):
        """
        Adds a .zip or .py or .egg dependency for all tasks to be executed
        as part of this context.

        Uses the corresponding SparkContext method.

        Parameters
        ----------
        path : str
            Path to a file as either a local file, file in HDFS, or URI.
        """
        self._sc.addPyFile(path)
        
    def stop(self):
        """
        Shut down the context
        """
        self._sc.stop()

    def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None,
                   maxPartitionSize='32mb', confFilename='conf.json', keyType=None, valueType=None, keyPath=None,
                   varName=None):
        """
        Loads a Series object from data stored as binary, text, npy, or mat.

        For binary and text, supports single files or multiple files stored on a local file system,
        a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS.
        For local formats (npy and mat) only local file systems currently supported.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        nkeys: int, optional (required if `inputFormat` is 'text'), default = None
            Number of keys per record (e.g. 3 for (x, y, z) coordinate keys). Must be specified for
            text data; can be specified here or in a configuration file for binary data.

        nvalues: int, optional (required if `inputFormat` is 'text')
            Number of values per record. Must be specified here or in a configuration file for binary data.

        inputFormat: {'text', 'binary', 'npy', 'mat'}. optional, default = 'binary'
            inputFormat of data to be read.

        minPartitions: int, optional, default = SparkContext.minParallelism
            Minimum number of Spark partitions to use, only for text.

        maxPartitionSize : int, optional, default = '32mb'
            Maximum size of partitions as a Java-style memory string, e.g. '32mb' or '64mb',
            indirectly controls the number of Spark partitions, only for binary.

        confFilename: string, optional, default 'conf.json'
            Path to JSON file with configuration options including 'nkeys', 'nvalues',
            'keyType', and 'valueType'. If a file is not found at the given path, then the base
            directory in 'dataPath' will be checked. Parameters will override the conf file.

        keyType: string or numpy dtype, optional, default = None
            Numerical type of keys, will override conf file.

        valueType: string or numpy dtype, optional, default = None
            Numerical type of values, will override conf file.

        keyPath: string, optional, default = None
            Path to file with keys when loading from npy or mat.

        varName : str, optional, default = None
            Variable name to load (for MAT files only)

        Returns
        -------
        data: thunder.rdds.Series
            A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs
        """
        checkParams(inputFormat, ['text', 'binary', 'npy', 'mat'])

        from thunder.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc, minPartitions=minPartitions)

        if inputFormat.lower() == 'binary':
            data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues,
                                     keyType=keyType, valueType=valueType, maxPartitionSize=maxPartitionSize)
        elif inputFormat.lower() == 'text':
            if nkeys is None:
                raise Exception('Must provide number of keys per record for loading from text')
            data = loader.fromText(dataPath, nkeys=nkeys)
        elif inputFormat.lower() == 'npy':
            data = loader.fromNpyLocal(dataPath, keyPath)
        else:
            if varName is None:
                raise Exception('Must provide variable name for loading MAT files')
            data = loader.fromMatLocal(dataPath, varName, keyPath)

        return data

    def loadImages(self, dataPath, dims=None, dtype=None, inputFormat='stack', ext=None,
                   startIdx=None, stopIdx=None, recursive=False, nplanes=None, npartitions=None,
                   renumber=False, confFilename='conf.json'):
        """
        Loads an Images object from data stored as a binary image stack, tif, or png files.

        Supports single files or multiple files, stored on a local file system, a networked file sytem
        (mounted and available on all nodes), Amazon S3, or Google Storage.
        HDFS is not currently supported for image file data.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        dims: tuple of positive int, optional (required if inputFormat is 'stack')
            Image dimensions. Binary stack data will be interpreted as a multidimensional array
            with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention),
            where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions
            will be read from the image file headers.

        inputFormat: str, optional, default = 'stack'
            Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks.
            'png' or 'tif' indicate image format. Page of a multipage tif file will be extend along
            the third dimension. Separate files interpreted as distinct records, with ordering
            given by lexicographic sorting of file names.

        ext: string, optional, default = None
            File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif',
            and 'png' for inputFormat=="png".

        dtype: string or numpy dtype, optional, default = 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string.
            Ignored for 'tif' or 'png' (data will be inferred from image formats).

        startIdx: nonnegative int, optional, default = None
            Convenience parameters to read only a subset of input files. Uses python slice conventions
            (zero-based indexing with exclusive final position). These parameters give the starting
            and final index after lexicographic sorting.

        stopIdx: nonnegative int, optional, default = None
            See startIdx.

        recursive: boolean, optional, default = False
            If true, will recursively descend directories rooted at dataPath, loading all files
            in the tree with an appropriate extension.

        nplanes: positive integer, optional, default = None
            Subdivide individual image files. Every `nplanes` from each file will be considered a new record.
            With nplanes=None (the default), a single file will be considered as representing a single record.
            If the number of records per file is not the same across all files, then `renumber` should be set
            to True to ensure consistent keys.

        npartitions: positive int, optional, default = None
            Specify number of partitions for the RDD, if unspecified will use 1 partition per image.

        renumber: boolean, optional, default = False
            Recalculate keys for records after images are loading. Only necessary if different files contain
            different number of records (e.g. due to specifying nplanes). See Images.renumber().

        confFilename : string, optional, default = 'conf.json'
            Name of conf file if using to specify parameters for binary stack data

        Returns
        -------
        data: thunder.rdds.Images
            An Images object, wrapping an RDD of with (int) : (numpy array) pairs

        """
        checkParams(inputFormat, ['stack', 'png', 'tif', 'tif-stack'])

        from thunder.rdds.fileio.imagesloader import ImagesLoader
        loader = ImagesLoader(self._sc)

        # Checking StartIdx is smaller or equal to StopIdx
        if startIdx is not None and stopIdx is not None and startIdx > stopIdx:
            raise Exception("Error. startIdx {} is larger than stopIdx {}".inputFormat(startIdx, stopIdx))

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        if inputFormat.lower() == 'stack':
            data = loader.fromStack(dataPath, dims=dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx,
                                    recursive=recursive, nplanes=nplanes, npartitions=npartitions,
                                    confFilename=confFilename)
        elif inputFormat.lower().startswith('tif'):
            data = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive,
                                  nplanes=nplanes, npartitions=npartitions)
        else:
            if nplanes:
                raise NotImplementedError("nplanes argument is not supported for png files")
            data = loader.fromPng(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive,
                                  npartitions=npartitions)

        if not renumber:
            return data
        else:
            return data.renumber()

    def loadSeriesFromArray(self, values, index=None, npartitions=None):
        """
        Load Series data from a local array

        Parameters
        ----------
        values : list or ndarray
            A list of 1d numpy arrays, or a single 2d numpy array

        index : array-like, optional, deafult = None
            Index to set for Series object, if None will use linear indices.

        npartitions : position int, optional, default = None
            Number of partitions for RDD, if unspecified will use
            default parallelism.
        """
        from numpy import ndarray, asarray
        from thunder.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc)

        if not npartitions:
            npartitions = self._sc.defaultParallelism

        if isinstance(values, list):
            values = asarray(values)

        if isinstance(values, ndarray) and values.ndim > 1:
            values = list(values)

        data = loader.fromArrays(values, npartitions=npartitions)

        if index:
            data.index = index

        return data

    def loadImagesFromArray(self, values, npartitions=None):
        """
        Load Images data from a local array

        Parameters
        ----------
        values : list or ndarray
            A list of 2d or 3d numpy arrays,
            or a single 3d or 4d numpy array

        npartitions : position int, optional, default = None
            Number of partitions for RDD, if unspecified will use
            default parallelism.
        """
        from numpy import ndarray, asarray

        from thunder.rdds.fileio.imagesloader import ImagesLoader
        loader = ImagesLoader(self._sc)

        if isinstance(values, list):
            values = asarray(values)

        if isinstance(values, ndarray) and values.ndim > 2:
            values = list(values)

        if not npartitions:
            npartitions = self._sc.defaultParallelism

        return loader.fromArrays(values, npartitions=npartitions)

    def loadImagesOCP(self, bucketName, resolution, server='ocp.me', startIdx=None, stopIdx=None,
                      minBound=None, maxBound=None):
        """
        Load Images from OCP (Open Connectome Project).

        The OCP is a web service for access to EM brain images and other neural image data.
        The web-service can be accessed at http://www.openconnectomeproject.org/.
        
        Parameters
        ----------
        bucketName: string
            Token name for the project in OCP. This name should exist on the server from which data is loaded.

        resolution: nonnegative int
            Resolution of the data in OCP

        server: string, optional, default = 'ocp.me'
            Name of the OCP server with the specified token.
        
        startIdx: nonnegative int, optional, default = None
            Convenience parameters to read only a subset of input files. Uses python slice conventions
            (zero-based indexing with exclusive final position).

        stopIdx: nonnegative int, optional
            See startIdx.
        
        minBound, maxBound: tuple of nonnegative int, optional, default = None
            X,Y,Z bounds of the data to fetch from OCP. minBound contains the (xMin,yMin,zMin) while
            maxBound contains (xMax,yMax,zMax).

        Returns
        -------
        data: thunder.rdds.Images
             An Images object, wrapping an RDD of with (int) : (numpy array) pairs
        """
      
        from thunder.rdds.fileio.imagesloader import ImagesLoader
        loader = ImagesLoader(self._sc)
        
        # Checking StartIdx is smaller or equal to StopIdx
        if startIdx is not None and stopIdx is not None and startIdx > stopIdx:
            raise Exception("Error. startIdx {} is larger than stopIdx {}".format(startIdx, stopIdx))
        data = loader.fromOCP(bucketName, resolution=resolution, server=server, startIdx=startIdx,
                              stopIdx=stopIdx, minBound=minBound, maxBound=maxBound)

        return data

    def loadImagesAsSeries(self, dataPath, dims=None, inputFormat='stack', ext=None, dtype='int16',
                           blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None,
                           shuffle=True, recursive=False, nplanes=None, npartitions=None,
                           renumber=False, confFilename='conf.json'):
        """
        Load Images data as Series data.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        dims: tuple of positive int, optional (required if inputFormat is 'stack')
            Image dimensions. Binary stack data will be interpreted as a multidimensional array
            with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention),
            where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions
            will be read from the image file headers.

        inputFormat: str, optional, default = 'stack'
            Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks.
            'png' or 'tif' indicate image formats. Page of a multipage tif file will be extend along
            the third dimension. Separate files interpreted as distinct records, with ordering
            given by lexicographic sorting of file names.

        ext: string, optional, default = None
            File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif',
            and 'png' for inputFormat=="png".

        dtype: string or numpy dtype. optional, default 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string.
            Ignored for 'tif' or 'png' (data will be inferred from image formats).

        blockSize: string or positive int, optional, default "150M"
            Requested size of blocks (e.g "64M", "512k", "2G"). If shuffle=True, can also be a
            tuple of int specifying the number of pixels or splits per dimension. Indirectly
            controls the number of Spark partitions, with one partition per block.

        blockSizeUnits: string, either "pixels" or "splits", default "pixels"
            Units for interpreting a tuple passed as blockSize when shuffle=True.

        startIdx: nonnegative int, optional, default = None
            Convenience parameters to read only a subset of input files. Uses python slice conventions
            (zero-based indexing with exclusive final position). These parameters give the starting
            and final index after lexicographic sorting.

        stopIdx: nonnegative int, optional, default = None
            See startIdx.

        shuffle: boolean, optional, default = True
            Controls whether the conversion from Images to Series formats will use of a Spark shuffle-based method.

        recursive: boolean, optional, default = False
            If true, will recursively descend directories rooted at dataPath, loading all files
            in the tree with an appropriate extension.

        nplanes: positive integer, optional, default = None
            Subdivide individual image files. Every `nplanes` from each file will be considered a new record.
            With nplanes=None (the default), a single file will be considered as representing a single record.
            If the number of records per file is not the same across all files, then `renumber` should be set
            to True to ensure consistent keys.

        npartitions: positive int, optional, default = None
            Specify number of partitions for the RDD, if unspecified will use 1 partition per image.

        renumber: boolean, optional, default = False
            Recalculate keys for records after images are loading. Only necessary if different files contain
            different number of records (e.g. due to specifying nplanes). See Images.renumber().

        confFilename : string, optional, default = 'conf.json'
            Name of conf file if using to specify parameters for binary stack data

        Returns
        -------
        data: thunder.rdds.Series
            A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs.
            Keys will be n-tuples of int, with n given by dimensionality of the images, and correspond
            to indexes into the image arrays. Value will have length equal to the number of image files.
            With each image contributing one point to this value array, with ordering given by
            the lexicographic ordering of image file names.
        """
        checkParams(inputFormat, ['stack', 'tif', 'tif-stack'])

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        if shuffle:
            from thunder.rdds.fileio.imagesloader import ImagesLoader
            loader = ImagesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                images = loader.fromStack(dataPath, dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx,
                                          recursive=recursive, nplanes=nplanes, npartitions=npartitions,
                                          confFilename=confFilename)
            else:
                # tif / tif stack
                images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx,
                                        recursive=recursive, nplanes=nplanes, npartitions=npartitions)
            if renumber:
                images = images.renumber()
            return images.toBlocks(blockSize, units=blockSizeUnits).toSeries()

        else:
            from thunder.rdds.fileio.seriesloader import SeriesLoader
            if nplanes is not None:
                raise NotImplementedError("nplanes is not supported with shuffle=False")
            if npartitions is not None:
                raise NotImplementedError("npartitions is not supported with shuffle=False")
            if renumber:
                raise NotImplementedError("renumber is not supported with shuffle=False")

            loader = SeriesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                return loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, blockSize=blockSize,
                                        startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
            else:
                # tif / tif stack
                return loader.fromTif(dataPath, ext=ext, blockSize=blockSize,
                                      startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)

    def convertImagesToSeries(self, dataPath, outputDirPath, dims=None, inputFormat='stack', ext=None,
                              dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None,
                              shuffle=True, overwrite=False, recursive=False, nplanes=None, npartitions=None,
                              renumber=False, confFilename='conf.json'):
        """
        Write out Images data as Series data, saved in a flat binary format.

        The resulting files may subsequently be read in using ThunderContext.loadSeries().
        Loading Series data directly will likely be faster than converting image data
        to a Series object through loadImagesAsSeries().

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        outputDirPath: string
            Path to directory to write Series file output. May be either a path on the local file system
            or a URI-like format, such as "local/directory", "s3n:///my-s3-bucket/data/",
            or "file:///mnt/another/directory/". If the directory exists and 'overwrite' is True,
            the existing directory and all its contents will be deleted and overwritten.

        dims: tuple of positive int, optional (required if inputFormat is 'stack')
            Image dimensions. Binary stack data will be interpreted as a multidimensional array
            with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention),
            where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions
            will be read from the image file headers.

        inputFormat: str, optional, default = 'stack'
            Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks.
            'png' or 'tif' indicate image formats. Page of a multipage tif file will be extend along
            the third dimension. Separate files interpreted as distinct records, with ordering
            given by lexicographic sorting of file names.

        ext: string, optional, default = None
            File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif',
            and 'png' for inputFormat=="png".

        dtype: string or numpy dtype. optional, default 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string.
            Ignored for 'tif' or 'png' (data will be inferred from image formats).

        blockSize: string or positive int, optional, default "150M"
            Requested size of blocks (e.g "64M", "512k", "2G"). If shuffle=True, can also be a
            tuple of int specifying the number of pixels or splits per dimension. Indirectly
            controls the number of Spark partitions, with one partition per block.

        blockSizeUnits: string, either "pixels" or "splits", default "pixels"
            Units for interpreting a tuple passed as blockSize when shuffle=True.

        startIdx: nonnegative int, optional, default = None
            Convenience parameters to read only a subset of input files. Uses python slice conventions
            (zero-based indexing with exclusive final position). These parameters give the starting
            and final index after lexicographic sorting.

        stopIdx: nonnegative int, optional, default = None
            See startIdx.

        shuffle: boolean, optional, default = True
            Controls whether the conversion from Images to Series formats will use of a Spark shuffle-based method.

        overwrite: boolean, optional, default False
            If true, the directory specified by outputDirPath will be deleted (recursively) if it
            already exists. (Use with caution.)

        recursive: boolean, optional, default = False
            If true, will recursively descend directories rooted at dataPath, loading all files
            in the tree with an appropriate extension.

        nplanes: positive integer, optional, default = None
            Subdivide individual image files. Every `nplanes` from each file will be considered a new record.
            With nplanes=None (the default), a single file will be considered as representing a single record.
            If the number of records per file is not the same across all files, then `renumber` should be set
            to True to ensure consistent keys.

        npartitions: positive int, optional, default = None
            Specify number of partitions for the RDD, if unspecified will use 1 partition per image.

        renumber: boolean, optional, default = False
            Recalculate keys for records after images are loading. Only necessary if different files contain
            different number of records (e.g. due to specifying nplanes). See Images.renumber().

        confFilename : string, optional, default = 'conf.json'
            Name of conf file if using to specify parameters for binary stack data

        """
        checkParams(inputFormat, ['stack', 'tif', 'tif-stack'])

        if not overwrite:
            raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=self._credentials)
            overwrite = True  # prevent additional downstream checks for this path

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        if shuffle:
            from thunder.rdds.fileio.imagesloader import ImagesLoader
            loader = ImagesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                images = loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, startIdx=startIdx, stopIdx=stopIdx,
                                          recursive=recursive, nplanes=nplanes, npartitions=npartitions,
                                          confFilename=confFilename)
            else:
                # 'tif' or 'tif-stack'
                images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx,
                                        recursive=recursive, nplanes=nplanes, npartitions=npartitions)
            if renumber:
                images = images.renumber()
            images.toBlocks(blockSize, units=blockSizeUnits).saveAsBinarySeries(outputDirPath, overwrite=overwrite)
        else:
            from thunder.rdds.fileio.seriesloader import SeriesLoader
            if nplanes is not None:
                raise NotImplementedError("nplanes is not supported with shuffle=False")
            if npartitions is not None:
                raise NotImplementedError("npartitions is not supported with shuffle=False")
            loader = SeriesLoader(self._sc)
            if inputFormat.lower() == 'stack':
                loader.saveFromStack(dataPath, outputDirPath, dims, ext=ext, dtype=dtype,
                                     blockSize=blockSize, overwrite=overwrite, startIdx=startIdx,
                                     stopIdx=stopIdx, recursive=recursive)
            else:
                # 'tif' or 'tif-stack'
                loader.saveFromTif(dataPath, outputDirPath, ext=ext, blockSize=blockSize,
                                   startIdx=startIdx, stopIdx=stopIdx, overwrite=overwrite,
                                   recursive=recursive)

    def makeExample(self, dataset=None, **opts):
        """
        Make an example data set for testing analyses.

        Options include 'pca', 'factor', 'kmeans', 'ica', 'sources'
        See thunder.utils.datasets for detailed options.

        Parameters
        ----------
        dataset : str
            Which dataset to generate

        Returns
        -------
        data : RDD of (tuple, array) pairs
            Generated dataset

        """
        from thunder.utils.datasets import DATASET_MAKERS

        if dataset is None:
            return sorted(DATASET_MAKERS.keys())

        checkParams(dataset, DATASET_MAKERS.keys())

        return DataSets.make(self._sc, dataset, **opts)

    def loadExample(self, dataset=None):
        """
        Load a local example data set for testing analyses.

        Some of these data sets are extremely downsampled and should be considered
        useful only for testing the API. If called with None,
        will return list of available datasets.

        Parameters
        ----------
        dataset : str
            Which dataset to load

        Returns
        -------
        data : Data object
            Generated dataset as a Thunder data objects (e.g Series or Images)
        """
        import atexit
        import shutil
        import tempfile
        from pkg_resources import resource_listdir, resource_filename

        DATASETS = {
            'iris': 'iris',
            'fish-series': 'fish/series',
            'fish-images': 'fish/images',
            'mouse-series': 'mouse/series',
            'mouse-images': 'mouse/images',
            'mouse-params': 'mouse/params'
        }

        if dataset is None:
            return sorted(DATASETS.keys())

        checkParams(dataset, DATASETS.keys())

        if 'ec2' in self._sc.master:
            tmpdir = os.path.join('/root/thunder/python/thunder/utils', 'data', DATASETS[dataset])
        else:
            tmpdir = tempfile.mkdtemp()
            atexit.register(shutil.rmtree, tmpdir)

            def copyLocal(target):
                files = resource_listdir('thunder.utils.data', target)
                for f in files:
                    path = resource_filename('thunder.utils.data', os.path.join(target, f))
                    shutil.copy(path, tmpdir)

            copyLocal(DATASETS[dataset])

        npartitions = self._sc.defaultParallelism

        if dataset == "iris":
            return self.loadSeries(tmpdir)
        elif dataset == "fish-series":
            return self.loadSeries(tmpdir).astype('float')
        elif dataset == "fish-images":
            return self.loadImages(tmpdir, inputFormat="tif", npartitions=npartitions)
        elif dataset == "mouse-series":
            return self.loadSeries(tmpdir).astype('float')
        elif dataset == "mouse-images":
            return self.loadImages(tmpdir, npartitions=npartitions)
        elif dataset == "mouse-params":
            return self.loadParams(os.path.join(tmpdir, 'covariates.json'))

    def loadExampleS3(self, dataset=None):
        """
        Load an example data set from S3.

        Info on the included datasets can be found at the CodeNeuro data repository
        (http://datasets.codeneuro.org/). If called with None, will return
        list of available datasets.

        Parameters
        ----------
        dataset : str
            Which dataset to load

        Returns
        -------
        data : a Data object (usually a Series or Images)
            The dataset as one of Thunder's data objects

        params : dict
            Parameters or metadata for dataset
        """
        DATASETS = {
            'ahrens.lab/direction.selectivity': 'ahrens.lab/direction.selectivity/1/',
            'ahrens.lab/optomotor.response': 'ahrens.lab/optomotor.response/1/',
            'svoboda.lab/tactile.navigation': 'svoboda.lab/tactile.navigation/1/'
        }

        if dataset is None:
            return DATASETS.keys()

        if 'local' in self._sc.master:
            raise Exception("Must be running on an EC2 cluster to load this example data set")

        checkParams(dataset, DATASETS.keys())

        basePath = 's3n://neuro.datasets/'
        dataPath = DATASETS[dataset]

        data = self.loadSeries(basePath + dataPath + 'series')
        params = self.loadParams(basePath + dataPath + 'params/covariates.json')

        return data, params

    def loadJSON(self, path):
        """
        Generic function for loading JSON from a path, handling local file systems and S3 or GS

        Parameters
        ----------
        path : str
            Path to a file, can be on a local file system or an S3 or GS bucket

        Returns
        -------
        A string with the JSON
        """

        import json
        from thunder.rdds.fileio.readers import getFileReaderForPath, FileNotFoundError

        reader = getFileReaderForPath(path)(awsCredentialsOverride=self._credentials)
        try:
            buffer = reader.read(path)
        except FileNotFoundError:
            raise Exception("Cannot find file %s" % path)

        return json.loads(buffer)

    def loadParams(self, path):
        """
        Load a file with parameters from a local file system or S3 or GS.

        Assumes file is JSON with basic types (strings, integers, doubles, lists),
        in either a single dict or list of dict-likes, and each dict has at least
        a "name" field and a "value" field.

        Useful for loading generic meta data, parameters, covariates, etc.

        Parameters
        ----------
        path : str
            Path to file, can be on a local file system or an S3 or GS bucket

        Returns
        -------
        A dict or list with the parameters
        """
        blob = self.loadJSON(path)
        return Params(blob)

    def loadSources(self, path):
        """
        Load a file with sources from a local file system or S3 or GS.

        Parameters
        ----------
        path : str
            Path to file, can be on a local file system or an S3 or GS bucket

        Returns
        -------
        A SourceModel

        See also
        --------
        SourceExtraction
        """
        from thunder import SourceExtraction

        blob = self.loadJSON(path)
        return SourceExtraction.deserialize(blob)

    def export(self, data, filename, outputFormat=None, overwrite=False, varname=None):
        """
        Export local array data to a variety of formats.

        Can write to a local file sytem or S3 or GS (destination inferred from filename schema).
        S3 or GS writing useful for persisting arrays when working in an environment without
        accessible local storage.

        Parameters
        ----------
        data : array-like
            The data to export

        filename : str
            Output location (path/to/file.ext)

        outputFormat : str, optional, default = None
            Ouput format ("npy", "mat", or "txt"), if not provided will
            try to infer from file extension.

        overwrite : boolean, optional, default = False
            Whether to overwrite if directory or file already exists

        varname : str, optional, default = None
            Variable name for writing "mat" formatted files
        """
        from numpy import save, savetxt, asarray
        from scipy.io import savemat
        from StringIO import StringIO

        from thunder.rdds.fileio.writers import getFileWriterForPath

        path, file, outputFormat = handleFormat(filename, outputFormat)
        checkParams(outputFormat, ["npy", "mat", "txt"])
        clazz = getFileWriterForPath(filename)
        writer = clazz(path, file, overwrite=overwrite, awsCredentialsOverride=self._credentials)

        stream = StringIO()

        if outputFormat == "mat":
            varname = os.path.splitext(file)[0] if varname is None else varname
            savemat(stream, mdict={varname: data}, oned_as='column', do_compression='true')
        if outputFormat == "npy":
            save(stream, data)
        if outputFormat == "txt":
            if asarray(data).ndim > 2:
                raise Exception("Cannot write data with more than two dimensions to text")
            savetxt(stream, data)

        stream.seek(0)
        writer.writeFile(stream.buf)

    def setAWSCredentials(self, awsAccessKeyId, awsSecretAccessKey):
        """
        Manually set AWS access credentials to be used by Thunder.

        Provided for hosted cloud environments without filesystem access. If
        launching a cluster using the thunder-ec2 script, credentials will be
        configured automatically (inside core-site.xml and ~/.boto), so this
        method should not need to be called.

        Parameters
        ----------
        awsAccessKeyId : string
            AWS public key, usually starts with "AKIA"

        awsSecretAccessKey : string
            AWS private key
        
        """
        from thunder.utils.aws import AWSCredentials
        self._credentials = AWSCredentials(awsAccessKeyId, awsSecretAccessKey)
        self._credentials.setOnContext(self._sc)
Exemple #13
0
 def _checkOverwrite(self, outputDirPath):
     """ Checks for existence of outputDirPath, raising ValueError if it already exists """
     from thunder.utils.aws import AWSCredentials
     from thunder.utils.common import raiseErrorIfPathExists
     awsCredentialOverride = AWSCredentials.fromContext(self.rdd.ctx)
     raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=awsCredentialOverride)
Exemple #14
0
    def saveAsBinarySeries(self, outputDirPath, overwrite=False):
        """
        Writes out Series-formatted data.

        This method (Series.saveAsBinarySeries) writes out binary series files using the current partitioning
        of this Series object. (That is, if mySeries.rdd.getNumPartitions() == 5, then 5 files will be written
        out, one per partition.) The records will not be resorted; the file names for each partition will be
        taken from the key of the first Series record in that partition. If the Series object is already
        sorted and no records have been removed by filtering, then the resulting output should be equivalent
        to what one would get from calling myImages.saveAsBinarySeries().

        If all one wishes to do is to save out Images data in a binary series format, then
        tsc.convertImagesToSeries() will likely be more efficient than
        tsc.loadImages().toSeries().saveAsBinarySeries().

        Parameters
        ----------
        outputDirPath : string path or URI to directory to be created
            Output files will be written underneath outputdirname. This directory must not yet exist
            (unless overwrite is True), and must be no more than one level beneath an existing directory.
            It will be created as a result of this call.

        overwrite : bool
            If true, outputdirname and all its contents will be deleted and recreated as part
            of this call.
        """
        import cStringIO as StringIO
        import struct
        from thunder.rdds.imgblocks.blocks import SimpleBlocks
        from thunder.rdds.fileio.writers import getParallelWriterForPath
        from thunder.rdds.fileio.seriesloader import writeSeriesConfig
        from thunder.utils.aws import AWSCredentials

        if not overwrite:
            self._checkOverwrite(outputDirPath)
            overwrite = True  # prevent additional downstream checks for this path

        def partitionToBinarySeries(kvIter):
            """ Collects all Series records in a partition into a single binary series record. """
            keypacker = None
            firstKey = None
            buf = StringIO.StringIO()
            for seriesKey, series in kvIter:
                if keypacker is None:
                    keypacker = struct.Struct('h'*len(seriesKey))
                    firstKey = seriesKey
                # print >> sys.stderr, seriesKey, series, series.tostring().encode('hex')
                buf.write(keypacker.pack(*seriesKey))
                buf.write(series.tostring())
            val = buf.getvalue()
            buf.close()
            # we might have an empty partition, in which case firstKey will still be None
            if firstKey is None:
                return iter([])
            else:
                label = SimpleBlocks.getBinarySeriesNameForKey(firstKey) + ".bin"
                return iter([(label, val)])

        awsCredentials = AWSCredentials.fromContext(self.rdd.ctx)
        writer = getParallelWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite,
                                                         awsCredentialsOverride=awsCredentials)

        binseriesrdd = self.rdd.mapPartitions(partitionToBinarySeries)

        binseriesrdd.foreach(writer.writerFcn)

        # TODO: all we really need here are the number of keys and number of values, which could in principle
        # be cached in _nkeys and _nvals attributes, removing the need for this .first() call in most cases.
        firstKey, firstVal = self.first()
        writeSeriesConfig(outputDirPath, len(firstKey), len(firstVal), keyType='int16', valueType=self.dtype,
                          overwrite=overwrite, awsCredentialsOverride=awsCredentials)
Exemple #15
0
    def saveAsPngs(self,
                   outputDirPath,
                   cmap=None,
                   vmin=None,
                   vmax=None,
                   filePrefix="export",
                   overwrite=False,
                   collectToDriver=True):
        """
        Write out basic png files for two-dimensional image data.

        Files will be written into a newly-created directory given by outputdirname.

        Parameters
        ----------
        outputDirPath : string
            Path to output directory to be created. Exception will be thrown if this directory already
            exists, unless overwrite is True. Directory must be one level below an existing directory.

        filePrefix : string
            String to prepend to all filenames. Files will be named <fileprefix>-00000.png, <fileprefix>-00001.png, etc

        overwrite : bool
            If true, the directory given by outputdirname will first be deleted if it already exists.

        collectToDriver : bool, default True
            If true, images will be collect()'ed at the driver first before being written out, allowing
            for use of a local filesystem at the expense of network overhead. If false, images will be written
            in parallel by each executor, presumably to a distributed or networked filesystem.
        """
        dims = self.dims
        if not len(dims) == 2:
            raise ValueError(
                "Only two-dimensional images can be exported as .png files; image is %d-dimensional."
                % len(dims))

        from matplotlib.pyplot import imsave
        from io import BytesIO
        from thunder.rdds.fileio.writers import getParallelWriterForPath, getCollectedFileWriterForPath
        from thunder.utils.aws import AWSCredentials

        def toFilenameAndPngBuf(kv):
            key, img = kv
            fname = filePrefix + "-" + "%05d.png" % int(key)
            bytebuf = BytesIO()
            imsave(bytebuf, img, vmin, vmax, cmap=cmap, format="png")
            return fname, bytebuf.getvalue()

        bufRdd = self.rdd.map(toFilenameAndPngBuf)

        awsCredentials = AWSCredentials.fromContext(self.rdd.ctx)
        if collectToDriver:
            writer = getCollectedFileWriterForPath(outputDirPath)(
                outputDirPath,
                overwrite=overwrite,
                awsCredentialsOverride=awsCredentials)
            writer.writeCollectedFiles(bufRdd.collect())
        else:
            writer = getParallelWriterForPath(outputDirPath)(
                outputDirPath,
                overwrite=overwrite,
                awsCredentialsOverride=awsCredentials)
            bufRdd.foreach(writer.writerFcn)
Exemple #16
0
class ThunderContext():
    """
    Wrapper for a SparkContext that provides an entry point for loading and saving.

    Also supports creation of example datasets, and loading example
    data both locally and from EC2.
    """
    def __init__(self, sparkcontext):
        self._sc = sparkcontext
        self._credentials = None

    @classmethod
    def start(cls, *args, **kwargs):
        """
        Starts a ThunderContext using the same arguments as SparkContext
        """
        from pyspark import SparkContext
        return ThunderContext(SparkContext(*args, **kwargs))

    def addPyFile(self, path):
        """
        Adds a .zip or .py or .egg dependency for all tasks to be executed
        as part of this context.

        Uses the corresponding SparkContext method.

        Parameters
        ----------
        path : str
            Path to a file as either a local file, file in HDFS, or URI.
        """
        self._sc.addPyFile(path)

    def stop(self):
        """
        Shut down the context
        """
        self._sc.stop()

    def loadSeries(self,
                   dataPath,
                   nkeys=None,
                   nvalues=None,
                   inputFormat='binary',
                   minPartitions=None,
                   maxPartitionSize='32mb',
                   confFilename='conf.json',
                   keyType=None,
                   valueType=None,
                   keyPath=None,
                   varName=None):
        """
        Loads a Series object from data stored as binary, text, npy, or mat.

        For binary and text, supports single files or multiple files stored on a local file system,
        a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS.
        For local formats (npy and mat) only local file systems currently supported.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        nkeys: int, optional (required if `inputFormat` is 'text'), default = None
            Number of keys per record (e.g. 3 for (x, y, z) coordinate keys). Must be specified for
            text data; can be specified here or in a configuration file for binary data.

        nvalues: int, optional (required if `inputFormat` is 'text')
            Number of values per record. Must be specified here or in a configuration file for binary data.

        inputFormat: {'text', 'binary', 'npy', 'mat'}. optional, default = 'binary'
            inputFormat of data to be read.

        minPartitions: int, optional, default = SparkContext.minParallelism
            Minimum number of Spark partitions to use, only for text.

        maxPartitionSize : int, optional, default = '32mb'
            Maximum size of partitions as a Java-style memory string, e.g. '32mb' or '64mb',
            indirectly controls the number of Spark partitions, only for binary.

        confFilename: string, optional, default 'conf.json'
            Path to JSON file with configuration options including 'nkeys', 'nvalues',
            'keyType', and 'valueType'. If a file is not found at the given path, then the base
            directory in 'dataPath' will be checked. Parameters will override the conf file.

        keyType: string or numpy dtype, optional, default = None
            Numerical type of keys, will override conf file.

        valueType: string or numpy dtype, optional, default = None
            Numerical type of values, will override conf file.

        keyPath: string, optional, default = None
            Path to file with keys when loading from npy or mat.

        varName : str, optional, default = None
            Variable name to load (for MAT files only)

        Returns
        -------
        data: thunder.rdds.Series
            A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs
        """
        checkParams(inputFormat, ['text', 'binary', 'npy', 'mat'])

        from thunder.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc, minPartitions=minPartitions)

        if inputFormat.lower() == 'binary':
            data = loader.fromBinary(dataPath,
                                     confFilename=confFilename,
                                     nkeys=nkeys,
                                     nvalues=nvalues,
                                     keyType=keyType,
                                     valueType=valueType,
                                     maxPartitionSize=maxPartitionSize)
        elif inputFormat.lower() == 'text':
            if nkeys is None:
                raise Exception(
                    'Must provide number of keys per record for loading from text'
                )
            data = loader.fromText(dataPath, nkeys=nkeys)
        elif inputFormat.lower() == 'npy':
            data = loader.fromNpyLocal(dataPath, keyPath)
        else:
            if varName is None:
                raise Exception(
                    'Must provide variable name for loading MAT files')
            data = loader.fromMatLocal(dataPath, varName, keyPath)

        return data

    def loadImages(self,
                   dataPath,
                   dims=None,
                   dtype=None,
                   inputFormat='stack',
                   ext=None,
                   startIdx=None,
                   stopIdx=None,
                   recursive=False,
                   nplanes=None,
                   npartitions=None,
                   renumber=False,
                   confFilename='conf.json'):
        """
        Loads an Images object from data stored as a binary image stack, tif, or png files.

        Supports single files or multiple files, stored on a local file system, a networked file sytem
        (mounted and available on all nodes), Amazon S3, or Google Storage.
        HDFS is not currently supported for image file data.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        dims: tuple of positive int, optional (required if inputFormat is 'stack')
            Image dimensions. Binary stack data will be interpreted as a multidimensional array
            with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention),
            where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions
            will be read from the image file headers.

        inputFormat: str, optional, default = 'stack'
            Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks.
            'png' or 'tif' indicate image format. Page of a multipage tif file will be extend along
            the third dimension. Separate files interpreted as distinct records, with ordering
            given by lexicographic sorting of file names.

        ext: string, optional, default = None
            File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif',
            and 'png' for inputFormat=="png".

        dtype: string or numpy dtype, optional, default = 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string.
            Ignored for 'tif' or 'png' (data will be inferred from image formats).

        startIdx: nonnegative int, optional, default = None
            Convenience parameters to read only a subset of input files. Uses python slice conventions
            (zero-based indexing with exclusive final position). These parameters give the starting
            and final index after lexicographic sorting.

        stopIdx: nonnegative int, optional, default = None
            See startIdx.

        recursive: boolean, optional, default = False
            If true, will recursively descend directories rooted at dataPath, loading all files
            in the tree with an appropriate extension.

        nplanes: positive integer, optional, default = None
            Subdivide individual image files. Every `nplanes` from each file will be considered a new record.
            With nplanes=None (the default), a single file will be considered as representing a single record.
            If the number of records per file is not the same across all files, then `renumber` should be set
            to True to ensure consistent keys.

        npartitions: positive int, optional, default = None
            Specify number of partitions for the RDD, if unspecified will use as many partitions
            as available cores

        renumber: boolean, optional, default = False
            Recalculate keys for records after images are loading. Only necessary if different files contain
            different number of records (e.g. due to specifying nplanes). See Images.renumber().

        confFilename : string, optional, default = 'conf.json'
            Name of conf file if using to specify parameters for binary stack data

        Returns
        -------
        data: thunder.rdds.Images
            An Images object, wrapping an RDD of with (int) : (numpy array) pairs

        """
        checkParams(inputFormat, ['stack', 'png', 'tif', 'tif-stack'])

        from thunder.rdds.fileio.imagesloader import ImagesLoader
        loader = ImagesLoader(self._sc)

        if npartitions is None:
            npartitions = self._sc.defaultParallelism

        # Checking StartIdx is smaller or equal to StopIdx
        if startIdx is not None and stopIdx is not None and startIdx > stopIdx:
            raise Exception(
                "Error. startIdx {} is larger than stopIdx {}".inputFormat(
                    startIdx, stopIdx))

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        if inputFormat.lower() == 'stack':
            data = loader.fromStack(dataPath,
                                    dims=dims,
                                    dtype=dtype,
                                    ext=ext,
                                    startIdx=startIdx,
                                    stopIdx=stopIdx,
                                    recursive=recursive,
                                    nplanes=nplanes,
                                    npartitions=npartitions,
                                    confFilename=confFilename)
        elif inputFormat.lower().startswith('tif'):
            data = loader.fromTif(dataPath,
                                  ext=ext,
                                  startIdx=startIdx,
                                  stopIdx=stopIdx,
                                  recursive=recursive,
                                  nplanes=nplanes,
                                  npartitions=npartitions)
        else:
            if nplanes:
                raise NotImplementedError(
                    "nplanes argument is not supported for png files")
            data = loader.fromPng(dataPath,
                                  ext=ext,
                                  startIdx=startIdx,
                                  stopIdx=stopIdx,
                                  recursive=recursive,
                                  npartitions=npartitions)

        if not renumber:
            return data
        else:
            return data.renumber()

    def loadSeriesFromArray(self, values, index=None, npartitions=None):
        """
        Load Series data from a local array

        Parameters
        ----------
        values : list or ndarray
            A list of 1d numpy arrays, or a single 2d numpy array

        index : array-like, optional, deafult = None
            Index to set for Series object, if None will use linear indices.

        npartitions : position int, optional, default = None
            Number of partitions for RDD, if unspecified will use
            default parallelism.
        """
        from numpy import ndarray, asarray
        from thunder.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc)

        if not npartitions:
            npartitions = self._sc.defaultParallelism

        if isinstance(values, list):
            values = asarray(values)

        if isinstance(values, ndarray) and values.ndim > 1:
            values = list(values)

        data = loader.fromArrays(values, npartitions=npartitions)

        if index:
            data.index = index

        return data

    def loadImagesFromArray(self, values, npartitions=None):
        """
        Load Images data from a local array

        Parameters
        ----------
        values : list or ndarray
            A list of 2d or 3d numpy arrays,
            or a single 3d or 4d numpy array

        npartitions : position int, optional, default = None
            Number of partitions for RDD, if unspecified will use
            default parallelism.
        """
        from numpy import ndarray, asarray

        from thunder.rdds.fileio.imagesloader import ImagesLoader
        loader = ImagesLoader(self._sc)

        if isinstance(values, list):
            values = asarray(values)

        if isinstance(values, ndarray) and values.ndim > 2:
            values = list(values)

        if not npartitions:
            npartitions = self._sc.defaultParallelism

        return loader.fromArrays(values, npartitions=npartitions)

    def loadImagesOCP(self,
                      bucketName,
                      resolution,
                      server='ocp.me',
                      startIdx=None,
                      stopIdx=None,
                      minBound=None,
                      maxBound=None):
        """
        Load Images from OCP (Open Connectome Project).

        The OCP is a web service for access to EM brain images and other neural image data.
        The web-service can be accessed at http://www.openconnectomeproject.org/.
        
        Parameters
        ----------
        bucketName: string
            Token name for the project in OCP. This name should exist on the server from which data is loaded.

        resolution: nonnegative int
            Resolution of the data in OCP

        server: string, optional, default = 'ocp.me'
            Name of the OCP server with the specified token.
        
        startIdx: nonnegative int, optional, default = None
            Convenience parameters to read only a subset of input files. Uses python slice conventions
            (zero-based indexing with exclusive final position).

        stopIdx: nonnegative int, optional
            See startIdx.
        
        minBound, maxBound: tuple of nonnegative int, optional, default = None
            X,Y,Z bounds of the data to fetch from OCP. minBound contains the (xMin,yMin,zMin) while
            maxBound contains (xMax,yMax,zMax).

        Returns
        -------
        data: thunder.rdds.Images
             An Images object, wrapping an RDD of with (int) : (numpy array) pairs
        """

        from thunder.rdds.fileio.imagesloader import ImagesLoader
        loader = ImagesLoader(self._sc)

        # Checking StartIdx is smaller or equal to StopIdx
        if startIdx is not None and stopIdx is not None and startIdx > stopIdx:
            raise Exception(
                "Error. startIdx {} is larger than stopIdx {}".format(
                    startIdx, stopIdx))
        data = loader.fromOCP(bucketName,
                              resolution=resolution,
                              server=server,
                              startIdx=startIdx,
                              stopIdx=stopIdx,
                              minBound=minBound,
                              maxBound=maxBound)

        return data

    def loadImagesAsSeries(self,
                           dataPath,
                           dims=None,
                           inputFormat='stack',
                           ext=None,
                           dtype='int16',
                           blockSize="150M",
                           blockSizeUnits="pixels",
                           startIdx=None,
                           stopIdx=None,
                           recursive=False,
                           nplanes=None,
                           npartitions=None,
                           renumber=False,
                           confFilename='conf.json'):
        """
        Load Images data as Series data.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        dims: tuple of positive int, optional (required if inputFormat is 'stack')
            Image dimensions. Binary stack data will be interpreted as a multidimensional array
            with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention),
            where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions
            will be read from the image file headers.

        inputFormat: str, optional, default = 'stack'
            Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks.
            'png' or 'tif' indicate image formats. Page of a multipage tif file will be extend along
            the third dimension. Separate files interpreted as distinct records, with ordering
            given by lexicographic sorting of file names.

        ext: string, optional, default = None
            File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif',
            and 'png' for inputFormat=="png".

        dtype: string or numpy dtype. optional, default 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string.
            Ignored for 'tif' or 'png' (data will be inferred from image formats).

        blockSize: string or positive int, optional, default "150M"
            Requested size of blocks (e.g "64M", "512k", "2G"). If shuffle=True, can also be a
            tuple of int specifying the number of pixels or splits per dimension. Indirectly
            controls the number of Spark partitions, with one partition per block.

        blockSizeUnits: string, either "pixels" or "splits", default "pixels"
            Units for interpreting a tuple passed as blockSize when shuffle=True.

        startIdx: nonnegative int, optional, default = None
            Convenience parameters to read only a subset of input files. Uses python slice conventions
            (zero-based indexing with exclusive final position). These parameters give the starting
            and final index after lexicographic sorting.

        stopIdx: nonnegative int, optional, default = None
            See startIdx.

        recursive: boolean, optional, default = False
            If true, will recursively descend directories rooted at dataPath, loading all files
            in the tree with an appropriate extension.

        nplanes: positive integer, optional, default = None
            Subdivide individual image files. Every `nplanes` from each file will be considered a new record.
            With nplanes=None (the default), a single file will be considered as representing a single record.
            If the number of records per file is not the same across all files, then `renumber` should be set
            to True to ensure consistent keys.

        npartitions: positive int, optional, default = None
            Specify number of partitions for the RDD, if unspecified will use 1 partition per image.

        renumber: boolean, optional, default = False
            Recalculate keys for records after images are loading. Only necessary if different files contain
            different number of records (e.g. due to specifying nplanes). See Images.renumber().

        confFilename : string, optional, default = 'conf.json'
            Name of conf file if using to specify parameters for binary stack data

        Returns
        -------
        data: thunder.rdds.Series
            A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs.
            Keys will be n-tuples of int, with n given by dimensionality of the images, and correspond
            to indexes into the image arrays. Value will have length equal to the number of image files.
            With each image contributing one point to this value array, with ordering given by
            the lexicographic ordering of image file names.
        """
        checkParams(inputFormat, ['stack', 'tif', 'tif-stack'])

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        from thunder.rdds.fileio.imagesloader import ImagesLoader
        loader = ImagesLoader(self._sc)
        if inputFormat.lower() == 'stack':
            images = loader.fromStack(dataPath,
                                      dims,
                                      dtype=dtype,
                                      ext=ext,
                                      startIdx=startIdx,
                                      stopIdx=stopIdx,
                                      recursive=recursive,
                                      nplanes=nplanes,
                                      npartitions=npartitions,
                                      confFilename=confFilename)
        else:
            # tif / tif stack
            images = loader.fromTif(dataPath,
                                    ext=ext,
                                    startIdx=startIdx,
                                    stopIdx=stopIdx,
                                    recursive=recursive,
                                    nplanes=nplanes,
                                    npartitions=npartitions)
        if renumber:
            images = images.renumber()
        return images.toBlocks(blockSize, units=blockSizeUnits).toSeries()

    def convertImagesToSeries(self,
                              dataPath,
                              outputDirPath,
                              dims=None,
                              inputFormat='stack',
                              ext=None,
                              dtype='int16',
                              blockSize="150M",
                              blockSizeUnits="pixels",
                              startIdx=None,
                              stopIdx=None,
                              overwrite=False,
                              recursive=False,
                              nplanes=None,
                              npartitions=None,
                              renumber=False,
                              confFilename='conf.json'):
        """
        Write out Images data as Series data, saved in a flat binary format.

        The resulting files may subsequently be read in using ThunderContext.loadSeries().
        Loading Series data directly will likely be faster than converting image data
        to a Series object through loadImagesAsSeries().

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        outputDirPath: string
            Path to directory to write Series file output. May be either a path on the local file system
            or a URI-like format, such as "local/directory", "s3n:///my-s3-bucket/data/",
            or "file:///mnt/another/directory/". If the directory exists and 'overwrite' is True,
            the existing directory and all its contents will be deleted and overwritten.

        dims: tuple of positive int, optional (required if inputFormat is 'stack')
            Image dimensions. Binary stack data will be interpreted as a multidimensional array
            with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention),
            where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions
            will be read from the image file headers.

        inputFormat: str, optional, default = 'stack'
            Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks.
            'png' or 'tif' indicate image formats. Page of a multipage tif file will be extend along
            the third dimension. Separate files interpreted as distinct records, with ordering
            given by lexicographic sorting of file names.

        ext: string, optional, default = None
            File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif',
            and 'png' for inputFormat=="png".

        dtype: string or numpy dtype. optional, default 'int16'
            Data type of the image files to be loaded, specified as a numpy "dtype" string.
            Ignored for 'tif' or 'png' (data will be inferred from image formats).

        blockSize: string or positive int, optional, default "150M"
            Requested size of blocks (e.g "64M", "512k", "2G"). If shuffle=True, can also be a
            tuple of int specifying the number of pixels or splits per dimension. Indirectly
            controls the number of Spark partitions, with one partition per block.

        blockSizeUnits: string, either "pixels" or "splits", default "pixels"
            Units for interpreting a tuple passed as blockSize when shuffle=True.

        startIdx: nonnegative int, optional, default = None
            Convenience parameters to read only a subset of input files. Uses python slice conventions
            (zero-based indexing with exclusive final position). These parameters give the starting
            and final index after lexicographic sorting.

        stopIdx: nonnegative int, optional, default = None
            See startIdx.

        overwrite: boolean, optional, default False
            If true, the directory specified by outputDirPath will be deleted (recursively) if it
            already exists. (Use with caution.)

        recursive: boolean, optional, default = False
            If true, will recursively descend directories rooted at dataPath, loading all files
            in the tree with an appropriate extension.

        nplanes: positive integer, optional, default = None
            Subdivide individual image files. Every `nplanes` from each file will be considered a new record.
            With nplanes=None (the default), a single file will be considered as representing a single record.
            If the number of records per file is not the same across all files, then `renumber` should be set
            to True to ensure consistent keys.

        npartitions: positive int, optional, default = None
            Specify number of partitions for the RDD, if unspecified will use 1 partition per image.

        renumber: boolean, optional, default = False
            Recalculate keys for records after images are loading. Only necessary if different files contain
            different number of records (e.g. due to specifying nplanes). See Images.renumber().

        confFilename : string, optional, default = 'conf.json'
            Name of conf file if using to specify parameters for binary stack data

        """
        checkParams(inputFormat, ['stack', 'tif', 'tif-stack'])

        if not overwrite:
            raiseErrorIfPathExists(outputDirPath,
                                   awsCredentialsOverride=self._credentials)
            overwrite = True  # prevent additional downstream checks for this path

        if not ext:
            ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None)

        from thunder.rdds.fileio.imagesloader import ImagesLoader
        loader = ImagesLoader(self._sc)
        if inputFormat.lower() == 'stack':
            images = loader.fromStack(dataPath,
                                      dims,
                                      ext=ext,
                                      dtype=dtype,
                                      startIdx=startIdx,
                                      stopIdx=stopIdx,
                                      recursive=recursive,
                                      nplanes=nplanes,
                                      npartitions=npartitions,
                                      confFilename=confFilename)
        else:
            # 'tif' or 'tif-stack'
            images = loader.fromTif(dataPath,
                                    ext=ext,
                                    startIdx=startIdx,
                                    stopIdx=stopIdx,
                                    recursive=recursive,
                                    nplanes=nplanes,
                                    npartitions=npartitions)
        if renumber:
            images = images.renumber()
        images.toBlocks(blockSize, units=blockSizeUnits).saveAsBinarySeries(
            outputDirPath, overwrite=overwrite)

    def makeExample(self, dataset=None, **opts):
        """
        Make an example data set for testing analyses.

        Options include 'pca', 'factor', 'kmeans', 'ica', 'sources'
        See thunder.utils.datasets for detailed options.

        Parameters
        ----------
        dataset : str
            Which dataset to generate

        Returns
        -------
        data : RDD of (tuple, array) pairs
            Generated dataset

        """
        from thunder.utils.datasets import DATASET_MAKERS

        if dataset is None:
            return sorted(DATASET_MAKERS.keys())

        checkParams(dataset, DATASET_MAKERS.keys())

        return DataSets.make(self._sc, dataset, **opts)

    def loadExample(self, dataset=None):
        """
        Load a local example data set for testing analyses.

        Some of these data sets are extremely downsampled and should be considered
        useful only for testing the API. If called with None,
        will return list of available datasets.

        Parameters
        ----------
        dataset : str
            Which dataset to load

        Returns
        -------
        data : Data object
            Generated dataset as a Thunder data objects (e.g Series or Images)
        """
        import atexit
        import shutil
        import tempfile
        from pkg_resources import resource_listdir, resource_filename

        DATASETS = {
            'iris': 'iris',
            'fish-series': 'fish/series',
            'fish-images': 'fish/images',
            'mouse-series': 'mouse/series',
            'mouse-images': 'mouse/images',
            'mouse-params': 'mouse/params'
        }

        if dataset is None:
            return sorted(DATASETS.keys())

        checkParams(dataset, DATASETS.keys())

        if 'ec2' in self._sc.master:
            tmpdir = os.path.join('/root/thunder/thunder/utils', 'data',
                                  DATASETS[dataset])
        else:
            tmpdir = tempfile.mkdtemp()
            atexit.register(shutil.rmtree, tmpdir)

            def copyLocal(target):
                files = resource_listdir('thunder.utils.data', target)
                for f in files:
                    path = resource_filename('thunder.utils.data',
                                             os.path.join(target, f))
                    shutil.copy(path, tmpdir)

            copyLocal(DATASETS[dataset])

        npartitions = self._sc.defaultParallelism

        if dataset == "iris":
            return self.loadSeries(tmpdir)
        elif dataset == "fish-series":
            return self.loadSeries(tmpdir).astype('float')
        elif dataset == "fish-images":
            return self.loadImages(tmpdir,
                                   inputFormat="tif",
                                   npartitions=npartitions)
        elif dataset == "mouse-series":
            return self.loadSeries(tmpdir).astype('float')
        elif dataset == "mouse-images":
            return self.loadImages(tmpdir, npartitions=npartitions)
        elif dataset == "mouse-params":
            return self.loadParams(os.path.join(tmpdir, 'covariates.json'))

    def loadExampleS3(self, dataset=None):
        """
        Load an example data set from S3.

        Info on the included datasets can be found at the CodeNeuro data repository
        (http://datasets.codeneuro.org/). If called with None, will return
        list of available datasets.

        Parameters
        ----------
        dataset : str
            Which dataset to load

        Returns
        -------
        data : a Data object (usually a Series or Images)
            The dataset as one of Thunder's data objects

        params : dict
            Parameters or metadata for dataset
        """
        DATASETS = {
            'ahrens.lab/direction.selectivity':
            'ahrens.lab/direction.selectivity/1/',
            'ahrens.lab/optomotor.response':
            'ahrens.lab/optomotor.response/1/',
            'svoboda.lab/tactile.navigation':
            'svoboda.lab/tactile.navigation/1/'
        }

        if dataset is None:
            return DATASETS.keys()

        if 'local' in self._sc.master:
            raise Exception(
                "Must be running on an EC2 cluster to load this example data set"
            )

        checkParams(dataset, DATASETS.keys())

        basePath = 's3n://neuro.datasets/'
        dataPath = DATASETS[dataset]

        data = self.loadSeries(basePath + dataPath + 'series')
        params = self.loadParams(basePath + dataPath +
                                 'params/covariates.json')

        return data, params

    def loadJSON(self, path):
        """
        Generic function for loading JSON from a path, handling local file systems and S3 or GS

        Parameters
        ----------
        path : str
            Path to a file, can be on a local file system or an S3 or GS bucket

        Returns
        -------
        A string with the JSON
        """

        import json
        from thunder.rdds.fileio.readers import getFileReaderForPath, FileNotFoundError
        from thunder.utils.serializable import _decode_dict

        reader = getFileReaderForPath(path)(
            awsCredentialsOverride=self._credentials)
        try:
            buffer = reader.read(path)
        except FileNotFoundError:
            raise Exception("Cannot find file %s" % path)

        return json.loads(buffer, object_hook=_decode_dict)

    def loadParams(self, path):
        """
        Load a file with parameters from a local file system or S3 or GS.

        Assumes file is JSON with basic types (strings, integers, doubles, lists),
        in either a single dict or list of dict-likes, and each dict has at least
        a "name" field and a "value" field.

        Useful for loading generic meta data, parameters, covariates, etc.

        Parameters
        ----------
        path : str
            Path to file, can be on a local file system or an S3 or GS bucket

        Returns
        -------
        A dict or list with the parameters
        """
        blob = self.loadJSON(path)
        return Params(blob)

    def loadSources(self, path):
        """
        Load a file with sources from a local file system or S3 or GS.

        Parameters
        ----------
        path : str
            Path to file, can be on a local file system or an S3 or GS bucket

        Returns
        -------
        A SourceModel

        See also
        --------
        SourceExtraction
        """
        from thunder import SourceExtraction

        blob = self.loadJSON(path)
        return SourceExtraction.deserialize(blob)

    def export(self,
               data,
               filename,
               outputFormat=None,
               overwrite=False,
               varname=None):
        """
        Export local array data to a variety of formats.

        Can write to a local file sytem or S3 or GS (destination inferred from filename schema).
        S3 or GS writing useful for persisting arrays when working in an environment without
        accessible local storage.

        Parameters
        ----------
        data : array-like
            The data to export

        filename : str
            Output location (path/to/file.ext)

        outputFormat : str, optional, default = None
            Ouput format ("npy", "mat", or "txt"), if not provided will
            try to infer from file extension.

        overwrite : boolean, optional, default = False
            Whether to overwrite if directory or file already exists

        varname : str, optional, default = None
            Variable name for writing "mat" formatted files
        """
        from numpy import save, savetxt, asarray
        from scipy.io import savemat
        from StringIO import StringIO

        from thunder.rdds.fileio.writers import getFileWriterForPath

        path, file, outputFormat = handleFormat(filename, outputFormat)
        checkParams(outputFormat, ["npy", "mat", "txt"])
        clazz = getFileWriterForPath(filename)
        writer = clazz(path,
                       file,
                       overwrite=overwrite,
                       awsCredentialsOverride=self._credentials)

        stream = StringIO()

        if outputFormat == "mat":
            varname = os.path.splitext(file)[0] if varname is None else varname
            savemat(stream,
                    mdict={varname: data},
                    oned_as='column',
                    do_compression='true')
        if outputFormat == "npy":
            save(stream, data)
        if outputFormat == "txt":
            if asarray(data).ndim > 2:
                raise Exception(
                    "Cannot write data with more than two dimensions to text")
            savetxt(stream, data)

        stream.seek(0)
        writer.writeFile(stream.buf)

    def setAWSCredentials(self, awsAccessKeyId, awsSecretAccessKey):
        """
        Manually set AWS access credentials to be used by Thunder.

        Provided for hosted cloud environments without filesystem access. If
        launching a cluster using the thunder-ec2 script, credentials will be
        configured automatically (inside core-site.xml and ~/.boto), so this
        method should not need to be called.

        Parameters
        ----------
        awsAccessKeyId : string
            AWS public key, usually starts with "AKIA"

        awsSecretAccessKey : string
            AWS private key
        
        """
        from thunder.utils.aws import AWSCredentials
        self._credentials = AWSCredentials(awsAccessKeyId, awsSecretAccessKey)
        self._credentials.setOnContext(self._sc)