def saveAsBinarySeries(self, outputDirPath, overwrite=False): """Writes out Series-formatted data. Subclasses are *not* expected to override this method. Parameters ---------- outputdirname : string path or URI to directory to be created Output files will be written underneath outputdirname. This directory must not yet exist (unless overwrite is True), and must be no more than one level beneath an existing directory. It will be created as a result of this call. overwrite : bool If true, outputdirname and all its contents will be deleted and recreated as part of this call. """ from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.seriesloader import writeSeriesConfig from thunder.utils.common import AWSCredentials if not overwrite: self._checkOverwrite(outputDirPath) overwrite = True # prevent additional downstream checks for this path awsCredentialsOverride = AWSCredentials.fromContext(self.rdd.ctx) writer = getParallelWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentialsOverride) binseriesRdd = self.toBinarySeries() binseriesRdd.foreach(writer.writerFcn) writeSeriesConfig(outputDirPath, len(self.dims), self.nimages, keyType='int16', valueType=self.dtype, overwrite=overwrite, awsCredentialsOverride=awsCredentialsOverride)
def setAWSCredentials(self, awsAccessKeyId, awsSecretAccessKey): """ Manually set AWS access credentials to be used by Thunder. Provided for hosted cloud environments without filesystem access. If launching a cluster using the thunder-ec2 script, credentials will be configured automatically (inside core-site.xml and ~/.boto), so this method should not need to be called. Parameters ---------- awsAccessKeyId : string AWS public key, usually starts with "AKIA" awsSecretAccessKey : string AWS private key """ from thunder.utils.common import AWSCredentials self._credentials = AWSCredentials(awsAccessKeyId, awsSecretAccessKey) self._credentials.setOnContext(self._sc)
def __init__(self, sparkContext): """Initialize a new ImagesLoader object. Parameters ---------- sparkcontext: SparkContext The pyspark SparkContext object used by the current Thunder environment. """ from thunder.utils.common import AWSCredentials self.sc = sparkContext self.awsCredentialsOverride = AWSCredentials.fromContext(sparkContext)
def exportAsPngs(self, outputDirPath, filePrefix="export", overwrite=False, collectToDriver=True): """ Write out basic png files for two-dimensional image data. Files will be written into a newly-created directory on the local file system given by outputdirname. All workers must be able to see the output directory via an NFS share or similar. Parameters ---------- outputDirPath : string Path to output directory to be created. Exception will be thrown if this directory already exists, unless overwrite is True. Directory must be one level below an existing directory. filePrefix : string String to prepend to all filenames. Files will be named <fileprefix>00000.png, <fileprefix>00001.png, etc overwrite : bool If true, the directory given by outputdirname will first be deleted if it already exists. collectToDriver : bool, default True If true, images will be collect()'ed at the driver first before being written out, allowing for use of a local filesystem at the expense of network overhead. If false, images will be written in parallel by each executor, presumably to a distributed or networked filesystem. """ dims = self.dims if not len(dims) == 2: raise ValueError("Only two-dimensional images can be exported as .png files; image is %d-dimensional." % len(dims)) from matplotlib.pyplot import imsave from io import BytesIO from thunder.rdds.fileio.writers import getParallelWriterForPath, getCollectedFileWriterForPath from thunder.utils.common import AWSCredentials def toFilenameAndPngBuf(kv): key, img = kv fname = filePrefix+"%05d.png" % int(key) bytebuf = BytesIO() imsave(bytebuf, img, format="png") return fname, bytebuf.getvalue() bufRdd = self.rdd.map(toFilenameAndPngBuf) awsCredentials = AWSCredentials.fromContext(self.rdd.ctx) if collectToDriver: writer = getCollectedFileWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) writer.writeCollectedFiles(bufRdd.collect()) else: writer = getParallelWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) bufRdd.foreach(writer.writerFcn)
def setAWSCredentials(self, awsAccessKeyId, awsSecretAccessKey): """ Manually set AWS access credentials to be used by Thunder. This method is provided primarily for hosted environments that do not provide filesystem access (e.g. Databricks Cloud). Typically AWS credentials can be set and read from core-site.xml (for Hadoop input format readers, such as Series binary files), ~/.boto or other boto credential file locations, or the environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY. These credentials should be configured automatically in clusters launched by the thunder-ec2 script, and so this method should not have to be called. Parameters ---------- awsAccessKeyId : string AWS public key, usually starts with "AKIA" awsSecretAccessKey : string AWS private key """ from thunder.utils.common import AWSCredentials self._credentials = AWSCredentials(awsAccessKeyId, awsSecretAccessKey) self._credentials.setOnContext(self._sc)
def __init__(self, sparkContext, minPartitions=None): """Initialize a new SeriesLoader object. Parameters ---------- sparkcontext: SparkContext The pyspark SparkContext object used by the current Thunder environment. minPartitions: int minimum number of partitions to use when loading data. (Used by fromText, fromMatLocal, and fromNpyLocal) """ from thunder.utils.common import AWSCredentials self.sc = sparkContext self.minPartitions = minPartitions self.awsCredentialsOverride = AWSCredentials.fromContext(sparkContext)
def saveAsBinaryImages(self, outputDirPath, filePrefix="export", overwrite=False): """ Write out images or volumes as flat binary files. Files will be written into a newly-created directory given by outputdirname. Parameters ---------- outputDirPath : string Path to output directory to be created. Exception will be thrown if this directory already exists, unless overwrite is True. Directory must be one level below an existing directory. filePrefix : string String to prepend to all filenames. Files will be named <fileprefix>-00000.bin, <fileprefix>-00001.bin, etc overwrite : bool If true, the directory given by outputdirname will first be deleted if it already exists. """ from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.imagesloader import writeBinaryImagesConfig from thunder.utils.common import AWSCredentials dimsTotal = list(asarray(self.dims.max) - asarray(self.dims.min) + 1) def toFilenameAndBinaryBuf(kv): key, img = kv fname = filePrefix + "-" + "%05d.bin" % int(key) return fname, img.transpose().copy() bufRdd = self.rdd.map(toFilenameAndBinaryBuf) awsCredentials = AWSCredentials.fromContext(self.rdd.ctx) writer = getParallelWriterForPath(outputDirPath)( outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) bufRdd.foreach(writer.writerFcn) writeBinaryImagesConfig(outputDirPath, dims=dimsTotal, dtype=self.dtype, overwrite=overwrite, awsCredentialsOverride=awsCredentials)
def setAWSCredentials(self, awsAccessKeyId, awsSecretAccessKey): """Manually set AWS access credentials to be used by Thunder. This method is provided primarily for hosted environments that do not provide filesystem access (e.g. Databricks Cloud). Typically AWS credentials can be set and read from core-site.xml (for Hadoop input format readers, such as Series binary files), ~/.boto or other boto credential file locations, or the environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY. These credentials should be configured automatically in clusters launched by the thunder-ec2 script, and so this method should not have to be called. Parameters ---------- awsAccessKeyId: string AWS public key, usually starts with "AKIA" awsSecretAccessKey: string AWS private key """ from thunder.utils.common import AWSCredentials self.awsCredentials = AWSCredentials(awsAccessKeyId, awsSecretAccessKey) self.awsCredentials.setOnContext(self._sc)
def saveAsBinarySeries(self, outputDirPath, overwrite=False): """Writes out Series-formatted data. Subclasses are *not* expected to override this method. Parameters ---------- outputdirname : string path or URI to directory to be created Output files will be written underneath outputdirname. This directory must not yet exist (unless overwrite is True), and must be no more than one level beneath an existing directory. It will be created as a result of this call. overwrite : bool If true, outputdirname and all its contents will be deleted and recreated as part of this call. """ from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.seriesloader import writeSeriesConfig from thunder.utils.common import AWSCredentials if not overwrite: self._checkOverwrite(outputDirPath) overwrite = True # prevent additional downstream checks for this path awsCredentialsOverride = AWSCredentials.fromContext(self.rdd.ctx) writer = getParallelWriterForPath(outputDirPath)( outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentialsOverride) binseriesRdd = self.toBinarySeries() binseriesRdd.foreach(writer.writerFcn) writeSeriesConfig(outputDirPath, len(self.dims), self.nimages, keyType='int16', valueType=self.dtype, overwrite=overwrite, awsCredentialsOverride=awsCredentialsOverride)
def saveAsBinaryImages(self, outputDirPath, filePrefix="export", overwrite=False): """ Write out images or volumes as flat binary files. Files will be written into a newly-created directory given by outputdirname. Parameters ---------- outputDirPath : string Path to output directory to be created. Exception will be thrown if this directory already exists, unless overwrite is True. Directory must be one level below an existing directory. filePrefix : string String to prepend to all filenames. Files will be named <fileprefix>-00000.bin, <fileprefix>-00001.bin, etc overwrite : bool If true, the directory given by outputdirname will first be deleted if it already exists. """ from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.imagesloader import writeBinaryImagesConfig from thunder.utils.common import AWSCredentials dimsTotal = list(asarray(self.dims.max)-asarray(self.dims.min)+1) def toFilenameAndBinaryBuf(kv): key, img = kv fname = filePrefix+"-"+"%05d.bin" % int(key) return fname, img.transpose().copy() bufRdd = self.rdd.map(toFilenameAndBinaryBuf) awsCredentials = AWSCredentials.fromContext(self.rdd.ctx) writer = getParallelWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) bufRdd.foreach(writer.writerFcn) writeBinaryImagesConfig(outputDirPath, dims=dimsTotal, dtype=self.dtype, overwrite=overwrite, awsCredentialsOverride=awsCredentials)
def _checkOverwrite(self, outputDirPath): """ Checks for existence of outputDirPath, raising ValueError if it already exists """ from thunder.utils.common import AWSCredentials, raiseErrorIfPathExists awsCredentialOverride = AWSCredentials.fromContext(self.rdd.ctx) raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=awsCredentialOverride)
def saveAsBinarySeries(self, outputDirPath, overwrite=False): """ Writes out Series-formatted data. This method (Series.saveAsBinarySeries) writes out binary series files using the current partitioning of this Series object. (That is, if mySeries.rdd.getNumPartitions() == 5, then 5 files will be written out, one per partition.) The records will not be resorted; the file names for each partition will be taken from the key of the first Series record in that partition. If the Series object is already sorted and no records have been removed by filtering, then the resulting output should be equivalent to what one would get from calling myImages.saveAsBinarySeries(). If all one wishes to do is to save out Images data in a binary series format, then tsc.convertImagesToSeries() will likely be more efficient than tsc.loadImages().toSeries().saveAsBinarySeries(). Parameters ---------- outputDirPath : string path or URI to directory to be created Output files will be written underneath outputdirname. This directory must not yet exist (unless overwrite is True), and must be no more than one level beneath an existing directory. It will be created as a result of this call. overwrite : bool If true, outputdirname and all its contents will be deleted and recreated as part of this call. """ import cStringIO as StringIO import struct from thunder.rdds.imgblocks.blocks import SimpleBlocks from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.seriesloader import writeSeriesConfig from thunder.utils.common import AWSCredentials if not overwrite: self._checkOverwrite(outputDirPath) overwrite = True # prevent additional downstream checks for this path def partitionToBinarySeries(kvIter): """ Collects all Series records in a partition into a single binary series record. """ keypacker = None firstKey = None buf = StringIO.StringIO() for seriesKey, series in kvIter: if keypacker is None: keypacker = struct.Struct('h' * len(seriesKey)) firstKey = seriesKey # print >> sys.stderr, seriesKey, series, series.tostring().encode('hex') buf.write(keypacker.pack(*seriesKey)) buf.write(series.tostring()) val = buf.getvalue() buf.close() # we might have an empty partition, in which case firstKey will still be None if firstKey is None: return iter([]) else: label = SimpleBlocks.getBinarySeriesNameForKey( firstKey) + ".bin" return iter([(label, val)]) awsCredentials = AWSCredentials.fromContext(self.rdd.ctx) writer = getParallelWriterForPath(outputDirPath)( outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) binseriesrdd = self.rdd.mapPartitions(partitionToBinarySeries) binseriesrdd.foreach(writer.writerFcn) # TODO: all we really need here are the number of keys and number of values, which could in principle # be cached in _nkeys and _nvals attributes, removing the need for this .first() call in most cases. firstKey, firstVal = self.first() writeSeriesConfig(outputDirPath, len(firstKey), len(firstVal), keyType='int16', valueType=self.dtype, overwrite=overwrite, awsCredentialsOverride=awsCredentials)
def saveAsBinarySeries(self, outputDirPath, overwrite=False): """ Writes out Series-formatted data. This method (Series.saveAsBinarySeries) writes out binary series files using the current partitioning of this Series object. (That is, if mySeries.rdd.getNumPartitions() == 5, then 5 files will be written out, one per partition.) The records will not be resorted; the file names for each partition will be taken from the key of the first Series record in that partition. If the Series object is already sorted and no records have been removed by filtering, then the resulting output should be equivalent to what one would get from calling myImages.saveAsBinarySeries(). If all one wishes to do is to save out Images data in a binary series format, then tsc.convertImagesToSeries() will likely be more efficient than tsc.loadImages().toSeries().saveAsBinarySeries(). Parameters ---------- outputDirPath : string path or URI to directory to be created Output files will be written underneath outputdirname. This directory must not yet exist (unless overwrite is True), and must be no more than one level beneath an existing directory. It will be created as a result of this call. overwrite : bool If true, outputdirname and all its contents will be deleted and recreated as part of this call. """ import cStringIO as StringIO import struct from thunder.rdds.imgblocks.blocks import SimpleBlocks from thunder.rdds.fileio.writers import getParallelWriterForPath from thunder.rdds.fileio.seriesloader import writeSeriesConfig from thunder.utils.common import AWSCredentials if not overwrite: self._checkOverwrite(outputDirPath) overwrite = True # prevent additional downstream checks for this path def partitionToBinarySeries(kvIter): """ Collects all Series records in a partition into a single binary series record. """ keypacker = None firstKey = None buf = StringIO.StringIO() for seriesKey, series in kvIter: if keypacker is None: keypacker = struct.Struct('h'*len(seriesKey)) firstKey = seriesKey # print >> sys.stderr, seriesKey, series, series.tostring().encode('hex') buf.write(keypacker.pack(*seriesKey)) buf.write(series.tostring()) val = buf.getvalue() buf.close() # we might have an empty partition, in which case firstKey will still be None if firstKey is None: return iter([]) else: label = SimpleBlocks.getBinarySeriesNameForKey(firstKey) + ".bin" return iter([(label, val)]) awsCredentials = AWSCredentials.fromContext(self.rdd.ctx) writer = getParallelWriterForPath(outputDirPath)(outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) binseriesrdd = self.rdd.mapPartitions(partitionToBinarySeries) binseriesrdd.foreach(writer.writerFcn) # TODO: all we really need here are the number of keys and number of values, which could in principle # be cached in _nkeys and _nvals attributes, removing the need for this .first() call in most cases. firstKey, firstVal = self.first() writeSeriesConfig(outputDirPath, len(firstKey), len(firstVal), keyType='int16', valueType=self.dtype, overwrite=overwrite, awsCredentialsOverride=awsCredentials)
def exportAsPngs(self, outputDirPath, filePrefix="export", overwrite=False, collectToDriver=True): """ Write out basic png files for two-dimensional image data. Files will be written into a newly-created directory on the local file system given by outputdirname. All workers must be able to see the output directory via an NFS share or similar. Parameters ---------- outputDirPath : string Path to output directory to be created. Exception will be thrown if this directory already exists, unless overwrite is True. Directory must be one level below an existing directory. filePrefix : string String to prepend to all filenames. Files will be named <fileprefix>00000.png, <fileprefix>00001.png, etc overwrite : bool If true, the directory given by outputdirname will first be deleted if it already exists. collectToDriver : bool, default True If true, images will be collect()'ed at the driver first before being written out, allowing for use of a local filesystem at the expense of network overhead. If false, images will be written in parallel by each executor, presumably to a distributed or networked filesystem. """ dims = self.dims if not len(dims) == 2: raise ValueError( "Only two-dimensional images can be exported as .png files; image is %d-dimensional." % len(dims)) from matplotlib.pyplot import imsave from io import BytesIO from thunder.rdds.fileio.writers import getParallelWriterForPath, getCollectedFileWriterForPath from thunder.utils.common import AWSCredentials def toFilenameAndPngBuf(kv): key, img = kv fname = filePrefix + "%05d.png" % int(key) bytebuf = BytesIO() imsave(bytebuf, img, format="png") return fname, bytebuf.getvalue() bufRdd = self.rdd.map(toFilenameAndPngBuf) awsCredentials = AWSCredentials.fromContext(self.rdd.ctx) if collectToDriver: writer = getCollectedFileWriterForPath(outputDirPath)( outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) writer.writeCollectedFiles(bufRdd.collect()) else: writer = getParallelWriterForPath(outputDirPath)( outputDirPath, overwrite=overwrite, awsCredentialsOverride=awsCredentials) bufRdd.foreach(writer.writerFcn)
class ThunderContext(): """ Wrapper for a SparkContext that provides functionality for loading data. Also supports creation of example datasets, and loading example data both locally and from EC2. Attributes ---------- `_sc` : SparkContext Spark context for Spark functionality awsAccessKeyId: None, or string awsSecretAccessKey: None, or string Public and private keys for AWS services. Typically the credentials should be accessible through any of several different configuration files, and so should not have to be set on the ThunderContext. See setAWSCredentials(). """ def __init__(self, sparkcontext): self._sc = sparkcontext self.awsCredentials = None @classmethod def start(cls, *args, **kwargs): """Starts a ThunderContext using the same arguments as SparkContext""" from pyspark import SparkContext return ThunderContext(SparkContext(*args, **kwargs)) def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None, confFilename='conf.json', keyType=None, valueType=None): """ Loads a Series object from data stored as text or binary files. Supports single files or multiple files stored on a local file system, a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS. Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". nkeys: int, optional (but required if `inputFormat` is 'text') dimensionality of data keys. (For instance, (x,y,z) keyed data for 3-dimensional image timeseries data.) For text data, number of keys must be specified in this parameter; for binary data, number of keys must be specified either in this parameter or in a configuration file named by the 'conffile' argument if this parameter is not set. nvalues: int, optional (but required if `inputFormat` is 'text') Number of values expected to be read. For binary data, nvalues must be specified either in this parameter or in a configuration file named by the 'conffile' argument if this parameter is not set. inputFormat: {'text', 'binary'}. optional, default 'binary' Format of data to be read. minPartitions: int, optional Explicitly specify minimum number of Spark partitions to be generated from this data. Used only for text data. Default is to use minParallelism attribute of Spark context object. confFilename: string, optional, default 'conf.json' Path to JSON file with configuration options including 'nkeys', 'nvalues', 'keytype', and 'valuetype'. If a file is not found at the given path, then the base directory given in 'datafile' will also be checked. Parameters `nkeys` or `nvalues` that are specified as explicit arguments to this method will take priority over those found in conffile if both are present. Returns ------- data: thunder.rdds.Series A newly-created Series object, wrapping an RDD of series data. This RDD will have as keys an n-tuple of int, with n given by `nkeys` or the configuration passed in `conffile`. RDD values will be a numpy array of length `nvalues` (or as specified in the passed configuration file). """ checkParams(inputFormat, ['text', 'binary']) from thunder.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc, minPartitions=minPartitions) if inputFormat.lower() == 'text': data = loader.fromText(dataPath, nkeys=nkeys) else: # must be either 'text' or 'binary' data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues, keyType=keyType, valueType=valueType) return data def loadImages(self, dataPath, dims=None, inputFormat='stack', ext=None, dtype='int16', startIdx=None, stopIdx=None, recursive=False, nplanes=None, npartitions=None, renumber=False): """ Loads an Images object from data stored as a binary image stack, tif, or png files. Supports single files or multiple files, stored on a local file system, a networked file sytem (mounted and available on all nodes), or Amazon S3. HDFS is not currently supported for image file data. Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". dims: tuple of positive int, optional (but required if inputFormat is 'stack') Dimensions of input image data, similar to a numpy 'shape' parameter, for instance (1024, 1024, 48). Binary stack data will be interpreted as coming from a multidimensional array of the specified dimensions. Stack data should be stored in row-major order (Fortran or Matlab convention) rather than column-major order (C or python/numpy convention), where the first dimension corresponds to that which is changing most rapidly on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file should be ordered as [(x0, y0, z0), (x1, y0, zo), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ..., (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. If inputFormat is 'png' or 'tif', the dims parameter (if any) will be ignored; data dimensions will instead be read out from the image file headers. inputFormat: {'stack', 'png', 'tif'}. optional, default 'stack' Expected format of the input data. 'stack' indicates flat files of raw binary data. 'png' or 'tif' indicate image files of the corresponding formats. Each page of a multipage tif file will be interpreted as a separate z-plane. For all formats, separate files are interpreted as distinct time points, with ordering given by lexicographic sorting of file names. ext: string, optional, default None Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for inputFormat=='tif', and 'png' for inputFormat="png". dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is 'tif' or 'png', the dtype parameter (if any) will be ignored; data type will instead be read out from the tif headers. startIdx: nonnegative int, optional startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used after lexicographically sorting all input data files matching the dataPath argument. For example, startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position). stopIdx: nonnegative int, optional See startIdx. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an appropriate extension. Recursive loading is currently only implemented for local filesystems (not s3). nplanes: positive integer, default None If passed, will cause a single image file to be subdivided into multiple records. Every `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the first nplane planes of the first file being record 0, the second nplane planes being record 1, etc, until the first file is exhausted and record ordering continues with the first nplane planes of the second file, and so on. With nplanes=None (the default), a single file will be considered as representing a single record. Keys are calculated assuming that all input files contain the same number of records; if the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. npartitions: positive int, optional If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1 partition per image file. renumber: boolean, optional, default False If renumber evaluates to True, then the keys for each record will be explicitly recalculated after all images are loaded. This should only be necessary at load time when different files contain different number of records. See Images.renumber(). Returns ------- data: thunder.rdds.Images A newly-created Images object, wrapping an RDD of <int index, numpy array> key-value pairs. """ checkParams(inputFormat, ['stack', 'png', 'tif', 'tif-stack']) from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if inputFormat.lower() == 'stack': data = loader.fromStack(dataPath, dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) elif inputFormat.lower().startswith('tif'): data = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) else: if nplanes: raise NotImplementedError("nplanes argument is not supported for png files") data = loader.fromPng(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, npartitions=npartitions) if not renumber: return data else: return data.renumber() def loadImagesAsSeries(self, dataPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, shuffle=True, recursive=False, nplanes=None, npartitions=None, renumber=False): """ Load Images data as Series data. Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". dims: tuple of positive int, optional (but required if inputFormat is 'stack') Dimensions of input image data, for instance (1024, 1024, 48). Binary stack data will be interpreted as coming from a multidimensional array of the specified dimensions. The first dimension of the passed dims tuple should be the one that is changing most rapidly on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file should be ordered as [(x0, y0, z0), (x1, y0, z0), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ..., (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. This is the opposite convention from that used by numpy, which by default has the fastest-changing dimension listed last (column-major convention). Thus, if loading a numpy array `ary`, where `ary.shape == (z, y, x)`, written to disk by `ary.tofile("myarray.stack")`, the corresponding dims parameter should be (x, y, z). If inputFormat is 'tif', the dims parameter (if any) will be ignored; data dimensions will instead be read out from the tif file headers. inputFormat: {'stack', 'tif'}. optional, default 'stack' Expected format of the input data. 'stack' indicates flat files of raw binary data, while 'tif' indicates greyscale / luminance TIF images. Each page of a multipage tif file will be interpreted as a separate z-plane. For both stacks and tif stacks, separate files are interpreted as distinct time points, with ordering given by lexicographic sorting of file names. ext: string, optional, default None Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for inputFormat=='tif'. dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is 'tif', the dtype parameter (if any) will be ignored; data type will instead be read out from the tif headers. blockSize: string formatted as e.g. "64M", "512k", "2G", or positive int. optional, default "150M" Requested size of individual output files in bytes (or kilobytes, megabytes, gigabytes). If shuffle=True, blockSize can also be a tuple of int specifying either the number of pixels or of splits per dimension to apply to the loaded images, or an instance of BlockingStrategy. Whether a tuple of int is interpreted as pixels or as splits depends on the value of the blockSizeUnits parameter. blockSize also indirectly controls the number of Spark partitions to be used, with one partition used per block created. blockSizeUnits: string, either "pixels" or "splits" (or unique prefix of each, such as "s"), default "pixels" Specifies units to be used in interpreting a tuple passed as blockSizeSpec when shuffle=True. If a string or a BlockingStrategy instance is passed as blockSizeSpec, or if shuffle=False, this parameter has no effect. startIdx: nonnegative int, optional startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used after lexicographically sorting all input data files matching the dataPath argument. For example, startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position). stopIdx: nonnegative int, optional See startIdx. shuffle: boolean, optional, default True Controls whether the conversion from Images to Series formats will make use of a Spark shuffle-based method. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an appropriate extension. Recursive loading is currently only implemented for local filesystems (not s3), and only with shuffle=True. nplanes: positive integer, default None If passed, will cause a single image file to be subdivided into multiple records. Every `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the first nplane planes of the first file being record 0, the second nplane planes being record 1, etc, until the first file is exhausted and record ordering continues with the first nplane planes of the second file, and so on. With nplanes=None (the default), a single file will be considered as representing a single record. Keys are calculated assuming that all input files contain the same number of records; if the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. nplanes is only supported for shuffle=True (the default). npartitions: positive int, optional If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1 partition per image file. Only applies when shuffle=True. renumber: boolean, optional, default False If renumber evaluates to True, then the keys for each record will be explicitly recalculated after all images are loaded. This should only be necessary at load time when different files contain different number of records. renumber is only supported for shuffle=True (the default). See Images.renumber(). Returns ------- data: thunder.rdds.Series A newly-created Series object, wrapping an RDD of timeseries data generated from the images in dataPath. This RDD will have as keys an n-tuple of int, with n given by the dimensionality of the original images. The keys will be the zero-based spatial index of the timeseries data in the RDD value. The value will be a numpy array of length equal to the number of image files loaded. Each loaded image file will contribute one point to this value array, with ordering as implied by the lexicographic ordering of image file names. """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if inputFormat.lower() == 'stack' and not dims: raise ValueError("Dimensions ('dims' parameter) must be specified if loading from binary image stack" + " ('stack' value for 'inputFormat' parameter)") if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) else: # tif / tif stack images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() return images.toBlocks(blockSize, units=blockSizeUnits).toSeries() else: from thunder.rdds.fileio.seriesloader import SeriesLoader if nplanes is not None: raise NotImplementedError("nplanes is not supported with shuffle=False") if npartitions is not None: raise NotImplementedError("npartitions is not supported with shuffle=False") if renumber: raise NotImplementedError("renumber is not supported with shuffle=False") loader = SeriesLoader(self._sc) if inputFormat.lower() == 'stack': return loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # tif / tif stack return loader.fromTif(dataPath, ext=ext, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) def convertImagesToSeries(self, dataPath, outputDirPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, shuffle=True, overwrite=False, recursive=False, nplanes=None, npartitions=None, renumber=False): """ Write out Images data as Series data, saved in a flat binary format. The resulting Series data files may subsequently be read in using the loadSeries() method. The Series data object that results will be equivalent to that which would be generated by loadImagesAsSeries(). It is expected that loading Series data directly from the series flat binary format, using loadSeries(), will be faster than converting image data to a Series object through loadImagesAsSeries(). Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". outputDirPath: string Path to a directory into which to write Series file output. An outputdir argument may be either a path on the local file system or a URI-like format, as in dataPath. Examples of valid outputDirPaths include "a/relative/directory/", "s3n:///my-s3-bucket/data/myoutput/", or "file:///mnt/a/new/directory/". If the directory specified by outputDirPath already exists and the 'overwrite' parameter is False, this method will throw a ValueError. If the directory exists and 'overwrite' is True, the existing directory and all its contents will be deleted and overwritten. dims: tuple of positive int, optional (but required if inputFormat is 'stack') Dimensions of input image data, for instance (1024, 1024, 48). Binary stack data will be interpreted as coming from a multidimensional array of the specified dimensions. The first dimension of the passed dims tuple should be the one that is changing most rapidly on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file should be ordered as [(x0, y0, z0), (x1, y0, z0), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ..., (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. This is the opposite convention from that used by numpy, which by default has the fastest-changing dimension listed last (column-major convention). Thus, if loading a numpy array `ary`, where `ary.shape == (z, y, x)`, written to disk by `ary.tofile("myarray.stack")`, the corresponding dims parameter should be (x, y, z). If inputFormat is 'tif', the dims parameter (if any) will be ignored; data dimensions will instead be read out from the tif file headers. inputFormat: {'stack', 'tif'}. optional, default 'stack' Expected format of the input data. 'stack' indicates flat files of raw binary data, while 'tif' indicates greyscale / luminance TIF images. Each page of a multipage tif file will be interpreted as a separate z-plane. For both stacks and tif stacks, separate files are interpreted as distinct time points, with ordering given by lexicographic sorting of file names. ext: string, optional, default None Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for inputFormat=='tif'. dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is 'tif', the dtype parameter (if any) will be ignored; data type will instead be read out from the tif headers. blockSize: string formatted as e.g. "64M", "512k", "2G", or positive int, tuple of positive int, or instance of BlockingStrategy. optional, default "150M" Requested size of individual output files in bytes (or kilobytes, megabytes, gigabytes). blockSize can also be an instance of blockingStrategy, or a tuple of int specifying either the number of pixels or of splits per dimension to apply to the loaded images. Whether a tuple of int is interpreted as pixels or as splits depends on the value of the blockSizeUnits parameter. This parameter also indirectly controls the number of Spark partitions to be used, with one partition used per block created. blockSizeUnits: string, either "pixels" or "splits" (or unique prefix of each, such as "s"), default "pixels" Specifies units to be used in interpreting a tuple passed as blockSizeSpec when shuffle=True. If a string or a BlockingStrategy instance is passed as blockSizeSpec, or if shuffle=False, this parameter has no effect. startIdx: nonnegative int, optional startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used after lexicographically sorting all input data files matching the dataPath argument. For example, startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position). stopIdx: nonnegative int, optional See startIdx. shuffle: boolean, optional, default True Controls whether the conversion from Images to Series formats will make use of a Spark shuffle-based method. overwrite: boolean, optional, default False If true, the directory specified by outputDirPath will first be deleted, along with all its contents, if it already exists. (Use with caution.) If false, a ValueError will be thrown if outputDirPath is found to already exist. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an appropriate extension. Recursive loading is currently only implemented for local filesystems (not s3), and only with shuffle=True. nplanes: positive integer, default None If passed, will cause a single image file to be subdivided into multiple records. Every `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the first nplane planes of the first file being record 0, the second nplane planes being record 1, etc, until the first file is exhausted and record ordering continues with the first nplane planes of the second file, and so on. With nplanes=None (the default), a single file will be considered as representing a single record. Keys are calculated assuming that all input files contain the same number of records; if the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. nplanes is only supported for shuffle=True (the default). npartitions: positive int, optional If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1 partition per image file. Only applies when shuffle=True. renumber: boolean, optional, default False If renumber evaluates to True, then the keys for each record will be explicitly recalculated after all images are loaded. This should only be necessary at load time when different files contain different number of records. renumber is only supported for shuffle=True (the default). See Images.renumber(). """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if inputFormat.lower() == 'stack' and not dims: raise ValueError("Dimensions ('dims' parameter) must be specified if loading from binary image stack" + " ('stack' value for 'inputFormat' parameter)") if not overwrite: raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=self.awsCredentials) overwrite = True # prevent additional downstream checks for this path if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) else: # 'tif' or 'tif-stack' images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() images.toBlocks(blockSize, units=blockSizeUnits).saveAsBinarySeries(outputDirPath, overwrite=overwrite) else: from thunder.rdds.fileio.seriesloader import SeriesLoader if nplanes is not None: raise NotImplementedError("nplanes is not supported with shuffle=False") if npartitions is not None: raise NotImplementedError("npartitions is not supported with shuffle=False") loader = SeriesLoader(self._sc) if inputFormat.lower() == 'stack': loader.saveFromStack(dataPath, outputDirPath, dims, ext=ext, dtype=dtype, blockSize=blockSize, overwrite=overwrite, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # 'tif' or 'tif-stack' loader.saveFromTif(dataPath, outputDirPath, ext=ext, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, overwrite=overwrite, recursive=recursive) def makeExample(self, dataset, **opts): """ Make an example data set for testing analyses. Options include 'pca', 'kmeans', and 'ica'. See thunder.utils.datasets for detailed options. Parameters ---------- dataset : str Which dataset to generate Returns ------- data : RDD of (tuple, array) pairs Generated dataset """ checkParams(dataset, ['kmeans', 'pca', 'ica']) return DataSets.make(self._sc, dataset, **opts) def loadExample(self, dataset): """ Load a local example data set for testing analyses. Parameters ---------- dataset : str Which dataset to load Returns ------- data : RDD of (tuple, array) pairs Generated dataset """ import os path = os.path.dirname(os.path.realpath(__file__)) # this path might actually be inside an .egg file (appears to happen with Spark 1.2) # check whether data/ directory actually exists on the filesystem, and if not, try # a hardcoded path that should work on ec2 clusters launched via the thunder-ec2 script if not os.path.isdir(os.path.join(path, 'data')): path = "/root/thunder/python/thunder/utils" if dataset == "iris": return self.loadSeries(os.path.join(path, 'data/iris/iris.bin')) elif dataset == "fish-series": return self.loadSeries(os.path.join(path, 'data/fish/bin/')).astype('float') elif dataset == "fish-images": return self.loadImages(os.path.join(path, 'data/fish/tif-stack'), inputFormat="tif") else: raise NotImplementedError("Dataset '%s' not known; should be one of 'iris', 'fish-series', 'fish-images'" % dataset) def loadExampleEC2(self, dataset): """ Load an example data set from EC2. Parameters ---------- dataset : str Which dataset to load Returns ------- data : RDD of (tuple, array) pairs Generated dataset params : Tuple or numpy array Parameters or metadata for dataset """ import json from numpy import asarray if 'ec' not in self._sc.master: raise Exception("must be running on EC2 to load this example data sets") elif dataset == "zebrafish-optomotor-response": path = 'zebrafish.datasets/optomotor-response/1/' data = self.loadSeries("s3n://" + path + 'data/dat_plane*.txt', inputFormat='text', minPartitions=1000, nkeys=3) paramFile = self._sc.textFile("s3n://" + path + "params.json") params = json.loads(paramFile.first()) modelFile = asarray(params['trials']) return data, modelFile else: raise NotImplementedError("dataset '%s' not availiable" % dataset) def loadSeriesLocal(self, dataFilePath, inputFormat='npy', minPartitions=None, keyFilePath=None, varName=None): """ Load a Series object from a local file (either npy or MAT format). File should contain a 1d or 2d matrix, where each row of the input matrix is a record. Keys can be provided in a separate file (with variable name 'keys', for MAT files). If not provided, linear indices will be used for keys. Parameters ---------- dataFilePath: str File to import varName : str, optional, default = None Variable name to load (for MAT files only) keyFilePath : str, optional, default = None File containing the keys for each record as another 1d or 2d array minPartitions : Int, optional, default = 1 Number of partitions for RDD """ checkParams(inputFormat, ['mat', 'npy']) from thunder.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc, minPartitions=minPartitions) if inputFormat.lower() == 'mat': if varName is None: raise Exception('Must provide variable name for loading MAT files') data = loader.fromMatLocal(dataFilePath, varName, keyFilePath) else: data = loader.fromNpyLocal(dataFilePath, keyFilePath) return data def setAWSCredentials(self, awsAccessKeyId, awsSecretAccessKey): """Manually set AWS access credentials to be used by Thunder. This method is provided primarily for hosted environments that do not provide filesystem access (e.g. Databricks Cloud). Typically AWS credentials can be set and read from core-site.xml (for Hadoop input format readers, such as Series binary files), ~/.boto or other boto credential file locations, or the environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY. These credentials should be configured automatically in clusters launched by the thunder-ec2 script, and so this method should not have to be called. Parameters ---------- awsAccessKeyId: string AWS public key, usually starts with "AKIA" awsSecretAccessKey: string AWS private key """ from thunder.utils.common import AWSCredentials self.awsCredentials = AWSCredentials(awsAccessKeyId, awsSecretAccessKey) self.awsCredentials.setOnContext(self._sc)
class ThunderContext(): """ Wrapper for a SparkContext that provides functionality for loading data. Also supports creation of example datasets, and loading example data both locally and from EC2. Attributes ---------- `_sc` : SparkContext Spark context for Spark functionality `_credentials` : AWSCredentials object, optional, default = None Stores public and private keys for AWS services. Typically available through configuration files, and but can optionally be set on the ThunderContext. See setAWSCredentials(). """ def __init__(self, sparkcontext): self._sc = sparkcontext self._credentials = None @classmethod def start(cls, *args, **kwargs): """Starts a ThunderContext using the same arguments as SparkContext""" from pyspark import SparkContext return ThunderContext(SparkContext(*args, **kwargs)) def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None, confFilename='conf.json', keyType=None, valueType=None): """ Loads a Series object from data stored as text or binary files. Supports single files or multiple files stored on a local file system, a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS. Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". nkeys: int, optional (but required if `inputFormat` is 'text') dimensionality of data keys. (For instance, (x,y,z) keyed data for 3-dimensional image timeseries data.) For text data, number of keys must be specified in this parameter; for binary data, number of keys must be specified either in this parameter or in a configuration file named by the 'conffile' argument if this parameter is not set. nvalues: int, optional (but required if `inputFormat` is 'text') Number of values expected to be read. For binary data, nvalues must be specified either in this parameter or in a configuration file named by the 'conffile' argument if this parameter is not set. inputFormat: {'text', 'binary'}. optional, default 'binary' Format of data to be read. minPartitions: int, optional Explicitly specify minimum number of Spark partitions to be generated from this data. Used only for text data. Default is to use minParallelism attribute of Spark context object. confFilename: string, optional, default 'conf.json' Path to JSON file with configuration options including 'nkeys', 'nvalues', 'keytype', and 'valuetype'. If a file is not found at the given path, then the base directory given in 'datafile' will also be checked. Parameters `nkeys` or `nvalues` that are specified as explicit arguments to this method will take priority over those found in conffile if both are present. Returns ------- data: thunder.rdds.Series A newly-created Series object, wrapping an RDD of series data. This RDD will have as keys an n-tuple of int, with n given by `nkeys` or the configuration passed in `conffile`. RDD values will be a numpy array of length `nvalues` (or as specified in the passed configuration file). """ checkParams(inputFormat, ['text', 'binary']) from thunder.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc, minPartitions=minPartitions) if inputFormat.lower() == 'text': data = loader.fromText(dataPath, nkeys=nkeys) else: # must be either 'text' or 'binary' data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues, keyType=keyType, valueType=valueType) return data def loadImages(self, dataPath, dims=None, dtype=None, inputFormat='stack', ext=None, startIdx=None, stopIdx=None, recursive=False, nplanes=None, npartitions=None, renumber=False, confFilename='conf.json'): """ Loads an Images object from data stored as a binary image stack, tif, or png files. Supports single files or multiple files, stored on a local file system, a networked file sytem (mounted and available on all nodes), or Amazon S3. HDFS is not currently supported for image file data. Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". dims: tuple of positive int, optional (but required if inputFormat is 'stack') Dimensions of input image data, similar to a numpy 'shape' parameter, for instance (1024, 1024, 48). Binary stack data will be interpreted as coming from a multidimensional array of the specified dimensions. Stack data should be stored in row-major order (Fortran or Matlab convention) rather than column-major order (C or python/numpy convention), where the first dimension corresponds to that which is changing most rapidly on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file should be ordered as [(x0, y0, z0), (x1, y0, zo), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ..., (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. If inputFormat is 'png' or 'tif', the dims parameter (if any) will be ignored; data dimensions will instead be read out from the image file headers. inputFormat: {'stack', 'png', 'tif'}. optional, default 'stack' Expected format of the input data. 'stack' indicates flat files of raw binary data. 'png' or 'tif' indicate image files of the corresponding formats. Each page of a multipage tif file will be interpreted as a separate z-plane. For all formats, separate files are interpreted as distinct time points, with ordering given by lexicographic sorting of file names. ext: string, optional, default None Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for inputFormat=='tif', and 'png' for inputFormat="png". dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is 'tif' or 'png', the dtype parameter (if any) will be ignored; data type will instead be read out from the tif headers. startIdx: nonnegative int, optional startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used after lexicographically sorting all input data files matching the dataPath argument. For example, startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position). stopIdx: nonnegative int, optional See startIdx. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an appropriate extension. Recursive loading is currently only implemented for local filesystems (not s3). nplanes: positive integer, default None If passed, will cause a single image file to be subdivided into multiple records. Every `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the first nplane planes of the first file being record 0, the second nplane planes being record 1, etc, until the first file is exhausted and record ordering continues with the first nplane planes of the second file, and so on. With nplanes=None (the default), a single file will be considered as representing a single record. Keys are calculated assuming that all input files contain the same number of records; if the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. npartitions: positive int, optional If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1 partition per image file. renumber: boolean, optional, default False If renumber evaluates to True, then the keys for each record will be explicitly recalculated after all images are loaded. This should only be necessary at load time when different files contain different number of records. See Images.renumber(). Returns ------- data: thunder.rdds.Images A newly-created Images object, wrapping an RDD of <int index, numpy array> key-value pairs. """ checkParams(inputFormat, ['stack', 'png', 'tif', 'tif-stack']) from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if inputFormat.lower() == 'stack': data = loader.fromStack(dataPath, dims=dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions, confFilename=confFilename) elif inputFormat.lower().startswith('tif'): data = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) else: if nplanes: raise NotImplementedError( "nplanes argument is not supported for png files") data = loader.fromPng(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, npartitions=npartitions) if not renumber: return data else: return data.renumber() def loadImagesAsSeries(self, dataPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, shuffle=True, recursive=False, nplanes=None, npartitions=None, renumber=False): """ Load Images data as Series data. Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". dims: tuple of positive int, optional (but required if inputFormat is 'stack') Dimensions of input image data, for instance (1024, 1024, 48). Binary stack data will be interpreted as coming from a multidimensional array of the specified dimensions. The first dimension of the passed dims tuple should be the one that is changing most rapidly on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file should be ordered as [(x0, y0, z0), (x1, y0, z0), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ..., (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. This is the opposite convention from that used by numpy, which by default has the fastest-changing dimension listed last (column-major convention). Thus, if loading a numpy array `ary`, where `ary.shape == (z, y, x)`, written to disk by `ary.tofile("myarray.stack")`, the corresponding dims parameter should be (x, y, z). If inputFormat is 'tif', the dims parameter (if any) will be ignored; data dimensions will instead be read out from the tif file headers. inputFormat: {'stack', 'tif'}. optional, default 'stack' Expected format of the input data. 'stack' indicates flat files of raw binary data, while 'tif' indicates greyscale / luminance TIF images. Each page of a multipage tif file will be interpreted as a separate z-plane. For both stacks and tif stacks, separate files are interpreted as distinct time points, with ordering given by lexicographic sorting of file names. ext: string, optional, default None Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for inputFormat=='tif'. dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is 'tif', the dtype parameter (if any) will be ignored; data type will instead be read out from the tif headers. blockSize: string formatted as e.g. "64M", "512k", "2G", or positive int. optional, default "150M" Requested size of individual output files in bytes (or kilobytes, megabytes, gigabytes). If shuffle=True, blockSize can also be a tuple of int specifying either the number of pixels or of splits per dimension to apply to the loaded images, or an instance of BlockingStrategy. Whether a tuple of int is interpreted as pixels or as splits depends on the value of the blockSizeUnits parameter. blockSize also indirectly controls the number of Spark partitions to be used, with one partition used per block created. blockSizeUnits: string, either "pixels" or "splits" (or unique prefix of each, such as "s"), default "pixels" Specifies units to be used in interpreting a tuple passed as blockSizeSpec when shuffle=True. If a string or a BlockingStrategy instance is passed as blockSizeSpec, or if shuffle=False, this parameter has no effect. startIdx: nonnegative int, optional startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used after lexicographically sorting all input data files matching the dataPath argument. For example, startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position). stopIdx: nonnegative int, optional See startIdx. shuffle: boolean, optional, default True Controls whether the conversion from Images to Series formats will make use of a Spark shuffle-based method. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an appropriate extension. Recursive loading is currently only implemented for local filesystems (not s3), and only with shuffle=True. nplanes: positive integer, default None If passed, will cause a single image file to be subdivided into multiple records. Every `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the first nplane planes of the first file being record 0, the second nplane planes being record 1, etc, until the first file is exhausted and record ordering continues with the first nplane planes of the second file, and so on. With nplanes=None (the default), a single file will be considered as representing a single record. Keys are calculated assuming that all input files contain the same number of records; if the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. nplanes is only supported for shuffle=True (the default). npartitions: positive int, optional If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1 partition per image file. Only applies when shuffle=True. renumber: boolean, optional, default False If renumber evaluates to True, then the keys for each record will be explicitly recalculated after all images are loaded. This should only be necessary at load time when different files contain different number of records. renumber is only supported for shuffle=True (the default). See Images.renumber(). Returns ------- data: thunder.rdds.Series A newly-created Series object, wrapping an RDD of timeseries data generated from the images in dataPath. This RDD will have as keys an n-tuple of int, with n given by the dimensionality of the original images. The keys will be the zero-based spatial index of the timeseries data in the RDD value. The value will be a numpy array of length equal to the number of image files loaded. Each loaded image file will contribute one point to this value array, with ordering as implied by the lexicographic ordering of image file names. """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if inputFormat.lower() == 'stack' and not dims: raise ValueError( "Dimensions ('dims' parameter) must be specified if loading from binary image stack" + " ('stack' value for 'inputFormat' parameter)") if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) else: # tif / tif stack images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() return images.toBlocks(blockSize, units=blockSizeUnits).toSeries() else: from thunder.rdds.fileio.seriesloader import SeriesLoader if nplanes is not None: raise NotImplementedError( "nplanes is not supported with shuffle=False") if npartitions is not None: raise NotImplementedError( "npartitions is not supported with shuffle=False") if renumber: raise NotImplementedError( "renumber is not supported with shuffle=False") loader = SeriesLoader(self._sc) if inputFormat.lower() == 'stack': return loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # tif / tif stack return loader.fromTif(dataPath, ext=ext, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) def convertImagesToSeries(self, dataPath, outputDirPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, shuffle=True, overwrite=False, recursive=False, nplanes=None, npartitions=None, renumber=False): """ Write out Images data as Series data, saved in a flat binary format. The resulting Series data files may subsequently be read in using the loadSeries() method. The Series data object that results will be equivalent to that which would be generated by loadImagesAsSeries(). It is expected that loading Series data directly from the series flat binary format, using loadSeries(), will be faster than converting image data to a Series object through loadImagesAsSeries(). Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. Examples of valid dataPaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif", "/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/". outputDirPath: string Path to a directory into which to write Series file output. An outputdir argument may be either a path on the local file system or a URI-like format, as in dataPath. Examples of valid outputDirPaths include "a/relative/directory/", "s3n:///my-s3-bucket/data/myoutput/", or "file:///mnt/a/new/directory/". If the directory specified by outputDirPath already exists and the 'overwrite' parameter is False, this method will throw a ValueError. If the directory exists and 'overwrite' is True, the existing directory and all its contents will be deleted and overwritten. dims: tuple of positive int, optional (but required if inputFormat is 'stack') Dimensions of input image data, for instance (1024, 1024, 48). Binary stack data will be interpreted as coming from a multidimensional array of the specified dimensions. The first dimension of the passed dims tuple should be the one that is changing most rapidly on disk. So for instance given dims of (x, y, z), the coordinates of the data in a binary stack file should be ordered as [(x0, y0, z0), (x1, y0, z0), ..., (xN, y0, z0), (x0, y1, z0), (x1, y1, z0), ..., (xN, yM, z0), (x0, y0, z1), ..., (xN, yM, zP)]. This is the opposite convention from that used by numpy, which by default has the fastest-changing dimension listed last (column-major convention). Thus, if loading a numpy array `ary`, where `ary.shape == (z, y, x)`, written to disk by `ary.tofile("myarray.stack")`, the corresponding dims parameter should be (x, y, z). If inputFormat is 'tif', the dims parameter (if any) will be ignored; data dimensions will instead be read out from the tif file headers. inputFormat: {'stack', 'tif'}. optional, default 'stack' Expected format of the input data. 'stack' indicates flat files of raw binary data, while 'tif' indicates greyscale / luminance TIF images. Each page of a multipage tif file will be interpreted as a separate z-plane. For both stacks and tif stacks, separate files are interpreted as distinct time points, with ordering given by lexicographic sorting of file names. ext: string, optional, default None Extension required on data files to be loaded. By default will be "stack" if inputFormat=="stack", "tif" for inputFormat=='tif'. dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. If inputFormat is 'tif', the dtype parameter (if any) will be ignored; data type will instead be read out from the tif headers. blockSize: string formatted as e.g. "64M", "512k", "2G", or positive int, tuple of positive int, or instance of BlockingStrategy. optional, default "150M" Requested size of individual output files in bytes (or kilobytes, megabytes, gigabytes). blockSize can also be an instance of blockingStrategy, or a tuple of int specifying either the number of pixels or of splits per dimension to apply to the loaded images. Whether a tuple of int is interpreted as pixels or as splits depends on the value of the blockSizeUnits parameter. This parameter also indirectly controls the number of Spark partitions to be used, with one partition used per block created. blockSizeUnits: string, either "pixels" or "splits" (or unique prefix of each, such as "s"), default "pixels" Specifies units to be used in interpreting a tuple passed as blockSizeSpec when shuffle=True. If a string or a BlockingStrategy instance is passed as blockSizeSpec, or if shuffle=False, this parameter has no effect. startIdx: nonnegative int, optional startIdx and stopIdx are convenience parameters to allow only a subset of input files to be read in. These parameters give the starting index (inclusive) and final index (exclusive) of the data files to be used after lexicographically sorting all input data files matching the dataPath argument. For example, startIdx=None (the default) and stopIdx=10 will cause only the first 10 data files in dataPath to be read in; startIdx=2 and stopIdx=3 will cause only the third file (zero-based index of 2) to be read in. startIdx and stopIdx use the python slice indexing convention (zero-based indexing with an exclusive final position). stopIdx: nonnegative int, optional See startIdx. shuffle: boolean, optional, default True Controls whether the conversion from Images to Series formats will make use of a Spark shuffle-based method. overwrite: boolean, optional, default False If true, the directory specified by outputDirPath will first be deleted, along with all its contents, if it already exists. (Use with caution.) If false, a ValueError will be thrown if outputDirPath is found to already exist. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an appropriate extension. Recursive loading is currently only implemented for local filesystems (not s3), and only with shuffle=True. nplanes: positive integer, default None If passed, will cause a single image file to be subdivided into multiple records. Every `nplanes` z-planes (or multipage tif pages) in the file will be taken as a new record, with the first nplane planes of the first file being record 0, the second nplane planes being record 1, etc, until the first file is exhausted and record ordering continues with the first nplane planes of the second file, and so on. With nplanes=None (the default), a single file will be considered as representing a single record. Keys are calculated assuming that all input files contain the same number of records; if the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. nplanes is only supported for shuffle=True (the default). npartitions: positive int, optional If specified, request a certain number of partitions for the underlying Spark RDD. Default is 1 partition per image file. Only applies when shuffle=True. renumber: boolean, optional, default False If renumber evaluates to True, then the keys for each record will be explicitly recalculated after all images are loaded. This should only be necessary at load time when different files contain different number of records. renumber is only supported for shuffle=True (the default). See Images.renumber(). """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if inputFormat.lower() == 'stack' and not dims: raise ValueError( "Dimensions ('dims' parameter) must be specified if loading from binary image stack" + " ('stack' value for 'inputFormat' parameter)") if not overwrite: raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=self._credentials) overwrite = True # prevent additional downstream checks for this path if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) else: # 'tif' or 'tif-stack' images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() images.toBlocks(blockSize, units=blockSizeUnits).saveAsBinarySeries( outputDirPath, overwrite=overwrite) else: from thunder.rdds.fileio.seriesloader import SeriesLoader if nplanes is not None: raise NotImplementedError( "nplanes is not supported with shuffle=False") if npartitions is not None: raise NotImplementedError( "npartitions is not supported with shuffle=False") loader = SeriesLoader(self._sc) if inputFormat.lower() == 'stack': loader.saveFromStack(dataPath, outputDirPath, dims, ext=ext, dtype=dtype, blockSize=blockSize, overwrite=overwrite, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # 'tif' or 'tif-stack' loader.saveFromTif(dataPath, outputDirPath, ext=ext, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, overwrite=overwrite, recursive=recursive) def makeExample(self, dataset, **opts): """ Make an example data set for testing analyses. Options include 'pca', 'kmeans', and 'ica'. See thunder.utils.datasets for detailed options. Parameters ---------- dataset : str Which dataset to generate Returns ------- data : RDD of (tuple, array) pairs Generated dataset """ checkParams(dataset, ['kmeans', 'pca', 'ica']) return DataSets.make(self._sc, dataset, **opts) def loadExample(self, dataset=None): """ Load a local example data set for testing analyses. Some of these data sets are extremely downsampled and should be considered useful only for testing the API. If called with None, will return list of available datasets. Parameters ---------- dataset : str Which dataset to load Returns ------- data : Data object Generated dataset as a Thunder data objects (e.g Series or Images) """ import atexit import shutil import tempfile from pkg_resources import resource_listdir, resource_filename DATASETS = { 'iris': 'iris', 'fish-series': 'fish/bin', 'fish-images': 'fish/tif-stack' } if dataset is None: return DATASETS.keys() checkParams(dataset, DATASETS.keys()) if 'ec2' in self._sc.master: tmpdir = os.path.join('/root/thunder/python/thunder/utils', 'data', DATASETS[dataset]) else: tmpdir = tempfile.mkdtemp() atexit.register(shutil.rmtree, tmpdir) def copyLocal(target): files = resource_listdir('thunder.utils.data', target) for f in files: path = resource_filename('thunder.utils.data', os.path.join(target, f)) shutil.copy(path, tmpdir) copyLocal(DATASETS[dataset]) if dataset == "iris": return self.loadSeries(tmpdir) elif dataset == "fish-series": return self.loadSeries(tmpdir).astype('float') elif dataset == "fish-images": return self.loadImages(tmpdir, inputFormat="tif") def loadExampleS3(self, dataset=None): """ Load an example data set from S3. Info on the included datasets can be found at the CodeNeuro data repository (http://datasets.codeneuro.org/). If called with None, will return list of available datasets. Parameters ---------- dataset : str Which dataset to load Returns ------- data : a Data object (usually a Series or Images) The dataset as one of Thunder's data objects params : dict Parameters or metadata for dataset """ DATASETS = { 'ahrens.lab/direction.selectivity': 'ahrens.lab/direction.selectivity/1/', 'ahrens.lab/optomotor.response': 'ahrens.lab/optomotor.response/1/', 'svoboda.lab/tactile.navigation': 'svoboda.lab/tactile.navigation/1/' } if 'local' in self._sc.master: raise Exception( "Must be running on an EC2 cluster to load this example data set" ) if dataset is None: return DATASETS.keys() checkParams(dataset, DATASETS.keys()) basePath = 's3n://neuro.datasets/' dataPath = DATASETS[dataset] data = self.loadSeries(basePath + dataPath + 'series') params = self.loadParams(basePath + dataPath + 'params/covariates.json') return data, params def loadParams(self, path): """ Load a file with parameters from a local file system or S3. Assumes file is JSON with basic types (strings, integers, doubles, lists), in either a single dict or list of dict-likes, and each dict has at least a "name" field and a "value" field. Useful for loading generic meta data, parameters, covariates, etc. Parameters ---------- path : str Path to file, can be on a local file system or an S3 bucket Returns ------- A dict or list with the parameters """ import json from thunder.rdds.fileio.readers import getFileReaderForPath, FileNotFoundError reader = getFileReaderForPath(path)( awsCredentialsOverride=self._credentials) try: buffer = reader.read(path) except FileNotFoundError: raise Exception("Cannot find file %s" % path) return Params(json.loads(buffer)) def loadSeriesLocal(self, dataFilePath, inputFormat='npy', minPartitions=None, keyFilePath=None, varName=None): """ Load a Series object from a local file (either npy or MAT format). File should contain a 1d or 2d matrix, where each row of the input matrix is a record. Keys can be provided in a separate file (with variable name 'keys', for MAT files). If not provided, linear indices will be used for keys. Parameters ---------- dataFilePath: str File to import varName : str, optional, default = None Variable name to load (for MAT files only) keyFilePath : str, optional, default = None File containing the keys for each record as another 1d or 2d array minPartitions : Int, optional, default = 1 Number of partitions for RDD """ checkParams(inputFormat, ['mat', 'npy']) from thunder.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc, minPartitions=minPartitions) if inputFormat.lower() == 'mat': if varName is None: raise Exception( 'Must provide variable name for loading MAT files') data = loader.fromMatLocal(dataFilePath, varName, keyFilePath) else: data = loader.fromNpyLocal(dataFilePath, keyFilePath) return data def export(self, data, filename, format=None, overwrite=False, varname=None): """ Export local array data to a variety of formats. Can write to a local file sytem or S3 (destination inferred from filename schema). S3 writing useful for persisting arrays when working in an environment without accessible local storage. Parameters ---------- data : array-like The data to export filename : str Output location (path/to/file.ext) format : str, optional, default = None Ouput format ("npy", "mat", or "txt"), if not provided will try to infer from file extension. overwrite : boolean, optional, default = False Whether to overwrite if directory or file already exists varname : str, optional, default = None Variable name for writing "mat" formatted files """ from numpy import save, savetxt from scipy.io import savemat from StringIO import StringIO from thunder.rdds.fileio.writers import getFileWriterForPath path, file, format = handleFormat(filename, format) checkParams(format, ["npy", "mat", "txt"]) clazz = getFileWriterForPath(filename) writer = clazz(path, file, overwrite=overwrite, awsCredentialsOverride=self._credentials) stream = StringIO() if format == "mat": varname = os.path.splitext(file)[0] if varname is None else varname savemat(stream, mdict={varname: data}, oned_as='column', do_compression='true') if format == "npy": save(stream, data) if format == "txt": savetxt(stream, data) stream.seek(0) writer.writeFile(stream.buf) def setAWSCredentials(self, awsAccessKeyId, awsSecretAccessKey): """ Manually set AWS access credentials to be used by Thunder. This method is provided primarily for hosted environments that do not provide filesystem access (e.g. Databricks Cloud). Typically AWS credentials can be set and read from core-site.xml (for Hadoop input format readers, such as Series binary files), ~/.boto or other boto credential file locations, or the environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY. These credentials should be configured automatically in clusters launched by the thunder-ec2 script, and so this method should not have to be called. Parameters ---------- awsAccessKeyId : string AWS public key, usually starts with "AKIA" awsSecretAccessKey : string AWS private key """ from thunder.utils.common import AWSCredentials self._credentials = AWSCredentials(awsAccessKeyId, awsSecretAccessKey) self._credentials.setOnContext(self._sc)
class ThunderContext(): """ Wrapper for a SparkContext that provides an entry point for loading and saving. Also supports creation of example datasets, and loading example data both locally and from EC2. Attributes ---------- `_sc` : SparkContext Spark context for Spark functionality `_credentials` : AWSCredentials object, optional, default = None Stores public and private keys for AWS services. Typically available through configuration files, and but can optionally be set using :func:`ThunderContext.setAWSCredentials()`. """ def __init__(self, sparkcontext): self._sc = sparkcontext self._credentials = None @classmethod def start(cls, *args, **kwargs): """ Starts a ThunderContext using the same arguments as SparkContext """ from pyspark import SparkContext return ThunderContext(SparkContext(*args, **kwargs)) def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None, confFilename='conf.json', keyType=None, valueType=None, keyPath=None, varName=None): """ Loads a Series object from data stored as binary, text, npy, or mat. For binary and text, supports single files or multiple files stored on a local file system, a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS. For local formats (npy and mat) only local file systems currently supported. Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". nkeys: int, optional (required if `inputFormat` is 'text'), default = None Number of keys per record (e.g. 3 for (x, y, z) coordinate keys). Must be specified for text data; can be specified here or in a configuration file for binary data. nvalues: int, optional (required if `inputFormat` is 'text') Number of values per record. Must be specified here or in a configuration file for binary data. inputFormat: {'text', 'binary', 'npy', 'mat'}. optional, default = 'binary' inputFormat of data to be read. minPartitions: int, optional, default = SparkContext.minParallelism Minimum number of Spark partitions to use, only for text. confFilename: string, optional, default 'conf.json' Path to JSON file with configuration options including 'nkeys', 'nvalues', 'keyType', and 'valueType'. If a file is not found at the given path, then the base directory in 'dataPath' will be checked. Parameters will override the conf file. keyType: string or numpy dtype, optional, default = None Numerical type of keys, will override conf file. valueType: string or numpy dtype, optional, default = None Numerical type of values, will override conf file. keyPath: string, optional, default = None Path to file with keys when loading from npy or mat. varName : str, optional, default = None Variable name to load (for MAT files only) Returns ------- data: thunder.rdds.Series A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs """ checkParams(inputFormat, ['text', 'binary', 'npy', 'mat']) from thunder.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc, minPartitions=minPartitions) if inputFormat.lower() == 'binary': data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues, keyType=keyType, valueType=valueType) elif inputFormat.lower() == 'text': if nkeys is None: raise Exception('Must provide number of keys per record for loading from text') data = loader.fromText(dataPath, nkeys=nkeys) elif inputFormat.lower() == 'npy': data = loader.fromNpyLocal(dataPath, keyPath) else: if varName is None: raise Exception('Must provide variable name for loading MAT files') data = loader.fromMatLocal(dataPath, varName, keyPath) return data def loadImages(self, dataPath, dims=None, dtype=None, inputFormat='stack', ext=None, startIdx=None, stopIdx=None, recursive=False, nplanes=None, npartitions=None, renumber=False, confFilename='conf.json'): """ Loads an Images object from data stored as a binary image stack, tif, or png files. Supports single files or multiple files, stored on a local file system, a networked file sytem (mounted and available on all nodes), or Amazon S3. HDFS is not currently supported for image file data. Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". dims: tuple of positive int, optional (required if inputFormat is 'stack') Image dimensions. Binary stack data will be interpreted as a multidimensional array with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention), where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions will be read from the image file headers. inputFormat: str, optional, default = 'stack' Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks. 'png' or 'tif' indicate image format. Page of a multipage tif file will be extend along the third dimension. Separate files interpreted as distinct records, with ordering given by lexicographic sorting of file names. ext: string, optional, default = None File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif', and 'png' for inputFormat=="png". dtype: string or numpy dtype, optional, default = 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. Ignored for 'tif' or 'png' (data will be inferred from image formats). startIdx: nonnegative int, optional, default = None Convenience parameters to read only a subset of input files. Uses python slice conventions (zero-based indexing with exclusive final position). These parameters give the starting and final index after lexicographic sorting. stopIdx: nonnegative int, optional, default = None See startIdx. recursive: boolean, optional, default = False If true, will recursively descend directories rooted at dataPath, loading all files in the tree with an appropriate extension. nplanes: positive integer, optional, default = None Subdivide individual image files. Every `nplanes` from each file will be considered a new record. With nplanes=None (the default), a single file will be considered as representing a single record. If the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. npartitions: positive int, optional, default = None Specify number of partitions for the RDD, if unspecified will use 1 partition per image. renumber: boolean, optional, default = False Recalculate keys for records after images are loading. Only necessary if different files contain different number of records (e.g. due to specifying nplanes). See Images.renumber(). confFilename : string, optional, default = 'conf.json' Name of conf file if using to specify parameters for binary stack data Returns ------- data: thunder.rdds.Images An Images object, wrapping an RDD of with (int) : (numpy array) pairs """ checkParams(inputFormat, ['stack', 'png', 'tif', 'tif-stack']) from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) # Checking StartIdx is smaller or equal to StopIdx if startIdx is not None and stopIdx is not None and startIdx > stopIdx: raise Exception("Error. startIdx {} is larger than stopIdx {}".inputFormat(startIdx, stopIdx)) if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if inputFormat.lower() == 'stack': data = loader.fromStack(dataPath, dims=dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions, confFilename=confFilename) elif inputFormat.lower().startswith('tif'): data = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) else: if nplanes: raise NotImplementedError("nplanes argument is not supported for png files") data = loader.fromPng(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, npartitions=npartitions) if not renumber: return data else: return data.renumber() def loadImagesOCP(self, bucketName, resolution, server='ocp.me', startIdx=None, stopIdx=None, minBound=None, maxBound=None): """ Load Images from OCP (Open Connectome Project). The OCP is a web service for access to EM brain images and other neural image data. The web-service can be accessed at http://www.openconnectomeproject.org/. Parameters ---------- bucketName: string Token name for the project in OCP. This name should exist on the server from which data is loaded. resolution: nonnegative int Resolution of the data in OCP server: string, optional, default = 'ocp.me' Name of the OCP server with the specified token. startIdx: nonnegative int, optional, default = None Convenience parameters to read only a subset of input files. Uses python slice conventions (zero-based indexing with exclusive final position). stopIdx: nonnegative int, optional See startIdx. minBound, maxBound: tuple of nonnegative int, optional, default = None X,Y,Z bounds of the data to fetch from OCP. minBound contains the (xMin,yMin,zMin) while maxBound contains (xMax,yMax,zMax). Returns ------- data: thunder.rdds.Images An Images object, wrapping an RDD of with (int) : (numpy array) pairs """ from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) # Checking StartIdx is smaller or equal to StopIdx if startIdx is not None and stopIdx is not None and startIdx > stopIdx: raise Exception("Error. startIdx {} is larger than stopIdx {}".format(startIdx, stopIdx)) data = loader.fromOCP(bucketName, resolution=resolution, server=server, startIdx=startIdx, stopIdx=stopIdx, minBound=minBound, maxBound=maxBound) return data def loadImagesAsSeries(self, dataPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, shuffle=True, recursive=False, nplanes=None, npartitions=None, renumber=False, confFilename='conf.json'): """ Load Images data as Series data. Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". dims: tuple of positive int, optional (required if inputFormat is 'stack') Image dimensions. Binary stack data will be interpreted as a multidimensional array with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention), where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions will be read from the image file headers. inputFormat: str, optional, default = 'stack' Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks. 'png' or 'tif' indicate image formats. Page of a multipage tif file will be extend along the third dimension. Separate files interpreted as distinct records, with ordering given by lexicographic sorting of file names. ext: string, optional, default = None File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif', and 'png' for inputFormat=="png". dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. Ignored for 'tif' or 'png' (data will be inferred from image formats). blockSize: string or positive int, optional, default "150M" Requested size of blocks (e.g "64M", "512k", "2G"). If shuffle=True, can also be a tuple of int specifying the number of pixels or splits per dimension. Indirectly controls the number of Spark partitions, with one partition per block. blockSizeUnits: string, either "pixels" or "splits", default "pixels" Units for interpreting a tuple passed as blockSize when shuffle=True. startIdx: nonnegative int, optional, default = None Convenience parameters to read only a subset of input files. Uses python slice conventions (zero-based indexing with exclusive final position). These parameters give the starting and final index after lexicographic sorting. stopIdx: nonnegative int, optional, default = None See startIdx. shuffle: boolean, optional, default = True Controls whether the conversion from Images to Series formats will use of a Spark shuffle-based method. recursive: boolean, optional, default = False If true, will recursively descend directories rooted at dataPath, loading all files in the tree with an appropriate extension. nplanes: positive integer, optional, default = None Subdivide individual image files. Every `nplanes` from each file will be considered a new record. With nplanes=None (the default), a single file will be considered as representing a single record. If the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. npartitions: positive int, optional, default = None Specify number of partitions for the RDD, if unspecified will use 1 partition per image. renumber: boolean, optional, default = False Recalculate keys for records after images are loading. Only necessary if different files contain different number of records (e.g. due to specifying nplanes). See Images.renumber(). confFilename : string, optional, default = 'conf.json' Name of conf file if using to specify parameters for binary stack data Returns ------- data: thunder.rdds.Series A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs. Keys will be n-tuples of int, with n given by dimensionality of the images, and correspond to indexes into the image arrays. Value will have length equal to the number of image files. With each image contributing one point to this value array, with ordering given by the lexicographic ordering of image file names. """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, dtype=dtype, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions, confFilename=confFilename) else: # tif / tif stack images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() return images.toBlocks(blockSize, units=blockSizeUnits).toSeries() else: from thunder.rdds.fileio.seriesloader import SeriesLoader if nplanes is not None: raise NotImplementedError("nplanes is not supported with shuffle=False") if npartitions is not None: raise NotImplementedError("npartitions is not supported with shuffle=False") if renumber: raise NotImplementedError("renumber is not supported with shuffle=False") loader = SeriesLoader(self._sc) if inputFormat.lower() == 'stack': return loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # tif / tif stack return loader.fromTif(dataPath, ext=ext, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) def convertImagesToSeries(self, dataPath, outputDirPath, dims=None, inputFormat='stack', ext=None, dtype='int16', blockSize="150M", blockSizeUnits="pixels", startIdx=None, stopIdx=None, shuffle=True, overwrite=False, recursive=False, nplanes=None, npartitions=None, renumber=False, confFilename='conf.json'): """ Write out Images data as Series data, saved in a flat binary format. The resulting files may subsequently be read in using ThunderContext.loadSeries(). Loading Series data directly will likely be faster than converting image data to a Series object through loadImagesAsSeries(). Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". outputDirPath: string Path to directory to write Series file output. May be either a path on the local file system or a URI-like format, such as "local/directory", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". If the directory exists and 'overwrite' is True, the existing directory and all its contents will be deleted and overwritten. dims: tuple of positive int, optional (required if inputFormat is 'stack') Image dimensions. Binary stack data will be interpreted as a multidimensional array with the given dimensions, and should be stored in row-major order (Fortran or Matlab convention), where the first dimension changes most rapidly. For 'png' or 'tif' data dimensions will be read from the image file headers. inputFormat: str, optional, default = 'stack' Expected format of the input data: 'stack', 'png', or 'tif'. 'stack' indicates flat binary stacks. 'png' or 'tif' indicate image formats. Page of a multipage tif file will be extend along the third dimension. Separate files interpreted as distinct records, with ordering given by lexicographic sorting of file names. ext: string, optional, default = None File extension, default will be "bin" if inputFormat=="stack", "tif" for inputFormat=='tif', and 'png' for inputFormat=="png". dtype: string or numpy dtype. optional, default 'int16' Data type of the image files to be loaded, specified as a numpy "dtype" string. Ignored for 'tif' or 'png' (data will be inferred from image formats). blockSize: string or positive int, optional, default "150M" Requested size of blocks (e.g "64M", "512k", "2G"). If shuffle=True, can also be a tuple of int specifying the number of pixels or splits per dimension. Indirectly controls the number of Spark partitions, with one partition per block. blockSizeUnits: string, either "pixels" or "splits", default "pixels" Units for interpreting a tuple passed as blockSize when shuffle=True. startIdx: nonnegative int, optional, default = None Convenience parameters to read only a subset of input files. Uses python slice conventions (zero-based indexing with exclusive final position). These parameters give the starting and final index after lexicographic sorting. stopIdx: nonnegative int, optional, default = None See startIdx. shuffle: boolean, optional, default = True Controls whether the conversion from Images to Series formats will use of a Spark shuffle-based method. overwrite: boolean, optional, default False If true, the directory specified by outputDirPath will be deleted (recursively) if it already exists. (Use with caution.) recursive: boolean, optional, default = False If true, will recursively descend directories rooted at dataPath, loading all files in the tree with an appropriate extension. nplanes: positive integer, optional, default = None Subdivide individual image files. Every `nplanes` from each file will be considered a new record. With nplanes=None (the default), a single file will be considered as representing a single record. If the number of records per file is not the same across all files, then `renumber` should be set to True to ensure consistent keys. npartitions: positive int, optional, default = None Specify number of partitions for the RDD, if unspecified will use 1 partition per image. renumber: boolean, optional, default = False Recalculate keys for records after images are loading. Only necessary if different files contain different number of records (e.g. due to specifying nplanes). See Images.renumber(). confFilename : string, optional, default = 'conf.json' Name of conf file if using to specify parameters for binary stack data """ checkParams(inputFormat, ['stack', 'tif', 'tif-stack']) if not overwrite: raiseErrorIfPathExists(outputDirPath, awsCredentialsOverride=self._credentials) overwrite = True # prevent additional downstream checks for this path if not ext: ext = DEFAULT_EXTENSIONS.get(inputFormat.lower(), None) if shuffle: from thunder.rdds.fileio.imagesloader import ImagesLoader loader = ImagesLoader(self._sc) if inputFormat.lower() == 'stack': images = loader.fromStack(dataPath, dims, ext=ext, dtype=dtype, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions, confFilename=confFilename) else: # 'tif' or 'tif-stack' images = loader.fromTif(dataPath, ext=ext, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive, nplanes=nplanes, npartitions=npartitions) if renumber: images = images.renumber() images.toBlocks(blockSize, units=blockSizeUnits).saveAsBinarySeries(outputDirPath, overwrite=overwrite) else: from thunder.rdds.fileio.seriesloader import SeriesLoader if nplanes is not None: raise NotImplementedError("nplanes is not supported with shuffle=False") if npartitions is not None: raise NotImplementedError("npartitions is not supported with shuffle=False") loader = SeriesLoader(self._sc) if inputFormat.lower() == 'stack': loader.saveFromStack(dataPath, outputDirPath, dims, ext=ext, dtype=dtype, blockSize=blockSize, overwrite=overwrite, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) else: # 'tif' or 'tif-stack' loader.saveFromTif(dataPath, outputDirPath, ext=ext, blockSize=blockSize, startIdx=startIdx, stopIdx=stopIdx, overwrite=overwrite, recursive=recursive) def makeExample(self, dataset, **opts): """ Make an example data set for testing analyses. Options include 'pca', 'kmeans', and 'ica'. See thunder.utils.datasets for detailed options. Parameters ---------- dataset : str Which dataset to generate Returns ------- data : RDD of (tuple, array) pairs Generated dataset """ from thunder.utils.datasets import DATASET_MAKERS checkParams(dataset, DATASET_MAKERS.keys()) return DataSets.make(self._sc, dataset, **opts) def loadExample(self, dataset=None): """ Load a local example data set for testing analyses. Some of these data sets are extremely downsampled and should be considered useful only for testing the API. If called with None, will return list of available datasets. Parameters ---------- dataset : str Which dataset to load Returns ------- data : Data object Generated dataset as a Thunder data objects (e.g Series or Images) """ import atexit import shutil import tempfile from pkg_resources import resource_listdir, resource_filename DATASETS = { 'iris': 'iris', 'fish-series': 'fish/series', 'fish-images': 'fish/images', 'mouse-series': 'mouse/series', 'mouse-images': 'mouse/images', 'mouse-params': 'mouse/params' } if dataset is None: return sorted(DATASETS.keys()) checkParams(dataset, DATASETS.keys()) if 'ec2' in self._sc.master: tmpdir = os.path.join('/root/thunder/python/thunder/utils', 'data', DATASETS[dataset]) else: tmpdir = tempfile.mkdtemp() atexit.register(shutil.rmtree, tmpdir) def copyLocal(target): files = resource_listdir('thunder.utils.data', target) for f in files: path = resource_filename('thunder.utils.data', os.path.join(target, f)) shutil.copy(path, tmpdir) copyLocal(DATASETS[dataset]) npartitions = self._sc.defaultParallelism if dataset == "iris": return self.loadSeries(tmpdir) elif dataset == "fish-series": return self.loadSeries(tmpdir).astype('float') elif dataset == "fish-images": return self.loadImages(tmpdir, inputFormat="tif", npartitions=npartitions) elif dataset == "mouse-series": return self.loadSeries(tmpdir).astype('float') elif dataset == "mouse-images": return self.loadImages(tmpdir, npartitions=npartitions) elif dataset == "mouse-params": return self.loadParams(os.path.join(tmpdir, 'covariates.json')) def loadExampleS3(self, dataset=None): """ Load an example data set from S3. Info on the included datasets can be found at the CodeNeuro data repository (http://datasets.codeneuro.org/). If called with None, will return list of available datasets. Parameters ---------- dataset : str Which dataset to load Returns ------- data : a Data object (usually a Series or Images) The dataset as one of Thunder's data objects params : dict Parameters or metadata for dataset """ DATASETS = { 'ahrens.lab/direction.selectivity': 'ahrens.lab/direction.selectivity/1/', 'ahrens.lab/optomotor.response': 'ahrens.lab/optomotor.response/1/', 'svoboda.lab/tactile.navigation': 'svoboda.lab/tactile.navigation/1/' } if dataset is None: return DATASETS.keys() if 'local' in self._sc.master: raise Exception("Must be running on an EC2 cluster to load this example data set") checkParams(dataset, DATASETS.keys()) basePath = 's3n://neuro.datasets/' dataPath = DATASETS[dataset] data = self.loadSeries(basePath + dataPath + 'series') params = self.loadParams(basePath + dataPath + 'params/covariates.json') return data, params def loadJSON(self, path): """ Generic function for loading JSON from a path, handling local file systems and S3 Parameters ---------- path : str Path to a file, can be on a local file system or an S3 bucket Returns ------- A string with the JSON """ import json from thunder.rdds.fileio.readers import getFileReaderForPath, FileNotFoundError reader = getFileReaderForPath(path)(awsCredentialsOverride=self._credentials) try: buffer = reader.read(path) except FileNotFoundError: raise Exception("Cannot find file %s" % path) return json.loads(buffer) def loadParams(self, path): """ Load a file with parameters from a local file system or S3. Assumes file is JSON with basic types (strings, integers, doubles, lists), in either a single dict or list of dict-likes, and each dict has at least a "name" field and a "value" field. Useful for loading generic meta data, parameters, covariates, etc. Parameters ---------- path : str Path to file, can be on a local file system or an S3 bucket Returns ------- A dict or list with the parameters """ blob = self.loadJSON(path) return Params(blob) def loadSources(self, path): """ Load a file with sources from a local file system or S3. Parameters ---------- path : str Path to file, can be on a local file system or an S3 bucket Returns ------- A SourceModel See also -------- thunder.SourceExtraction """ from thunder import SourceExtraction blob = self.loadJSON(path) return SourceExtraction.deserialize(blob) def export(self, data, filename, outputFormat=None, overwrite=False, varname=None): """ Export local array data to a variety of formats. Can write to a local file sytem or S3 (destination inferred from filename schema). S3 writing useful for persisting arrays when working in an environment without accessible local storage. Parameters ---------- data : array-like The data to export filename : str Output location (path/to/file.ext) outputFormat : str, optional, default = None Ouput format ("npy", "mat", or "txt"), if not provided will try to infer from file extension. overwrite : boolean, optional, default = False Whether to overwrite if directory or file already exists varname : str, optional, default = None Variable name for writing "mat" formatted files """ from numpy import save, savetxt, asarray from scipy.io import savemat from StringIO import StringIO from thunder.rdds.fileio.writers import getFileWriterForPath path, file, outputFormat = handleFormat(filename, outputFormat) checkParams(outputFormat, ["npy", "mat", "txt"]) clazz = getFileWriterForPath(filename) writer = clazz(path, file, overwrite=overwrite, awsCredentialsOverride=self._credentials) stream = StringIO() if outputFormat == "mat": varname = os.path.splitext(file)[0] if varname is None else varname savemat(stream, mdict={varname: data}, oned_as='column', do_compression='true') if outputFormat == "npy": save(stream, data) if outputFormat == "txt": if asarray(data).ndim > 2: raise Exception("Cannot write data with more than two dimensions to text") savetxt(stream, data) stream.seek(0) writer.writeFile(stream.buf) def setAWSCredentials(self, awsAccessKeyId, awsSecretAccessKey): """ Manually set AWS access credentials to be used by Thunder. Provided for hosted cloud environments without filesystem access. If launching a cluster using the thunder-ec2 script, credentials will be configured automatically (inside core-site.xml and ~/.boto), so this method should not need to be called. Parameters ---------- awsAccessKeyId : string AWS public key, usually starts with "AKIA" awsSecretAccessKey : string AWS private key """ from thunder.utils.common import AWSCredentials self._credentials = AWSCredentials(awsAccessKeyId, awsSecretAccessKey) self._credentials.setOnContext(self._sc)
def __init__(self, awsCredentialsOverride=None): """Initialization; validates that AWS keys are available as environment variables. Will let boto library look up credentials itself according to its own rules - e.g. first looking for AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY, then going through several possible config files and finally looking for a ~/.aws/credentials .ini-formatted file. See boto docs: http://boto.readthedocs.org/en/latest/boto_config_tut.html However, if an AWSCredentials object is provided, its `awsAccessKeyId` and `awsSecretAccessKey` attributes will be used instead of those found by the standard boto credential lookup process. """ if not _haveBoto: raise ValueError( "The boto package does not appear to be available; boto is required for BotoS3Reader" ) self.awsCredentialsOverride = awsCredentialsOverride if awsCredentialsOverride else AWSCredentials( )