def test_fromMultipleArrays(self): ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) ary2 = arange(8, 16, dtype=dtypeFunc('int16')).reshape((2, 4)) series = SeriesLoader(self.sc).fromArraysAsImages([ary, ary2]) seriesVals = series.collect() seriesAry = series.pack() # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((3, 0), seriesVals[3][0]) assert_equals((0, 1), seriesVals[4][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple is reversed from numpy shape assert_equals(ary.shape[::-1], series.dims.count) # check that values are in original order, with subsequent point concatenated in values collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16')) assert_true(array_equal(ary.ravel(), collectedVals[:, 0])) assert_true(array_equal(ary2.ravel(), collectedVals[:, 1])) # check that packing returns concatenation of input arrays, with time as first dimension assert_true(array_equal(ary.T, seriesAry[0])) assert_true(array_equal(ary2.T, seriesAry[1]))
def loadSeriesFromArray(self, values, index=None, npartitions=None): """ Load Series data from a local array Parameters ---------- values : list or ndarray A list of 1d numpy arrays, or a single 2d numpy array index : array-like, optional, deafult = None Index to set for Series object, if None will use linear indices. npartitions : position int, optional, default = None Number of partitions for RDD, if unspecified will use default parallelism. """ from numpy import ndarray, asarray from lambdaimage.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc) if not npartitions: npartitions = self._sc.defaultParallelism if isinstance(values, list): values = asarray(values) if isinstance(values, ndarray) and values.ndim > 1: values = list(values) data = loader.fromArrays(values, npartitions=npartitions) if index: data.index = index return data
def test_fromArrays(self): ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) series = SeriesLoader(self.sc).fromArraysAsImages(ary) seriesVals = series.collect() seriesAry = series.pack() # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((2, 0), seriesVals[2][0]) assert_equals((3, 0), seriesVals[3][0]) assert_equals((0, 1), seriesVals[4][0]) assert_equals((1, 1), seriesVals[5][0]) assert_equals((2, 1), seriesVals[6][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple is reversed from numpy shape assert_equals(ary.shape[::-1], series.dims.count) # check that values are in original order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16')).ravel() assert_true(array_equal(ary.ravel(), collectedVals)) # check that packing returns transpose of original array assert_true(array_equal(ary.T, seriesAry))
def _run_tst_fromBinary(self, useConfJson=False): # run this as a single big test so as to avoid repeated setUp and tearDown of the spark context # data will be a sequence of test data # all keys and all values in a test data item must be of the same length # keys get converted to ints regardless of raw input format DATA = [ SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3], [5, 6, 7]], [[11], [12]], 'int16', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int32'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int32', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'int16', 'float32'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'float32', 'float32'), SeriesBinaryTestData.fromArrays([[2, 3, 4]], [[11.0, 12.0, 13.0]], 'float32', 'float32'), ] for itemidx, item in enumerate(DATA): outSubdir = os.path.join(self.outputdir, 'input%d' % itemidx) os.mkdir(outSubdir) fname = os.path.join(outSubdir, 'inputfile%d.bin' % itemidx) with open(fname, 'wb') as f: item.writeToFile(f) loader = SeriesLoader(self.sc) if not useConfJson: series = loader.fromBinary(outSubdir, nkeys=item.nkeys, nvalues=item.nvals, keyType=str(item.keyDtype), valueType=str(item.valDtype)) else: # write configuration file conf = {'input': outSubdir, 'nkeys': item.nkeys, 'nvalues': item.nvals, 'valuetype': str(item.valDtype), 'keytype': str(item.keyDtype)} with open(os.path.join(outSubdir, "conf.json"), 'wb') as f: json.dump(conf, f, indent=2) series = loader.fromBinary(outSubdir) seriesData = series.rdd.collect() expectedData = item.data assert_equals(len(expectedData), len(seriesData), "Differing numbers of k/v pairs in item %d; expected %d, got %d" % (itemidx, len(expectedData), len(seriesData))) for expected, actual in zip(expectedData, seriesData): expectedKeys = tuple(expected[0]) expectedType = smallestFloatType(item.valDtype) expectedVals = array(expected[1], dtype=expectedType) assert_equals(expectedKeys, actual[0], "Key mismatch in item %d; expected %s, got %s" % (itemidx, str(expectedKeys), str(actual[0]))) assert_true(allclose(expectedVals, actual[1]), "Value mismatch in item %d; expected %s, got %s" % (itemidx, str(expectedVals), str(actual[1]))) assert_equals(expectedType, str(actual[1].dtype), "Value type mismatch in item %d; expected %s, got %s" % (itemidx, expectedType, str(actual[1].dtype)))
def test_castToFloat(self): from numpy import arange shape = (3, 2, 2) size = 3*2*2 ary = arange(size, dtype=dtypeFunc('uint8')).reshape(shape) ary2 = ary + size from lambdaimage.rdds.fileio.seriesloader import SeriesLoader series = SeriesLoader(self.sc).fromArraysAsImages([ary, ary2]) castSeries = series.astype("smallfloat") assert_equals('float16', str(castSeries.dtype)) assert_equals('float16', str(castSeries.first()[1].dtype))
def test_maxProject(self): from lambdaimage.rdds.fileio.seriesloader import SeriesLoader ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) series = SeriesLoader(self.sc).fromArraysAsImages(ary) project0Series = series.maxProject(axis=0) project0 = project0Series.pack() project1Series = series.maxProject(axis=1) project1 = project1Series.pack(sorting=True) assert_true(array_equal(amax(ary.T, 0), project0)) assert_true(array_equal(amax(ary.T, 1), project1))
def _run_tst_roundtripConvertToSeries(self, images, strategy): outdir = os.path.join(self.outputdir, "fish-series-dir") partitionedimages = images.toBlocks(strategy) series = partitionedimages.toSeries() series_ary = series.pack() partitionedimages.saveAsBinarySeries(outdir) converted_series = SeriesLoader(self.sc).fromBinary(outdir) converted_series_ary = converted_series.pack() assert_equals(images.dims.count, series.dims.count) expected_shape = tuple([images.nrecords] + list(images.dims.count)) assert_equals(expected_shape, series_ary.shape) assert_true(array_equal(series_ary, converted_series_ary))
def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None, maxPartitionSize='32mb', confFilename='conf.json', keyType=None, valueType=None, keyPath=None, varName=None): """ Loads a Series object from data stored as binary, text, npy, or mat. For binary and text, supports single files or multiple files stored on a local file system, a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS. For local formats (npy and mat) only local file systems currently supported. Parameters ---------- dataPath: string Path to data files or directory, as either a local filesystem path or a URI. May include a single '*' wildcard in the filename. Examples of valid dataPaths include 'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/". nkeys: int, optional (required if `inputFormat` is 'text'), default = None Number of keys per record (e.g. 3 for (x, y, z) coordinate keys). Must be specified for text data; can be specified here or in a configuration file for binary data. nvalues: int, optional (required if `inputFormat` is 'text') Number of values per record. Must be specified here or in a configuration file for binary data. inputFormat: {'text', 'binary', 'npy', 'mat'}. optional, default = 'binary' inputFormat of data to be read. minPartitions: int, optional, default = SparkContext.minParallelism Minimum number of Spark partitions to use, only for text. maxPartitionSize : int, optional, default = '32mb' Maximum size of partitions as a Java-style memory string, e.g. '32mb' or '64mb', indirectly controls the number of Spark partitions, only for binary. confFilename: string, optional, default 'conf.json' Path to JSON file with configuration options including 'nkeys', 'nvalues', 'keyType', and 'valueType'. If a file is not found at the given path, then the base directory in 'dataPath' will be checked. Parameters will override the conf file. keyType: string or numpy dtype, optional, default = None Numerical type of keys, will override conf file. valueType: string or numpy dtype, optional, default = None Numerical type of values, will override conf file. keyPath: string, optional, default = None Path to file with keys when loading from npy or mat. varName : str, optional, default = None Variable name to load (for MAT files only) Returns ------- data: lambdaimage.rdds.Series A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs """ checkParams(inputFormat, ['text', 'binary', 'npy', 'mat']) from lambdaimage.rdds.fileio.seriesloader import SeriesLoader loader = SeriesLoader(self._sc, minPartitions=minPartitions) if inputFormat.lower() == 'binary': data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues, keyType=keyType, valueType=valueType, maxPartitionSize=maxPartitionSize) elif inputFormat.lower() == 'text': if nkeys is None: raise Exception('Must provide number of keys per record for loading from text') data = loader.fromText(dataPath, nkeys=nkeys) elif inputFormat.lower() == 'npy': data = loader.fromNpyLocal(dataPath, keyPath) else: if varName is None: raise Exception('Must provide variable name for loading MAT files') data = loader.fromMatLocal(dataPath, varName, keyPath) return data