def astype(self, dtype, casting='safe'): """ Cast values to specified numpy dtype. If 'smallfloat' is passed, values will be cast to the smallest floating point representation to which they can be cast safely, as determined by the thunder.utils.common smallest_float_type function. Typically this will be a float type larger than a passed integer type (for instance, float16 for int8 or uint8). If the passed dtype is the same as the current dtype, or if 'smallfloat' is passed when values are already in floating point, then this method will return self unchanged. Parameters ---------- dtype: numpy dtype or dtype specifier, or string 'smallfloat', or None Data type to which RDD values are to be cast. Will return immediately, performing no cast, if None is passed. casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's astype() method; see numpy documentation for details. Returns ------- New Data object, of same type as self, with values cast to the requested dtype; or self if no cast is performed. """ if dtype is None or dtype == '': return self if dtype == 'smallfloat': # get the smallest floating point type that can be safely cast to from our current type from thunder.utils.common import smallestFloatType dtype = smallestFloatType(self.dtype) nextRdd = self.rdd.mapValues(lambda v: v.astype(dtype, casting=casting, copy=False)) return self._constructor(nextRdd, dtype=str(dtype)).__finalize__(self)
def astype(self, dtype, casting='safe'): """Cast values to specified numpy dtype Calls numpy's astype() method. If the string 'smallfloat' is passed, then the values will be cast to the smallest floating point representation to which they can be cast safely, as determined by the thunder.utils.common smallest_float_type function. Typically this will be a float type larger than a passed integer type (for instance, float16 for int8 or uint8). If the passed dtype is the same as the current dtype, or if 'smallfloat' is passed when values are already in floating point, then this method will return immediately, returning self. Parameters ---------- dtype: numpy dtype or dtype specifier, or string 'smallfloat', or None Data type to which RDD values are to be cast. Will return immediately, performing no cast, if None is passed. casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's astype() method; see numpy documentation for details. Returns ------- New Data object, of same type as self, with values cast to the requested dtype; or self if no cast is performed. """ if dtype is None or dtype == '': return self if dtype == 'smallfloat': # get the smallest floating point type that can be safely cast to from our current type from thunder.utils.common import smallestFloatType dtype = smallestFloatType(self.dtype) nextRdd = self.rdd.mapValues( lambda v: v.astype(dtype, casting=casting, copy=False)) return self._constructor(nextRdd, dtype=str(dtype)).__finalize__(self)
def _run_tst_fromBinary(self, useConfJson=False): # run this as a single big test so as to avoid repeated setUp and tearDown of the spark context # data will be a sequence of test data # all keys and all values in a test data item must be of the same length # keys get converted to ints regardless of raw input format DATA = [ SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3], [5, 6, 7]], [[11], [12]], 'int16', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int32'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int32', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'int16', 'float32'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'float32', 'float32'), SeriesBinaryTestData.fromArrays([[2, 3, 4]], [[11.0, 12.0, 13.0]], 'float32', 'float32'), ] for itemidx, item in enumerate(DATA): outSubdir = os.path.join(self.outputdir, 'input%d' % itemidx) os.mkdir(outSubdir) fname = os.path.join(outSubdir, 'inputfile%d.bin' % itemidx) with open(fname, 'wb') as f: item.writeToFile(f) loader = SeriesLoader(self.sc) if not useConfJson: series = loader.fromBinary(outSubdir, nkeys=item.nkeys, nvalues=item.nvals, keyType=str(item.keyDtype), valueType=str(item.valDtype)) else: # write configuration file conf = {'input': outSubdir, 'nkeys': item.nkeys, 'nvalues': item.nvals, 'valuetype': str(item.valDtype), 'keytype': str(item.keyDtype)} with open(os.path.join(outSubdir, "conf.json"), 'wb') as f: json.dump(conf, f, indent=2) series = loader.fromBinary(outSubdir) seriesData = series.rdd.collect() expectedData = item.data assert_equals(len(expectedData), len(seriesData), "Differing numbers of k/v pairs in item %d; expected %d, got %d" % (itemidx, len(expectedData), len(seriesData))) for expected, actual in zip(expectedData, seriesData): expectedKeys = tuple(expected[0]) expectedType = smallestFloatType(item.valDtype) expectedVals = array(expected[1], dtype=expectedType) assert_equals(expectedKeys, actual[0], "Key mismatch in item %d; expected %s, got %s" % (itemidx, str(expectedKeys), str(actual[0]))) assert_true(allclose(expectedVals, actual[1]), "Value mismatch in item %d; expected %s, got %s" % (itemidx, str(expectedVals), str(actual[1]))) assert_equals(expectedType, str(actual[1].dtype), "Value type mismatch in item %d; expected %s, got %s" % (itemidx, expectedType, str(actual[1].dtype)))
def _run_roundtrip_tst(self, testCount, arrays, blockSize): # print "Running TestSeriesBinaryWriteFromStack roundtrip test #%d" % testCount inSubdir = os.path.join(self.outputdir, 'input%d' % testCount) os.mkdir(inSubdir) outSubdir = os.path.join(self.outputdir, 'output%d' % testCount) # os.mkdir(outSubdir) for aryCount, ary in enumerate(arrays): # array.tofile always writes in column-major order... ary.tofile(os.path.join(inSubdir, "img%02d.stack" % aryCount)) # ... but we will read and interpret these as though they are in row-major order dims = list(arrays[0].shape) dims.reverse() underTest = SeriesLoader(self.sc) underTest.saveFromStack(inSubdir, outSubdir, dims, blockSize=blockSize, dtype=str(arrays[0].dtype)) series = underTest.fromStack(inSubdir, dims, dtype=str(arrays[0].dtype)) roundtrippedSeries = underTest.fromBinary(outSubdir) roundtripped = roundtrippedSeries.collect() direct = series.collect() expectedDtype = str(smallestFloatType(arrays[0].dtype)) assert_equals(expectedDtype, roundtrippedSeries.dtype) assert_equals(expectedDtype, series.dtype) assert_equals(expectedDtype, str(roundtripped[0][1].dtype)) assert_equals(expectedDtype, str(direct[0][1].dtype)) with open(os.path.join(outSubdir, "conf.json"), 'r') as fp: # check that binary series file data type *matches* input stack data type (not yet converted to float) # at least according to conf.json conf = json.load(fp) assert_equals(str(arrays[0].dtype), conf["valuetype"]) for ((seriesKeys, seriesValues), (directKeys, directValues)) in zip(roundtripped, direct): assert_equals(directKeys, seriesKeys) assert_equals(directValues, seriesValues) for seriesIdx, seriesVal in enumerate(seriesValues): # print "seriesIdx: %d; seriesKeys: %s; seriesVal: %g" % (seriesIdx, seriesKeys, seriesVal) # flip indices again for row vs col-major insanity aryKeys = list(seriesKeys) aryKeys.reverse() msg = "Failure on test #%d, time point %d, indices %s" % (testCount, seriesIdx, str(tuple(aryKeys))) try: assert_almost_equal(arrays[seriesIdx][tuple(aryKeys)], seriesVal, places=4) except AssertionError, e: raise AssertionError(msg, e)
def astype(self, dtype, casting='safe'): """ Cast values to specified numpy dtype. If 'smallfloat' is passed, values will be cast to the smallest floating point representation to which they can be cast safely, as determined by the thunder.utils.common smallest_float_type function. Typically this will be a float type larger than a passed integer type (for instance, float16 for int8 or uint8). If the passed dtype is the same as the current dtype, or if 'smallfloat' is passed when values are already in floating point, then this method will return self unchanged. Parameters ---------- dtype: numpy dtype or dtype specifier, or string 'smallfloat', or None Data type to which RDD values are to be cast. Will return without cast if None is passed. casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's astype() method; see numpy documentation for details. Returns ------- New Data object, of same type as self, with values cast to the requested dtype; or self if no cast is performed. """ if dtype is None or dtype == '': return self from numpy import ndarray from numpy import dtype as dtypeFunc if dtype == 'smallfloat': # get the smallest floating point type that can be safely cast to from our current type from thunder.utils.common import smallestFloatType dtype = smallestFloatType(self.dtype) def cast(v, dtype_, casting_): if isinstance(v, ndarray): return v.astype(dtype_, casting=casting_, copy=False) else: # assume we are a scalar, either a numpy scalar or a python scalar # turn ourself into a numpy scalar of the appropriate type return asarray([v]).astype(dtype_, casting=casting_, copy=False)[0] nextRdd = self.rdd.mapValues( lambda v: cast(v, dtypeFunc(dtype), casting)) return self._constructor(nextRdd, dtype=str(dtype)).__finalize__(self)
def _run_roundtrip_tst(self, testIdx, nimages, aryShape, dtypeSpec, npartitions): testArrays = TestSeriesBinaryWriteFromStack.generateTestImages( nimages, aryShape, dtypeSpec) loader = SeriesLoader(self.sc) series = loader.fromArrays(testArrays) saveDirPath = os.path.join(self.outputdir, 'save%d' % testIdx) series.repartition( npartitions ) # note: this does an elementwise shuffle! won't be in sorted order series.saveAsBinarySeries(saveDirPath) nnonemptyPartitions = 0 for partitionList in series.rdd.glom().collect(): if partitionList: nnonemptyPartitions += 1 del partitionList nsaveFiles = len(glob.glob(saveDirPath + os.sep + "*.bin")) roundtrippedSeries = loader.fromBinary(saveDirPath) with open(os.path.join(saveDirPath, "conf.json"), 'r') as fp: conf = json.load(fp) # sorting is required here b/c of the randomization induced by the repartition. # orig and roundtripped will in general be different from each other, since roundtripped # will have (0, 0, 0) index as first element (since it will be the lexicographically first # file) while orig has only a 1 in npartitions chance of starting with (0, 0, 0) after repartition. expectedPackedAry = series.pack(sorting=True) actualPackedAry = roundtrippedSeries.pack(sorting=True) assert_true(array_equal(expectedPackedAry, actualPackedAry)) assert_equals(nnonemptyPartitions, nsaveFiles) assert_equals(len(aryShape), conf["nkeys"]) assert_equals(nimages, conf["nvalues"]) assert_equals("int16", conf["keytype"]) assert_equals(str(series.dtype), conf["valuetype"]) # check that we have converted ourselves to an appropriate float after reloading assert_equals(str(smallestFloatType(series.dtype)), str(roundtrippedSeries.dtype))
def _run_roundtrip_tst(self, testIdx, nimages, aryShape, dtypeSpec, npartitions): testArrays = TestSeriesBinaryWriteFromStack.generateTestImages(nimages, aryShape, dtypeSpec) loader = SeriesLoader(self.sc) series = loader.fromArrays(testArrays) saveDirPath = os.path.join(self.outputdir, 'save%d' % testIdx) series.repartition(npartitions) # note: this does an elementwise shuffle! won't be in sorted order series.saveAsBinarySeries(saveDirPath) nnonemptyPartitions = 0 for partitionList in series.rdd.glom().collect(): if partitionList: nnonemptyPartitions += 1 del partitionList nsaveFiles = len(glob.glob(saveDirPath + os.sep + "*.bin")) roundtrippedSeries = loader.fromBinary(saveDirPath) with open(os.path.join(saveDirPath, "conf.json"), 'r') as fp: conf = json.load(fp) # sorting is required here b/c of the randomization induced by the repartition. # orig and roundtripped will in general be different from each other, since roundtripped # will have (0, 0, 0) index as first element (since it will be the lexicographically first # file) while orig has only a 1 in npartitions chance of starting with (0, 0, 0) after repartition. expectedPackedAry = series.pack(sorting=True) actualPackedAry = roundtrippedSeries.pack(sorting=True) assert_true(array_equal(expectedPackedAry, actualPackedAry)) assert_equals(nnonemptyPartitions, nsaveFiles) assert_equals(len(aryShape), conf["nkeys"]) assert_equals(nimages, conf["nvalues"]) assert_equals("int16", conf["keytype"]) assert_equals(str(series.dtype), conf["valuetype"]) # check that we have converted ourselves to an appropriate float after reloading assert_equals(str(smallestFloatType(series.dtype)), str(roundtrippedSeries.dtype))
def _run_tst_fromBinary(self, useConfJson=False): # run this as a single big test so as to avoid repeated setUp and tearDown of the spark context # data will be a sequence of test data # all keys and all values in a test data item must be of the same length # keys get converted to ints regardless of raw input format DATA = [ SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3], [5, 6, 7]], [[11], [12]], 'int16', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int32'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int32', 'int16'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'int16', 'float32'), SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'float32', 'float32'), SeriesBinaryTestData.fromArrays([[2, 3, 4]], [[11.0, 12.0, 13.0]], 'float32', 'float32'), ] for itemidx, item in enumerate(DATA): outSubdir = os.path.join(self.outputdir, 'input%d' % itemidx) os.mkdir(outSubdir) fname = os.path.join(outSubdir, 'inputfile%d.bin' % itemidx) with open(fname, 'wb') as f: item.writeToFile(f) loader = SeriesLoader(self.sc) if not useConfJson: series = loader.fromBinary(outSubdir, nkeys=item.nkeys, nvalues=item.nvals, keyType=str(item.keyDtype), valueType=str(item.valDtype)) else: # write configuration file conf = { 'input': outSubdir, 'nkeys': item.nkeys, 'nvalues': item.nvals, 'valuetype': str(item.valDtype), 'keytype': str(item.keyDtype) } with open(os.path.join(outSubdir, "conf.json"), 'wb') as f: json.dump(conf, f, indent=2) series = loader.fromBinary(outSubdir) seriesData = series.rdd.collect() expectedData = item.data assert_equals( len(expectedData), len(seriesData), "Differing numbers of k/v pairs in item %d; expected %d, got %d" % (itemidx, len(expectedData), len(seriesData))) for expected, actual in zip(expectedData, seriesData): expectedKeys = tuple(expected[0]) expectedType = smallestFloatType(item.valDtype) expectedVals = array(expected[1], dtype=expectedType) assert_equals( expectedKeys, actual[0], "Key mismatch in item %d; expected %s, got %s" % (itemidx, str(expectedKeys), str(actual[0]))) assert_true( allclose(expectedVals, actual[1]), "Value mismatch in item %d; expected %s, got %s" % (itemidx, str(expectedVals), str(actual[1]))) assert_equals( expectedType, str(actual[1].dtype), "Value type mismatch in item %d; expected %s, got %s" % (itemidx, expectedType, str(actual[1].dtype)))
def _getSeriesBlocksFromMultiTif(self, dataPath, ext="tif", blockSize="150M", newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None, recursive=False): import thunder.rdds.fileio.multitif as multitif import itertools from PIL import Image import io dataPath = self.__normalizeDatafilePattern(dataPath, ext) blockSize = parseMemoryString(blockSize) reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride) filenames = reader.list(dataPath, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) if not filenames: raise IOError("No files found for path '%s'" % dataPath) ntimepoints = len(filenames) doMinimizeReads = dataPath.lower().startswith("s3") # check PIL version to see whether it is actually pillow or indeed old PIL and choose # conversion function appropriately. See ImagesLoader.fromMultipageTif and common.pil_to_array # for more explanation. isPillow = hasattr(Image, "PILLOW_VERSION") if isPillow: conversionFcn = array # use numpy's array() function else: from thunder.utils.common import pil_to_array conversionFcn = pil_to_array # use our modified version of matplotlib's pil_to_array height, width, npages, dtype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif(reader, filenames[0]) if dtype.startswith('int'): raise ValueError('Signed integer tiff images are not supported in SeriesLoader (shuffle=False);' + ' please try loading as Images (shuffle=True)') pixelBytesize = dtypeFunc(dtype).itemsize if newDtype is None or str(newDtype) == '': newDtype = str(dtype) elif newDtype == 'smallfloat': newDtype = str(smallestFloatType(dtype)) else: newDtype = str(newDtype) # intialize at one block per plane bytesPerPlane = height * width * pixelBytesize * ntimepoints bytesPerBlock = bytesPerPlane blocksPerPlane = 1 # keep dividing while cutting our size in half still leaves us bigger than the requested size # should end up no more than 2x blockSize. while bytesPerBlock >= blockSize * 2: bytesPerBlock /= 2 blocksPerPlane *= 2 blocklenPixels = max((height * width) / blocksPerPlane, 1) # integer division while blocksPerPlane * blocklenPixels < height * width: # make sure we're reading the plane fully blocksPerPlane += 1 # prevent bringing in self in closure: awsCredentialsOverride = self.awsCredentialsOverride # keys will be planeidx, blockidx: keys = list(itertools.product(xrange(npages), xrange(blocksPerPlane))) def readBlockFromTiff(planeIdxBlockIdx): planeIdx, blockIdx = planeIdxBlockIdx blocks = [] planeShape = None blockStart = None blockEnd = None for fname in filenames: reader_ = getFileReaderForPath(fname)(awsCredentialsOverride=awsCredentialsOverride) fp = reader_.open(fname) try: if doMinimizeReads: # use multitif module to generate a fake, in-memory one-page tif file # the advantage of this is that it cuts way down on the many small reads # that PIL/pillow will make otherwise, which would be a problem for s3 tiffParser_ = multitif.TiffParser(fp, debug=False) tiffFilebuffer = multitif.packSinglePage(tiffParser_, pageIdx=planeIdx) byteBuf = io.BytesIO(tiffFilebuffer) try: pilImg = Image.open(byteBuf) ary = conversionFcn(pilImg).T finally: byteBuf.close() del tiffFilebuffer, tiffParser_, pilImg, byteBuf else: # read tif using PIL directly pilImg = Image.open(fp) pilImg.seek(planeIdx) ary = conversionFcn(pilImg).T del pilImg if not planeShape: planeShape = ary.shape[:] blockStart = blockIdx * blocklenPixels blockEnd = min(blockStart+blocklenPixels, planeShape[0]*planeShape[1]) blocks.append(ary.ravel(order='C')[blockStart:blockEnd]) del ary finally: fp.close() buf = vstack(blocks).T # dimensions are now linindex x time (images) del blocks buf = buf.astype(newDtype, casting=casting, copy=False) # append subscript keys based on dimensions linearIdx = arange(blockStart, blockEnd) # zero-based seriesKeys = zip(*map(tuple, unravel_index(linearIdx, planeShape, order='C'))) # add plane index to end of keys if npages > 1: seriesKeys = [tuple(list(keys_)[::-1]+[planeIdx]) for keys_ in seriesKeys] else: seriesKeys = [tuple(list(keys_)[::-1]) for keys_ in seriesKeys] return zip(seriesKeys, buf) # map over blocks rdd = self.sc.parallelize(keys, len(keys)).flatMap(readBlockFromTiff) if npages > 1: dims = (npages, width, height) else: dims = (width, height) metadata = (dims, ntimepoints, newDtype) return rdd, metadata
def _getSeriesBlocksFromStack(self, dataPath, dims, ext="stack", blockSize="150M", dtype='int16', newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None, recursive=False): """Create an RDD of <string blocklabel, (int k-tuple indices, array of datatype values)> Parameters ---------- dataPath: string URI or local filesystem path Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://", "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified by a glob-style expression using a single wildcard character '*'. dims: tuple of positive int Dimensions of input image data, ordered with the fastest-changing dimension first. dtype: dtype or dtype specifier, optional, default 'int16' Numpy dtype of input stack data newDtype: floating-point dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat' Numpy dtype of output series data. Series data must be floating-point. Input data will be cast to the requested `newdtype` - see numpy `astype()` method. casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's `astype()` method; see numpy documentation for details. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems (not s3). Returns --------- pair of (RDD, ntimepoints) RDD: sequence of keys, values pairs (call using flatMap) RDD Key: tuple of int zero-based indicies of position within original image volume RDD Value: numpy array of datatype series of values at position across loaded image volumes ntimepoints: int number of time points in returned series, determined from number of stack files found at dataPath newDtype: string string representation of numpy data type of returned blocks """ dataPath = self.__normalizeDatafilePattern(dataPath, ext) blockSize = parseMemoryString(blockSize) totalDim = reduce(lambda x_, y_: x_*y_, dims) dtype = dtypeFunc(dtype) if newDtype is None or newDtype == '': newDtype = str(dtype) elif newDtype == 'smallfloat': newDtype = str(smallestFloatType(dtype)) else: newDtype = str(newDtype) reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride) filenames = reader.list(dataPath, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) if not filenames: raise IOError("No files found for path '%s'" % dataPath) dataSize = totalDim * len(filenames) * dtype.itemsize nblocks = max(dataSize / blockSize, 1) # integer division if len(dims) >= 3: # for 3D stacks, do calculations to ensure that # different planes appear in distinct files blocksPerPlane = max(nblocks / dims[-1], 1) pixPerPlane = reduce(lambda x_, y_: x_*y_, dims[:-1]) # all but last dimension # get the greatest number of blocks in a plane (up to as many as requested) that still divide the plane # evenly. This will always be at least one. kUpdated = [x for x in range(1, blocksPerPlane+1) if not pixPerPlane % x][-1] nblocks = kUpdated * dims[-1] blockSizePerStack = (totalDim / nblocks) * dtype.itemsize else: # otherwise just round to make contents divide into nearly even blocks blockSizePerStack = int(math.ceil(totalDim / float(nblocks))) nblocks = int(math.ceil(totalDim / float(blockSizePerStack))) blockSizePerStack *= dtype.itemsize fileSize = totalDim * dtype.itemsize def readBlock(blockNum): # copy size out from closure; will modify later: blockSizePerStack_ = blockSizePerStack # get start position for this block position = blockNum * blockSizePerStack_ # adjust if at end of file if (position + blockSizePerStack_) > fileSize: blockSizePerStack_ = int(fileSize - position) # loop over files, loading one block from each bufs = [] for fname in filenames: buf = reader.read(fname, startOffset=position, size=blockSizePerStack_) bufs.append(frombuffer(buf, dtype=dtype)) buf = vstack(bufs).T # dimensions are now linindex x time (images) del bufs buf = buf.astype(newDtype, casting=casting, copy=False) # append subscript keys based on dimensions itemPosition = position / dtype.itemsize itemBlocksize = blockSizePerStack_ / dtype.itemsize linearIdx = arange(itemPosition, itemPosition + itemBlocksize) # zero-based keys = zip(*map(tuple, unravel_index(linearIdx, dims, order='F'))) return zip(keys, buf) # map over blocks return (self.sc.parallelize(range(0, nblocks), nblocks).flatMap(lambda bn: readBlock(bn)), len(filenames), newDtype)
def _run_roundtrip_tst(self, testCount, arrays, blockSize): print "Running TestSeriesBinaryWriteFromStack roundtrip test #%d" % testCount inSubdir = os.path.join(self.outputdir, 'input%d' % testCount) os.mkdir(inSubdir) outSubdir = os.path.join(self.outputdir, 'output%d' % testCount) # os.mkdir(outSubdir) for aryCount, ary in enumerate(arrays): # array.tofile always writes in column-major order... ary.tofile(os.path.join(inSubdir, "img%02d.stack" % aryCount)) # ... but we will read and interpret these as though they are in row-major order dims = list(arrays[0].shape) dims.reverse() underTest = SeriesLoader(self.sc) underTest.saveFromStack(inSubdir, outSubdir, dims, blockSize=blockSize, dtype=str(arrays[0].dtype)) series = underTest.fromStack(inSubdir, dims, dtype=str(arrays[0].dtype)) roundtrippedSeries = underTest.fromBinary(outSubdir) roundtripped = roundtrippedSeries.collect() direct = series.collect() expectedDtype = str(smallestFloatType(arrays[0].dtype)) assert_equals(expectedDtype, roundtrippedSeries.dtype) assert_equals(expectedDtype, series.dtype) assert_equals(expectedDtype, str(roundtripped[0][1].dtype)) assert_equals(expectedDtype, str(direct[0][1].dtype)) with open(os.path.join(outSubdir, "conf.json"), 'r') as fp: # check that binary series file data type *matches* input stack data type (not yet converted to float) # at least according to conf.json conf = json.load(fp) assert_equals(str(arrays[0].dtype), conf["valuetype"]) for ((seriesKeys, seriesValues), (directKeys, directValues)) in zip(roundtripped, direct): assert_equals(directKeys, seriesKeys) assert_equals(directValues, seriesValues) for seriesIdx, seriesVal in enumerate(seriesValues): # print "seriesIdx: %d; seriesKeys: %s; seriesVal: %g" % (seriesIdx, seriesKeys, seriesVal) # flip indices again for row vs col-major insanity aryKeys = list(seriesKeys) aryKeys.reverse() msg = "Failure on test #%d, time point %d, indices %s" % ( testCount, seriesIdx, str(tuple(aryKeys))) try: assert_almost_equal(arrays[seriesIdx][tuple(aryKeys)], seriesVal, places=4) except AssertionError, e: raise AssertionError(msg, e)