def test_loadMultipleMultipointStacksAsSeries(self): rangeAry = arange(64*128, dtype=dtypeFunc('int16')) filePath = os.path.join(self.outputdir, "rangeary01.bin") rangeAry.tofile(filePath) expectedAry = rangeAry.reshape((32, 32, 8), order='F') rangeAry2 = arange(64*128, 2*64*128, dtype=dtypeFunc('int16')) filePath = os.path.join(self.outputdir, "rangeary02.bin") rangeAry2.tofile(filePath) expectedAry2 = rangeAry2.reshape((32, 32, 8), order='F') rangeSeries = self.tsc.loadImagesAsSeries(self.outputdir, dims=(32, 32, 8), nplanes=2) assert_equals('float32', rangeSeries._dtype) rangeSeriesAry = rangeSeries.pack() assert_equals((32, 32, 2), rangeSeries.dims.count) assert_equals((8, 32, 32, 2), rangeSeriesAry.shape) assert_equals('float32', str(rangeSeriesAry.dtype)) assert_true(array_equal(expectedAry[:, :, :2], rangeSeriesAry[0])) assert_true(array_equal(expectedAry[:, :, 2:4], rangeSeriesAry[1])) assert_true(array_equal(expectedAry[:, :, 4:6], rangeSeriesAry[2])) assert_true(array_equal(expectedAry[:, :, 6:], rangeSeriesAry[3])) assert_true(array_equal(expectedAry2[:, :, :2], rangeSeriesAry[4])) assert_true(array_equal(expectedAry2[:, :, 2:4], rangeSeriesAry[5])) assert_true(array_equal(expectedAry2[:, :, 4:6], rangeSeriesAry[6])) assert_true(array_equal(expectedAry2[:, :, 6:], rangeSeriesAry[7]))
def test_fromStackToSeriesWithPack(self): ary = arange(8, dtype=dtypeFunc("int16")).reshape((2, 4)) filename = os.path.join(self.outputdir, "test.stack") ary.tofile(filename) image = ImagesLoader(self.sc).fromStack(filename, dims=(4, 2)) strategy = SimpleBlockingStrategy.generateFromBlockSize(image, "150M") series = image.toBlocks(strategy).toSeries() seriesVals = series.collect() seriesAry = series.pack() # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((2, 0), seriesVals[2][0]) assert_equals((3, 0), seriesVals[3][0]) assert_equals((0, 1), seriesVals[4][0]) assert_equals((1, 1), seriesVals[5][0]) assert_equals((2, 1), seriesVals[6][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple is reversed from numpy shape assert_equals(ary.shape[::-1], series.dims.count) # check that values are in original order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc("int16")).ravel() assert_true(array_equal(ary.ravel(), collectedVals)) # check that packing returns transpose of original array assert_true(array_equal(ary.T, seriesAry))
def test_fromStackToSeriesWithPack(self): ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) filename = os.path.join(self.outputdir, "test.stack") ary.tofile(filename) image = ImagesLoader(self.sc).fromStack(filename, dims=(4, 2)) strategy = SimpleBlockingStrategy.generateFromBlockSize(image, "150M") series = image.toBlocks(strategy).toSeries() seriesVals = series.collect() seriesAry = series.pack() # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((2, 0), seriesVals[2][0]) assert_equals((3, 0), seriesVals[3][0]) assert_equals((0, 1), seriesVals[4][0]) assert_equals((1, 1), seriesVals[5][0]) assert_equals((2, 1), seriesVals[6][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple is reversed from numpy shape assert_equals(ary.shape[::-1], series.dims.count) # check that values are in original order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16')).ravel() assert_true(array_equal(ary.ravel(), collectedVals)) # check that packing returns transpose of original array assert_true(array_equal(ary.T, seriesAry))
def test_loadMultipleTifsAsSeriesWithShuffle(self): tmpAry = arange(60*120, dtype=dtypeFunc('uint16')) rangeAry = mod(tmpAry, 255).astype('uint8').reshape((60, 120)) pilImg = Image.fromarray(rangeAry) filePath = os.path.join(self.outputdir, "rangetif01.tif") pilImg.save(filePath) tmpAry = arange(60*120, 2*60*120, dtype=dtypeFunc('uint16')) rangeAry2 = mod(tmpAry, 255).astype('uint8').reshape((60, 120)) pilImg = Image.fromarray(rangeAry2) filePath = os.path.join(self.outputdir, "rangetif02.tif") pilImg.save(filePath) del pilImg, tmpAry rangeSeries = self.tsc.loadImagesAsSeries(self.outputdir, inputFormat="tif-stack") assert_equals('float16', rangeSeries._dtype) rangeSeriesAry = rangeSeries.pack() rangeSeriesAry_xpose = rangeSeries.pack(transpose=True) assert_equals((60, 120), rangeSeries.dims.count) # 2d tif now loaded as 2d image; was 3d with singleton z dim assert_equals((2, 60, 120), rangeSeriesAry.shape) assert_equals((2, 120, 60), rangeSeriesAry_xpose.shape) assert_equals('float16', str(rangeSeriesAry.dtype)) assert_true(array_equal(rangeAry, rangeSeriesAry[0])) assert_true(array_equal(rangeAry2, rangeSeriesAry[1])) assert_true(array_equal(rangeAry.T, rangeSeriesAry_xpose[0])) assert_true(array_equal(rangeAry2.T, rangeSeriesAry_xpose[1]))
def test_toSeriesWithInefficientSplitAndSortedPack(self): ary = arange(8, dtype=dtypeFunc("int16")).reshape((4, 2)) image = ImagesLoader(self.sc).fromArrays(ary) series = image.toBlocks((2, 1), units="s").toSeries() seriesVals = series.collect() seriesAry = series.pack(sorting=True) # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((0, 1), seriesVals[2][0]) assert_equals((1, 1), seriesVals[3][0]) # end of first block # beginning of second block assert_equals((2, 0), seriesVals[4][0]) assert_equals((3, 0), seriesVals[5][0]) assert_equals((2, 1), seriesVals[6][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple matches numpy shape assert_equals(ary.shape, series.dims.count) # check that values are in expected order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc("int16")).ravel() assert_true(array_equal(ary[:2, :].ravel(order="F"), collectedVals[:4])) # first block assert_true(array_equal(ary[2:4, :].ravel(order="F"), collectedVals[4:])) # second block # check that packing returns original array (after sort) assert_true(array_equal(ary, seriesAry))
def __run_loadMultipleStacksAsSeries(self, shuffle): rangeAry = arange(64 * 128, dtype=dtypeFunc('int16')) filePath = os.path.join(self.outputdir, "rangeary01.stack") rangeAry.tofile(filePath) expectedAry = rangeAry.reshape((128, 64), order='F') rangeAry2 = arange(64 * 128, 2 * 64 * 128, dtype=dtypeFunc('int16')) filePath = os.path.join(self.outputdir, "rangeary02.stack") rangeAry2.tofile(filePath) expectedAry2 = rangeAry2.reshape((128, 64), order='F') rangeSeries = self.tsc.loadImagesAsSeries(self.outputdir, dims=(128, 64), shuffle=shuffle) assert_equals('float32', rangeSeries._dtype) rangeSeriesAry = rangeSeries.pack() rangeSeriesAry_xpose = rangeSeries.pack(transpose=True) assert_equals((128, 64), rangeSeries.dims.count) assert_equals((2, 128, 64), rangeSeriesAry.shape) assert_equals((2, 64, 128), rangeSeriesAry_xpose.shape) assert_equals('float32', str(rangeSeriesAry.dtype)) assert_true(array_equal(expectedAry, rangeSeriesAry[0])) assert_true(array_equal(expectedAry2, rangeSeriesAry[1])) assert_true(array_equal(expectedAry.T, rangeSeriesAry_xpose[0])) assert_true(array_equal(expectedAry2.T, rangeSeriesAry_xpose[1]))
def test_fromMultiTimepointStacks(self): ary = arange(16, dtype=dtypeFunc('uint8')).reshape((4, 2, 2)) ary2 = arange(16, 32, dtype=dtypeFunc('uint8')).reshape((4, 2, 2)) ary.tofile(os.path.join(self.outputdir, "test01.stack")) ary2.tofile(os.path.join(self.outputdir, "test02.stack")) image = ImagesLoader(self.sc).fromStack(self.outputdir, dtype="uint8", dims=(2, 2, 4), nplanes=2) collectedImage = image.collect() # we don't expect to have nrecords cached, since we get an unknown number of images per file assert_true(image._nrecords is None) assert_equals(4, image.nrecords) assert_equals(4, len(collectedImage)) # check keys: assert_equals(0, collectedImage[0][0]) assert_equals(1, collectedImage[1][0]) assert_equals(2, collectedImage[2][0]) assert_equals(3, collectedImage[3][0]) # check values: assert_true(array_equal(ary[:2].T, collectedImage[0][1])) assert_true(array_equal(ary[2:].T, collectedImage[1][1])) assert_true(array_equal(ary2[:2].T, collectedImage[2][1])) assert_true(array_equal(ary2[2:].T, collectedImage[3][1])) # 3 planes does not divide 4 assert_raises(ValueError, ImagesLoader(self.sc).fromStack, self.outputdir, dtype="uint8", dims=(2, 2, 4), nplanes=3)
def _run_tst_toSeriesWithSplitsAndPack(self, strategy): ary = arange(8, dtype=dtypeFunc('int16')).reshape((4, 2)) image = ImagesLoader(self.sc).fromArrays(ary) series = image.toBlocks(strategy).toSeries() seriesVals = series.collect() seriesAry = series.pack() # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((2, 0), seriesVals[2][0]) assert_equals((3, 0), seriesVals[3][0]) assert_equals((0, 1), seriesVals[4][0]) assert_equals((1, 1), seriesVals[5][0]) assert_equals((2, 1), seriesVals[6][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple matches numpy shape assert_equals(ary.shape, series.dims.count) # check that values are in Fortran-convention order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16')).ravel() assert_true(array_equal(ary.ravel(order='F'), collectedVals)) # check that packing returns original array assert_true(array_equal(ary, seriesAry))
def test_fromArrays(self): ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) series = SeriesLoader(self.sc).fromArraysAsImages(ary) seriesVals = series.collect() seriesAry = series.pack() # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((2, 0), seriesVals[2][0]) assert_equals((3, 0), seriesVals[3][0]) assert_equals((0, 1), seriesVals[4][0]) assert_equals((1, 1), seriesVals[5][0]) assert_equals((2, 1), seriesVals[6][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple is reversed from numpy shape assert_equals(ary.shape[::-1], series.dims.count) # check that values are in original order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16')).ravel() assert_true(array_equal(ary.ravel(), collectedVals)) # check that packing returns transpose of original array assert_true(array_equal(ary.T, seriesAry))
def test_fromMultipleArrays(self): ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) ary2 = arange(8, 16, dtype=dtypeFunc('int16')).reshape((2, 4)) series = SeriesLoader(self.sc).fromArraysAsImages([ary, ary2]) seriesVals = series.collect() seriesAry = series.pack() # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((3, 0), seriesVals[3][0]) assert_equals((0, 1), seriesVals[4][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple is reversed from numpy shape assert_equals(ary.shape[::-1], series.dims.count) # check that values are in original order, with subsequent point concatenated in values collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16')) assert_true(array_equal(ary.ravel(), collectedVals[:, 0])) assert_true(array_equal(ary2.ravel(), collectedVals[:, 1])) # check that packing returns concatenation of input arrays, with time as first dimension assert_true(array_equal(ary.T, seriesAry[0])) assert_true(array_equal(ary2.T, seriesAry[1]))
def fromBinary(self, dataPath, ext='bin', confFilename='conf.json', nkeys=None, nvalues=None, keyType=None, valueType=None, newDtype='smallfloat', casting='safe', maxPartitionSize='32mb'): """ Load a Series object from a directory of binary files. Parameters ---------- dataPath : string URI or local filesystem path Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://", "s3n://", or "gs://"). If no scheme is present, will be interpreted as a path on the local filesystem. This path must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified by a glob-style expression using a single wildcard character '*'. newDtype : dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat' Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be cast to the requested `newdtype` if not None - see Data `astype()` method. casting : 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's `astype()` method; see numpy documentation for details. maxPartitionSize : str, optional, default = '32mb' Maximum size of partitions as Java-style memory, will indirectly control the number of partitions """ paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename, nkeys, nvalues, keyType, valueType) self.__checkBinaryParametersAreSpecified(paramsObj) dataPath = self.__normalizeDatafilePattern(dataPath, ext) keyDtype = dtypeFunc(paramsObj.keytype) valDtype = dtypeFunc(paramsObj.valuetype) keySize = paramsObj.nkeys * keyDtype.itemsize recordSize = keySize + paramsObj.nvalues * valDtype.itemsize lines = self.sc.binaryRecords(dataPath, recordSize) get = lambda v: (tuple( int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype)), frombuffer(buffer(v, keySize), dtype=valDtype)) data = lines.map(get) return Series(data, dtype=str(valDtype), index=arange(paramsObj.nvalues)).astype( newDtype, casting)
def test_fromMultipleArrays(self): ary = arange(8, dtype=dtypeFunc("int16")).reshape((2, 4)) ary2 = arange(8, 16, dtype=dtypeFunc("int16")).reshape((2, 4)) series = SeriesLoader(self.sc).fromArraysAsImages([ary, ary2]) seriesVals = series.collect() seriesAry = series.pack() # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((3, 0), seriesVals[3][0]) assert_equals((0, 1), seriesVals[4][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple is reversed from numpy shape assert_equals(ary.shape[::-1], series.dims.count) # check that values are in original order, with subsequent point concatenated in values collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc("int16")) assert_true(array_equal(ary.ravel(), collectedVals[:, 0])) assert_true(array_equal(ary2.ravel(), collectedVals[:, 1])) # check that packing returns concatenation of input arrays, with time as first dimension assert_true(array_equal(ary.T, seriesAry[0])) assert_true(array_equal(ary2.T, seriesAry[1]))
def test_fromArrays(self): ary = arange(8, dtype=dtypeFunc("int16")).reshape((2, 4)) series = SeriesLoader(self.sc).fromArraysAsImages(ary) seriesVals = series.collect() seriesAry = series.pack() # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((2, 0), seriesVals[2][0]) assert_equals((3, 0), seriesVals[3][0]) assert_equals((0, 1), seriesVals[4][0]) assert_equals((1, 1), seriesVals[5][0]) assert_equals((2, 1), seriesVals[6][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple is reversed from numpy shape assert_equals(ary.shape[::-1], series.dims.count) # check that values are in original order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc("int16")).ravel() assert_true(array_equal(ary.ravel(), collectedVals)) # check that packing returns transpose of original array assert_true(array_equal(ary.T, seriesAry))
def test_toSeriesWithPack(self): ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) image = ImagesLoader(self.sc).fromArrays(ary) series = image.toBlocks("150M").toSeries() seriesVals = series.collect() seriesAry = series.pack() seriesAry_xpose = series.pack(transpose=True) # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((0, 1), seriesVals[2][0]) assert_equals((1, 1), seriesVals[3][0]) assert_equals((0, 2), seriesVals[4][0]) assert_equals((1, 2), seriesVals[5][0]) assert_equals((0, 3), seriesVals[6][0]) assert_equals((1, 3), seriesVals[7][0]) # check dimensions tuple matches numpy shape assert_equals(image.dims.count, series.dims.count) assert_equals(ary.shape, series.dims.count) # check that values are in Fortran-convention order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16')).ravel() assert_true(array_equal(ary.ravel(order='F'), collectedVals)) # check that packing returns original array assert_true(array_equal(ary, seriesAry)) assert_true(array_equal(ary.T, seriesAry_xpose))
def test_toSeriesWithPack(self): ary = arange(8, dtype=dtypeFunc("int16")).reshape((2, 4)) image = ImagesLoader(self.sc).fromArrays(ary) series = image.toBlocks("150M").toSeries() seriesVals = series.collect() seriesAry = series.pack() seriesAry_xpose = series.pack(transpose=True) # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((0, 1), seriesVals[2][0]) assert_equals((1, 1), seriesVals[3][0]) assert_equals((0, 2), seriesVals[4][0]) assert_equals((1, 2), seriesVals[5][0]) assert_equals((0, 3), seriesVals[6][0]) assert_equals((1, 3), seriesVals[7][0]) # check dimensions tuple matches numpy shape assert_equals(image.dims.count, series.dims.count) assert_equals(ary.shape, series.dims.count) # check that values are in Fortran-convention order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc("int16")).ravel() assert_true(array_equal(ary.ravel(order="F"), collectedVals)) # check that packing returns original array assert_true(array_equal(ary, seriesAry)) assert_true(array_equal(ary.T, seriesAry_xpose))
def test_toSeriesWithInefficientSplitAndSortedPack(self): ary = arange(8, dtype=dtypeFunc('int16')).reshape((4, 2)) image = ImagesLoader(self.sc).fromArrays(ary) series = image.toBlocks((2, 1), units="s").toSeries() seriesVals = series.collect() seriesAry = series.pack(sorting=True) # check ordering of keys assert_equals((0, 0), seriesVals[0][0]) # first key assert_equals((1, 0), seriesVals[1][0]) # second key assert_equals((0, 1), seriesVals[2][0]) assert_equals((1, 1), seriesVals[3][0]) # end of first block # beginning of second block assert_equals((2, 0), seriesVals[4][0]) assert_equals((3, 0), seriesVals[5][0]) assert_equals((2, 1), seriesVals[6][0]) assert_equals((3, 1), seriesVals[7][0]) # check dimensions tuple matches numpy shape assert_equals(ary.shape, series.dims.count) # check that values are in expected order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16')).ravel() assert_true(array_equal(ary[:2, :].ravel(order='F'), collectedVals[:4])) # first block assert_true( array_equal(ary[2:4, :].ravel(order='F'), collectedVals[4:])) # second block # check that packing returns original array (after sort) assert_true(array_equal(ary, seriesAry))
def fromBinary(self, dataPath, ext='bin', confFilename='conf.json', nkeys=None, nvalues=None, keyType=None, valueType=None, newDtype='smallfloat', casting='safe', maxPartitionSize='32mb'): """ Load a Series object from a directory of binary files. Parameters ---------- dataPath : string URI or local filesystem path Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://", "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified by a glob-style expression using a single wildcard character '*'. newDtype : dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat' Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be cast to the requested `newdtype` if not None - see Data `astype()` method. casting : 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's `astype()` method; see numpy documentation for details. maxPartitionSize : str, optional, default = '32mb' Maximum size of partitions as Java-style memory, will indirectly control the number of partitions """ paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename, nkeys, nvalues, keyType, valueType) self.__checkBinaryParametersAreSpecified(paramsObj) dataPath = self.__normalizeDatafilePattern(dataPath, ext) keyDtype = dtypeFunc(paramsObj.keytype) valDtype = dtypeFunc(paramsObj.valuetype) keySize = paramsObj.nkeys * keyDtype.itemsize recordSize = keySize + paramsObj.nvalues * valDtype.itemsize from thunder.utils.common import parseMemoryString if isinstance(maxPartitionSize, basestring): size = parseMemoryString(maxPartitionSize) else: raise Exception("Invalid size specification") hadoopConf = {'recordLength': str(recordSize), 'mapred.max.split.size': str(size)} lines = self.sc.newAPIHadoopFile(dataPath, 'thunder.util.io.hadoop.FixedLengthBinaryInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.BytesWritable', conf=hadoopConf) data = lines.map(lambda (_, v): (tuple(int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype)), frombuffer(buffer(v, keySize), dtype=valDtype))) return Series(data, dtype=str(valDtype), index=arange(paramsObj.nvalues)).astype(newDtype, casting)
def smallestFloatType(dtype): """Returns the smallest floating point dtype to which the passed dtype can be safely cast. For integers and unsigned ints, this will generally be next floating point type larger than the integer type. So for instance, smallest_float_type('uint8') -> dtype('float16'), smallest_float_type('int16') -> dtype('float32'), smallest_float_type('uint32') -> dtype('float64'). This function relies on numpy's promote_types function. """ from numpy import dtype as dtypeFunc from numpy import promote_types inType = dtypeFunc(dtype) compSize = max(2, inType.itemsize) # smallest float is at least 16 bits compType = dtypeFunc('=f'+str(compSize)) # compare to a float of the same size return promote_types(inType, compType)
def smallestFloatType(dtype): """ Returns the smallest floating point dtype to which the passed dtype can be safely cast. For integers and unsigned ints, this will generally be next floating point type larger than the integer type. So for instance, smallest_float_type('uint8') -> dtype('float16'), smallest_float_type('int16') -> dtype('float32'), smallest_float_type('uint32') -> dtype('float64'). This function relies on numpy's promote_types function. """ from numpy import dtype as dtypeFunc from numpy import promote_types inType = dtypeFunc(dtype) compSize = max(2, inType.itemsize) # smallest float is at least 16 bits compType = dtypeFunc('=f'+str(compSize)) # compare to a float of the same size return promote_types(inType, compType)
def calcAverageBlockSize(self): if not (self._splitsPerDim is None): elts = _BlockMemoryAsSequence.avgElementsPerBlock( self.dims, self._splitsPerDim) else: elts = reduce(lambda x, y: x * y, self._pixPerDim) return elts * dtypeFunc(self.dtype).itemsize * self.nimages
def generateFromBlockSize(cls, series, blockSize, **kwargs): """Returns a new SeriesBlockingStrategy, that yields blocks closely matching the requested size in bytes. Parameters ---------- series : Series object Series for which blocking strategy is to be generated. blockSize : positive int or string Requests an average size for the intermediate blocks in bytes. A passed string should be in a format like "256k" or "150M" (see util.common.parseMemoryString). If blocksPerDim or groupingDim are passed, they will take precedence over this argument. See strategy._BlockMemoryAsSequence for a description of the blocking strategy used. Returns ------- SeriesBlockingStrategy or subclass new BlockingStrategy will be created and setSource() called on it with the passed series object """ dims, nimages, dtype = series.dims, len(series.index), series.dtype elementSize = nimages * dtypeFunc(dtype).itemsize splitsPerDim = _calcSplitsForBlockSize(blockSize, elementSize, dims) strategy = cls(splitsPerDim, units="splits", **kwargs) strategy.setSource(series) return strategy
def calcAverageBlockSize(self): if self._splitsPerDim is None: raise Exception( "setSource() must be called before calcAverageBlockSize()") elts = _BlockMemoryAsSequence.avgElementsPerBlock( self.dims, self._splitsPerDim) return elts * dtypeFunc(self.dtype).itemsize * self.nimages
def test_threeDArrayToSeriesWithPack(self): ary = arange(24, dtype=dtypeFunc('int16')).reshape((3, 4, 2)) image = ImagesLoader(self.sc).fromArrays(ary) series = image.toBlocks("150M").toSeries() seriesVals = series.collect() seriesAry = series.pack() seriesAry_xpose = series.pack(transpose=True) # check ordering of keys assert_equals((0, 0, 0), seriesVals[0][0]) # first key assert_equals((1, 0, 0), seriesVals[1][0]) # second key assert_equals((2, 0, 0), seriesVals[2][0]) assert_equals((0, 1, 0), seriesVals[3][0]) assert_equals((1, 1, 0), seriesVals[4][0]) assert_equals((2, 1, 0), seriesVals[5][0]) assert_equals((0, 2, 0), seriesVals[6][0]) assert_equals((1, 2, 0), seriesVals[7][0]) assert_equals((2, 2, 0), seriesVals[8][0]) assert_equals((0, 3, 0), seriesVals[9][0]) assert_equals((1, 3, 0), seriesVals[10][0]) assert_equals((2, 3, 0), seriesVals[11][0]) assert_equals((0, 0, 1), seriesVals[12][0]) assert_equals((1, 0, 1), seriesVals[13][0]) assert_equals((2, 0, 1), seriesVals[14][0]) assert_equals((0, 1, 1), seriesVals[15][0]) assert_equals((1, 1, 1), seriesVals[16][0]) assert_equals((2, 1, 1), seriesVals[17][0]) assert_equals((0, 2, 1), seriesVals[18][0]) assert_equals((1, 2, 1), seriesVals[19][0]) assert_equals((2, 2, 1), seriesVals[20][0]) assert_equals((0, 3, 1), seriesVals[21][0]) assert_equals((1, 3, 1), seriesVals[22][0]) assert_equals((2, 3, 1), seriesVals[23][0]) # check dimensions tuple matches numpy shape assert_equals(ary.shape, series.dims.count) # check that values are in Fortran-convention order collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16')).ravel() assert_true(array_equal(ary.ravel(order='F'), collectedVals)) # check that packing returns transpose of original array assert_true(array_equal(ary, seriesAry)) assert_true(array_equal(ary.T, seriesAry_xpose))
def test_fromStacks(self): ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) ary2 = arange(8, 16, dtype=dtypeFunc('int16')).reshape((2, 4)) filename = os.path.join(self.outputdir, "test01.stack") ary.tofile(filename) filename = os.path.join(self.outputdir, "test02.stack") ary2.tofile(filename) image = ImagesLoader(self.sc).fromStack(self.outputdir, dims=(4, 2)) collectedImage = image.collect() assert_equals(2, len(collectedImage)) assert_equals(0, collectedImage[0][0]) # check key assert_equals(image.dims.count, collectedImage[0][1].shape) assert_true(array_equal(ary.T, collectedImage[0][1])) # check value assert_equals(1, collectedImage[1][0]) # check image 2 assert_true(array_equal(ary2.T, collectedImage[1][1]))
def _generateTestArrays(narys, dtype_='int16'): sh = 4, 3, 3 sz = reduce(lambda x, y: x * y, sh, 1) arys = [ arange(i, i + sz, dtype=dtypeFunc(dtype_)).reshape(sh) for i in xrange(0, sz * narys, sz) ] return arys, sh, sz
def fromBinary(self, dataPath, ext='bin', confFilename='conf.json', nkeys=None, nvalues=None, keyType=None, valueType=None, newDtype='smallfloat', casting='safe', maxPartitionSize='32mb'): """ Load a Series object from a directory of binary files. Parameters ---------- dataPath : string URI or local filesystem path Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://", "s3n://", or "gs://"). If no scheme is present, will be interpreted as a path on the local filesystem. This path must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified by a glob-style expression using a single wildcard character '*'. newDtype : dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat' Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be cast to the requested `newdtype` if not None - see Data `astype()` method. casting : 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's `astype()` method; see numpy documentation for details. maxPartitionSize : str, optional, default = '32mb' Maximum size of partitions as Java-style memory, will indirectly control the number of partitions """ paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename, nkeys, nvalues, keyType, valueType) self.__checkBinaryParametersAreSpecified(paramsObj) dataPath = self.__normalizeDatafilePattern(dataPath, ext) keyDtype = dtypeFunc(paramsObj.keytype) valDtype = dtypeFunc(paramsObj.valuetype) keySize = paramsObj.nkeys * keyDtype.itemsize recordSize = keySize + paramsObj.nvalues * valDtype.itemsize lines = self.sc.binaryRecords(dataPath, recordSize) get = lambda v: (tuple(int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype)), frombuffer(buffer(v, keySize), dtype=valDtype)) data = lines.map(get) return Series(data, dtype=str(valDtype), index=arange(paramsObj.nvalues)).astype(newDtype, casting)
def _run_tstSaveAsBinarySeries(self, testIdx, narys_, valDtype, groupingDim_): """Pseudo-parameterized test fixture, allows reusing existing spark context """ paramStr = "(groupingdim=%d, valuedtype='%s')" % (groupingDim_, valDtype) arys, aryShape, arySize = _generateTestArrays(narys_, dtype_=valDtype) dims = aryShape[:] outdir = os.path.join(self.outputdir, "anotherdir%02d" % testIdx) images = ImagesLoader(self.sc).fromArrays(arys) slicesPerDim = [1]*arys[0].ndim slicesPerDim[groupingDim_] = arys[0].shape[groupingDim_] images.toBlocks(slicesPerDim, units="splits").saveAsBinarySeries(outdir) ndims = len(aryShape) # prevent padding to 4-byte boundaries: "=" specifies no alignment unpacker = struct.Struct('=' + 'h'*ndims + dtypeFunc(valDtype).char*narys_) def calcExpectedNKeys(): tmpShape = list(dims[:]) del tmpShape[groupingDim_] return prod(tmpShape) expectedNKeys = calcExpectedNKeys() def byrec(f_, unpacker_, nkeys_): rec = True while rec: rec = f_.read(unpacker_.size) if rec: allRecVals = unpacker_.unpack(rec) yield allRecVals[:nkeys_], allRecVals[nkeys_:] outFilenames = glob.glob(os.path.join(outdir, "*.bin")) assert_equals(dims[groupingDim_], len(outFilenames)) for outFilename in outFilenames: with open(outFilename, 'rb') as f: nkeys = 0 for keys, vals in byrec(f, unpacker, ndims): nkeys += 1 assert_equals(narys_, len(vals)) for valIdx, val in enumerate(vals): assert_equals(arys[valIdx][keys], val, "Expected %g, got %g, for test %d %s" % (arys[valIdx][keys], val, testIdx, paramStr)) assert_equals(expectedNKeys, nkeys) confName = os.path.join(outdir, "conf.json") assert_true(os.path.isfile(confName)) with open(os.path.join(outdir, "conf.json"), 'r') as fconf: import json conf = json.load(fconf) assert_equals(outdir, conf['input']) assert_equals(len(aryShape), conf['nkeys']) assert_equals(narys_, conf['nvalues']) assert_equals(valDtype, conf['valuetype']) assert_equals('int16', conf['keytype']) assert_true(os.path.isfile(os.path.join(outdir, 'SUCCESS')))
def test_fromArrays(self): ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) image = ImagesLoader(self.sc).fromArrays(ary) collectedImage = image.collect() assert_equals(1, len(collectedImage)) assert_equals(ary.shape, image.dims.count) assert_equals(0, collectedImage[0][0]) # check key assert_true(array_equal(ary, collectedImage[0][1])) # check value
def test_loadStacksAsSeries(self): rangeAry = arange(64*128, dtype=dtypeFunc('int16')) rangeAry.shape = (64, 128) filepath = os.path.join(self.outputdir, "rangeAry.stack") rangeAry.tofile(filepath) series = SeriesLoader(self.sc).fromStack(filepath, dims=(128, 64)) seriesAry = series.pack() assert_equals((128, 64), series.dims.count) assert_equals((128, 64), seriesAry.shape) assert_true(array_equal(rangeAry.T, seriesAry))
def test_loadStacksAsSeries(self): rangeAry = arange(64 * 128, dtype=dtypeFunc('int16')) rangeAry.shape = (64, 128) filepath = os.path.join(self.outputdir, "rangeAry.stack") rangeAry.tofile(filepath) series = SeriesLoader(self.sc).fromStack(filepath, dims=(128, 64)) seriesAry = series.pack() assert_equals((128, 64), series.dims.count) assert_equals((128, 64), seriesAry.shape) assert_true(array_equal(rangeAry.T, seriesAry))
def test_subtract(self): narys = 3 arys, sh, sz = _generateTestArrays(narys) subVals = [1, arange(sz, dtype=dtypeFunc("int16")).reshape(sh)] imageData = ImagesLoader(self.sc).fromArrays(arys) for subVal in subVals: subData = imageData.subtract(subVal) subtracted = subData.collect() expectedArys = map(lambda ary: ary - subVal, arys) for actual, expected in zip(subtracted, expectedArys): assert_true(allclose(expected, actual[1]))
def test_planes(self): dims = (2, 2, 4) sz = reduce(lambda x, y: x * y, dims) origAry = arange(sz, dtype=dtypeFunc('int16')).reshape(dims) imageData = ImagesLoader(self.sc).fromArrays([origAry]) planedData = imageData.planes(0, 2) planed = planedData.collect()[0][1] expected = squeeze(origAry[slice(None), slice(None), slice(0, 2)]) assert_true(array_equal(expected, planed)) assert_equals(tuple(expected.shape), planedData._dims.count) assert_equals(str(expected.dtype), planedData._dtype)
def test_subtract(self): narys = 3 arys, sh, sz = _generateTestArrays(narys) subVals = [1, arange(sz, dtype=dtypeFunc('int16')).reshape(sh)] imageData = ImagesLoader(self.sc).fromArrays(arys) for subVal in subVals: subData = imageData.subtract(subVal) subtracted = subData.collect() expectedArys = map(lambda ary: ary - subVal, arys) for actual, expected in zip(subtracted, expectedArys): assert_true(allclose(expected, actual[1]))
def test_planes(self): dims = (2, 2, 4) sz = reduce(lambda x, y: x * y, dims) origAry = arange(sz, dtype=dtypeFunc("int16")).reshape(dims) imageData = ImagesLoader(self.sc).fromArrays([origAry]) planedData = imageData.planes(0, 2) planed = planedData.collect()[0][1] expected = squeeze(origAry[slice(None), slice(None), slice(0, 2)]) assert_true(array_equal(expected, planed)) assert_equals(tuple(expected.shape), planedData._dims.count) assert_equals(str(expected.dtype), planedData._dtype)
def test_toBlocksWithSplit(self): ary = arange(8, dtype=dtypeFunc("int16")).reshape((2, 4)) image = ImagesLoader(self.sc).fromArrays(ary) groupedblocks = image.toBlocks((1, 2), units="s") # collectedblocks = blocks.collect() collectedgroupedblocks = groupedblocks.collect() assert_equals((0, 0), collectedgroupedblocks[0][0].spatialKey) assert_true(array_equal(ary[:, :2].ravel(), collectedgroupedblocks[0][1].ravel())) assert_equals((0, 2), collectedgroupedblocks[1][0].spatialKey) assert_true(array_equal(ary[:, 2:].ravel(), collectedgroupedblocks[1][1].ravel()))
def test_toBlocksWithSplit(self): ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) image = ImagesLoader(self.sc).fromArrays(ary) groupedblocks = image.toBlocks((1, 2), units="s") # collectedblocks = blocks.collect() collectedgroupedblocks = groupedblocks.collect() assert_equals((0, 0), collectedgroupedblocks[0][0].spatialKey) assert_true(array_equal(ary[:, :2].ravel(), collectedgroupedblocks[0][1].ravel())) assert_equals((0, 2), collectedgroupedblocks[1][0].spatialKey) assert_true(array_equal(ary[:, 2:].ravel(), collectedgroupedblocks[1][1].ravel()))
def test_fromStack(self): ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) filename = os.path.join(self.outputdir, "test.stack") ary.tofile(filename) image = ImagesLoader(self.sc).fromStack(filename, dims=(4, 2)) collectedImage = image.collect() assert_equals(1, len(collectedImage)) assert_equals(0, collectedImage[0][0]) # check key # assert that image shape *matches* that in image dimensions: assert_equals(image.dims.count, collectedImage[0][1].shape) assert_true(array_equal(ary.T, collectedImage[0][1])) # check value
def test_maxProject(self): from thunder.rdds.fileio.seriesloader import SeriesLoader ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) series = SeriesLoader(self.sc).fromArrays(ary) project0Series = series.maxProject(axis=0) project0 = project0Series.pack() project1Series = series.maxProject(axis=1) project1 = project1Series.pack(sorting=True) assert_true(array_equal(amax(ary.T, 0), project0)) assert_true(array_equal(amax(ary.T, 1), project1))
def test_castToFloat(self): from numpy import arange shape = (3, 2, 2) size = 3*2*2 ary = arange(size, dtype=dtypeFunc('uint8')).reshape(shape) ary2 = ary + size from thunder.rdds.fileio.seriesloader import SeriesLoader series = SeriesLoader(self.sc).fromArrays([ary, ary2]) castSeries = series.astype("smallfloat") assert_equals('float16', str(castSeries.dtype)) assert_equals('float16', str(castSeries.first()[1].dtype))
def test_castToFloat(self): from numpy import arange shape = (3, 2, 2) size = 3 * 2 * 2 ary = arange(size, dtype=dtypeFunc('uint8')).reshape(shape) ary2 = ary + size from thunder.rdds.fileio.seriesloader import SeriesLoader series = SeriesLoader(self.sc).fromArraysAsImages([ary, ary2]) castSeries = series.astype("smallfloat") assert_equals('float16', str(castSeries.dtype)) assert_equals('float16', str(castSeries.first()[1].dtype))
def test_maxProject(self): from thunder.rdds.fileio.seriesloader import SeriesLoader ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4)) series = SeriesLoader(self.sc).fromArraysAsImages(ary) project0Series = series.maxProject(axis=0) project0 = project0Series.pack() project1Series = series.maxProject(axis=1) project1 = project1Series.pack(sorting=True) assert_true(array_equal(amax(ary.T, 0), project0)) assert_true(array_equal(amax(ary.T, 1), project1))
def test_fromStacksWithConf(self): ary = arange(8, dtype=dtypeFunc('int32')).reshape((2, 4)) ary2 = arange(8, 16, dtype=dtypeFunc('int32')).reshape((2, 4)) filename = os.path.join(self.outputdir, "test01.stack") ary.tofile(filename) filename = os.path.join(self.outputdir, "test02.stack") ary2.tofile(filename) conf = {"dims": [4, 2], "dtype": "int32"} with open(os.path.join(self.outputdir, "conf.json"), 'w') as fp: json.dump(conf, fp) image = ImagesLoader(self.sc).fromStack(self.outputdir) assert_equals("int32", image._dtype) assert_equals(2, image._nrecords) assert_equals((4, 2), image._dims.count) collectedImage = image.collect() assert_equals(2, len(collectedImage)) assert_equals(0, collectedImage[0][0]) # check key assert_equals(image.dims.count, collectedImage[0][1].shape) assert_true(array_equal(ary.T, collectedImage[0][1])) # check value assert_equals(1, collectedImage[1][0]) # check image 2 assert_true(array_equal(ary2.T, collectedImage[1][1]))
def test_loadStacksAsSeriesWithShuffle(self): rangeAry = arange(64*128, dtype=dtypeFunc('int16')) filePath = os.path.join(self.outputdir, "rangeary.stack") rangeAry.tofile(filePath) expectedAry = rangeAry.reshape((128, 64), order='F') rangeSeries = self.tsc.loadImagesAsSeries(filePath, dims=(128, 64)) assert_equals('float32', rangeSeries._dtype) # check before any potential first() calls update this val rangeSeriesAry = rangeSeries.pack() assert_equals((128, 64), rangeSeries.dims.count) assert_equals((128, 64), rangeSeriesAry.shape) assert_equals('float32', str(rangeSeriesAry.dtype)) assert_true(array_equal(expectedAry, rangeSeriesAry))
def test_load3dStackAsSeriesWithShuffle(self): rangeAry = arange(32*64*4, dtype=dtypeFunc('int16')) filePath = os.path.join(self.outputdir, "rangeary.stack") rangeAry.tofile(filePath) expectedAry = rangeAry.reshape((32, 64, 4), order='F') rangeSeries = self.tsc.loadImagesAsSeries(filePath, dims=(32, 64, 4)) assert_equals('float32', rangeSeries._dtype) rangeSeriesAry = rangeSeries.pack() assert_equals((32, 64, 4), rangeSeries.dims.count) assert_equals((32, 64, 4), rangeSeriesAry.shape) assert_equals('float32', str(rangeSeriesAry.dtype)) assert_true(array_equal(expectedAry, rangeSeriesAry))
def __run_loadMultipleStacksAsSeries(self): rangeAry = arange(64*128, dtype=dtypeFunc('int16')) filePath = os.path.join(self.outputdir, "rangeary01.bin") rangeAry.tofile(filePath) expectedAry = rangeAry.reshape((128, 64), order='F') rangeAry2 = arange(64*128, 2*64*128, dtype=dtypeFunc('int16')) filePath = os.path.join(self.outputdir, "rangeary02.bin") rangeAry2.tofile(filePath) expectedAry2 = rangeAry2.reshape((128, 64), order='F') rangeSeries = self.tsc.loadImagesAsSeries(self.outputdir, dims=(128, 64)) assert_equals('float32', rangeSeries._dtype) rangeSeriesAry = rangeSeries.pack() rangeSeriesAry_xpose = rangeSeries.pack(transpose=True) assert_equals((128, 64), rangeSeries.dims.count) assert_equals((2, 128, 64), rangeSeriesAry.shape) assert_equals((2, 64, 128), rangeSeriesAry_xpose.shape) assert_equals('float32', str(rangeSeriesAry.dtype)) assert_true(array_equal(expectedAry, rangeSeriesAry[0])) assert_true(array_equal(expectedAry2, rangeSeriesAry[1])) assert_true(array_equal(expectedAry.T, rangeSeriesAry_xpose[0])) assert_true(array_equal(expectedAry2.T, rangeSeriesAry_xpose[1]))
def __run_loadTifAsSeries(self): tmpAry = arange(60*120, dtype=dtypeFunc('uint16')) rangeAry = mod(tmpAry, 255).astype('uint8').reshape((60, 120)) pilImg = Image.fromarray(rangeAry) filePath = os.path.join(self.outputdir, "rangetif01.tif") pilImg.save(filePath) del pilImg, tmpAry rangeSeries = self.tsc.loadImagesAsSeries(self.outputdir, inputFormat="tif-stack") assert_equals('float16', rangeSeries._dtype) # check before any potential first() calls update this val rangeSeriesAry = rangeSeries.pack() assert_equals((60, 120), rangeSeries.dims.count) # 2d tif now loaded as 2d image; was 3d with singleton z dim assert_equals((60, 120), rangeSeriesAry.shape) assert_equals('float16', str(rangeSeriesAry.dtype)) assert_true(array_equal(rangeAry, rangeSeriesAry))
def astype(self, dtype, casting='safe'): """ Cast values to specified numpy dtype. If 'smallfloat' is passed, values will be cast to the smallest floating point representation to which they can be cast safely, as determined by the thunder.utils.common smallest_float_type function. Typically this will be a float type larger than a passed integer type (for instance, float16 for int8 or uint8). If the passed dtype is the same as the current dtype, or if 'smallfloat' is passed when values are already in floating point, then this method will return self unchanged. Parameters ---------- dtype: numpy dtype or dtype specifier, or string 'smallfloat', or None Data type to which RDD values are to be cast. Will return without cast if None is passed. casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's astype() method; see numpy documentation for details. Returns ------- New Data object, of same type as self, with values cast to the requested dtype; or self if no cast is performed. """ if dtype is None or dtype == '': return self from numpy import ndarray from numpy import dtype as dtypeFunc if dtype == 'smallfloat': # get the smallest floating point type that can be safely cast to from our current type from thunder.utils.common import smallestFloatType dtype = smallestFloatType(self.dtype) def cast(v, dtype_, casting_): if isinstance(v, ndarray): return v.astype(dtype_, casting=casting_, copy=False) else: # assume we are a scalar, either a numpy scalar or a python scalar # turn ourself into a numpy scalar of the appropriate type return asarray([v]).astype(dtype_, casting=casting_, copy=False)[0] nextRdd = self.rdd.mapValues( lambda v: cast(v, dtypeFunc(dtype), casting)) return self._constructor(nextRdd, dtype=str(dtype)).__finalize__(self)
def astype(self, dtype, casting='safe'): """ Cast values to specified numpy dtype. If 'smallfloat' is passed, values will be cast to the smallest floating point representation to which they can be cast safely, as determined by the thunder.utils.common smallest_float_type function. Typically this will be a float type larger than a passed integer type (for instance, float16 for int8 or uint8). If the passed dtype is the same as the current dtype, or if 'smallfloat' is passed when values are already in floating point, then this method will return self unchanged. Parameters ---------- dtype: numpy dtype or dtype specifier, or string 'smallfloat', or None Data type to which RDD values are to be cast. Will return without cast if None is passed. casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's astype() method; see numpy documentation for details. Returns ------- New Data object, of same type as self, with values cast to the requested dtype; or self if no cast is performed. """ if dtype is None or dtype == '': return self from numpy import ndarray from numpy import dtype as dtypeFunc if dtype == 'smallfloat': # get the smallest floating point type that can be safely cast to from our current type from thunder.utils.common import smallestFloatType dtype = smallestFloatType(self.dtype) def cast(v, dtype_, casting_): if isinstance(v, ndarray): return v.astype(dtype_, casting=casting_, copy=False) else: # assume we are a scalar, either a numpy scalar or a python scalar # turn ourself into a numpy scalar of the appropriate type return asarray([v]).astype(dtype_, casting=casting_, copy=False)[0] nextRdd = self.rdd.mapValues(lambda v: cast(v, dtypeFunc(dtype), casting)) return self._constructor(nextRdd, dtype=str(dtype)).__finalize__(self)