Example #1
0
    def astype(self, dtype, casting='safe'):
        """
        Cast values to specified numpy dtype.

        If 'smallfloat' is passed, values will be cast to the smallest floating point representation
        to which they can be cast safely, as determined by the thunder.utils.common smallest_float_type function.
        Typically this will be a float type larger than a passed integer type (for instance, float16 for int8 or uint8).

        If the passed dtype is the same as the current dtype, or if 'smallfloat' is passed when values are already
        in floating point, then this method will return self unchanged.

        Parameters
        ----------
        dtype: numpy dtype or dtype specifier, or string 'smallfloat', or None
            Data type to which RDD values are to be cast. Will return immediately, performing no cast, if None is passed.

        casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's astype() method; see numpy documentation for details.

        Returns
        -------
        New Data object, of same type as self, with values cast to the requested dtype; or self if no cast is performed.
        """
        if dtype is None or dtype == '':
            return self
        if dtype == 'smallfloat':
            # get the smallest floating point type that can be safely cast to from our current type
            from thunder.utils.common import smallestFloatType
            dtype = smallestFloatType(self.dtype)

        nextRdd = self.rdd.mapValues(lambda v: v.astype(dtype, casting=casting, copy=False))
        return self._constructor(nextRdd, dtype=str(dtype)).__finalize__(self)
Example #2
0
    def astype(self, dtype, casting='safe'):
        """Cast values to specified numpy dtype

        Calls numpy's astype() method.

        If the string 'smallfloat' is passed, then the values will be cast to the smallest floating point representation
        to which they can be cast safely, as determined by the thunder.utils.common smallest_float_type function.
        Typically this will be a float type larger than a passed integer type (for instance, float16 for int8 or uint8).

        If the passed dtype is the same as the current dtype, or if 'smallfloat' is passed when values are already
        in floating point, then this method will return immediately, returning self.

        Parameters
        ----------
        dtype: numpy dtype or dtype specifier, or string 'smallfloat', or None
            Data type to which RDD values are to be cast. Will return immediately, performing no cast, if None is passed.

        casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's astype() method; see numpy documentation for details.

        Returns
        -------
        New Data object, of same type as self, with values cast to the requested dtype; or self if no cast is performed.
        """
        if dtype is None or dtype == '':
            return self
        if dtype == 'smallfloat':
            # get the smallest floating point type that can be safely cast to from our current type
            from thunder.utils.common import smallestFloatType
            dtype = smallestFloatType(self.dtype)

        nextRdd = self.rdd.mapValues(
            lambda v: v.astype(dtype, casting=casting, copy=False))
        return self._constructor(nextRdd, dtype=str(dtype)).__finalize__(self)
    def _run_tst_fromBinary(self, useConfJson=False):
        # run this as a single big test so as to avoid repeated setUp and tearDown of the spark context
        # data will be a sequence of test data
        # all keys and all values in a test data item must be of the same length
        # keys get converted to ints regardless of raw input format
        DATA = [
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3], [5, 6, 7]], [[11], [12]], 'int16', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int32'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int32', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'int16', 'float32'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'float32', 'float32'),
            SeriesBinaryTestData.fromArrays([[2, 3, 4]], [[11.0, 12.0, 13.0]], 'float32', 'float32'),
        ]

        for itemidx, item in enumerate(DATA):
            outSubdir = os.path.join(self.outputdir, 'input%d' % itemidx)
            os.mkdir(outSubdir)

            fname = os.path.join(outSubdir, 'inputfile%d.bin' % itemidx)
            with open(fname, 'wb') as f:
                item.writeToFile(f)

            loader = SeriesLoader(self.sc)
            if not useConfJson:
                series = loader.fromBinary(outSubdir, nkeys=item.nkeys, nvalues=item.nvals, keyType=str(item.keyDtype),
                                           valueType=str(item.valDtype))
            else:
                # write configuration file
                conf = {'input': outSubdir,
                        'nkeys': item.nkeys, 'nvalues': item.nvals,
                        'valuetype': str(item.valDtype), 'keytype': str(item.keyDtype)}
                with open(os.path.join(outSubdir, "conf.json"), 'wb') as f:
                    json.dump(conf, f, indent=2)
                series = loader.fromBinary(outSubdir)

            seriesData = series.rdd.collect()

            expectedData = item.data
            assert_equals(len(expectedData), len(seriesData),
                          "Differing numbers of k/v pairs in item %d; expected %d, got %d" %
                          (itemidx, len(expectedData), len(seriesData)))

            for expected, actual in zip(expectedData, seriesData):
                expectedKeys = tuple(expected[0])
                expectedType = smallestFloatType(item.valDtype)
                expectedVals = array(expected[1], dtype=expectedType)
                assert_equals(expectedKeys, actual[0],
                              "Key mismatch in item %d; expected %s, got %s" %
                              (itemidx, str(expectedKeys), str(actual[0])))
                assert_true(allclose(expectedVals, actual[1]),
                            "Value mismatch in item %d; expected %s, got %s" %
                            (itemidx, str(expectedVals), str(actual[1])))
                assert_equals(expectedType, str(actual[1].dtype),
                              "Value type mismatch in item %d; expected %s, got %s" %
                              (itemidx, expectedType, str(actual[1].dtype)))
    def _run_roundtrip_tst(self, testCount, arrays, blockSize):
        #  print "Running TestSeriesBinaryWriteFromStack roundtrip test #%d" % testCount
        inSubdir = os.path.join(self.outputdir, 'input%d' % testCount)
        os.mkdir(inSubdir)

        outSubdir = os.path.join(self.outputdir, 'output%d' % testCount)
        # os.mkdir(outSubdir)

        for aryCount, ary in enumerate(arrays):
            # array.tofile always writes in column-major order...
            ary.tofile(os.path.join(inSubdir, "img%02d.stack" % aryCount))

        # ... but we will read and interpret these as though they are in row-major order
        dims = list(arrays[0].shape)
        dims.reverse()

        underTest = SeriesLoader(self.sc)

        underTest.saveFromStack(inSubdir, outSubdir, dims, blockSize=blockSize, dtype=str(arrays[0].dtype))
        series = underTest.fromStack(inSubdir, dims, dtype=str(arrays[0].dtype))

        roundtrippedSeries = underTest.fromBinary(outSubdir)
        roundtripped = roundtrippedSeries.collect()
        direct = series.collect()

        expectedDtype = str(smallestFloatType(arrays[0].dtype))
        assert_equals(expectedDtype, roundtrippedSeries.dtype)
        assert_equals(expectedDtype, series.dtype)
        assert_equals(expectedDtype, str(roundtripped[0][1].dtype))
        assert_equals(expectedDtype, str(direct[0][1].dtype))

        with open(os.path.join(outSubdir, "conf.json"), 'r') as fp:
            # check that binary series file data type *matches* input stack data type (not yet converted to float)
            # at least according to conf.json
            conf = json.load(fp)
            assert_equals(str(arrays[0].dtype), conf["valuetype"])

        for ((seriesKeys, seriesValues), (directKeys, directValues)) in zip(roundtripped, direct):
            assert_equals(directKeys, seriesKeys)
            assert_equals(directValues, seriesValues)

            for seriesIdx, seriesVal in enumerate(seriesValues):
                # print "seriesIdx: %d; seriesKeys: %s; seriesVal: %g" % (seriesIdx, seriesKeys, seriesVal)
                # flip indices again for row vs col-major insanity
                aryKeys = list(seriesKeys)
                aryKeys.reverse()
                msg = "Failure on test #%d, time point %d, indices %s" % (testCount, seriesIdx, str(tuple(aryKeys)))
                try:
                    assert_almost_equal(arrays[seriesIdx][tuple(aryKeys)], seriesVal, places=4)
                except AssertionError, e:
                    raise AssertionError(msg, e)
Example #5
0
    def astype(self, dtype, casting='safe'):
        """
        Cast values to specified numpy dtype.

        If 'smallfloat' is passed, values will be cast to the smallest floating point representation
        to which they can be cast safely, as determined by the thunder.utils.common smallest_float_type function.
        Typically this will be a float type larger than a passed integer type (for instance, float16 for int8 or uint8).

        If the passed dtype is the same as the current dtype, or if 'smallfloat' is passed when values are already
        in floating point, then this method will return self unchanged.

        Parameters
        ----------
        dtype: numpy dtype or dtype specifier, or string 'smallfloat', or None
            Data type to which RDD values are to be cast. Will return without cast if None is passed.

        casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's astype() method; see numpy documentation for details.

        Returns
        -------
        New Data object, of same type as self, with values cast to the requested dtype; or self if no cast is performed.
        """
        if dtype is None or dtype == '':
            return self
        from numpy import ndarray
        from numpy import dtype as dtypeFunc
        if dtype == 'smallfloat':
            # get the smallest floating point type that can be safely cast to from our current type
            from thunder.utils.common import smallestFloatType
            dtype = smallestFloatType(self.dtype)

        def cast(v, dtype_, casting_):
            if isinstance(v, ndarray):
                return v.astype(dtype_, casting=casting_, copy=False)
            else:
                # assume we are a scalar, either a numpy scalar or a python scalar
                # turn ourself into a numpy scalar of the appropriate type
                return asarray([v]).astype(dtype_,
                                           casting=casting_,
                                           copy=False)[0]

        nextRdd = self.rdd.mapValues(
            lambda v: cast(v, dtypeFunc(dtype), casting))
        return self._constructor(nextRdd, dtype=str(dtype)).__finalize__(self)
Example #6
0
    def _run_roundtrip_tst(self, testIdx, nimages, aryShape, dtypeSpec,
                           npartitions):
        testArrays = TestSeriesBinaryWriteFromStack.generateTestImages(
            nimages, aryShape, dtypeSpec)
        loader = SeriesLoader(self.sc)
        series = loader.fromArrays(testArrays)

        saveDirPath = os.path.join(self.outputdir, 'save%d' % testIdx)
        series.repartition(
            npartitions
        )  # note: this does an elementwise shuffle! won't be in sorted order
        series.saveAsBinarySeries(saveDirPath)

        nnonemptyPartitions = 0
        for partitionList in series.rdd.glom().collect():
            if partitionList:
                nnonemptyPartitions += 1
        del partitionList
        nsaveFiles = len(glob.glob(saveDirPath + os.sep + "*.bin"))

        roundtrippedSeries = loader.fromBinary(saveDirPath)

        with open(os.path.join(saveDirPath, "conf.json"), 'r') as fp:
            conf = json.load(fp)

        # sorting is required here b/c of the randomization induced by the repartition.
        # orig and roundtripped will in general be different from each other, since roundtripped
        # will have (0, 0, 0) index as first element (since it will be the lexicographically first
        # file) while orig has only a 1 in npartitions chance of starting with (0, 0, 0) after repartition.
        expectedPackedAry = series.pack(sorting=True)
        actualPackedAry = roundtrippedSeries.pack(sorting=True)

        assert_true(array_equal(expectedPackedAry, actualPackedAry))

        assert_equals(nnonemptyPartitions, nsaveFiles)

        assert_equals(len(aryShape), conf["nkeys"])
        assert_equals(nimages, conf["nvalues"])
        assert_equals("int16", conf["keytype"])
        assert_equals(str(series.dtype), conf["valuetype"])
        # check that we have converted ourselves to an appropriate float after reloading
        assert_equals(str(smallestFloatType(series.dtype)),
                      str(roundtrippedSeries.dtype))
    def _run_roundtrip_tst(self, testIdx, nimages, aryShape, dtypeSpec, npartitions):
        testArrays = TestSeriesBinaryWriteFromStack.generateTestImages(nimages, aryShape, dtypeSpec)
        loader = SeriesLoader(self.sc)
        series = loader.fromArrays(testArrays)

        saveDirPath = os.path.join(self.outputdir, 'save%d' % testIdx)
        series.repartition(npartitions)  # note: this does an elementwise shuffle! won't be in sorted order
        series.saveAsBinarySeries(saveDirPath)

        nnonemptyPartitions = 0
        for partitionList in series.rdd.glom().collect():
            if partitionList:
                nnonemptyPartitions += 1
        del partitionList
        nsaveFiles = len(glob.glob(saveDirPath + os.sep + "*.bin"))

        roundtrippedSeries = loader.fromBinary(saveDirPath)

        with open(os.path.join(saveDirPath, "conf.json"), 'r') as fp:
            conf = json.load(fp)

        # sorting is required here b/c of the randomization induced by the repartition.
        # orig and roundtripped will in general be different from each other, since roundtripped
        # will have (0, 0, 0) index as first element (since it will be the lexicographically first
        # file) while orig has only a 1 in npartitions chance of starting with (0, 0, 0) after repartition.
        expectedPackedAry = series.pack(sorting=True)
        actualPackedAry = roundtrippedSeries.pack(sorting=True)

        assert_true(array_equal(expectedPackedAry, actualPackedAry))

        assert_equals(nnonemptyPartitions, nsaveFiles)

        assert_equals(len(aryShape), conf["nkeys"])
        assert_equals(nimages, conf["nvalues"])
        assert_equals("int16", conf["keytype"])
        assert_equals(str(series.dtype), conf["valuetype"])
        # check that we have converted ourselves to an appropriate float after reloading
        assert_equals(str(smallestFloatType(series.dtype)), str(roundtrippedSeries.dtype))
Example #8
0
    def _run_tst_fromBinary(self, useConfJson=False):
        # run this as a single big test so as to avoid repeated setUp and tearDown of the spark context
        # data will be a sequence of test data
        # all keys and all values in a test data item must be of the same length
        # keys get converted to ints regardless of raw input format
        DATA = [
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]],
                                            'int16', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3], [5, 6, 7]],
                                            [[11], [12]], 'int16', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]],
                                            'int16', 'int32'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]],
                                            'int32', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]],
                                            'int16', 'float32'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]],
                                            'float32', 'float32'),
            SeriesBinaryTestData.fromArrays([[2, 3, 4]], [[11.0, 12.0, 13.0]],
                                            'float32', 'float32'),
        ]

        for itemidx, item in enumerate(DATA):
            outSubdir = os.path.join(self.outputdir, 'input%d' % itemidx)
            os.mkdir(outSubdir)

            fname = os.path.join(outSubdir, 'inputfile%d.bin' % itemidx)
            with open(fname, 'wb') as f:
                item.writeToFile(f)

            loader = SeriesLoader(self.sc)
            if not useConfJson:
                series = loader.fromBinary(outSubdir,
                                           nkeys=item.nkeys,
                                           nvalues=item.nvals,
                                           keyType=str(item.keyDtype),
                                           valueType=str(item.valDtype))
            else:
                # write configuration file
                conf = {
                    'input': outSubdir,
                    'nkeys': item.nkeys,
                    'nvalues': item.nvals,
                    'valuetype': str(item.valDtype),
                    'keytype': str(item.keyDtype)
                }
                with open(os.path.join(outSubdir, "conf.json"), 'wb') as f:
                    json.dump(conf, f, indent=2)
                series = loader.fromBinary(outSubdir)

            seriesData = series.rdd.collect()

            expectedData = item.data
            assert_equals(
                len(expectedData), len(seriesData),
                "Differing numbers of k/v pairs in item %d; expected %d, got %d"
                % (itemidx, len(expectedData), len(seriesData)))

            for expected, actual in zip(expectedData, seriesData):
                expectedKeys = tuple(expected[0])
                expectedType = smallestFloatType(item.valDtype)
                expectedVals = array(expected[1], dtype=expectedType)
                assert_equals(
                    expectedKeys, actual[0],
                    "Key mismatch in item %d; expected %s, got %s" %
                    (itemidx, str(expectedKeys), str(actual[0])))
                assert_true(
                    allclose(expectedVals, actual[1]),
                    "Value mismatch in item %d; expected %s, got %s" %
                    (itemidx, str(expectedVals), str(actual[1])))
                assert_equals(
                    expectedType, str(actual[1].dtype),
                    "Value type mismatch in item %d; expected %s, got %s" %
                    (itemidx, expectedType, str(actual[1].dtype)))
    def _getSeriesBlocksFromMultiTif(self, dataPath, ext="tif", blockSize="150M",
                                     newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None,
                                     recursive=False):
        import thunder.rdds.fileio.multitif as multitif
        import itertools
        from PIL import Image
        import io

        dataPath = self.__normalizeDatafilePattern(dataPath, ext)
        blockSize = parseMemoryString(blockSize)

        reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride)
        filenames = reader.list(dataPath, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
        if not filenames:
            raise IOError("No files found for path '%s'" % dataPath)
        ntimepoints = len(filenames)

        doMinimizeReads = dataPath.lower().startswith("s3")
        # check PIL version to see whether it is actually pillow or indeed old PIL and choose
        # conversion function appropriately. See ImagesLoader.fromMultipageTif and common.pil_to_array
        # for more explanation.
        isPillow = hasattr(Image, "PILLOW_VERSION")
        if isPillow:
            conversionFcn = array  # use numpy's array() function
        else:
            from thunder.utils.common import pil_to_array
            conversionFcn = pil_to_array  # use our modified version of matplotlib's pil_to_array

        height, width, npages, dtype = SeriesLoader.__readMetadataFromFirstPageOfMultiTif(reader, filenames[0])
        if dtype.startswith('int'):
            raise ValueError('Signed integer tiff images are not supported in SeriesLoader (shuffle=False);' +
                             ' please try loading as Images (shuffle=True)')
        pixelBytesize = dtypeFunc(dtype).itemsize
        if newDtype is None or str(newDtype) == '':
            newDtype = str(dtype)
        elif newDtype == 'smallfloat':
            newDtype = str(smallestFloatType(dtype))
        else:
            newDtype = str(newDtype)

        # intialize at one block per plane
        bytesPerPlane = height * width * pixelBytesize * ntimepoints
        bytesPerBlock = bytesPerPlane
        blocksPerPlane = 1
        # keep dividing while cutting our size in half still leaves us bigger than the requested size
        # should end up no more than 2x blockSize.
        while bytesPerBlock >= blockSize * 2:
            bytesPerBlock /= 2
            blocksPerPlane *= 2

        blocklenPixels = max((height * width) / blocksPerPlane, 1)  # integer division
        while blocksPerPlane * blocklenPixels < height * width:  # make sure we're reading the plane fully
            blocksPerPlane += 1

        # prevent bringing in self in closure:
        awsCredentialsOverride = self.awsCredentialsOverride

        # keys will be planeidx, blockidx:
        keys = list(itertools.product(xrange(npages), xrange(blocksPerPlane)))

        def readBlockFromTiff(planeIdxBlockIdx):
            planeIdx, blockIdx = planeIdxBlockIdx
            blocks = []
            planeShape = None
            blockStart = None
            blockEnd = None
            for fname in filenames:
                reader_ = getFileReaderForPath(fname)(awsCredentialsOverride=awsCredentialsOverride)
                fp = reader_.open(fname)
                try:
                    if doMinimizeReads:
                        # use multitif module to generate a fake, in-memory one-page tif file
                        # the advantage of this is that it cuts way down on the many small reads
                        # that PIL/pillow will make otherwise, which would be a problem for s3
                        tiffParser_ = multitif.TiffParser(fp, debug=False)
                        tiffFilebuffer = multitif.packSinglePage(tiffParser_, pageIdx=planeIdx)
                        byteBuf = io.BytesIO(tiffFilebuffer)
                        try:
                            pilImg = Image.open(byteBuf)
                            ary = conversionFcn(pilImg).T
                        finally:
                            byteBuf.close()
                        del tiffFilebuffer, tiffParser_, pilImg, byteBuf
                    else:
                        # read tif using PIL directly
                        pilImg = Image.open(fp)
                        pilImg.seek(planeIdx)
                        ary = conversionFcn(pilImg).T
                        del pilImg

                    if not planeShape:
                        planeShape = ary.shape[:]
                        blockStart = blockIdx * blocklenPixels
                        blockEnd = min(blockStart+blocklenPixels, planeShape[0]*planeShape[1])
                    blocks.append(ary.ravel(order='C')[blockStart:blockEnd])
                    del ary
                finally:
                    fp.close()

            buf = vstack(blocks).T  # dimensions are now linindex x time (images)
            del blocks
            buf = buf.astype(newDtype, casting=casting, copy=False)

            # append subscript keys based on dimensions
            linearIdx = arange(blockStart, blockEnd)  # zero-based

            seriesKeys = zip(*map(tuple, unravel_index(linearIdx, planeShape, order='C')))
            # add plane index to end of keys
            if npages > 1:
                seriesKeys = [tuple(list(keys_)[::-1]+[planeIdx]) for keys_ in seriesKeys]
            else:
                seriesKeys = [tuple(list(keys_)[::-1]) for keys_ in seriesKeys]
            return zip(seriesKeys, buf)

        # map over blocks
        rdd = self.sc.parallelize(keys, len(keys)).flatMap(readBlockFromTiff)
        if npages > 1:
            dims = (npages, width, height)
        else:
            dims = (width, height)

        metadata = (dims, ntimepoints, newDtype)
        return rdd, metadata
Example #10
0
    def _getSeriesBlocksFromStack(self, dataPath, dims, ext="stack", blockSize="150M", dtype='int16',
                                  newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None, recursive=False):
        """Create an RDD of <string blocklabel, (int k-tuple indices, array of datatype values)>

        Parameters
        ----------

        dataPath: string URI or local filesystem path
            Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://",
            "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path
            must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified
            by a glob-style expression using a single wildcard character '*'.

        dims: tuple of positive int
            Dimensions of input image data, ordered with the fastest-changing dimension first.

        dtype: dtype or dtype specifier, optional, default 'int16'
            Numpy dtype of input stack data

        newDtype: floating-point dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat'
            Numpy dtype of output series data. Series data must be floating-point. Input data will be cast to the
            requested `newdtype` - see numpy `astype()` method.

        casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's `astype()` method; see numpy documentation for details.

        recursive: boolean, default False
            If true, will recursively descend directories rooted at dataPath, loading all files in the tree that
            have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems
            (not s3).

        Returns
        ---------
        pair of (RDD, ntimepoints)

        RDD: sequence of keys, values pairs
            (call using flatMap)

        RDD Key: tuple of int
            zero-based indicies of position within original image volume

        RDD Value: numpy array of datatype
            series of values at position across loaded image volumes

        ntimepoints: int
            number of time points in returned series, determined from number of stack files found at dataPath

        newDtype: string
            string representation of numpy data type of returned blocks

        """
        dataPath = self.__normalizeDatafilePattern(dataPath, ext)
        blockSize = parseMemoryString(blockSize)
        totalDim = reduce(lambda x_, y_: x_*y_, dims)
        dtype = dtypeFunc(dtype)
        if newDtype is None or newDtype == '':
            newDtype = str(dtype)
        elif newDtype == 'smallfloat':
            newDtype = str(smallestFloatType(dtype))
        else:
            newDtype = str(newDtype)

        reader = getFileReaderForPath(dataPath)(awsCredentialsOverride=self.awsCredentialsOverride)
        filenames = reader.list(dataPath, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive)
        if not filenames:
            raise IOError("No files found for path '%s'" % dataPath)

        dataSize = totalDim * len(filenames) * dtype.itemsize
        nblocks = max(dataSize / blockSize, 1)  # integer division

        if len(dims) >= 3:
            # for 3D stacks, do calculations to ensure that
            # different planes appear in distinct files
            blocksPerPlane = max(nblocks / dims[-1], 1)

            pixPerPlane = reduce(lambda x_, y_: x_*y_, dims[:-1])  # all but last dimension

            # get the greatest number of blocks in a plane (up to as many as requested) that still divide the plane
            # evenly. This will always be at least one.
            kUpdated = [x for x in range(1, blocksPerPlane+1) if not pixPerPlane % x][-1]
            nblocks = kUpdated * dims[-1]
            blockSizePerStack = (totalDim / nblocks) * dtype.itemsize
        else:
            # otherwise just round to make contents divide into nearly even blocks
            blockSizePerStack = int(math.ceil(totalDim / float(nblocks)))
            nblocks = int(math.ceil(totalDim / float(blockSizePerStack)))
            blockSizePerStack *= dtype.itemsize

        fileSize = totalDim * dtype.itemsize

        def readBlock(blockNum):
            # copy size out from closure; will modify later:
            blockSizePerStack_ = blockSizePerStack
            # get start position for this block
            position = blockNum * blockSizePerStack_

            # adjust if at end of file
            if (position + blockSizePerStack_) > fileSize:
                blockSizePerStack_ = int(fileSize - position)
            # loop over files, loading one block from each
            bufs = []

            for fname in filenames:
                buf = reader.read(fname, startOffset=position, size=blockSizePerStack_)
                bufs.append(frombuffer(buf, dtype=dtype))

            buf = vstack(bufs).T  # dimensions are now linindex x time (images)
            del bufs
            buf = buf.astype(newDtype, casting=casting, copy=False)

            # append subscript keys based on dimensions
            itemPosition = position / dtype.itemsize
            itemBlocksize = blockSizePerStack_ / dtype.itemsize
            linearIdx = arange(itemPosition, itemPosition + itemBlocksize)  # zero-based

            keys = zip(*map(tuple, unravel_index(linearIdx, dims, order='F')))
            return zip(keys, buf)

        # map over blocks
        return (self.sc.parallelize(range(0, nblocks), nblocks).flatMap(lambda bn: readBlock(bn)),
                len(filenames), newDtype)
Example #11
0
    def _run_roundtrip_tst(self, testCount, arrays, blockSize):
        print "Running TestSeriesBinaryWriteFromStack roundtrip test #%d" % testCount
        inSubdir = os.path.join(self.outputdir, 'input%d' % testCount)
        os.mkdir(inSubdir)

        outSubdir = os.path.join(self.outputdir, 'output%d' % testCount)
        # os.mkdir(outSubdir)

        for aryCount, ary in enumerate(arrays):
            # array.tofile always writes in column-major order...
            ary.tofile(os.path.join(inSubdir, "img%02d.stack" % aryCount))

        # ... but we will read and interpret these as though they are in row-major order
        dims = list(arrays[0].shape)
        dims.reverse()

        underTest = SeriesLoader(self.sc)

        underTest.saveFromStack(inSubdir,
                                outSubdir,
                                dims,
                                blockSize=blockSize,
                                dtype=str(arrays[0].dtype))
        series = underTest.fromStack(inSubdir,
                                     dims,
                                     dtype=str(arrays[0].dtype))

        roundtrippedSeries = underTest.fromBinary(outSubdir)
        roundtripped = roundtrippedSeries.collect()
        direct = series.collect()

        expectedDtype = str(smallestFloatType(arrays[0].dtype))
        assert_equals(expectedDtype, roundtrippedSeries.dtype)
        assert_equals(expectedDtype, series.dtype)
        assert_equals(expectedDtype, str(roundtripped[0][1].dtype))
        assert_equals(expectedDtype, str(direct[0][1].dtype))

        with open(os.path.join(outSubdir, "conf.json"), 'r') as fp:
            # check that binary series file data type *matches* input stack data type (not yet converted to float)
            # at least according to conf.json
            conf = json.load(fp)
            assert_equals(str(arrays[0].dtype), conf["valuetype"])

        for ((seriesKeys, seriesValues),
             (directKeys, directValues)) in zip(roundtripped, direct):
            assert_equals(directKeys, seriesKeys)
            assert_equals(directValues, seriesValues)

            for seriesIdx, seriesVal in enumerate(seriesValues):
                # print "seriesIdx: %d; seriesKeys: %s; seriesVal: %g" % (seriesIdx, seriesKeys, seriesVal)
                # flip indices again for row vs col-major insanity
                aryKeys = list(seriesKeys)
                aryKeys.reverse()
                msg = "Failure on test #%d, time point %d, indices %s" % (
                    testCount, seriesIdx, str(tuple(aryKeys)))
                try:
                    assert_almost_equal(arrays[seriesIdx][tuple(aryKeys)],
                                        seriesVal,
                                        places=4)
                except AssertionError, e:
                    raise AssertionError(msg, e)