Example #1
0
    def test_loadMultipleMultipointStacksAsSeries(self):
        rangeAry = arange(64*128, dtype=dtypeFunc('int16'))
        filePath = os.path.join(self.outputdir, "rangeary01.bin")
        rangeAry.tofile(filePath)
        expectedAry = rangeAry.reshape((32, 32, 8), order='F')
        rangeAry2 = arange(64*128, 2*64*128, dtype=dtypeFunc('int16'))
        filePath = os.path.join(self.outputdir, "rangeary02.bin")
        rangeAry2.tofile(filePath)
        expectedAry2 = rangeAry2.reshape((32, 32, 8), order='F')

        rangeSeries = self.tsc.loadImagesAsSeries(self.outputdir, dims=(32, 32, 8), nplanes=2)
        assert_equals('float32', rangeSeries._dtype)

        rangeSeriesAry = rangeSeries.pack()

        assert_equals((32, 32, 2), rangeSeries.dims.count)
        assert_equals((8, 32, 32, 2), rangeSeriesAry.shape)
        assert_equals('float32', str(rangeSeriesAry.dtype))
        assert_true(array_equal(expectedAry[:, :, :2], rangeSeriesAry[0]))
        assert_true(array_equal(expectedAry[:, :, 2:4], rangeSeriesAry[1]))
        assert_true(array_equal(expectedAry[:, :, 4:6], rangeSeriesAry[2]))
        assert_true(array_equal(expectedAry[:, :, 6:], rangeSeriesAry[3]))
        assert_true(array_equal(expectedAry2[:, :, :2], rangeSeriesAry[4]))
        assert_true(array_equal(expectedAry2[:, :, 2:4], rangeSeriesAry[5]))
        assert_true(array_equal(expectedAry2[:, :, 4:6], rangeSeriesAry[6]))
        assert_true(array_equal(expectedAry2[:, :, 6:], rangeSeriesAry[7]))
Example #2
0
    def test_fromStackToSeriesWithPack(self):
        ary = arange(8, dtype=dtypeFunc("int16")).reshape((2, 4))
        filename = os.path.join(self.outputdir, "test.stack")
        ary.tofile(filename)

        image = ImagesLoader(self.sc).fromStack(filename, dims=(4, 2))
        strategy = SimpleBlockingStrategy.generateFromBlockSize(image, "150M")
        series = image.toBlocks(strategy).toSeries()

        seriesVals = series.collect()
        seriesAry = series.pack()

        # check ordering of keys
        assert_equals((0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0), seriesVals[1][0])  # second key
        assert_equals((2, 0), seriesVals[2][0])
        assert_equals((3, 0), seriesVals[3][0])
        assert_equals((0, 1), seriesVals[4][0])
        assert_equals((1, 1), seriesVals[5][0])
        assert_equals((2, 1), seriesVals[6][0])
        assert_equals((3, 1), seriesVals[7][0])

        # check dimensions tuple is reversed from numpy shape
        assert_equals(ary.shape[::-1], series.dims.count)

        # check that values are in original order
        collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc("int16")).ravel()
        assert_true(array_equal(ary.ravel(), collectedVals))

        # check that packing returns transpose of original array
        assert_true(array_equal(ary.T, seriesAry))
Example #3
0
    def test_fromStackToSeriesWithPack(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))
        filename = os.path.join(self.outputdir, "test.stack")
        ary.tofile(filename)

        image = ImagesLoader(self.sc).fromStack(filename, dims=(4, 2))
        strategy = SimpleBlockingStrategy.generateFromBlockSize(image, "150M")
        series = image.toBlocks(strategy).toSeries()

        seriesVals = series.collect()
        seriesAry = series.pack()

        # check ordering of keys
        assert_equals((0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0), seriesVals[1][0])  # second key
        assert_equals((2, 0), seriesVals[2][0])
        assert_equals((3, 0), seriesVals[3][0])
        assert_equals((0, 1), seriesVals[4][0])
        assert_equals((1, 1), seriesVals[5][0])
        assert_equals((2, 1), seriesVals[6][0])
        assert_equals((3, 1), seriesVals[7][0])

        # check dimensions tuple is reversed from numpy shape
        assert_equals(ary.shape[::-1], series.dims.count)

        # check that values are in original order
        collectedVals = array([kv[1] for kv in seriesVals],
                              dtype=dtypeFunc('int16')).ravel()
        assert_true(array_equal(ary.ravel(), collectedVals))

        # check that packing returns transpose of original array
        assert_true(array_equal(ary.T, seriesAry))
Example #4
0
    def test_loadMultipleTifsAsSeriesWithShuffle(self):
        tmpAry = arange(60*120, dtype=dtypeFunc('uint16'))
        rangeAry = mod(tmpAry, 255).astype('uint8').reshape((60, 120))
        pilImg = Image.fromarray(rangeAry)
        filePath = os.path.join(self.outputdir, "rangetif01.tif")
        pilImg.save(filePath)

        tmpAry = arange(60*120, 2*60*120, dtype=dtypeFunc('uint16'))
        rangeAry2 = mod(tmpAry, 255).astype('uint8').reshape((60, 120))
        pilImg = Image.fromarray(rangeAry2)
        filePath = os.path.join(self.outputdir, "rangetif02.tif")
        pilImg.save(filePath)

        del pilImg, tmpAry

        rangeSeries = self.tsc.loadImagesAsSeries(self.outputdir, inputFormat="tif-stack")
        assert_equals('float16', rangeSeries._dtype)
        rangeSeriesAry = rangeSeries.pack()
        rangeSeriesAry_xpose = rangeSeries.pack(transpose=True)

        assert_equals((60, 120), rangeSeries.dims.count)  # 2d tif now loaded as 2d image; was 3d with singleton z dim
        assert_equals((2, 60, 120), rangeSeriesAry.shape)
        assert_equals((2, 120, 60), rangeSeriesAry_xpose.shape)
        assert_equals('float16', str(rangeSeriesAry.dtype))
        assert_true(array_equal(rangeAry, rangeSeriesAry[0]))
        assert_true(array_equal(rangeAry2, rangeSeriesAry[1]))
        assert_true(array_equal(rangeAry.T, rangeSeriesAry_xpose[0]))
        assert_true(array_equal(rangeAry2.T, rangeSeriesAry_xpose[1]))
Example #5
0
    def test_toSeriesWithInefficientSplitAndSortedPack(self):
        ary = arange(8, dtype=dtypeFunc("int16")).reshape((4, 2))

        image = ImagesLoader(self.sc).fromArrays(ary)
        series = image.toBlocks((2, 1), units="s").toSeries()

        seriesVals = series.collect()
        seriesAry = series.pack(sorting=True)

        # check ordering of keys
        assert_equals((0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0), seriesVals[1][0])  # second key
        assert_equals((0, 1), seriesVals[2][0])
        assert_equals((1, 1), seriesVals[3][0])
        # end of first block
        # beginning of second block
        assert_equals((2, 0), seriesVals[4][0])
        assert_equals((3, 0), seriesVals[5][0])
        assert_equals((2, 1), seriesVals[6][0])
        assert_equals((3, 1), seriesVals[7][0])

        # check dimensions tuple matches numpy shape
        assert_equals(ary.shape, series.dims.count)

        # check that values are in expected order
        collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc("int16")).ravel()
        assert_true(array_equal(ary[:2, :].ravel(order="F"), collectedVals[:4]))  # first block
        assert_true(array_equal(ary[2:4, :].ravel(order="F"), collectedVals[4:]))  # second block

        # check that packing returns original array (after sort)
        assert_true(array_equal(ary, seriesAry))
Example #6
0
    def __run_loadMultipleStacksAsSeries(self, shuffle):
        rangeAry = arange(64 * 128, dtype=dtypeFunc('int16'))
        filePath = os.path.join(self.outputdir, "rangeary01.stack")
        rangeAry.tofile(filePath)
        expectedAry = rangeAry.reshape((128, 64), order='F')
        rangeAry2 = arange(64 * 128, 2 * 64 * 128, dtype=dtypeFunc('int16'))
        filePath = os.path.join(self.outputdir, "rangeary02.stack")
        rangeAry2.tofile(filePath)
        expectedAry2 = rangeAry2.reshape((128, 64), order='F')

        rangeSeries = self.tsc.loadImagesAsSeries(self.outputdir,
                                                  dims=(128, 64),
                                                  shuffle=shuffle)
        assert_equals('float32', rangeSeries._dtype)

        rangeSeriesAry = rangeSeries.pack()
        rangeSeriesAry_xpose = rangeSeries.pack(transpose=True)

        assert_equals((128, 64), rangeSeries.dims.count)
        assert_equals((2, 128, 64), rangeSeriesAry.shape)
        assert_equals((2, 64, 128), rangeSeriesAry_xpose.shape)
        assert_equals('float32', str(rangeSeriesAry.dtype))
        assert_true(array_equal(expectedAry, rangeSeriesAry[0]))
        assert_true(array_equal(expectedAry2, rangeSeriesAry[1]))
        assert_true(array_equal(expectedAry.T, rangeSeriesAry_xpose[0]))
        assert_true(array_equal(expectedAry2.T, rangeSeriesAry_xpose[1]))
Example #7
0
    def test_fromMultiTimepointStacks(self):
        ary = arange(16, dtype=dtypeFunc('uint8')).reshape((4, 2, 2))
        ary2 = arange(16, 32, dtype=dtypeFunc('uint8')).reshape((4, 2, 2))
        ary.tofile(os.path.join(self.outputdir, "test01.stack"))
        ary2.tofile(os.path.join(self.outputdir, "test02.stack"))

        image = ImagesLoader(self.sc).fromStack(self.outputdir, dtype="uint8", dims=(2, 2, 4), nplanes=2)
        collectedImage = image.collect()

        # we don't expect to have nrecords cached, since we get an unknown number of images per file
        assert_true(image._nrecords is None)
        assert_equals(4, image.nrecords)
        assert_equals(4, len(collectedImage))
        # check keys:
        assert_equals(0, collectedImage[0][0])
        assert_equals(1, collectedImage[1][0])
        assert_equals(2, collectedImage[2][0])
        assert_equals(3, collectedImage[3][0])
        # check values:
        assert_true(array_equal(ary[:2].T, collectedImage[0][1]))
        assert_true(array_equal(ary[2:].T, collectedImage[1][1]))
        assert_true(array_equal(ary2[:2].T, collectedImage[2][1]))
        assert_true(array_equal(ary2[2:].T, collectedImage[3][1]))

        # 3 planes does not divide 4
        assert_raises(ValueError, ImagesLoader(self.sc).fromStack, self.outputdir, dtype="uint8",
                      dims=(2, 2, 4), nplanes=3)
Example #8
0
    def _run_tst_toSeriesWithSplitsAndPack(self, strategy):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((4, 2))
        image = ImagesLoader(self.sc).fromArrays(ary)
        series = image.toBlocks(strategy).toSeries()

        seriesVals = series.collect()
        seriesAry = series.pack()

        # check ordering of keys
        assert_equals((0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0), seriesVals[1][0])  # second key
        assert_equals((2, 0), seriesVals[2][0])
        assert_equals((3, 0), seriesVals[3][0])
        assert_equals((0, 1), seriesVals[4][0])
        assert_equals((1, 1), seriesVals[5][0])
        assert_equals((2, 1), seriesVals[6][0])
        assert_equals((3, 1), seriesVals[7][0])

        # check dimensions tuple matches numpy shape
        assert_equals(ary.shape, series.dims.count)

        # check that values are in Fortran-convention order
        collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16')).ravel()
        assert_true(array_equal(ary.ravel(order='F'), collectedVals))

        # check that packing returns original array
        assert_true(array_equal(ary, seriesAry))
Example #9
0
    def test_fromMultiTimepointStacks(self):
        ary = arange(16, dtype=dtypeFunc('uint8')).reshape((4, 2, 2))
        ary2 = arange(16, 32, dtype=dtypeFunc('uint8')).reshape((4, 2, 2))
        ary.tofile(os.path.join(self.outputdir, "test01.stack"))
        ary2.tofile(os.path.join(self.outputdir, "test02.stack"))

        image = ImagesLoader(self.sc).fromStack(self.outputdir, dtype="uint8", dims=(2, 2, 4), nplanes=2)
        collectedImage = image.collect()

        # we don't expect to have nrecords cached, since we get an unknown number of images per file
        assert_true(image._nrecords is None)
        assert_equals(4, image.nrecords)
        assert_equals(4, len(collectedImage))
        # check keys:
        assert_equals(0, collectedImage[0][0])
        assert_equals(1, collectedImage[1][0])
        assert_equals(2, collectedImage[2][0])
        assert_equals(3, collectedImage[3][0])
        # check values:
        assert_true(array_equal(ary[:2].T, collectedImage[0][1]))
        assert_true(array_equal(ary[2:].T, collectedImage[1][1]))
        assert_true(array_equal(ary2[:2].T, collectedImage[2][1]))
        assert_true(array_equal(ary2[2:].T, collectedImage[3][1]))

        # 3 planes does not divide 4
        assert_raises(ValueError, ImagesLoader(self.sc).fromStack, self.outputdir, dtype="uint8",
                      dims=(2, 2, 4), nplanes=3)
Example #10
0
    def test_loadMultipleTifsAsSeriesWithShuffle(self):
        tmpAry = arange(60*120, dtype=dtypeFunc('uint16'))
        rangeAry = mod(tmpAry, 255).astype('uint8').reshape((60, 120))
        pilImg = Image.fromarray(rangeAry)
        filePath = os.path.join(self.outputdir, "rangetif01.tif")
        pilImg.save(filePath)

        tmpAry = arange(60*120, 2*60*120, dtype=dtypeFunc('uint16'))
        rangeAry2 = mod(tmpAry, 255).astype('uint8').reshape((60, 120))
        pilImg = Image.fromarray(rangeAry2)
        filePath = os.path.join(self.outputdir, "rangetif02.tif")
        pilImg.save(filePath)

        del pilImg, tmpAry

        rangeSeries = self.tsc.loadImagesAsSeries(self.outputdir, inputFormat="tif-stack")
        assert_equals('float16', rangeSeries._dtype)
        rangeSeriesAry = rangeSeries.pack()
        rangeSeriesAry_xpose = rangeSeries.pack(transpose=True)

        assert_equals((60, 120), rangeSeries.dims.count)  # 2d tif now loaded as 2d image; was 3d with singleton z dim
        assert_equals((2, 60, 120), rangeSeriesAry.shape)
        assert_equals((2, 120, 60), rangeSeriesAry_xpose.shape)
        assert_equals('float16', str(rangeSeriesAry.dtype))
        assert_true(array_equal(rangeAry, rangeSeriesAry[0]))
        assert_true(array_equal(rangeAry2, rangeSeriesAry[1]))
        assert_true(array_equal(rangeAry.T, rangeSeriesAry_xpose[0]))
        assert_true(array_equal(rangeAry2.T, rangeSeriesAry_xpose[1]))
Example #11
0
    def test_fromArrays(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))

        series = SeriesLoader(self.sc).fromArraysAsImages(ary)

        seriesVals = series.collect()
        seriesAry = series.pack()

        # check ordering of keys
        assert_equals((0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0), seriesVals[1][0])  # second key
        assert_equals((2, 0), seriesVals[2][0])
        assert_equals((3, 0), seriesVals[3][0])
        assert_equals((0, 1), seriesVals[4][0])
        assert_equals((1, 1), seriesVals[5][0])
        assert_equals((2, 1), seriesVals[6][0])
        assert_equals((3, 1), seriesVals[7][0])

        # check dimensions tuple is reversed from numpy shape
        assert_equals(ary.shape[::-1], series.dims.count)

        # check that values are in original order
        collectedVals = array([kv[1] for kv in seriesVals],
                              dtype=dtypeFunc('int16')).ravel()
        assert_true(array_equal(ary.ravel(), collectedVals))

        # check that packing returns transpose of original array
        assert_true(array_equal(ary.T, seriesAry))
Example #12
0
    def test_fromMultipleArrays(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))
        ary2 = arange(8, 16, dtype=dtypeFunc('int16')).reshape((2, 4))

        series = SeriesLoader(self.sc).fromArraysAsImages([ary, ary2])

        seriesVals = series.collect()
        seriesAry = series.pack()

        # check ordering of keys
        assert_equals((0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0), seriesVals[1][0])  # second key
        assert_equals((3, 0), seriesVals[3][0])
        assert_equals((0, 1), seriesVals[4][0])
        assert_equals((3, 1), seriesVals[7][0])

        # check dimensions tuple is reversed from numpy shape
        assert_equals(ary.shape[::-1], series.dims.count)

        # check that values are in original order, with subsequent point concatenated in values
        collectedVals = array([kv[1] for kv in seriesVals],
                              dtype=dtypeFunc('int16'))
        assert_true(array_equal(ary.ravel(), collectedVals[:, 0]))
        assert_true(array_equal(ary2.ravel(), collectedVals[:, 1]))

        # check that packing returns concatenation of input arrays, with time as first dimension
        assert_true(array_equal(ary.T, seriesAry[0]))
        assert_true(array_equal(ary2.T, seriesAry[1]))
Example #13
0
    def fromBinary(self,
                   dataPath,
                   ext='bin',
                   confFilename='conf.json',
                   nkeys=None,
                   nvalues=None,
                   keyType=None,
                   valueType=None,
                   newDtype='smallfloat',
                   casting='safe',
                   maxPartitionSize='32mb'):
        """
        Load a Series object from a directory of binary files.

        Parameters
        ----------

        dataPath : string URI or local filesystem path
            Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://",
            "s3n://", or "gs://"). If no scheme is present, will be interpreted as a path on the local filesystem. This path
            must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified
            by a glob-style expression using a single wildcard character '*'.

        newDtype : dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat'
            Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be
            cast to the requested `newdtype` if not None - see Data `astype()` method.

        casting : 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's `astype()` method; see numpy documentation for details.

        maxPartitionSize : str, optional, default = '32mb'
            Maximum size of partitions as Java-style memory, will indirectly control the number of partitions

        """

        paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename,
                                                     nkeys, nvalues, keyType,
                                                     valueType)
        self.__checkBinaryParametersAreSpecified(paramsObj)

        dataPath = self.__normalizeDatafilePattern(dataPath, ext)

        keyDtype = dtypeFunc(paramsObj.keytype)
        valDtype = dtypeFunc(paramsObj.valuetype)

        keySize = paramsObj.nkeys * keyDtype.itemsize
        recordSize = keySize + paramsObj.nvalues * valDtype.itemsize

        lines = self.sc.binaryRecords(dataPath, recordSize)

        get = lambda v: (tuple(
            int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype)),
                         frombuffer(buffer(v, keySize), dtype=valDtype))

        data = lines.map(get)

        return Series(data,
                      dtype=str(valDtype),
                      index=arange(paramsObj.nvalues)).astype(
                          newDtype, casting)
Example #14
0
    def test_loadMultipleMultipointStacksAsSeries(self):
        rangeAry = arange(64*128, dtype=dtypeFunc('int16'))
        filePath = os.path.join(self.outputdir, "rangeary01.bin")
        rangeAry.tofile(filePath)
        expectedAry = rangeAry.reshape((32, 32, 8), order='F')
        rangeAry2 = arange(64*128, 2*64*128, dtype=dtypeFunc('int16'))
        filePath = os.path.join(self.outputdir, "rangeary02.bin")
        rangeAry2.tofile(filePath)
        expectedAry2 = rangeAry2.reshape((32, 32, 8), order='F')

        rangeSeries = self.tsc.loadImagesAsSeries(self.outputdir, dims=(32, 32, 8), nplanes=2)
        assert_equals('float32', rangeSeries._dtype)

        rangeSeriesAry = rangeSeries.pack()

        assert_equals((32, 32, 2), rangeSeries.dims.count)
        assert_equals((8, 32, 32, 2), rangeSeriesAry.shape)
        assert_equals('float32', str(rangeSeriesAry.dtype))
        assert_true(array_equal(expectedAry[:, :, :2], rangeSeriesAry[0]))
        assert_true(array_equal(expectedAry[:, :, 2:4], rangeSeriesAry[1]))
        assert_true(array_equal(expectedAry[:, :, 4:6], rangeSeriesAry[2]))
        assert_true(array_equal(expectedAry[:, :, 6:], rangeSeriesAry[3]))
        assert_true(array_equal(expectedAry2[:, :, :2], rangeSeriesAry[4]))
        assert_true(array_equal(expectedAry2[:, :, 2:4], rangeSeriesAry[5]))
        assert_true(array_equal(expectedAry2[:, :, 4:6], rangeSeriesAry[6]))
        assert_true(array_equal(expectedAry2[:, :, 6:], rangeSeriesAry[7]))
    def test_fromMultipleArrays(self):
        ary = arange(8, dtype=dtypeFunc("int16")).reshape((2, 4))
        ary2 = arange(8, 16, dtype=dtypeFunc("int16")).reshape((2, 4))

        series = SeriesLoader(self.sc).fromArraysAsImages([ary, ary2])

        seriesVals = series.collect()
        seriesAry = series.pack()

        # check ordering of keys
        assert_equals((0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0), seriesVals[1][0])  # second key
        assert_equals((3, 0), seriesVals[3][0])
        assert_equals((0, 1), seriesVals[4][0])
        assert_equals((3, 1), seriesVals[7][0])

        # check dimensions tuple is reversed from numpy shape
        assert_equals(ary.shape[::-1], series.dims.count)

        # check that values are in original order, with subsequent point concatenated in values
        collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc("int16"))
        assert_true(array_equal(ary.ravel(), collectedVals[:, 0]))
        assert_true(array_equal(ary2.ravel(), collectedVals[:, 1]))

        # check that packing returns concatenation of input arrays, with time as first dimension
        assert_true(array_equal(ary.T, seriesAry[0]))
        assert_true(array_equal(ary2.T, seriesAry[1]))
    def test_fromArrays(self):
        ary = arange(8, dtype=dtypeFunc("int16")).reshape((2, 4))

        series = SeriesLoader(self.sc).fromArraysAsImages(ary)

        seriesVals = series.collect()
        seriesAry = series.pack()

        # check ordering of keys
        assert_equals((0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0), seriesVals[1][0])  # second key
        assert_equals((2, 0), seriesVals[2][0])
        assert_equals((3, 0), seriesVals[3][0])
        assert_equals((0, 1), seriesVals[4][0])
        assert_equals((1, 1), seriesVals[5][0])
        assert_equals((2, 1), seriesVals[6][0])
        assert_equals((3, 1), seriesVals[7][0])

        # check dimensions tuple is reversed from numpy shape
        assert_equals(ary.shape[::-1], series.dims.count)

        # check that values are in original order
        collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc("int16")).ravel()
        assert_true(array_equal(ary.ravel(), collectedVals))

        # check that packing returns transpose of original array
        assert_true(array_equal(ary.T, seriesAry))
Example #17
0
    def test_toSeriesWithPack(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))

        image = ImagesLoader(self.sc).fromArrays(ary)
        series = image.toBlocks("150M").toSeries()

        seriesVals = series.collect()
        seriesAry = series.pack()
        seriesAry_xpose = series.pack(transpose=True)

        # check ordering of keys
        assert_equals((0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0), seriesVals[1][0])  # second key
        assert_equals((0, 1), seriesVals[2][0])
        assert_equals((1, 1), seriesVals[3][0])
        assert_equals((0, 2), seriesVals[4][0])
        assert_equals((1, 2), seriesVals[5][0])
        assert_equals((0, 3), seriesVals[6][0])
        assert_equals((1, 3), seriesVals[7][0])

        # check dimensions tuple matches numpy shape
        assert_equals(image.dims.count, series.dims.count)
        assert_equals(ary.shape, series.dims.count)

        # check that values are in Fortran-convention order
        collectedVals = array([kv[1] for kv in seriesVals],
                              dtype=dtypeFunc('int16')).ravel()
        assert_true(array_equal(ary.ravel(order='F'), collectedVals))

        # check that packing returns original array
        assert_true(array_equal(ary, seriesAry))
        assert_true(array_equal(ary.T, seriesAry_xpose))
Example #18
0
    def test_toSeriesWithPack(self):
        ary = arange(8, dtype=dtypeFunc("int16")).reshape((2, 4))

        image = ImagesLoader(self.sc).fromArrays(ary)
        series = image.toBlocks("150M").toSeries()

        seriesVals = series.collect()
        seriesAry = series.pack()
        seriesAry_xpose = series.pack(transpose=True)

        # check ordering of keys
        assert_equals((0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0), seriesVals[1][0])  # second key
        assert_equals((0, 1), seriesVals[2][0])
        assert_equals((1, 1), seriesVals[3][0])
        assert_equals((0, 2), seriesVals[4][0])
        assert_equals((1, 2), seriesVals[5][0])
        assert_equals((0, 3), seriesVals[6][0])
        assert_equals((1, 3), seriesVals[7][0])

        # check dimensions tuple matches numpy shape
        assert_equals(image.dims.count, series.dims.count)
        assert_equals(ary.shape, series.dims.count)

        # check that values are in Fortran-convention order
        collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc("int16")).ravel()
        assert_true(array_equal(ary.ravel(order="F"), collectedVals))

        # check that packing returns original array
        assert_true(array_equal(ary, seriesAry))
        assert_true(array_equal(ary.T, seriesAry_xpose))
Example #19
0
    def test_toSeriesWithInefficientSplitAndSortedPack(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((4, 2))

        image = ImagesLoader(self.sc).fromArrays(ary)
        series = image.toBlocks((2, 1), units="s").toSeries()

        seriesVals = series.collect()
        seriesAry = series.pack(sorting=True)

        # check ordering of keys
        assert_equals((0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0), seriesVals[1][0])  # second key
        assert_equals((0, 1), seriesVals[2][0])
        assert_equals((1, 1), seriesVals[3][0])
        # end of first block
        # beginning of second block
        assert_equals((2, 0), seriesVals[4][0])
        assert_equals((3, 0), seriesVals[5][0])
        assert_equals((2, 1), seriesVals[6][0])
        assert_equals((3, 1), seriesVals[7][0])

        # check dimensions tuple matches numpy shape
        assert_equals(ary.shape, series.dims.count)

        # check that values are in expected order
        collectedVals = array([kv[1] for kv in seriesVals],
                              dtype=dtypeFunc('int16')).ravel()
        assert_true(array_equal(ary[:2, :].ravel(order='F'),
                                collectedVals[:4]))  # first block
        assert_true(
            array_equal(ary[2:4, :].ravel(order='F'),
                        collectedVals[4:]))  # second block

        # check that packing returns original array (after sort)
        assert_true(array_equal(ary, seriesAry))
Example #20
0
    def fromBinary(self, dataPath, ext='bin', confFilename='conf.json',
                   nkeys=None, nvalues=None, keyType=None, valueType=None,
                   newDtype='smallfloat', casting='safe', maxPartitionSize='32mb'):
        """
        Load a Series object from a directory of binary files.

        Parameters
        ----------

        dataPath : string URI or local filesystem path
            Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://",
            "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path
            must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified
            by a glob-style expression using a single wildcard character '*'.

        newDtype : dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat'
            Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be
            cast to the requested `newdtype` if not None - see Data `astype()` method.

        casting : 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's `astype()` method; see numpy documentation for details.

        maxPartitionSize : str, optional, default = '32mb'
            Maximum size of partitions as Java-style memory, will indirectly control the number of partitions

        """

        paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename, nkeys, nvalues, keyType, valueType)
        self.__checkBinaryParametersAreSpecified(paramsObj)

        dataPath = self.__normalizeDatafilePattern(dataPath, ext)

        keyDtype = dtypeFunc(paramsObj.keytype)
        valDtype = dtypeFunc(paramsObj.valuetype)

        keySize = paramsObj.nkeys * keyDtype.itemsize
        recordSize = keySize + paramsObj.nvalues * valDtype.itemsize

        from thunder.utils.common import parseMemoryString
        if isinstance(maxPartitionSize, basestring):
            size = parseMemoryString(maxPartitionSize)
        else:
            raise Exception("Invalid size specification")
        hadoopConf = {'recordLength': str(recordSize), 'mapred.max.split.size': str(size)}

        lines = self.sc.newAPIHadoopFile(dataPath, 'thunder.util.io.hadoop.FixedLengthBinaryInputFormat',
                                         'org.apache.hadoop.io.LongWritable',
                                         'org.apache.hadoop.io.BytesWritable',
                                         conf=hadoopConf)

        data = lines.map(lambda (_, v):
                         (tuple(int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype)),
                          frombuffer(buffer(v, keySize), dtype=valDtype)))

        return Series(data, dtype=str(valDtype), index=arange(paramsObj.nvalues)).astype(newDtype, casting)
Example #21
0
def smallestFloatType(dtype):
    """Returns the smallest floating point dtype to which the passed dtype can be safely cast.

    For integers and unsigned ints, this will generally be next floating point type larger than the integer type. So
    for instance, smallest_float_type('uint8') -> dtype('float16'), smallest_float_type('int16') -> dtype('float32'),
    smallest_float_type('uint32') -> dtype('float64').

    This function relies on numpy's promote_types function.
    """
    from numpy import dtype as dtypeFunc
    from numpy import promote_types
    inType = dtypeFunc(dtype)
    compSize = max(2, inType.itemsize)  # smallest float is at least 16 bits
    compType = dtypeFunc('=f'+str(compSize))  # compare to a float of the same size
    return promote_types(inType, compType)
Example #22
0
def smallestFloatType(dtype):
    """
    Returns the smallest floating point dtype to which the passed dtype can be safely cast.

    For integers and unsigned ints, this will generally be next floating point type larger than the integer type. So
    for instance, smallest_float_type('uint8') -> dtype('float16'), smallest_float_type('int16') -> dtype('float32'),
    smallest_float_type('uint32') -> dtype('float64').

    This function relies on numpy's promote_types function.
    """
    from numpy import dtype as dtypeFunc
    from numpy import promote_types
    inType = dtypeFunc(dtype)
    compSize = max(2, inType.itemsize)  # smallest float is at least 16 bits
    compType = dtypeFunc('=f'+str(compSize))  # compare to a float of the same size
    return promote_types(inType, compType)
Example #23
0
 def calcAverageBlockSize(self):
     if not (self._splitsPerDim is None):
         elts = _BlockMemoryAsSequence.avgElementsPerBlock(
             self.dims, self._splitsPerDim)
     else:
         elts = reduce(lambda x, y: x * y, self._pixPerDim)
     return elts * dtypeFunc(self.dtype).itemsize * self.nimages
Example #24
0
    def generateFromBlockSize(cls, series, blockSize, **kwargs):
        """Returns a new SeriesBlockingStrategy, that yields blocks
        closely matching the requested size in bytes.

        Parameters
        ----------
        series : Series object
            Series for which blocking strategy is to be generated.

        blockSize : positive int or string
            Requests an average size for the intermediate blocks in bytes. A passed string should
            be in a format like "256k" or "150M" (see util.common.parseMemoryString). If blocksPerDim
            or groupingDim are passed, they will take precedence over this argument. See
            strategy._BlockMemoryAsSequence for a description of the blocking strategy used.

        Returns
        -------
        SeriesBlockingStrategy or subclass
            new BlockingStrategy will be created and setSource() called on it with the passed series object
        """
        dims, nimages, dtype = series.dims, len(series.index), series.dtype
        elementSize = nimages * dtypeFunc(dtype).itemsize

        splitsPerDim = _calcSplitsForBlockSize(blockSize, elementSize, dims)
        strategy = cls(splitsPerDim, units="splits", **kwargs)
        strategy.setSource(series)
        return strategy
Example #25
0
    def generateFromBlockSize(cls, series, blockSize, **kwargs):
        """Returns a new SeriesBlockingStrategy, that yields blocks
        closely matching the requested size in bytes.

        Parameters
        ----------
        series : Series object
            Series for which blocking strategy is to be generated.

        blockSize : positive int or string
            Requests an average size for the intermediate blocks in bytes. A passed string should
            be in a format like "256k" or "150M" (see util.common.parseMemoryString). If blocksPerDim
            or groupingDim are passed, they will take precedence over this argument. See
            strategy._BlockMemoryAsSequence for a description of the blocking strategy used.

        Returns
        -------
        SeriesBlockingStrategy or subclass
            new BlockingStrategy will be created and setSource() called on it with the passed series object
        """
        dims, nimages, dtype = series.dims, len(series.index), series.dtype
        elementSize = nimages * dtypeFunc(dtype).itemsize

        splitsPerDim = _calcSplitsForBlockSize(blockSize, elementSize, dims)
        strategy = cls(splitsPerDim, units="splits", **kwargs)
        strategy.setSource(series)
        return strategy
Example #26
0
 def calcAverageBlockSize(self):
     if self._splitsPerDim is None:
         raise Exception(
             "setSource() must be called before calcAverageBlockSize()")
     elts = _BlockMemoryAsSequence.avgElementsPerBlock(
         self.dims, self._splitsPerDim)
     return elts * dtypeFunc(self.dtype).itemsize * self.nimages
Example #27
0
    def test_threeDArrayToSeriesWithPack(self):
        ary = arange(24, dtype=dtypeFunc('int16')).reshape((3, 4, 2))

        image = ImagesLoader(self.sc).fromArrays(ary)
        series = image.toBlocks("150M").toSeries()

        seriesVals = series.collect()
        seriesAry = series.pack()
        seriesAry_xpose = series.pack(transpose=True)

        # check ordering of keys
        assert_equals((0, 0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0, 0), seriesVals[1][0])  # second key
        assert_equals((2, 0, 0), seriesVals[2][0])
        assert_equals((0, 1, 0), seriesVals[3][0])
        assert_equals((1, 1, 0), seriesVals[4][0])
        assert_equals((2, 1, 0), seriesVals[5][0])
        assert_equals((0, 2, 0), seriesVals[6][0])
        assert_equals((1, 2, 0), seriesVals[7][0])
        assert_equals((2, 2, 0), seriesVals[8][0])
        assert_equals((0, 3, 0), seriesVals[9][0])
        assert_equals((1, 3, 0), seriesVals[10][0])
        assert_equals((2, 3, 0), seriesVals[11][0])
        assert_equals((0, 0, 1), seriesVals[12][0])
        assert_equals((1, 0, 1), seriesVals[13][0])
        assert_equals((2, 0, 1), seriesVals[14][0])
        assert_equals((0, 1, 1), seriesVals[15][0])
        assert_equals((1, 1, 1), seriesVals[16][0])
        assert_equals((2, 1, 1), seriesVals[17][0])
        assert_equals((0, 2, 1), seriesVals[18][0])
        assert_equals((1, 2, 1), seriesVals[19][0])
        assert_equals((2, 2, 1), seriesVals[20][0])
        assert_equals((0, 3, 1), seriesVals[21][0])
        assert_equals((1, 3, 1), seriesVals[22][0])
        assert_equals((2, 3, 1), seriesVals[23][0])

        # check dimensions tuple matches numpy shape
        assert_equals(ary.shape, series.dims.count)

        # check that values are in Fortran-convention order
        collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16')).ravel()
        assert_true(array_equal(ary.ravel(order='F'), collectedVals))

        # check that packing returns transpose of original array
        assert_true(array_equal(ary, seriesAry))
        assert_true(array_equal(ary.T, seriesAry_xpose))
Example #28
0
    def test_fromStacks(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))
        ary2 = arange(8, 16, dtype=dtypeFunc('int16')).reshape((2, 4))
        filename = os.path.join(self.outputdir, "test01.stack")
        ary.tofile(filename)
        filename = os.path.join(self.outputdir, "test02.stack")
        ary2.tofile(filename)

        image = ImagesLoader(self.sc).fromStack(self.outputdir, dims=(4, 2))

        collectedImage = image.collect()
        assert_equals(2, len(collectedImage))
        assert_equals(0, collectedImage[0][0])  # check key
        assert_equals(image.dims.count, collectedImage[0][1].shape)
        assert_true(array_equal(ary.T, collectedImage[0][1]))  # check value
        assert_equals(1, collectedImage[1][0])  # check image 2
        assert_true(array_equal(ary2.T, collectedImage[1][1]))
Example #29
0
def _generateTestArrays(narys, dtype_='int16'):
    sh = 4, 3, 3
    sz = reduce(lambda x, y: x * y, sh, 1)
    arys = [
        arange(i, i + sz, dtype=dtypeFunc(dtype_)).reshape(sh)
        for i in xrange(0, sz * narys, sz)
    ]
    return arys, sh, sz
Example #30
0
    def test_fromStacks(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))
        ary2 = arange(8, 16, dtype=dtypeFunc('int16')).reshape((2, 4))
        filename = os.path.join(self.outputdir, "test01.stack")
        ary.tofile(filename)
        filename = os.path.join(self.outputdir, "test02.stack")
        ary2.tofile(filename)

        image = ImagesLoader(self.sc).fromStack(self.outputdir, dims=(4, 2))

        collectedImage = image.collect()
        assert_equals(2, len(collectedImage))
        assert_equals(0, collectedImage[0][0])  # check key
        assert_equals(image.dims.count, collectedImage[0][1].shape)
        assert_true(array_equal(ary.T, collectedImage[0][1]))  # check value
        assert_equals(1, collectedImage[1][0])  # check image 2
        assert_true(array_equal(ary2.T, collectedImage[1][1]))
Example #31
0
    def fromBinary(self, dataPath, ext='bin', confFilename='conf.json',
                   nkeys=None, nvalues=None, keyType=None, valueType=None,
                   newDtype='smallfloat', casting='safe', maxPartitionSize='32mb'):
        """
        Load a Series object from a directory of binary files.

        Parameters
        ----------

        dataPath : string URI or local filesystem path
            Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://",
            "s3n://", or "gs://"). If no scheme is present, will be interpreted as a path on the local filesystem. This path
            must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified
            by a glob-style expression using a single wildcard character '*'.

        newDtype : dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat'
            Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be
            cast to the requested `newdtype` if not None - see Data `astype()` method.

        casting : 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's `astype()` method; see numpy documentation for details.

        maxPartitionSize : str, optional, default = '32mb'
            Maximum size of partitions as Java-style memory, will indirectly control the number of partitions

        """

        paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename, nkeys, nvalues, keyType, valueType)
        self.__checkBinaryParametersAreSpecified(paramsObj)

        dataPath = self.__normalizeDatafilePattern(dataPath, ext)

        keyDtype = dtypeFunc(paramsObj.keytype)
        valDtype = dtypeFunc(paramsObj.valuetype)

        keySize = paramsObj.nkeys * keyDtype.itemsize
        recordSize = keySize + paramsObj.nvalues * valDtype.itemsize

        lines = self.sc.binaryRecords(dataPath, recordSize)

        get = lambda v: (tuple(int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype)),
                         frombuffer(buffer(v, keySize), dtype=valDtype))

        data = lines.map(get)

        return Series(data, dtype=str(valDtype), index=arange(paramsObj.nvalues)).astype(newDtype, casting)
Example #32
0
    def _run_tstSaveAsBinarySeries(self, testIdx, narys_, valDtype, groupingDim_):
        """Pseudo-parameterized test fixture, allows reusing existing spark context
        """
        paramStr = "(groupingdim=%d, valuedtype='%s')" % (groupingDim_, valDtype)
        arys, aryShape, arySize = _generateTestArrays(narys_, dtype_=valDtype)
        dims = aryShape[:]
        outdir = os.path.join(self.outputdir, "anotherdir%02d" % testIdx)

        images = ImagesLoader(self.sc).fromArrays(arys)

        slicesPerDim = [1]*arys[0].ndim
        slicesPerDim[groupingDim_] = arys[0].shape[groupingDim_]
        images.toBlocks(slicesPerDim, units="splits").saveAsBinarySeries(outdir)

        ndims = len(aryShape)
        # prevent padding to 4-byte boundaries: "=" specifies no alignment
        unpacker = struct.Struct('=' + 'h'*ndims + dtypeFunc(valDtype).char*narys_)

        def calcExpectedNKeys():
            tmpShape = list(dims[:])
            del tmpShape[groupingDim_]
            return prod(tmpShape)
        expectedNKeys = calcExpectedNKeys()

        def byrec(f_, unpacker_, nkeys_):
            rec = True
            while rec:
                rec = f_.read(unpacker_.size)
                if rec:
                    allRecVals = unpacker_.unpack(rec)
                    yield allRecVals[:nkeys_], allRecVals[nkeys_:]

        outFilenames = glob.glob(os.path.join(outdir, "*.bin"))
        assert_equals(dims[groupingDim_], len(outFilenames))
        for outFilename in outFilenames:
            with open(outFilename, 'rb') as f:
                nkeys = 0
                for keys, vals in byrec(f, unpacker, ndims):
                    nkeys += 1
                    assert_equals(narys_, len(vals))
                    for valIdx, val in enumerate(vals):
                        assert_equals(arys[valIdx][keys], val, "Expected %g, got %g, for test %d %s" %
                                      (arys[valIdx][keys], val, testIdx, paramStr))
                assert_equals(expectedNKeys, nkeys)

        confName = os.path.join(outdir, "conf.json")
        assert_true(os.path.isfile(confName))
        with open(os.path.join(outdir, "conf.json"), 'r') as fconf:
            import json
            conf = json.load(fconf)
            assert_equals(outdir, conf['input'])
            assert_equals(len(aryShape), conf['nkeys'])
            assert_equals(narys_, conf['nvalues'])
            assert_equals(valDtype, conf['valuetype'])
            assert_equals('int16', conf['keytype'])

        assert_true(os.path.isfile(os.path.join(outdir, 'SUCCESS')))
Example #33
0
    def _run_tstSaveAsBinarySeries(self, testIdx, narys_, valDtype, groupingDim_):
        """Pseudo-parameterized test fixture, allows reusing existing spark context
        """
        paramStr = "(groupingdim=%d, valuedtype='%s')" % (groupingDim_, valDtype)
        arys, aryShape, arySize = _generateTestArrays(narys_, dtype_=valDtype)
        dims = aryShape[:]
        outdir = os.path.join(self.outputdir, "anotherdir%02d" % testIdx)

        images = ImagesLoader(self.sc).fromArrays(arys)

        slicesPerDim = [1]*arys[0].ndim
        slicesPerDim[groupingDim_] = arys[0].shape[groupingDim_]
        images.toBlocks(slicesPerDim, units="splits").saveAsBinarySeries(outdir)

        ndims = len(aryShape)
        # prevent padding to 4-byte boundaries: "=" specifies no alignment
        unpacker = struct.Struct('=' + 'h'*ndims + dtypeFunc(valDtype).char*narys_)

        def calcExpectedNKeys():
            tmpShape = list(dims[:])
            del tmpShape[groupingDim_]
            return prod(tmpShape)
        expectedNKeys = calcExpectedNKeys()

        def byrec(f_, unpacker_, nkeys_):
            rec = True
            while rec:
                rec = f_.read(unpacker_.size)
                if rec:
                    allRecVals = unpacker_.unpack(rec)
                    yield allRecVals[:nkeys_], allRecVals[nkeys_:]

        outFilenames = glob.glob(os.path.join(outdir, "*.bin"))
        assert_equals(dims[groupingDim_], len(outFilenames))
        for outFilename in outFilenames:
            with open(outFilename, 'rb') as f:
                nkeys = 0
                for keys, vals in byrec(f, unpacker, ndims):
                    nkeys += 1
                    assert_equals(narys_, len(vals))
                    for valIdx, val in enumerate(vals):
                        assert_equals(arys[valIdx][keys], val, "Expected %g, got %g, for test %d %s" %
                                      (arys[valIdx][keys], val, testIdx, paramStr))
                assert_equals(expectedNKeys, nkeys)

        confName = os.path.join(outdir, "conf.json")
        assert_true(os.path.isfile(confName))
        with open(os.path.join(outdir, "conf.json"), 'r') as fconf:
            import json
            conf = json.load(fconf)
            assert_equals(outdir, conf['input'])
            assert_equals(len(aryShape), conf['nkeys'])
            assert_equals(narys_, conf['nvalues'])
            assert_equals(valDtype, conf['valuetype'])
            assert_equals('int16', conf['keytype'])

        assert_true(os.path.isfile(os.path.join(outdir, 'SUCCESS')))
Example #34
0
    def test_fromArrays(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))

        image = ImagesLoader(self.sc).fromArrays(ary)

        collectedImage = image.collect()
        assert_equals(1, len(collectedImage))
        assert_equals(ary.shape, image.dims.count)
        assert_equals(0, collectedImage[0][0])  # check key
        assert_true(array_equal(ary, collectedImage[0][1]))  # check value
Example #35
0
    def test_fromArrays(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))

        image = ImagesLoader(self.sc).fromArrays(ary)

        collectedImage = image.collect()
        assert_equals(1, len(collectedImage))
        assert_equals(ary.shape, image.dims.count)
        assert_equals(0, collectedImage[0][0])  # check key
        assert_true(array_equal(ary, collectedImage[0][1]))  # check value
    def test_loadStacksAsSeries(self):
        rangeAry = arange(64*128, dtype=dtypeFunc('int16'))
        rangeAry.shape = (64, 128)
        filepath = os.path.join(self.outputdir, "rangeAry.stack")
        rangeAry.tofile(filepath)

        series = SeriesLoader(self.sc).fromStack(filepath, dims=(128, 64))
        seriesAry = series.pack()

        assert_equals((128, 64), series.dims.count)
        assert_equals((128, 64), seriesAry.shape)
        assert_true(array_equal(rangeAry.T, seriesAry))
Example #37
0
    def test_loadStacksAsSeries(self):
        rangeAry = arange(64 * 128, dtype=dtypeFunc('int16'))
        rangeAry.shape = (64, 128)
        filepath = os.path.join(self.outputdir, "rangeAry.stack")
        rangeAry.tofile(filepath)

        series = SeriesLoader(self.sc).fromStack(filepath, dims=(128, 64))
        seriesAry = series.pack()

        assert_equals((128, 64), series.dims.count)
        assert_equals((128, 64), seriesAry.shape)
        assert_true(array_equal(rangeAry.T, seriesAry))
Example #38
0
    def test_subtract(self):
        narys = 3
        arys, sh, sz = _generateTestArrays(narys)
        subVals = [1, arange(sz, dtype=dtypeFunc("int16")).reshape(sh)]

        imageData = ImagesLoader(self.sc).fromArrays(arys)
        for subVal in subVals:
            subData = imageData.subtract(subVal)
            subtracted = subData.collect()
            expectedArys = map(lambda ary: ary - subVal, arys)
            for actual, expected in zip(subtracted, expectedArys):
                assert_true(allclose(expected, actual[1]))
Example #39
0
    def test_planes(self):
        dims = (2, 2, 4)
        sz = reduce(lambda x, y: x * y, dims)
        origAry = arange(sz, dtype=dtypeFunc('int16')).reshape(dims)
        imageData = ImagesLoader(self.sc).fromArrays([origAry])
        planedData = imageData.planes(0, 2)
        planed = planedData.collect()[0][1]

        expected = squeeze(origAry[slice(None), slice(None), slice(0, 2)])
        assert_true(array_equal(expected, planed))
        assert_equals(tuple(expected.shape), planedData._dims.count)
        assert_equals(str(expected.dtype), planedData._dtype)
Example #40
0
    def test_subtract(self):
        narys = 3
        arys, sh, sz = _generateTestArrays(narys)
        subVals = [1, arange(sz, dtype=dtypeFunc('int16')).reshape(sh)]

        imageData = ImagesLoader(self.sc).fromArrays(arys)
        for subVal in subVals:
            subData = imageData.subtract(subVal)
            subtracted = subData.collect()
            expectedArys = map(lambda ary: ary - subVal, arys)
            for actual, expected in zip(subtracted, expectedArys):
                assert_true(allclose(expected, actual[1]))
Example #41
0
    def test_planes(self):
        dims = (2, 2, 4)
        sz = reduce(lambda x, y: x * y, dims)
        origAry = arange(sz, dtype=dtypeFunc("int16")).reshape(dims)
        imageData = ImagesLoader(self.sc).fromArrays([origAry])
        planedData = imageData.planes(0, 2)
        planed = planedData.collect()[0][1]

        expected = squeeze(origAry[slice(None), slice(None), slice(0, 2)])
        assert_true(array_equal(expected, planed))
        assert_equals(tuple(expected.shape), planedData._dims.count)
        assert_equals(str(expected.dtype), planedData._dtype)
Example #42
0
    def test_toBlocksWithSplit(self):
        ary = arange(8, dtype=dtypeFunc("int16")).reshape((2, 4))

        image = ImagesLoader(self.sc).fromArrays(ary)
        groupedblocks = image.toBlocks((1, 2), units="s")

        # collectedblocks = blocks.collect()
        collectedgroupedblocks = groupedblocks.collect()
        assert_equals((0, 0), collectedgroupedblocks[0][0].spatialKey)
        assert_true(array_equal(ary[:, :2].ravel(), collectedgroupedblocks[0][1].ravel()))
        assert_equals((0, 2), collectedgroupedblocks[1][0].spatialKey)
        assert_true(array_equal(ary[:, 2:].ravel(), collectedgroupedblocks[1][1].ravel()))
Example #43
0
    def test_toBlocksWithSplit(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))

        image = ImagesLoader(self.sc).fromArrays(ary)
        groupedblocks = image.toBlocks((1, 2), units="s")

        # collectedblocks = blocks.collect()
        collectedgroupedblocks = groupedblocks.collect()
        assert_equals((0, 0), collectedgroupedblocks[0][0].spatialKey)
        assert_true(array_equal(ary[:, :2].ravel(), collectedgroupedblocks[0][1].ravel()))
        assert_equals((0, 2), collectedgroupedblocks[1][0].spatialKey)
        assert_true(array_equal(ary[:, 2:].ravel(), collectedgroupedblocks[1][1].ravel()))
Example #44
0
    def test_fromStack(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))
        filename = os.path.join(self.outputdir, "test.stack")
        ary.tofile(filename)

        image = ImagesLoader(self.sc).fromStack(filename, dims=(4, 2))

        collectedImage = image.collect()
        assert_equals(1, len(collectedImage))
        assert_equals(0, collectedImage[0][0])  # check key
        # assert that image shape *matches* that in image dimensions:
        assert_equals(image.dims.count, collectedImage[0][1].shape)
        assert_true(array_equal(ary.T, collectedImage[0][1]))  # check value
Example #45
0
    def test_maxProject(self):
        from thunder.rdds.fileio.seriesloader import SeriesLoader
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))

        series = SeriesLoader(self.sc).fromArrays(ary)
        project0Series = series.maxProject(axis=0)
        project0 = project0Series.pack()

        project1Series = series.maxProject(axis=1)
        project1 = project1Series.pack(sorting=True)

        assert_true(array_equal(amax(ary.T, 0), project0))
        assert_true(array_equal(amax(ary.T, 1), project1))
Example #46
0
    def test_castToFloat(self):
        from numpy import arange
        shape = (3, 2, 2)
        size = 3*2*2
        ary = arange(size, dtype=dtypeFunc('uint8')).reshape(shape)
        ary2 = ary + size
        from thunder.rdds.fileio.seriesloader import SeriesLoader
        series = SeriesLoader(self.sc).fromArrays([ary, ary2])

        castSeries = series.astype("smallfloat")

        assert_equals('float16', str(castSeries.dtype))
        assert_equals('float16', str(castSeries.first()[1].dtype))
Example #47
0
    def test_castToFloat(self):
        from numpy import arange
        shape = (3, 2, 2)
        size = 3 * 2 * 2
        ary = arange(size, dtype=dtypeFunc('uint8')).reshape(shape)
        ary2 = ary + size
        from thunder.rdds.fileio.seriesloader import SeriesLoader
        series = SeriesLoader(self.sc).fromArraysAsImages([ary, ary2])

        castSeries = series.astype("smallfloat")

        assert_equals('float16', str(castSeries.dtype))
        assert_equals('float16', str(castSeries.first()[1].dtype))
Example #48
0
    def test_maxProject(self):
        from thunder.rdds.fileio.seriesloader import SeriesLoader
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))

        series = SeriesLoader(self.sc).fromArraysAsImages(ary)
        project0Series = series.maxProject(axis=0)
        project0 = project0Series.pack()

        project1Series = series.maxProject(axis=1)
        project1 = project1Series.pack(sorting=True)

        assert_true(array_equal(amax(ary.T, 0), project0))
        assert_true(array_equal(amax(ary.T, 1), project1))
Example #49
0
    def test_fromStack(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))
        filename = os.path.join(self.outputdir, "test.stack")
        ary.tofile(filename)

        image = ImagesLoader(self.sc).fromStack(filename, dims=(4, 2))

        collectedImage = image.collect()
        assert_equals(1, len(collectedImage))
        assert_equals(0, collectedImage[0][0])  # check key
        # assert that image shape *matches* that in image dimensions:
        assert_equals(image.dims.count, collectedImage[0][1].shape)
        assert_true(array_equal(ary.T, collectedImage[0][1]))  # check value
Example #50
0
    def test_fromStacksWithConf(self):
        ary = arange(8, dtype=dtypeFunc('int32')).reshape((2, 4))
        ary2 = arange(8, 16, dtype=dtypeFunc('int32')).reshape((2, 4))
        filename = os.path.join(self.outputdir, "test01.stack")
        ary.tofile(filename)
        filename = os.path.join(self.outputdir, "test02.stack")
        ary2.tofile(filename)
        conf = {"dims": [4, 2], "dtype": "int32"}
        with open(os.path.join(self.outputdir, "conf.json"), 'w') as fp:
            json.dump(conf, fp)

        image = ImagesLoader(self.sc).fromStack(self.outputdir)
        assert_equals("int32", image._dtype)
        assert_equals(2, image._nrecords)
        assert_equals((4, 2), image._dims.count)

        collectedImage = image.collect()
        assert_equals(2, len(collectedImage))
        assert_equals(0, collectedImage[0][0])  # check key
        assert_equals(image.dims.count, collectedImage[0][1].shape)
        assert_true(array_equal(ary.T, collectedImage[0][1]))  # check value
        assert_equals(1, collectedImage[1][0])  # check image 2
        assert_true(array_equal(ary2.T, collectedImage[1][1]))
Example #51
0
    def test_loadStacksAsSeriesWithShuffle(self):
        rangeAry = arange(64*128, dtype=dtypeFunc('int16'))
        filePath = os.path.join(self.outputdir, "rangeary.stack")
        rangeAry.tofile(filePath)
        expectedAry = rangeAry.reshape((128, 64), order='F')

        rangeSeries = self.tsc.loadImagesAsSeries(filePath, dims=(128, 64))
        assert_equals('float32', rangeSeries._dtype)  # check before any potential first() calls update this val
        rangeSeriesAry = rangeSeries.pack()

        assert_equals((128, 64), rangeSeries.dims.count)
        assert_equals((128, 64), rangeSeriesAry.shape)
        assert_equals('float32', str(rangeSeriesAry.dtype))
        assert_true(array_equal(expectedAry, rangeSeriesAry))
Example #52
0
    def test_load3dStackAsSeriesWithShuffle(self):
        rangeAry = arange(32*64*4, dtype=dtypeFunc('int16'))
        filePath = os.path.join(self.outputdir, "rangeary.stack")
        rangeAry.tofile(filePath)
        expectedAry = rangeAry.reshape((32, 64, 4), order='F')

        rangeSeries = self.tsc.loadImagesAsSeries(filePath, dims=(32, 64, 4))
        assert_equals('float32', rangeSeries._dtype)
        rangeSeriesAry = rangeSeries.pack()

        assert_equals((32, 64, 4), rangeSeries.dims.count)
        assert_equals((32, 64, 4), rangeSeriesAry.shape)
        assert_equals('float32', str(rangeSeriesAry.dtype))
        assert_true(array_equal(expectedAry, rangeSeriesAry))
Example #53
0
    def test_load3dStackAsSeriesWithShuffle(self):
        rangeAry = arange(32*64*4, dtype=dtypeFunc('int16'))
        filePath = os.path.join(self.outputdir, "rangeary.stack")
        rangeAry.tofile(filePath)
        expectedAry = rangeAry.reshape((32, 64, 4), order='F')

        rangeSeries = self.tsc.loadImagesAsSeries(filePath, dims=(32, 64, 4))
        assert_equals('float32', rangeSeries._dtype)
        rangeSeriesAry = rangeSeries.pack()

        assert_equals((32, 64, 4), rangeSeries.dims.count)
        assert_equals((32, 64, 4), rangeSeriesAry.shape)
        assert_equals('float32', str(rangeSeriesAry.dtype))
        assert_true(array_equal(expectedAry, rangeSeriesAry))
Example #54
0
    def test_loadStacksAsSeriesWithShuffle(self):
        rangeAry = arange(64*128, dtype=dtypeFunc('int16'))
        filePath = os.path.join(self.outputdir, "rangeary.stack")
        rangeAry.tofile(filePath)
        expectedAry = rangeAry.reshape((128, 64), order='F')

        rangeSeries = self.tsc.loadImagesAsSeries(filePath, dims=(128, 64))
        assert_equals('float32', rangeSeries._dtype)  # check before any potential first() calls update this val
        rangeSeriesAry = rangeSeries.pack()

        assert_equals((128, 64), rangeSeries.dims.count)
        assert_equals((128, 64), rangeSeriesAry.shape)
        assert_equals('float32', str(rangeSeriesAry.dtype))
        assert_true(array_equal(expectedAry, rangeSeriesAry))
Example #55
0
    def test_fromStacksWithConf(self):
        ary = arange(8, dtype=dtypeFunc('int32')).reshape((2, 4))
        ary2 = arange(8, 16, dtype=dtypeFunc('int32')).reshape((2, 4))
        filename = os.path.join(self.outputdir, "test01.stack")
        ary.tofile(filename)
        filename = os.path.join(self.outputdir, "test02.stack")
        ary2.tofile(filename)
        conf = {"dims": [4, 2], "dtype": "int32"}
        with open(os.path.join(self.outputdir, "conf.json"), 'w') as fp:
            json.dump(conf, fp)

        image = ImagesLoader(self.sc).fromStack(self.outputdir)
        assert_equals("int32", image._dtype)
        assert_equals(2, image._nrecords)
        assert_equals((4, 2), image._dims.count)

        collectedImage = image.collect()
        assert_equals(2, len(collectedImage))
        assert_equals(0, collectedImage[0][0])  # check key
        assert_equals(image.dims.count, collectedImage[0][1].shape)
        assert_true(array_equal(ary.T, collectedImage[0][1]))  # check value
        assert_equals(1, collectedImage[1][0])  # check image 2
        assert_true(array_equal(ary2.T, collectedImage[1][1]))
Example #56
0
    def __run_loadMultipleStacksAsSeries(self):
        rangeAry = arange(64*128, dtype=dtypeFunc('int16'))
        filePath = os.path.join(self.outputdir, "rangeary01.bin")
        rangeAry.tofile(filePath)
        expectedAry = rangeAry.reshape((128, 64), order='F')
        rangeAry2 = arange(64*128, 2*64*128, dtype=dtypeFunc('int16'))
        filePath = os.path.join(self.outputdir, "rangeary02.bin")
        rangeAry2.tofile(filePath)
        expectedAry2 = rangeAry2.reshape((128, 64), order='F')

        rangeSeries = self.tsc.loadImagesAsSeries(self.outputdir, dims=(128, 64))
        assert_equals('float32', rangeSeries._dtype)

        rangeSeriesAry = rangeSeries.pack()
        rangeSeriesAry_xpose = rangeSeries.pack(transpose=True)

        assert_equals((128, 64), rangeSeries.dims.count)
        assert_equals((2, 128, 64), rangeSeriesAry.shape)
        assert_equals((2, 64, 128), rangeSeriesAry_xpose.shape)
        assert_equals('float32', str(rangeSeriesAry.dtype))
        assert_true(array_equal(expectedAry, rangeSeriesAry[0]))
        assert_true(array_equal(expectedAry2, rangeSeriesAry[1]))
        assert_true(array_equal(expectedAry.T, rangeSeriesAry_xpose[0]))
        assert_true(array_equal(expectedAry2.T, rangeSeriesAry_xpose[1]))
Example #57
0
    def __run_loadTifAsSeries(self):
        tmpAry = arange(60*120, dtype=dtypeFunc('uint16'))
        rangeAry = mod(tmpAry, 255).astype('uint8').reshape((60, 120))
        pilImg = Image.fromarray(rangeAry)
        filePath = os.path.join(self.outputdir, "rangetif01.tif")
        pilImg.save(filePath)
        del pilImg, tmpAry

        rangeSeries = self.tsc.loadImagesAsSeries(self.outputdir, inputFormat="tif-stack")
        assert_equals('float16', rangeSeries._dtype)  # check before any potential first() calls update this val
        rangeSeriesAry = rangeSeries.pack()

        assert_equals((60, 120), rangeSeries.dims.count)  # 2d tif now loaded as 2d image; was 3d with singleton z dim
        assert_equals((60, 120), rangeSeriesAry.shape)
        assert_equals('float16', str(rangeSeriesAry.dtype))
        assert_true(array_equal(rangeAry, rangeSeriesAry))
Example #58
0
    def __run_loadTifAsSeries(self):
        tmpAry = arange(60*120, dtype=dtypeFunc('uint16'))
        rangeAry = mod(tmpAry, 255).astype('uint8').reshape((60, 120))
        pilImg = Image.fromarray(rangeAry)
        filePath = os.path.join(self.outputdir, "rangetif01.tif")
        pilImg.save(filePath)
        del pilImg, tmpAry

        rangeSeries = self.tsc.loadImagesAsSeries(self.outputdir, inputFormat="tif-stack")
        assert_equals('float16', rangeSeries._dtype)  # check before any potential first() calls update this val
        rangeSeriesAry = rangeSeries.pack()

        assert_equals((60, 120), rangeSeries.dims.count)  # 2d tif now loaded as 2d image; was 3d with singleton z dim
        assert_equals((60, 120), rangeSeriesAry.shape)
        assert_equals('float16', str(rangeSeriesAry.dtype))
        assert_true(array_equal(rangeAry, rangeSeriesAry))
Example #59
0
    def astype(self, dtype, casting='safe'):
        """
        Cast values to specified numpy dtype.

        If 'smallfloat' is passed, values will be cast to the smallest floating point representation
        to which they can be cast safely, as determined by the thunder.utils.common smallest_float_type function.
        Typically this will be a float type larger than a passed integer type (for instance, float16 for int8 or uint8).

        If the passed dtype is the same as the current dtype, or if 'smallfloat' is passed when values are already
        in floating point, then this method will return self unchanged.

        Parameters
        ----------
        dtype: numpy dtype or dtype specifier, or string 'smallfloat', or None
            Data type to which RDD values are to be cast. Will return without cast if None is passed.

        casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's astype() method; see numpy documentation for details.

        Returns
        -------
        New Data object, of same type as self, with values cast to the requested dtype; or self if no cast is performed.
        """
        if dtype is None or dtype == '':
            return self
        from numpy import ndarray
        from numpy import dtype as dtypeFunc
        if dtype == 'smallfloat':
            # get the smallest floating point type that can be safely cast to from our current type
            from thunder.utils.common import smallestFloatType
            dtype = smallestFloatType(self.dtype)

        def cast(v, dtype_, casting_):
            if isinstance(v, ndarray):
                return v.astype(dtype_, casting=casting_, copy=False)
            else:
                # assume we are a scalar, either a numpy scalar or a python scalar
                # turn ourself into a numpy scalar of the appropriate type
                return asarray([v]).astype(dtype_,
                                           casting=casting_,
                                           copy=False)[0]

        nextRdd = self.rdd.mapValues(
            lambda v: cast(v, dtypeFunc(dtype), casting))
        return self._constructor(nextRdd, dtype=str(dtype)).__finalize__(self)
Example #60
0
    def astype(self, dtype, casting='safe'):
        """
        Cast values to specified numpy dtype.

        If 'smallfloat' is passed, values will be cast to the smallest floating point representation
        to which they can be cast safely, as determined by the thunder.utils.common smallest_float_type function.
        Typically this will be a float type larger than a passed integer type (for instance, float16 for int8 or uint8).

        If the passed dtype is the same as the current dtype, or if 'smallfloat' is passed when values are already
        in floating point, then this method will return self unchanged.

        Parameters
        ----------
        dtype: numpy dtype or dtype specifier, or string 'smallfloat', or None
            Data type to which RDD values are to be cast. Will return without cast if None is passed.

        casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's astype() method; see numpy documentation for details.

        Returns
        -------
        New Data object, of same type as self, with values cast to the requested dtype; or self if no cast is performed.
        """
        if dtype is None or dtype == '':
            return self
        from numpy import ndarray
        from numpy import dtype as dtypeFunc
        if dtype == 'smallfloat':
            # get the smallest floating point type that can be safely cast to from our current type
            from thunder.utils.common import smallestFloatType
            dtype = smallestFloatType(self.dtype)

        def cast(v, dtype_, casting_):
            if isinstance(v, ndarray):
                return v.astype(dtype_, casting=casting_, copy=False)
            else:
                # assume we are a scalar, either a numpy scalar or a python scalar
                # turn ourself into a numpy scalar of the appropriate type
                return asarray([v]).astype(dtype_, casting=casting_, copy=False)[0]

        nextRdd = self.rdd.mapValues(lambda v: cast(v, dtypeFunc(dtype), casting))
        return self._constructor(nextRdd, dtype=str(dtype)).__finalize__(self)