def test_fromMultipleArrays(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))
        ary2 = arange(8, 16, dtype=dtypeFunc('int16')).reshape((2, 4))

        series = SeriesLoader(self.sc).fromArraysAsImages([ary, ary2])

        seriesVals = series.collect()
        seriesAry = series.pack()

        # check ordering of keys
        assert_equals((0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0), seriesVals[1][0])  # second key
        assert_equals((3, 0), seriesVals[3][0])
        assert_equals((0, 1), seriesVals[4][0])
        assert_equals((3, 1), seriesVals[7][0])

        # check dimensions tuple is reversed from numpy shape
        assert_equals(ary.shape[::-1], series.dims.count)

        # check that values are in original order, with subsequent point concatenated in values
        collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16'))
        assert_true(array_equal(ary.ravel(), collectedVals[:, 0]))
        assert_true(array_equal(ary2.ravel(), collectedVals[:, 1]))

        # check that packing returns concatenation of input arrays, with time as first dimension
        assert_true(array_equal(ary.T, seriesAry[0]))
        assert_true(array_equal(ary2.T, seriesAry[1]))
Esempio n. 2
0
    def loadSeriesFromArray(self, values, index=None, npartitions=None):
        """
        Load Series data from a local array

        Parameters
        ----------
        values : list or ndarray
            A list of 1d numpy arrays, or a single 2d numpy array

        index : array-like, optional, deafult = None
            Index to set for Series object, if None will use linear indices.

        npartitions : position int, optional, default = None
            Number of partitions for RDD, if unspecified will use
            default parallelism.
        """
        from numpy import ndarray, asarray
        from lambdaimage.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc)

        if not npartitions:
            npartitions = self._sc.defaultParallelism

        if isinstance(values, list):
            values = asarray(values)

        if isinstance(values, ndarray) and values.ndim > 1:
            values = list(values)

        data = loader.fromArrays(values, npartitions=npartitions)

        if index:
            data.index = index

        return data
    def test_fromArrays(self):
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))

        series = SeriesLoader(self.sc).fromArraysAsImages(ary)

        seriesVals = series.collect()
        seriesAry = series.pack()

        # check ordering of keys
        assert_equals((0, 0), seriesVals[0][0])  # first key
        assert_equals((1, 0), seriesVals[1][0])  # second key
        assert_equals((2, 0), seriesVals[2][0])
        assert_equals((3, 0), seriesVals[3][0])
        assert_equals((0, 1), seriesVals[4][0])
        assert_equals((1, 1), seriesVals[5][0])
        assert_equals((2, 1), seriesVals[6][0])
        assert_equals((3, 1), seriesVals[7][0])

        # check dimensions tuple is reversed from numpy shape
        assert_equals(ary.shape[::-1], series.dims.count)

        # check that values are in original order
        collectedVals = array([kv[1] for kv in seriesVals], dtype=dtypeFunc('int16')).ravel()
        assert_true(array_equal(ary.ravel(), collectedVals))

        # check that packing returns transpose of original array
        assert_true(array_equal(ary.T, seriesAry))
    def _run_tst_fromBinary(self, useConfJson=False):
        # run this as a single big test so as to avoid repeated setUp and tearDown of the spark context
        # data will be a sequence of test data
        # all keys and all values in a test data item must be of the same length
        # keys get converted to ints regardless of raw input format
        DATA = [
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3], [5, 6, 7]], [[11], [12]], 'int16', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int32'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int32', 'int16'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'int16', 'float32'),
            SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'float32', 'float32'),
            SeriesBinaryTestData.fromArrays([[2, 3, 4]], [[11.0, 12.0, 13.0]], 'float32', 'float32'),
        ]

        for itemidx, item in enumerate(DATA):
            outSubdir = os.path.join(self.outputdir, 'input%d' % itemidx)
            os.mkdir(outSubdir)

            fname = os.path.join(outSubdir, 'inputfile%d.bin' % itemidx)
            with open(fname, 'wb') as f:
                item.writeToFile(f)

            loader = SeriesLoader(self.sc)
            if not useConfJson:
                series = loader.fromBinary(outSubdir, nkeys=item.nkeys, nvalues=item.nvals, keyType=str(item.keyDtype),
                                           valueType=str(item.valDtype))
            else:
                # write configuration file
                conf = {'input': outSubdir,
                        'nkeys': item.nkeys, 'nvalues': item.nvals,
                        'valuetype': str(item.valDtype), 'keytype': str(item.keyDtype)}
                with open(os.path.join(outSubdir, "conf.json"), 'wb') as f:
                    json.dump(conf, f, indent=2)
                series = loader.fromBinary(outSubdir)

            seriesData = series.rdd.collect()

            expectedData = item.data
            assert_equals(len(expectedData), len(seriesData),
                          "Differing numbers of k/v pairs in item %d; expected %d, got %d" %
                          (itemidx, len(expectedData), len(seriesData)))

            for expected, actual in zip(expectedData, seriesData):
                expectedKeys = tuple(expected[0])
                expectedType = smallestFloatType(item.valDtype)
                expectedVals = array(expected[1], dtype=expectedType)
                assert_equals(expectedKeys, actual[0],
                              "Key mismatch in item %d; expected %s, got %s" %
                              (itemidx, str(expectedKeys), str(actual[0])))
                assert_true(allclose(expectedVals, actual[1]),
                            "Value mismatch in item %d; expected %s, got %s" %
                            (itemidx, str(expectedVals), str(actual[1])))
                assert_equals(expectedType, str(actual[1].dtype),
                              "Value type mismatch in item %d; expected %s, got %s" %
                              (itemidx, expectedType, str(actual[1].dtype)))
Esempio n. 5
0
    def test_castToFloat(self):
        from numpy import arange
        shape = (3, 2, 2)
        size = 3*2*2
        ary = arange(size, dtype=dtypeFunc('uint8')).reshape(shape)
        ary2 = ary + size
        from lambdaimage.rdds.fileio.seriesloader import SeriesLoader
        series = SeriesLoader(self.sc).fromArraysAsImages([ary, ary2])

        castSeries = series.astype("smallfloat")

        assert_equals('float16', str(castSeries.dtype))
        assert_equals('float16', str(castSeries.first()[1].dtype))
Esempio n. 6
0
    def test_maxProject(self):
        from lambdaimage.rdds.fileio.seriesloader import SeriesLoader
        ary = arange(8, dtype=dtypeFunc('int16')).reshape((2, 4))

        series = SeriesLoader(self.sc).fromArraysAsImages(ary)
        project0Series = series.maxProject(axis=0)
        project0 = project0Series.pack()

        project1Series = series.maxProject(axis=1)
        project1 = project1Series.pack(sorting=True)

        assert_true(array_equal(amax(ary.T, 0), project0))
        assert_true(array_equal(amax(ary.T, 1), project1))
Esempio n. 7
0
    def _run_tst_roundtripConvertToSeries(self, images, strategy):
        outdir = os.path.join(self.outputdir, "fish-series-dir")

        partitionedimages = images.toBlocks(strategy)
        series = partitionedimages.toSeries()
        series_ary = series.pack()

        partitionedimages.saveAsBinarySeries(outdir)
        converted_series = SeriesLoader(self.sc).fromBinary(outdir)
        converted_series_ary = converted_series.pack()

        assert_equals(images.dims.count, series.dims.count)
        expected_shape = tuple([images.nrecords] + list(images.dims.count))
        assert_equals(expected_shape, series_ary.shape)
        assert_true(array_equal(series_ary, converted_series_ary))
Esempio n. 8
0
    def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None,
                   maxPartitionSize='32mb', confFilename='conf.json', keyType=None, valueType=None, keyPath=None,
                   varName=None):
        """
        Loads a Series object from data stored as binary, text, npy, or mat.

        For binary and text, supports single files or multiple files stored on a local file system,
        a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS.
        For local formats (npy and mat) only local file systems currently supported.

        Parameters
        ----------
        dataPath: string
            Path to data files or directory, as either a local filesystem path or a URI.
            May include a single '*' wildcard in the filename. Examples of valid dataPaths include
            'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".

        nkeys: int, optional (required if `inputFormat` is 'text'), default = None
            Number of keys per record (e.g. 3 for (x, y, z) coordinate keys). Must be specified for
            text data; can be specified here or in a configuration file for binary data.

        nvalues: int, optional (required if `inputFormat` is 'text')
            Number of values per record. Must be specified here or in a configuration file for binary data.

        inputFormat: {'text', 'binary', 'npy', 'mat'}. optional, default = 'binary'
            inputFormat of data to be read.

        minPartitions: int, optional, default = SparkContext.minParallelism
            Minimum number of Spark partitions to use, only for text.

        maxPartitionSize : int, optional, default = '32mb'
            Maximum size of partitions as a Java-style memory string, e.g. '32mb' or '64mb',
            indirectly controls the number of Spark partitions, only for binary.

        confFilename: string, optional, default 'conf.json'
            Path to JSON file with configuration options including 'nkeys', 'nvalues',
            'keyType', and 'valueType'. If a file is not found at the given path, then the base
            directory in 'dataPath' will be checked. Parameters will override the conf file.

        keyType: string or numpy dtype, optional, default = None
            Numerical type of keys, will override conf file.

        valueType: string or numpy dtype, optional, default = None
            Numerical type of values, will override conf file.

        keyPath: string, optional, default = None
            Path to file with keys when loading from npy or mat.

        varName : str, optional, default = None
            Variable name to load (for MAT files only)

        Returns
        -------
        data: lambdaimage.rdds.Series
            A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs
        """
        checkParams(inputFormat, ['text', 'binary', 'npy', 'mat'])

        from lambdaimage.rdds.fileio.seriesloader import SeriesLoader
        loader = SeriesLoader(self._sc, minPartitions=minPartitions)

        if inputFormat.lower() == 'binary':
            data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues,
                                     keyType=keyType, valueType=valueType, maxPartitionSize=maxPartitionSize)
        elif inputFormat.lower() == 'text':
            if nkeys is None:
                raise Exception('Must provide number of keys per record for loading from text')
            data = loader.fromText(dataPath, nkeys=nkeys)
        elif inputFormat.lower() == 'npy':
            data = loader.fromNpyLocal(dataPath, keyPath)
        else:
            if varName is None:
                raise Exception('Must provide variable name for loading MAT files')
            data = loader.fromMatLocal(dataPath, varName, keyPath)

        return data