Example #1
0
    def test_ind_to_sub_rdd(self):
        dataLocal = map(lambda x: (x, array([1.0])), range(1, 13))

        data = Series(self.sc.parallelize(dataLocal))
        subs = data.indToSub(dims=[2, 3, 2]).keys().collect()
        assert(allclose(subs, array([(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1), (2, 3, 1),
                                     (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2), (1, 3, 2), (2, 3, 2)])))
Example #2
0
 def test_toTimeSeries(self):
     from thunder.rdds.timeseries import TimeSeries
     rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])),
                                (1, array([8, 9, 10, 11]))])
     data = Series(rdd)
     ts = data.toTimeSeries()
     assert (isinstance(ts, TimeSeries))
Example #3
0
 def test_to_row_matrix(self):
     from thunder.rdds.matrices import RowMatrix
     rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))])
     data = Series(rdd)
     mat = data.toRowMatrix()
     assert(isinstance(mat, RowMatrix))
     assert(mat.nrows == 2)
     assert(mat.ncols == 4)
Example #4
0
 def predictAndScore(self, X, y):
     X = self._transforms.apply(X)
     joined = self._models.join(y.rdd)
     results = joined.mapValues(lambda
                                (model, y): model.predictWithStats(X, y))
     yhat = results.mapValues(lambda v: v[0])
     stats = results.mapValues(lambda v: v[1])
     return Series(yhat), Series(stats)
Example #5
0
    def test_sub_to_ind_rdd(self):
        subs = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1), (2, 3, 1),
                (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2), (1, 3, 2), (2, 3, 2)]
        dataLocal = map(lambda x: (x, array([1.0])), subs)

        data = Series(self.sc.parallelize(dataLocal))
        inds = array(data.subToInd().keys().collect())
        assert(allclose(inds, array(range(1, 13))))
Example #6
0
 def test_select(self):
     rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))])
     data = Series(rdd, index=['label1', 'label2', 'label3', 'label4'])
     selection1 = data.select(['label1'])
     assert(allclose(selection1.first()[1], 4))
     selection1 = data.select('label1')
     assert(allclose(selection1.first()[1], 4))
     selection2 = data.select(['label1', 'label2'])
     assert(allclose(selection2.first()[1], array([4, 5])))
Example #7
0
 def test_squelch(self):
     rdd = self.sc.parallelize([(0, array([1, 2])), (0, array([3, 4]))])
     data = Series(rdd)
     squelched = data.squelch(5)
     assert (allclose(squelched.collectValuesAsArray(), [[0, 0], [0, 0]]))
     squelched = data.squelch(3)
     assert (allclose(squelched.collectValuesAsArray(), [[0, 0], [3, 4]]))
     squelched = data.squelch(1)
     assert (allclose(squelched.collectValuesAsArray(), [[1, 2], [3, 4]]))
Example #8
0
 def test_to_row_matrix(self):
     from thunder.rdds.matrices import RowMatrix
     rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])),
                                (1, array([8, 9, 10, 11]))])
     data = Series(rdd)
     mat = data.toRowMatrix()
     assert (isinstance(mat, RowMatrix))
     assert (mat.nrows == 2)
     assert (mat.ncols == 4)
Example #9
0
 def test_correlate(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5]))])
     data = Series(rdd)
     sig1 = [4, 5, 6, 7, 8]
     corr = data.correlate(sig1).values().collect()
     assert(allclose(corr[0], 1))
     sig12 = [[4, 5, 6, 7, 8], [8, 7, 6, 5, 4]]
     corrs = data.correlate(sig12).values().collect()
     assert(allclose(corrs[0], [1, -1]))
Example #10
0
 def test_select(self):
     rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))])
     data = Series(rdd, index=['label1', 'label2', 'label3', 'label4'])
     selection1 = data.select(['label1'])
     assert(allclose(selection1.first()[1], 4))
     selection1 = data.select('label1')
     assert(allclose(selection1.first()[1], 4))
     selection2 = data.select(['label1', 'label2'])
     assert(allclose(selection2.first()[1], array([4, 5])))
Example #11
0
 def test_standardization_axis0(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))])
     data = Series(rdd, dtype='float16')
     centered = data.center(0)
     standardized = data.standardize(0)
     zscored = data.zscore(0)
     assert(allclose(centered.first()[1], array([-2, -1, 0, 1, 2]), atol=1e-3))
     assert(allclose(standardized.first()[1], array([0.70710,  1.41421,  2.12132,  2.82842,  3.53553]), atol=1e-3))
     assert(allclose(zscored.first()[1], array([-1.41421, -0.70710,  0,  0.70710,  1.41421]), atol=1e-3))
Example #12
0
 def test_standardization_axis1(self):
     rdd = self.sc.parallelize([(0, array([1, 2], dtype='float16')), (0, array([3, 4], dtype='float16'))])
     data = Series(rdd, dtype='float16')
     centered = data.center(1)
     standardized = data.standardize(1)
     zscored = data.zscore(1)
     assert(allclose(centered.first()[1], array([-1, -1]), atol=1e-3))
     assert(allclose(standardized.first()[1], array([1, 2]), atol=1e-3))
     assert(allclose(zscored.first()[1], array([-1, -1]), atol=1e-3))
Example #13
0
 def test_squelch(self):
     rdd = self.sc.parallelize([(0, array([1, 2])), (0, array([3, 4]))])
     data = Series(rdd)
     squelched = data.squelch(5)
     assert(allclose(squelched.collectValuesAsArray(), [[0, 0], [0, 0]]))
     squelched = data.squelch(3)
     assert(allclose(squelched.collectValuesAsArray(), [[0, 0], [3, 4]]))
     squelched = data.squelch(1)
     assert(allclose(squelched.collectValuesAsArray(), [[1, 2], [3, 4]]))
Example #14
0
    def test_round_trip_rdd(self):
        subs = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1), (2, 3, 1),
                (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2), (1, 3, 2), (2, 3, 2)]
        dataLocal = map(lambda x: (x, array([1.0])), subs)

        data = Series(self.sc.parallelize(dataLocal))
        start = data.keys().collect()
        stop = data.subToInd().indToSub().keys().collect()
        assert(allclose(array(start), array(stop)))
Example #15
0
 def test_standardization_axis0(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))])
     data = Series(rdd, dtype='float16')
     centered = data.center(0)
     standardized = data.standardize(0)
     zscored = data.zscore(0)
     assert(allclose(centered.first()[1], array([-2, -1, 0, 1, 2]), atol=1e-3))
     assert(allclose(standardized.first()[1], array([0.70710,  1.41421,  2.12132,  2.82842,  3.53553]), atol=1e-3))
     assert(allclose(zscored.first()[1], array([-1.41421, -0.70710,  0,  0.70710,  1.41421]), atol=1e-3))
Example #16
0
 def test_standardization_axis1(self):
     rdd = self.sc.parallelize([(0, array([1, 2], dtype='float16')), (0, array([3, 4], dtype='float16'))])
     data = Series(rdd, dtype='float16')
     centered = data.center(1)
     standardized = data.standardize(1)
     zscored = data.zscore(1)
     assert(allclose(centered.first()[1], array([-1, -1]), atol=1e-3))
     assert(allclose(standardized.first()[1], array([1, 2]), atol=1e-3))
     assert(allclose(zscored.first()[1], array([-1, -1]), atol=1e-3))
Example #17
0
 def setUp(self):
     super(TestSeriesRegionMeanMethods, self).setUp()
     self.dataLocal = [((0, 0), array([1.0, 2.0, 3.0])),
                       ((0, 1), array([2.0, 2.0, 4.0])),
                       ((1, 0), array([4.0, 2.0, 1.0])),
                       ((1, 1), array([3.0, 1.0, 1.0]))]
     self.series = Series(self.sc.parallelize(self.dataLocal),
                          dtype=self.dataLocal[0][1].dtype,
                          index=arange(3))
Example #18
0
 def test_normalization(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))])
     data = Series(rdd, dtype='float16')
     out = data.normalize('percentile')
     # check that _dtype has been set properly *before* calling first(), b/c first() will update this
     # value even if it hasn't been correctly set
     assert_equals('float16', str(out._dtype))
     vals = out.first()[1]
     assert_equals('float16', str(vals.dtype))
     assert(allclose(vals, array([-0.42105,  0.10526,  0.63157,  1.15789,  1.68421]), atol=1e-3))
Example #19
0
    def test_toImages(self):
        from thunder.rdds.images import Images
        rdd = self.sc.parallelize([((0, 0), array([1])), ((0, 1), array([2])),
                                   ((1, 0), array([3])), ((1, 1), array([4]))])
        data = Series(rdd)
        imgs = data.toImages()
        assert(isinstance(imgs, Images))

        im = imgs.values().first()
        assert(allclose(im, [[1, 2], [3, 4]]))
Example #20
0
    def test_toImages(self):
        from thunder.rdds.images import Images
        rdd = self.sc.parallelize([((0, 0), array([1])), ((0, 1), array([2])),
                                   ((1, 0), array([3])), ((1, 1), array([4]))])
        data = Series(rdd)
        imgs = data.toImages()
        assert (isinstance(imgs, Images))

        im = imgs.values().first()
        assert (allclose(im, [[1, 2], [3, 4]]))
Example #21
0
 def test_subset(self):
     rdd = self.sc.parallelize([(0, array([1, 5], dtype='float16')),
                                (0, array([1, 10], dtype='float16')),
                                (0, array([1, 15], dtype='float16'))])
     data = Series(rdd)
     assert_equal(len(data.subset(3, stat='min', thresh=0)), 3)
     assert_array_equal(data.subset(1, stat='max', thresh=10), [[1, 15]])
     assert_array_equal(data.subset(1, stat='mean', thresh=6), [[1, 15]])
     assert_array_equal(data.subset(1, stat='std', thresh=6), [[1, 15]])
     assert_array_equal(data.subset(1, thresh=6), [[1, 15]])
Example #22
0
 def test_correlate(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))])
     data = Series(rdd, dtype='float16')
     sig1 = [4, 5, 6, 7, 8]
     corrData = data.correlate(sig1)
     assert_equals('float64', corrData._dtype)
     corr = corrData.values().collect()
     assert(allclose(corr[0], 1))
     sig12 = [[4, 5, 6, 7, 8], [8, 7, 6, 5, 4]]
     corrs = data.correlate(sig12).values().collect()
     assert(allclose(corrs[0], [1, -1]))
Example #23
0
    def test_query_subscripts(self):
        dataLocal = [((1, 1), array([1.0, 2.0, 3.0])),
                     ((2, 1), array([2.0, 2.0, 4.0])),
                     ((1, 2), array([4.0, 2.0, 1.0]))]

        data = Series(self.sc.parallelize(dataLocal))

        inds = array([array([1, 2]), array([3])])
        keys, values = data.query(inds)
        assert (allclose(values[0, :], array([1.5, 2., 3.5])))
        assert (allclose(values[1, :], array([4.0, 2.0, 1.0])))
Example #24
0
    def test_query_linear_singleton(self):
        dataLocal = [((1, ), array([1.0, 2.0, 3.0])),
                     ((2, ), array([2.0, 2.0, 4.0])),
                     ((3, ), array([4.0, 2.0, 1.0]))]

        data = Series(self.sc.parallelize(dataLocal))

        inds = array([array([1, 2])])
        keys, values = data.query(inds)
        assert (allclose(values[0, :], array([1.5, 2., 3.5])))
        assert_equals(data.dtype, values[0, :].dtype)
Example #25
0
 def test_correlate(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))])
     data = Series(rdd, dtype='float16')
     sig1 = [4, 5, 6, 7, 8]
     corrData = data.correlate(sig1)
     assert_equals('float64', corrData._dtype)
     corr = corrData.values().collect()
     assert(allclose(corr[0], 1))
     sig12 = [[4, 5, 6, 7, 8], [8, 7, 6, 5, 4]]
     corrs = data.correlate(sig12).values().collect()
     assert(allclose(corrs[0], [1, -1]))
Example #26
0
 def test_normalization_bymean(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))])
     data = Series(rdd, dtype='float16')
     out = data.normalize('mean')
     # check that _dtype has been set properly *before* calling first(), b/c first() will update this
     # value even if it hasn't been correctly set
     assert_equals('float16', str(out._dtype))
     vals = out.first()[1]
     assert_equals('float16', str(vals.dtype))
     assert(allclose(out.first()[1],
                     array([-0.64516,  -0.32258,  0.0,  0.32258,  0.64516]), atol=1e-3))
Example #27
0
    def test_query_linear_singleton(self):
        data_local = [
            ((1,), array([1.0, 2.0, 3.0])),
            ((2,), array([2.0, 2.0, 4.0])),
            ((3,), array([4.0, 2.0, 1.0]))
        ]

        data = Series(self.sc.parallelize(data_local))

        inds = array([array([1, 2])])
        keys, values = data.query(inds)
        assert(allclose(values[0, :], array([1.5, 2., 3.5])))
Example #28
0
 def test_seriesStats(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5]))])
     data = Series(rdd)
     assert(allclose(data.seriesMean().first()[1], 3.0))
     assert(allclose(data.seriesSum().first()[1], 15.0))
     assert(allclose(data.seriesMedian().first()[1], 3.0))
     assert(allclose(data.seriesStdev().first()[1], 1.4142135))
     assert(allclose(data.seriesStat('mean').first()[1], 3.0))
     assert(allclose(data.seriesStats().select('mean').first()[1], 3.0))
     assert(allclose(data.seriesStats().select('count').first()[1], 5))
     assert(allclose(data.seriesPercentile(25).first()[1], 2.0))
     assert(allclose(data.seriesPercentile((25, 75)).first()[1], array([2.0, 4.0])))
Example #29
0
 def test_normalization(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5],
                                          dtype='float16'))])
     data = Series(rdd, dtype='float16')
     out = data.normalize('percentile')
     # check that _dtype has been set properly *before* calling first(), b/c first() will update this
     # value even if it hasn't been correctly set
     assert_equals('float16', str(out._dtype))
     vals = out.first()[1]
     assert_equals('float16', str(vals.dtype))
     assert (allclose(vals,
                      array([-0.42105, 0.10526, 0.63157, 1.15789, 1.68421]),
                      atol=1e-3))
Example #30
0
    def test_toRowMatrix(self):
        from thunder.rdds.matrices import RowMatrix
        rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))])
        data = Series(rdd)
        mat = data.toRowMatrix()
        assert(isinstance(mat, RowMatrix))
        assert(mat.nrows == 2)
        assert(mat.ncols == 4)

        # check a basic operation from superclass
        newmat = mat.applyValues(lambda x: x + 1)
        out = newmat.collectValuesAsArray()
        assert(array_equal(out, array([[5, 6, 7, 8], [9, 10, 11, 12]])))
Example #31
0
    def test_meanByFixedLength(self):
        rdd = self.sc.parallelize([((0,), array([0, 1, 2, 3, 4, 5, 6, 7], dtype='float16'))])
        data = Series(rdd)

        test1 = data.meanByFixedLength(4)
        assert(test1.keys().collect() == [(0,)])
        assert(allclose(test1.index, array([0, 1, 2, 3])))
        assert(allclose(test1.values().collect(), [[2, 3, 4, 5]]))

        test2 = data.meanByFixedLength(2)
        assert(test2.keys().collect() == [(0,)])
        assert(allclose(test2.index, array([0, 1])))
        assert(allclose(test2.values().collect(), [[3, 4]]))
Example #32
0
 def test_normalization_bymean(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5],
                                          dtype='float16'))])
     data = Series(rdd, dtype='float16')
     out = data.normalize('mean')
     # check that _dtype has been set properly *before* calling first(), b/c first() will update this
     # value even if it hasn't been correctly set
     assert_equals('float16', str(out._dtype))
     vals = out.first()[1]
     assert_equals('float16', str(vals.dtype))
     assert (allclose(out.first()[1],
                      array([-0.64516, -0.32258, 0.0, 0.32258, 0.64516]),
                      atol=1e-3))
Example #33
0
    def test_query_subscripts(self):
        dataLocal = [
            ((1, 1), array([1.0, 2.0, 3.0])),
            ((2, 1), array([2.0, 2.0, 4.0])),
            ((1, 2), array([4.0, 2.0, 1.0]))
        ]

        data = Series(self.sc.parallelize(dataLocal))

        inds = array([array([1, 2]), array([3])])
        keys, values = data.query(inds)
        assert(allclose(values[0, :], array([1.5, 2., 3.5])))
        assert(allclose(values[1, :], array([4.0, 2.0, 1.0])))
Example #34
0
    def test_toRowMatrix(self):
        from thunder.rdds.matrices import RowMatrix
        rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))])
        data = Series(rdd)
        mat = data.toRowMatrix()
        assert(isinstance(mat, RowMatrix))
        assert(mat.nrows == 2)
        assert(mat.ncols == 4)

        # check a basic operation from superclass
        newmat = mat.applyValues(lambda x: x + 1)
        out = newmat.collectValuesAsArray()
        assert(array_equal(out, array([[5, 6, 7, 8], [9, 10, 11, 12]])))
Example #35
0
    def test_meanByFixedLength(self):
        rdd = self.sc.parallelize([((0, ),
                                    array([0, 1, 2, 3, 4, 5, 6, 7],
                                          dtype='float16'))])
        data = Series(rdd)

        test1 = data.meanByFixedLength(4)
        assert (test1.keys().collect() == [(0, )])
        assert (allclose(test1.index, array([0, 1, 2, 3])))
        assert (allclose(test1.values().collect(), [[2, 3, 4, 5]]))

        test2 = data.meanByFixedLength(2)
        assert (test2.keys().collect() == [(0, )])
        assert (allclose(test2.index, array([0, 1])))
        assert (allclose(test2.values().collect(), [[3, 4]]))
Example #36
0
    def test_index_setter_getter(self):
        dataLocal = [((1, ), array([1.0, 2.0, 3.0])),
                     ((2, ), array([2.0, 2.0, 4.0])),
                     ((3, ), array([4.0, 2.0, 1.0]))]
        data = Series(self.sc.parallelize(dataLocal))

        assert_true(array_equal(data.index, array([0, 1, 2])))
        data.index = [3, 2, 1]
        assert_true(data.index == [3, 2, 1])

        def setIndex(data, idx):
            data.index = idx

        assert_raises(ValueError, setIndex, data, 5)
        assert_raises(ValueError, setIndex, data, [1, 2])
Example #37
0
    def test_mass_univariate_classification_ttest_2d(self):
        """Simple classification problem, 2d features"""
        X = array([-1, -2, -0.1, -2, -0.1, -2.1, 1, 1.1, 1, 1, 1.1, 2])
        features = array([1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2])
        samples = array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6])
        labels = array([1, 1, 1, 2, 2, 2])
        params = dict([('labels', labels), ('features', features),
                       ('samples', samples)])

        clf = MassUnivariateClassifier.load(params, "ttest")

        # should match direct calculation using scipy

        # test first feature only
        data = Series(self.sc.parallelize(zip([1], [X])))
        result = clf.fit(data, [[1]]).values().collect()
        ground_truth = ttest_ind(X[features == 1][:3], X[features == 1][3:])
        assert_array_almost_equal(result[0], ground_truth[0])

        # test both features
        result = clf.fit(data, [[1, 2]]).values().collect()
        ground_truth = ttest_ind(
            vstack((X[features == 1][:3], X[features == 2][:3])).T,
            vstack((X[features == 1][3:], X[features == 2][3:])).T)
        assert_array_almost_equal(result[0][0], ground_truth[0])
Example #38
0
    def fromArrays(self, arrays):
        """Create a Series object from a sequence of numpy ndarrays resident in memory on the driver.

        The arrays will be interpreted as though each represents a single time point - effectively the same
        as if converting Images to a Series, with each array representing a volume image at a particular
        point in time. Thus in the resulting Series, the value of the record with key (0,0,0) will be
        array([arrays[0][0,0,0], arrays[1][0,0,0],... arrays[n][0,0,0]).

        The dimensions of the resulting Series will be *opposite* that of the passed numpy array. Their dtype will not
        be changed.
        """
        # if passed a single array, cast it to a sequence of length 1
        if isinstance(arrays, ndarray):
            arrays = [arrays]

        # check that shapes of passed arrays are consistent
        shape = arrays[0].shape
        dtype = arrays[0].dtype
        for ary in arrays:
            if not ary.shape == shape:
                raise ValueError("Inconsistent array shapes: first array had shape %s, but other array has shape %s" %
                                 (str(shape), str(ary.shape)))
            if not ary.dtype == dtype:
                raise ValueError("Inconsistent array dtypes: first array had dtype %s, but other array has dtype %s" %
                                 (str(dtype), str(ary.dtype)))

        # get indices so that fastest index changes first
        shapeiters = (xrange(n) for n in shape)
        keys = [idx[::-1] for idx in itertools.product(*shapeiters)]

        values = vstack([ary.ravel() for ary in arrays]).T

        dims = Dimensions.fromTuple(shape[::-1])

        return Series(self.sc.parallelize(zip(keys, values), self.minPartitions), dims=dims, dtype=str(dtype))
Example #39
0
    def test_mass_univariate_classification_gnb_2d(self):
        """Simple classification problem, 2d features"""

        X = array([-1, 1, -2, -1, -3, -2, 1, 1, 2, 1, 3, 2])
        features = array([1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2])
        samples = array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6])
        labels = array([1, 1, 1, 2, 2, 2])
        params = dict([('labels', labels), ('features', features),
                       ('samples', samples)])
        clf = MassUnivariateClassifier.load(params, "gaussnaivebayes", cv=0)

        data = Series(self.sc.parallelize(zip([1], [X])))

        # first feature predicts perfectly
        result = clf.fit(data, [[1]]).values().collect()
        assert_array_almost_equal(result[0], [1.0])

        # second feature gets one wrong
        result = clf.fit(data, [[2]]).values().collect()
        assert_array_almost_equal(result[0], [5.0 / 6.0])

        # two features together predict perfectly
        result = clf.fit(data, [[1, 2]]).values().collect()
        assert_array_almost_equal(result[0], [1.0])

        # test iteration over multiple feature sets
        result = clf.fit(data, [[1, 2], [2]]).values().collect()
        assert_array_almost_equal(result[0], [1.0, 5.0 / 6.0])
Example #40
0
    def toSeries(self, newDType="smallfloat", casting="safe"):
        from thunder.rdds.series import Series
        # returns generator of (z, y, x) array data for all z, y, x
        seriesRdd = self.rdd.flatMap(lambda kv: SimpleBlocks._toSeriesIter(kv[0], kv[1]))

        idx = arange(self._nimages) if self._nimages else None
        return Series(seriesRdd, dims=self.dims, index=idx, dtype=self.dtype).astype(newDType, casting=casting)
Example #41
0
    def fromBinary(self,
                   dataPath,
                   ext='bin',
                   confFilename='conf.json',
                   nkeys=None,
                   nvalues=None,
                   keyType=None,
                   valueType=None,
                   newDtype='smallfloat',
                   casting='safe',
                   maxPartitionSize='32mb'):
        """
        Load a Series object from a directory of binary files.

        Parameters
        ----------

        dataPath : string URI or local filesystem path
            Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://",
            "s3n://", or "gs://"). If no scheme is present, will be interpreted as a path on the local filesystem. This path
            must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified
            by a glob-style expression using a single wildcard character '*'.

        newDtype : dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat'
            Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be
            cast to the requested `newdtype` if not None - see Data `astype()` method.

        casting : 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe'
            Casting method to pass on to numpy's `astype()` method; see numpy documentation for details.

        maxPartitionSize : str, optional, default = '32mb'
            Maximum size of partitions as Java-style memory, will indirectly control the number of partitions

        """

        paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename,
                                                     nkeys, nvalues, keyType,
                                                     valueType)
        self.__checkBinaryParametersAreSpecified(paramsObj)

        dataPath = self.__normalizeDatafilePattern(dataPath, ext)

        keyDtype = dtypeFunc(paramsObj.keytype)
        valDtype = dtypeFunc(paramsObj.valuetype)

        keySize = paramsObj.nkeys * keyDtype.itemsize
        recordSize = keySize + paramsObj.nvalues * valDtype.itemsize

        lines = self.sc.binaryRecords(dataPath, recordSize)

        get = lambda v: (tuple(
            int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype)),
                         frombuffer(buffer(v, keySize), dtype=valDtype))

        data = lines.map(get)

        return Series(data,
                      dtype=str(valDtype),
                      index=arange(paramsObj.nvalues)).astype(
                          newDtype, casting)
Example #42
0
    def fourier(self, freq=None):
        """
        Compute statistics of a Fourier decomposition on time series data

        Parameters
        ----------
        freq : int
            Digital frequency at which to compute coherence and phase
        """
        def get(y, freq):
            y = y - mean(y)
            nframes = len(y)
            ft = fft.fft(y)
            ft = ft[0:int(fix(nframes/2))]
            ampFt = 2*abs(ft)/nframes
            amp = ampFt[freq]
            ampSum = sqrt(sum(ampFt**2))
            co = amp / ampSum
            ph = -(pi/2) - angle(ft[freq])
            if ph < 0:
                ph += pi * 2
            return array([co, ph])

        if freq >= int(fix(size(self.index)/2)):
            raise Exception('Requested frequency, %g, is too high, must be less than half the series duration' % freq)

        rdd = self.rdd.mapValues(lambda x: get(x, freq))
        return Series(rdd, index=['coherence', 'phase']).__finalize__(self)
Example #43
0
    def fromArrays(self, arrays, npartitions=None):
        """
        Create a Series object from a sequence of 1d numpy arrays on the driver.
        """
        # recast singleton
        if isinstance(arrays, ndarray):
            arrays = [arrays]

        # check shape and dtype
        shape = arrays[0].shape
        dtype = arrays[0].dtype
        for ary in arrays:
            if not ary.shape == shape:
                raise ValueError(
                    "Inconsistent array shapes: first array had shape %s, but other array has shape %s"
                    % (str(shape), str(ary.shape)))
            if not ary.dtype == dtype:
                raise ValueError(
                    "Inconsistent array dtypes: first array had dtype %s, but other array has dtype %s"
                    % (str(dtype), str(ary.dtype)))

        # generate linear keys
        keys = map(lambda k: (k, ), xrange(0, len(arrays)))

        return Series(self.sc.parallelize(zip(keys, arrays), npartitions),
                      dtype=str(dtype))
Example #44
0
    def fit(self, data, comps=None):
        """
        Fit mass univariate regression models

        Parameters
        ----------
        data : Series or a subclass (e.g. RowMatrix)
            The data to fit regression models to, a collection of
            key-value pairs where the keys are identifiers and the values are
            one-dimensional arrays

        Returns
        -------
        result : Series
            Fitted model parameters: betas, summary statistic, and residuals
        """

        if not (isinstance(data, Series)):
            raise Exception(
                'Input must be Series or a subclass (e.g. RowMatrix)')

        if comps is not None:
            traj = data.rdd.map(lambda (_, v): v).map(lambda x: outer(
                x, inner(self.get(x)[0] - mean(self.get(x)[0]), comps))).sum(
                ) / data.count()
            return traj
        else:
            result = Series(data.rdd.mapValues(lambda x: self.get(x)),
                            index=['betas', 'stats',
                                   'resid']).__finalize__(data)
            return result
Example #45
0
 def coeffs(self):
     """
     Series containing the coefficients of the model.
     """
     if not hasattr(self, '_coeffs'):
         self._coeffs = Series(self._models.mapValues(lambda v: v.betas))
     return self._coeffs
Example #46
0
    def fromText(self, dataPath, nkeys=None, ext="txt", dtype='float64'):
        """
        Loads Series data from text files.

        Parameters
        ----------
        dataPath : string
            Specifies the file or files to be loaded. dataPath may be either a URI (with scheme specified) or a path
            on the local filesystem.
            If a path is passed (determined by the absence of a scheme component when attempting to parse as a URI),
            and it is not already a wildcard expression and does not end in <ext>, then it will be converted into a
            wildcard pattern by appending '/*.ext'. This conversion can be avoided by passing a "file://" URI.

        dtype: dtype or dtype specifier, default 'float64'

        """
        dataPath = self.__normalizeDatafilePattern(dataPath, ext)

        def parse(line, nkeys_):
            vec = [float(x) for x in line.split(' ')]
            ts = array(vec[nkeys_:], dtype=dtype)
            keys = tuple(int(x) for x in vec[:nkeys_])
            return keys, ts

        lines = self.sc.textFile(dataPath, self.minPartitions)
        data = lines.map(lambda x: parse(x, nkeys))
        return Series(data, dtype=str(dtype))
Example #47
0
    def test_index_setter_getter(self):
        dataLocal = [
            ((1,), array([1.0, 2.0, 3.0])),
            ((2,), array([2.0, 2.0, 4.0])),
            ((3,), array([4.0, 2.0, 1.0]))
        ]
        data = Series(self.sc.parallelize(dataLocal))

        assert_true(array_equal(data.index, array([0, 1, 2])))
        data.index = [3, 2, 1]
        assert_true(data.index == [3, 2, 1])

        def setIndex(data, idx):
            data.index = idx

        assert_raises(ValueError, setIndex, data, 5)
        assert_raises(ValueError, setIndex, data, [1, 2])
Example #48
0
    def test_normalization_bywindow(self):
        y = array([1, 2, 3, 4, 5], dtype='float16')
        rdd = self.sc.parallelize([(0, y)])
        data = Series(rdd, dtype='float16')
        out = data.normalize('window', window=2)
        # check that _dtype has been set properly *before* calling first(), b/c first() will update this
        # value even if it hasn't been correctly set
        assert_equals('float16', str(out._dtype))
        vals = out.first()[1]
        assert_equals('float64', str(vals.dtype))
        b_true = array([1.2,  1.4,  2.4,  3.4,  4.2])
        result_true = (y - b_true) / (b_true + 0.1)
        assert(allclose(vals, result_true, atol=1e-3))

        out = data.normalize('window', window=6)
        vals = out.first()[1]
        b_true = array([1.6,  1.8,  1.8,  1.8,  2.6])
        result_true = (y - b_true) / (b_true + 0.1)
        assert(allclose(vals, result_true, atol=1e-3))
Example #49
0
 def setUp(self):
     super(TestSeriesRegionMeanMethods, self).setUp()
     self.dataLocal = [
         ((0, 0), array([1.0, 2.0, 3.0])),
         ((0, 1), array([2.0, 2.0, 4.0])),
         ((1, 0), array([4.0, 2.0, 1.0])),
         ((1, 1), array([3.0, 1.0, 1.0]))
     ]
     self.series = Series(self.sc.parallelize(self.dataLocal),
                          dtype=self.dataLocal[0][1].dtype,
                          index=arange(3))
Example #50
0
    def test_seriesAggregateByIndex(self):
        dataLocal = [((1,), arange(12))]
        index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
        data = Series(self.sc.parallelize(dataLocal), index=index)

        result = data.seriesAggregateByIndex(sum)
        print result.values().first()
        assert_true(array_equal(result.values().first(), array([6, 22, 38])))
        assert_true(array_equal(result.index, array([0, 1, 2])))

        index = [
            [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
            [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
            [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3]
        ]
        data.index = array(index).T
        
        result = data.seriesAggregateByIndex(sum, level=[0, 1])
        assert_true(array_equal(result.values().first(), array([1, 14, 13, 38])))
        assert_true(array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]])))
Example #51
0
    def test_selectByIndex(self):
        dataLocal = [((1,), arange(12))]
        index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
        data = Series(self.sc.parallelize(dataLocal), index=index)

        result = data.selectByIndex(1)
        assert_true(array_equal(result.values().first(), array([4, 5, 6, 7])))
        assert_true(array_equal(result.index, array([1, 1, 1, 1])))

        result = data.selectByIndex(1, squeeze=True)
        assert_true(array_equal(result.index, array([0, 1, 2, 3])))

        index = [
            [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
            [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
            [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3]
        ]
        data.index = array(index).T

        result, mask = data.selectByIndex(0, level=2, returnMask=True)
        assert_true(array_equal(result.values().first(), array([0, 2, 6, 8])))
        assert_true(array_equal(result.index, array([[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0]])))
        assert_true(array_equal(mask, array([1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0])))

        result = data.selectByIndex(0, level=2, squeeze=True)
        assert_true(array_equal(result.values().first(), array([0, 2, 6, 8])))
        assert_true(array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]])))

        result = data.selectByIndex([1, 0], level=[0, 1])
        assert_true(array_equal(result.values().first(), array([6, 7])))
        assert_true(array_equal(result.index, array([[1, 0, 0], [1, 0, 1]])))

        result = data.selectByIndex(val=[0, [2,3]], level=[0, 2])
        assert_true(array_equal(result.values().first(), array([4, 5])))
        assert_true(array_equal(result.index, array([[0, 1, 2], [0, 1, 3]])))

        result = data.selectByIndex(1, level=1, filter=True)
        assert_true(array_equal(result.values().first(), array([0, 1, 6, 7])))
        assert_true(array_equal(result.index, array([[0, 0, 0], [0, 0, 1], [1, 0, 0], [1, 0, 1]])))
Example #52
0
class TestSeriesRegionMeanMethods(PySparkTestCase):
    def setUp(self):
        super(TestSeriesRegionMeanMethods, self).setUp()
        self.dataLocal = [
            ((0, 0), array([1.0, 2.0, 3.0])),
            ((0, 1), array([2.0, 2.0, 4.0])),
            ((1, 0), array([4.0, 2.0, 1.0])),
            ((1, 1), array([3.0, 1.0, 1.0]))
        ]
        self.series = Series(self.sc.parallelize(self.dataLocal),
                             dtype=self.dataLocal[0][1].dtype,
                             index=arange(3))

    def __setup_meanByRegion(self, useMask=False):
        itemIdxs = [1, 2]  # data keys for items 1 and 2 (0-based)
        keys = [self.dataLocal[idx][0] for idx in itemIdxs]

        expectedKeys = tuple(vstack(keys).mean(axis=0).astype('int16'))
        expected = vstack([self.dataLocal[idx][1] for idx in itemIdxs]).mean(axis=0)
        if useMask:
            keys = array([[0, 1], [1, 0]], dtype='uint8')
        return keys, expectedKeys, expected

    @staticmethod
    def __checkAsserts(expectedLen, expectedKeys, expected, actual):
        assert_equals(expectedLen, len(actual))
        assert_equals(expectedKeys, actual[0])
        assert_true(array_equal(expected, actual[1]))

    @staticmethod
    def __checkNestedAsserts(expectedLen, expectedKeys, expected, actual):
        assert_equals(expectedLen, len(actual))
        for i in xrange(expectedLen):
            assert_equals(expectedKeys[i], actual[i][0])
            assert_true(array_equal(expected[i], actual[i][1]))

    def __checkReturnedSeriesAttributes(self, newSeries):
        assert_true(newSeries._dims is None)  # check that new _dims is unset
        assert_equals(self.series.dtype, newSeries._dtype)  # check that new dtype is set
        assert_true(array_equal(self.series.index, newSeries._index))  # check that new index is set
        assert_is_not_none(newSeries.dims)  # check that new dims is at least calculable (expected to be meaningless)

    def __run_tst_meanOfRegion(self, useMask):
        keys, expectedKeys, expected = self.__setup_meanByRegion(useMask)
        actual = self.series.meanOfRegion(keys)
        TestSeriesRegionMeanMethods.__checkAsserts(2, expectedKeys, expected, actual)

    def test_meanOfRegion(self):
        self.__run_tst_meanOfRegion(False)

    def test_meanOfRegionWithMask(self):
        self.__run_tst_meanOfRegion(True)

    def test_meanOfRegionErrorsOnMissing(self):
        _, expectedKeys, expected = self.__setup_meanByRegion(False)
        keys = [(17, 24), (17, 25)]
        # if no records match, return None, None
        actualKey, actualVal = self.series.meanOfRegion(keys)
        assert_is_none(actualKey)
        assert_is_none(actualVal)
        # if we have only a partial match but haven't turned on validation, return a sensible value
        keys = [(0, 1), (17, 25)]
        actualKey, actualVal = self.series.meanOfRegion(keys)
        assert_equals((0, 1), actualKey)
        assert_true(array_equal(self.dataLocal[1][1], actualVal))
        # throw an error on a partial match when validation turned on
        assert_raises(ValueError, self.series.meanOfRegion, keys, validate=True)

    def test_meanByRegions_singleRegion(self):
        keys, expectedKeys, expected = self.__setup_meanByRegion()

        actualSeries = self.series.meanByRegions([keys])
        actual = actualSeries.collect()
        self.__checkReturnedSeriesAttributes(actualSeries)
        TestSeriesRegionMeanMethods.__checkNestedAsserts(1, [expectedKeys], [expected], actual)

    def test_meanByRegionsErrorsOnMissing(self):
        keys, expectedKeys, expected = self.__setup_meanByRegion()
        keys += [(17, 25)]

        # check that we get a sensible value with validation turned off:
        actualSeries = self.series.meanByRegions([keys])
        actual = actualSeries.collect()
        self.__checkReturnedSeriesAttributes(actualSeries)
        TestSeriesRegionMeanMethods.__checkNestedAsserts(1, [expectedKeys], [expected], actual)

        # throw an error on a partial match when validation turned on
        # this error will be on the workers, which propagates back to the driver
        # as something other than the ValueError that it started out life as
        assert_raises(Exception, self.series.meanByRegions([keys], validate=True).count)

    def test_meanByRegions_singleRegionWithMask(self):
        mask, expectedKeys, expected = self.__setup_meanByRegion(True)

        actualSeries = self.series.meanByRegions(mask)
        actual = actualSeries.collect()
        self.__checkReturnedSeriesAttributes(actualSeries)
        TestSeriesRegionMeanMethods.__checkNestedAsserts(1, [expectedKeys], [expected], actual)

    def test_meanByRegions_twoRegions(self):
        nestedKeys, expectedKeys, expected = [], [], []
        for itemIdxs in [(0, 1), (1, 2)]:
            keys = [self.dataLocal[idx][0] for idx in itemIdxs]
            nestedKeys.append(keys)
            avgKeys = tuple(vstack(keys).mean(axis=0).astype('int16'))
            expectedKeys.append(avgKeys)
            avgVals = vstack([self.dataLocal[idx][1] for idx in itemIdxs]).mean(axis=0)
            expected.append(avgVals)

        actualSeries = self.series.meanByRegions(nestedKeys)
        actual = actualSeries.collect()
        self.__checkReturnedSeriesAttributes(actualSeries)
        TestSeriesRegionMeanMethods.__checkNestedAsserts(2, expectedKeys, expected, actual)

    def test_meanByRegions_twoRegionsWithMask(self):
        expectedKeys, expected = [], []
        mask = array([[1, 1], [2, 0]], dtype='uint8')
        for itemIdxs in [(0, 1), (2, )]:
            keys = [self.dataLocal[idx][0] for idx in itemIdxs]
            avgKeys = tuple(vstack(keys).mean(axis=0).astype('int16'))
            expectedKeys.append(avgKeys)
            avgVals = vstack([self.dataLocal[idx][1] for idx in itemIdxs]).mean(axis=0)
            expected.append(avgVals)

        actualSeries = self.series.meanByRegions(mask)
        actual = actualSeries.collect()
        self.__checkReturnedSeriesAttributes(actualSeries)
        TestSeriesRegionMeanMethods.__checkNestedAsserts(2, expectedKeys, expected, actual)
Example #53
0
    def test_seriesStatByIndex(self):
        dataLocal = [((1,), arange(12))]
        index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
        data = Series(self.sc.parallelize(dataLocal), index=index)

        assert_true(array_equal(data.seriesStatByIndex('sum').values().first(), array([6, 22, 38])))
        assert_true(array_equal(data.seriesStatByIndex('mean').values().first(), array([1.5, 5.5, 9.5])))
        assert_true(array_equal(data.seriesStatByIndex('min').values().first(), array([0, 4, 8])))
        assert_true(array_equal(data.seriesStatByIndex('max').values().first(), array([3, 7, 11])))
        assert_true(array_equal(data.seriesStatByIndex('count').values().first(), array([4, 4, 4])))
        assert_true(array_equal(data.seriesStatByIndex('median').values().first(), array([1.5, 5.5, 9.5])))

        assert_true(array_equal(data.seriesSumByIndex().values().first(), array([6, 22, 38])))
        assert_true(array_equal(data.seriesMeanByIndex().values().first(), array([1.5, 5.5, 9.5])))
        assert_true(array_equal(data.seriesMinByIndex().values().first(), array([0, 4, 8])))
        assert_true(array_equal(data.seriesMaxByIndex().values().first(), array([3, 7, 11])))
        assert_true(array_equal(data.seriesCountByIndex().values().first(), array([4, 4, 4])))
        assert_true(array_equal(data.seriesMedianByIndex().values().first(), array([1.5, 5.5, 9.5])))

        index = [
            [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
            [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
            [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3]
        ]
        data.index = array(index).T

        result = data.seriesStatByIndex('sum', level=[0, 1])
        assert_true(array_equal(result.values().first(), array([1, 14, 13, 38])))
        assert_true(array_equal(result.index, array([[0,0], [0, 1], [1, 0], [1, 1]])))

        result = data.seriesSumByIndex(level=[0, 1])
        assert_true(array_equal(result.values().first(), array([1, 14, 13, 38])))
        assert_true(array_equal(result.index, array([[0,0], [0, 1], [1, 0], [1, 1]])))
Example #54
0
 def test_standardization_axis1(self):
     rdd = self.sc.parallelize([(0, array([1, 2])), (0, array([3, 4]))])
     data = Series(rdd)
     assert(allclose(data.center(1).first()[1], array([-1, -1])))
     assert(allclose(data.standardize(1).first()[1], array([1, 2])))
     assert(allclose(data.zscore(1).first()[1], array([-1, -1])))
Example #55
0
 def test_toTimeSeries(self):
     from thunder.rdds.timeseries import TimeSeries
     rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))])
     data = Series(rdd)
     ts = data.toTimeSeries()
     assert(isinstance(ts, TimeSeries))
Example #56
0
 def test_detrend(self):
     rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5]))])
     data = Series(rdd).detrend('linear')
     # detrending linearly increasing data should yield all 0s
     assert(allclose(data.first()[1], array([0, 0, 0, 0, 0])))
Example #57
0
 def test_between(self):
     rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))])
     data = Series(rdd).between(0, 1)
     assert(allclose(data.index, array([0, 1])))
     assert(allclose(data.first()[1], array([4, 5])))