def test_ind_to_sub_rdd(self): dataLocal = map(lambda x: (x, array([1.0])), range(1, 13)) data = Series(self.sc.parallelize(dataLocal)) subs = data.indToSub(dims=[2, 3, 2]).keys().collect() assert(allclose(subs, array([(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1), (2, 3, 1), (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2), (1, 3, 2), (2, 3, 2)])))
def test_toTimeSeries(self): from thunder.rdds.timeseries import TimeSeries rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))]) data = Series(rdd) ts = data.toTimeSeries() assert (isinstance(ts, TimeSeries))
def test_to_row_matrix(self): from thunder.rdds.matrices import RowMatrix rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))]) data = Series(rdd) mat = data.toRowMatrix() assert(isinstance(mat, RowMatrix)) assert(mat.nrows == 2) assert(mat.ncols == 4)
def predictAndScore(self, X, y): X = self._transforms.apply(X) joined = self._models.join(y.rdd) results = joined.mapValues(lambda (model, y): model.predictWithStats(X, y)) yhat = results.mapValues(lambda v: v[0]) stats = results.mapValues(lambda v: v[1]) return Series(yhat), Series(stats)
def test_sub_to_ind_rdd(self): subs = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1), (2, 3, 1), (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2), (1, 3, 2), (2, 3, 2)] dataLocal = map(lambda x: (x, array([1.0])), subs) data = Series(self.sc.parallelize(dataLocal)) inds = array(data.subToInd().keys().collect()) assert(allclose(inds, array(range(1, 13))))
def test_select(self): rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))]) data = Series(rdd, index=['label1', 'label2', 'label3', 'label4']) selection1 = data.select(['label1']) assert(allclose(selection1.first()[1], 4)) selection1 = data.select('label1') assert(allclose(selection1.first()[1], 4)) selection2 = data.select(['label1', 'label2']) assert(allclose(selection2.first()[1], array([4, 5])))
def test_squelch(self): rdd = self.sc.parallelize([(0, array([1, 2])), (0, array([3, 4]))]) data = Series(rdd) squelched = data.squelch(5) assert (allclose(squelched.collectValuesAsArray(), [[0, 0], [0, 0]])) squelched = data.squelch(3) assert (allclose(squelched.collectValuesAsArray(), [[0, 0], [3, 4]])) squelched = data.squelch(1) assert (allclose(squelched.collectValuesAsArray(), [[1, 2], [3, 4]]))
def test_to_row_matrix(self): from thunder.rdds.matrices import RowMatrix rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))]) data = Series(rdd) mat = data.toRowMatrix() assert (isinstance(mat, RowMatrix)) assert (mat.nrows == 2) assert (mat.ncols == 4)
def test_correlate(self): rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5]))]) data = Series(rdd) sig1 = [4, 5, 6, 7, 8] corr = data.correlate(sig1).values().collect() assert(allclose(corr[0], 1)) sig12 = [[4, 5, 6, 7, 8], [8, 7, 6, 5, 4]] corrs = data.correlate(sig12).values().collect() assert(allclose(corrs[0], [1, -1]))
def test_standardization_axis0(self): rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))]) data = Series(rdd, dtype='float16') centered = data.center(0) standardized = data.standardize(0) zscored = data.zscore(0) assert(allclose(centered.first()[1], array([-2, -1, 0, 1, 2]), atol=1e-3)) assert(allclose(standardized.first()[1], array([0.70710, 1.41421, 2.12132, 2.82842, 3.53553]), atol=1e-3)) assert(allclose(zscored.first()[1], array([-1.41421, -0.70710, 0, 0.70710, 1.41421]), atol=1e-3))
def test_standardization_axis1(self): rdd = self.sc.parallelize([(0, array([1, 2], dtype='float16')), (0, array([3, 4], dtype='float16'))]) data = Series(rdd, dtype='float16') centered = data.center(1) standardized = data.standardize(1) zscored = data.zscore(1) assert(allclose(centered.first()[1], array([-1, -1]), atol=1e-3)) assert(allclose(standardized.first()[1], array([1, 2]), atol=1e-3)) assert(allclose(zscored.first()[1], array([-1, -1]), atol=1e-3))
def test_squelch(self): rdd = self.sc.parallelize([(0, array([1, 2])), (0, array([3, 4]))]) data = Series(rdd) squelched = data.squelch(5) assert(allclose(squelched.collectValuesAsArray(), [[0, 0], [0, 0]])) squelched = data.squelch(3) assert(allclose(squelched.collectValuesAsArray(), [[0, 0], [3, 4]])) squelched = data.squelch(1) assert(allclose(squelched.collectValuesAsArray(), [[1, 2], [3, 4]]))
def test_round_trip_rdd(self): subs = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1), (2, 3, 1), (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2), (1, 3, 2), (2, 3, 2)] dataLocal = map(lambda x: (x, array([1.0])), subs) data = Series(self.sc.parallelize(dataLocal)) start = data.keys().collect() stop = data.subToInd().indToSub().keys().collect() assert(allclose(array(start), array(stop)))
def setUp(self): super(TestSeriesRegionMeanMethods, self).setUp() self.dataLocal = [((0, 0), array([1.0, 2.0, 3.0])), ((0, 1), array([2.0, 2.0, 4.0])), ((1, 0), array([4.0, 2.0, 1.0])), ((1, 1), array([3.0, 1.0, 1.0]))] self.series = Series(self.sc.parallelize(self.dataLocal), dtype=self.dataLocal[0][1].dtype, index=arange(3))
def test_normalization(self): rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))]) data = Series(rdd, dtype='float16') out = data.normalize('percentile') # check that _dtype has been set properly *before* calling first(), b/c first() will update this # value even if it hasn't been correctly set assert_equals('float16', str(out._dtype)) vals = out.first()[1] assert_equals('float16', str(vals.dtype)) assert(allclose(vals, array([-0.42105, 0.10526, 0.63157, 1.15789, 1.68421]), atol=1e-3))
def test_toImages(self): from thunder.rdds.images import Images rdd = self.sc.parallelize([((0, 0), array([1])), ((0, 1), array([2])), ((1, 0), array([3])), ((1, 1), array([4]))]) data = Series(rdd) imgs = data.toImages() assert(isinstance(imgs, Images)) im = imgs.values().first() assert(allclose(im, [[1, 2], [3, 4]]))
def test_toImages(self): from thunder.rdds.images import Images rdd = self.sc.parallelize([((0, 0), array([1])), ((0, 1), array([2])), ((1, 0), array([3])), ((1, 1), array([4]))]) data = Series(rdd) imgs = data.toImages() assert (isinstance(imgs, Images)) im = imgs.values().first() assert (allclose(im, [[1, 2], [3, 4]]))
def test_subset(self): rdd = self.sc.parallelize([(0, array([1, 5], dtype='float16')), (0, array([1, 10], dtype='float16')), (0, array([1, 15], dtype='float16'))]) data = Series(rdd) assert_equal(len(data.subset(3, stat='min', thresh=0)), 3) assert_array_equal(data.subset(1, stat='max', thresh=10), [[1, 15]]) assert_array_equal(data.subset(1, stat='mean', thresh=6), [[1, 15]]) assert_array_equal(data.subset(1, stat='std', thresh=6), [[1, 15]]) assert_array_equal(data.subset(1, thresh=6), [[1, 15]])
def test_correlate(self): rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))]) data = Series(rdd, dtype='float16') sig1 = [4, 5, 6, 7, 8] corrData = data.correlate(sig1) assert_equals('float64', corrData._dtype) corr = corrData.values().collect() assert(allclose(corr[0], 1)) sig12 = [[4, 5, 6, 7, 8], [8, 7, 6, 5, 4]] corrs = data.correlate(sig12).values().collect() assert(allclose(corrs[0], [1, -1]))
def test_query_subscripts(self): dataLocal = [((1, 1), array([1.0, 2.0, 3.0])), ((2, 1), array([2.0, 2.0, 4.0])), ((1, 2), array([4.0, 2.0, 1.0]))] data = Series(self.sc.parallelize(dataLocal)) inds = array([array([1, 2]), array([3])]) keys, values = data.query(inds) assert (allclose(values[0, :], array([1.5, 2., 3.5]))) assert (allclose(values[1, :], array([4.0, 2.0, 1.0])))
def test_query_linear_singleton(self): dataLocal = [((1, ), array([1.0, 2.0, 3.0])), ((2, ), array([2.0, 2.0, 4.0])), ((3, ), array([4.0, 2.0, 1.0]))] data = Series(self.sc.parallelize(dataLocal)) inds = array([array([1, 2])]) keys, values = data.query(inds) assert (allclose(values[0, :], array([1.5, 2., 3.5]))) assert_equals(data.dtype, values[0, :].dtype)
def test_normalization_bymean(self): rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))]) data = Series(rdd, dtype='float16') out = data.normalize('mean') # check that _dtype has been set properly *before* calling first(), b/c first() will update this # value even if it hasn't been correctly set assert_equals('float16', str(out._dtype)) vals = out.first()[1] assert_equals('float16', str(vals.dtype)) assert(allclose(out.first()[1], array([-0.64516, -0.32258, 0.0, 0.32258, 0.64516]), atol=1e-3))
def test_query_linear_singleton(self): data_local = [ ((1,), array([1.0, 2.0, 3.0])), ((2,), array([2.0, 2.0, 4.0])), ((3,), array([4.0, 2.0, 1.0])) ] data = Series(self.sc.parallelize(data_local)) inds = array([array([1, 2])]) keys, values = data.query(inds) assert(allclose(values[0, :], array([1.5, 2., 3.5])))
def test_seriesStats(self): rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5]))]) data = Series(rdd) assert(allclose(data.seriesMean().first()[1], 3.0)) assert(allclose(data.seriesSum().first()[1], 15.0)) assert(allclose(data.seriesMedian().first()[1], 3.0)) assert(allclose(data.seriesStdev().first()[1], 1.4142135)) assert(allclose(data.seriesStat('mean').first()[1], 3.0)) assert(allclose(data.seriesStats().select('mean').first()[1], 3.0)) assert(allclose(data.seriesStats().select('count').first()[1], 5)) assert(allclose(data.seriesPercentile(25).first()[1], 2.0)) assert(allclose(data.seriesPercentile((25, 75)).first()[1], array([2.0, 4.0])))
def test_normalization(self): rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))]) data = Series(rdd, dtype='float16') out = data.normalize('percentile') # check that _dtype has been set properly *before* calling first(), b/c first() will update this # value even if it hasn't been correctly set assert_equals('float16', str(out._dtype)) vals = out.first()[1] assert_equals('float16', str(vals.dtype)) assert (allclose(vals, array([-0.42105, 0.10526, 0.63157, 1.15789, 1.68421]), atol=1e-3))
def test_toRowMatrix(self): from thunder.rdds.matrices import RowMatrix rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))]) data = Series(rdd) mat = data.toRowMatrix() assert(isinstance(mat, RowMatrix)) assert(mat.nrows == 2) assert(mat.ncols == 4) # check a basic operation from superclass newmat = mat.applyValues(lambda x: x + 1) out = newmat.collectValuesAsArray() assert(array_equal(out, array([[5, 6, 7, 8], [9, 10, 11, 12]])))
def test_meanByFixedLength(self): rdd = self.sc.parallelize([((0,), array([0, 1, 2, 3, 4, 5, 6, 7], dtype='float16'))]) data = Series(rdd) test1 = data.meanByFixedLength(4) assert(test1.keys().collect() == [(0,)]) assert(allclose(test1.index, array([0, 1, 2, 3]))) assert(allclose(test1.values().collect(), [[2, 3, 4, 5]])) test2 = data.meanByFixedLength(2) assert(test2.keys().collect() == [(0,)]) assert(allclose(test2.index, array([0, 1]))) assert(allclose(test2.values().collect(), [[3, 4]]))
def test_normalization_bymean(self): rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))]) data = Series(rdd, dtype='float16') out = data.normalize('mean') # check that _dtype has been set properly *before* calling first(), b/c first() will update this # value even if it hasn't been correctly set assert_equals('float16', str(out._dtype)) vals = out.first()[1] assert_equals('float16', str(vals.dtype)) assert (allclose(out.first()[1], array([-0.64516, -0.32258, 0.0, 0.32258, 0.64516]), atol=1e-3))
def test_query_subscripts(self): dataLocal = [ ((1, 1), array([1.0, 2.0, 3.0])), ((2, 1), array([2.0, 2.0, 4.0])), ((1, 2), array([4.0, 2.0, 1.0])) ] data = Series(self.sc.parallelize(dataLocal)) inds = array([array([1, 2]), array([3])]) keys, values = data.query(inds) assert(allclose(values[0, :], array([1.5, 2., 3.5]))) assert(allclose(values[1, :], array([4.0, 2.0, 1.0])))
def test_meanByFixedLength(self): rdd = self.sc.parallelize([((0, ), array([0, 1, 2, 3, 4, 5, 6, 7], dtype='float16'))]) data = Series(rdd) test1 = data.meanByFixedLength(4) assert (test1.keys().collect() == [(0, )]) assert (allclose(test1.index, array([0, 1, 2, 3]))) assert (allclose(test1.values().collect(), [[2, 3, 4, 5]])) test2 = data.meanByFixedLength(2) assert (test2.keys().collect() == [(0, )]) assert (allclose(test2.index, array([0, 1]))) assert (allclose(test2.values().collect(), [[3, 4]]))
def test_index_setter_getter(self): dataLocal = [((1, ), array([1.0, 2.0, 3.0])), ((2, ), array([2.0, 2.0, 4.0])), ((3, ), array([4.0, 2.0, 1.0]))] data = Series(self.sc.parallelize(dataLocal)) assert_true(array_equal(data.index, array([0, 1, 2]))) data.index = [3, 2, 1] assert_true(data.index == [3, 2, 1]) def setIndex(data, idx): data.index = idx assert_raises(ValueError, setIndex, data, 5) assert_raises(ValueError, setIndex, data, [1, 2])
def test_mass_univariate_classification_ttest_2d(self): """Simple classification problem, 2d features""" X = array([-1, -2, -0.1, -2, -0.1, -2.1, 1, 1.1, 1, 1, 1.1, 2]) features = array([1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]) samples = array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]) labels = array([1, 1, 1, 2, 2, 2]) params = dict([('labels', labels), ('features', features), ('samples', samples)]) clf = MassUnivariateClassifier.load(params, "ttest") # should match direct calculation using scipy # test first feature only data = Series(self.sc.parallelize(zip([1], [X]))) result = clf.fit(data, [[1]]).values().collect() ground_truth = ttest_ind(X[features == 1][:3], X[features == 1][3:]) assert_array_almost_equal(result[0], ground_truth[0]) # test both features result = clf.fit(data, [[1, 2]]).values().collect() ground_truth = ttest_ind( vstack((X[features == 1][:3], X[features == 2][:3])).T, vstack((X[features == 1][3:], X[features == 2][3:])).T) assert_array_almost_equal(result[0][0], ground_truth[0])
def fromArrays(self, arrays): """Create a Series object from a sequence of numpy ndarrays resident in memory on the driver. The arrays will be interpreted as though each represents a single time point - effectively the same as if converting Images to a Series, with each array representing a volume image at a particular point in time. Thus in the resulting Series, the value of the record with key (0,0,0) will be array([arrays[0][0,0,0], arrays[1][0,0,0],... arrays[n][0,0,0]). The dimensions of the resulting Series will be *opposite* that of the passed numpy array. Their dtype will not be changed. """ # if passed a single array, cast it to a sequence of length 1 if isinstance(arrays, ndarray): arrays = [arrays] # check that shapes of passed arrays are consistent shape = arrays[0].shape dtype = arrays[0].dtype for ary in arrays: if not ary.shape == shape: raise ValueError("Inconsistent array shapes: first array had shape %s, but other array has shape %s" % (str(shape), str(ary.shape))) if not ary.dtype == dtype: raise ValueError("Inconsistent array dtypes: first array had dtype %s, but other array has dtype %s" % (str(dtype), str(ary.dtype))) # get indices so that fastest index changes first shapeiters = (xrange(n) for n in shape) keys = [idx[::-1] for idx in itertools.product(*shapeiters)] values = vstack([ary.ravel() for ary in arrays]).T dims = Dimensions.fromTuple(shape[::-1]) return Series(self.sc.parallelize(zip(keys, values), self.minPartitions), dims=dims, dtype=str(dtype))
def test_mass_univariate_classification_gnb_2d(self): """Simple classification problem, 2d features""" X = array([-1, 1, -2, -1, -3, -2, 1, 1, 2, 1, 3, 2]) features = array([1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]) samples = array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]) labels = array([1, 1, 1, 2, 2, 2]) params = dict([('labels', labels), ('features', features), ('samples', samples)]) clf = MassUnivariateClassifier.load(params, "gaussnaivebayes", cv=0) data = Series(self.sc.parallelize(zip([1], [X]))) # first feature predicts perfectly result = clf.fit(data, [[1]]).values().collect() assert_array_almost_equal(result[0], [1.0]) # second feature gets one wrong result = clf.fit(data, [[2]]).values().collect() assert_array_almost_equal(result[0], [5.0 / 6.0]) # two features together predict perfectly result = clf.fit(data, [[1, 2]]).values().collect() assert_array_almost_equal(result[0], [1.0]) # test iteration over multiple feature sets result = clf.fit(data, [[1, 2], [2]]).values().collect() assert_array_almost_equal(result[0], [1.0, 5.0 / 6.0])
def toSeries(self, newDType="smallfloat", casting="safe"): from thunder.rdds.series import Series # returns generator of (z, y, x) array data for all z, y, x seriesRdd = self.rdd.flatMap(lambda kv: SimpleBlocks._toSeriesIter(kv[0], kv[1])) idx = arange(self._nimages) if self._nimages else None return Series(seriesRdd, dims=self.dims, index=idx, dtype=self.dtype).astype(newDType, casting=casting)
def fromBinary(self, dataPath, ext='bin', confFilename='conf.json', nkeys=None, nvalues=None, keyType=None, valueType=None, newDtype='smallfloat', casting='safe', maxPartitionSize='32mb'): """ Load a Series object from a directory of binary files. Parameters ---------- dataPath : string URI or local filesystem path Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://", "s3n://", or "gs://"). If no scheme is present, will be interpreted as a path on the local filesystem. This path must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified by a glob-style expression using a single wildcard character '*'. newDtype : dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat' Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be cast to the requested `newdtype` if not None - see Data `astype()` method. casting : 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's `astype()` method; see numpy documentation for details. maxPartitionSize : str, optional, default = '32mb' Maximum size of partitions as Java-style memory, will indirectly control the number of partitions """ paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename, nkeys, nvalues, keyType, valueType) self.__checkBinaryParametersAreSpecified(paramsObj) dataPath = self.__normalizeDatafilePattern(dataPath, ext) keyDtype = dtypeFunc(paramsObj.keytype) valDtype = dtypeFunc(paramsObj.valuetype) keySize = paramsObj.nkeys * keyDtype.itemsize recordSize = keySize + paramsObj.nvalues * valDtype.itemsize lines = self.sc.binaryRecords(dataPath, recordSize) get = lambda v: (tuple( int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype)), frombuffer(buffer(v, keySize), dtype=valDtype)) data = lines.map(get) return Series(data, dtype=str(valDtype), index=arange(paramsObj.nvalues)).astype( newDtype, casting)
def fourier(self, freq=None): """ Compute statistics of a Fourier decomposition on time series data Parameters ---------- freq : int Digital frequency at which to compute coherence and phase """ def get(y, freq): y = y - mean(y) nframes = len(y) ft = fft.fft(y) ft = ft[0:int(fix(nframes/2))] ampFt = 2*abs(ft)/nframes amp = ampFt[freq] ampSum = sqrt(sum(ampFt**2)) co = amp / ampSum ph = -(pi/2) - angle(ft[freq]) if ph < 0: ph += pi * 2 return array([co, ph]) if freq >= int(fix(size(self.index)/2)): raise Exception('Requested frequency, %g, is too high, must be less than half the series duration' % freq) rdd = self.rdd.mapValues(lambda x: get(x, freq)) return Series(rdd, index=['coherence', 'phase']).__finalize__(self)
def fromArrays(self, arrays, npartitions=None): """ Create a Series object from a sequence of 1d numpy arrays on the driver. """ # recast singleton if isinstance(arrays, ndarray): arrays = [arrays] # check shape and dtype shape = arrays[0].shape dtype = arrays[0].dtype for ary in arrays: if not ary.shape == shape: raise ValueError( "Inconsistent array shapes: first array had shape %s, but other array has shape %s" % (str(shape), str(ary.shape))) if not ary.dtype == dtype: raise ValueError( "Inconsistent array dtypes: first array had dtype %s, but other array has dtype %s" % (str(dtype), str(ary.dtype))) # generate linear keys keys = map(lambda k: (k, ), xrange(0, len(arrays))) return Series(self.sc.parallelize(zip(keys, arrays), npartitions), dtype=str(dtype))
def fit(self, data, comps=None): """ Fit mass univariate regression models Parameters ---------- data : Series or a subclass (e.g. RowMatrix) The data to fit regression models to, a collection of key-value pairs where the keys are identifiers and the values are one-dimensional arrays Returns ------- result : Series Fitted model parameters: betas, summary statistic, and residuals """ if not (isinstance(data, Series)): raise Exception( 'Input must be Series or a subclass (e.g. RowMatrix)') if comps is not None: traj = data.rdd.map(lambda (_, v): v).map(lambda x: outer( x, inner(self.get(x)[0] - mean(self.get(x)[0]), comps))).sum( ) / data.count() return traj else: result = Series(data.rdd.mapValues(lambda x: self.get(x)), index=['betas', 'stats', 'resid']).__finalize__(data) return result
def coeffs(self): """ Series containing the coefficients of the model. """ if not hasattr(self, '_coeffs'): self._coeffs = Series(self._models.mapValues(lambda v: v.betas)) return self._coeffs
def fromText(self, dataPath, nkeys=None, ext="txt", dtype='float64'): """ Loads Series data from text files. Parameters ---------- dataPath : string Specifies the file or files to be loaded. dataPath may be either a URI (with scheme specified) or a path on the local filesystem. If a path is passed (determined by the absence of a scheme component when attempting to parse as a URI), and it is not already a wildcard expression and does not end in <ext>, then it will be converted into a wildcard pattern by appending '/*.ext'. This conversion can be avoided by passing a "file://" URI. dtype: dtype or dtype specifier, default 'float64' """ dataPath = self.__normalizeDatafilePattern(dataPath, ext) def parse(line, nkeys_): vec = [float(x) for x in line.split(' ')] ts = array(vec[nkeys_:], dtype=dtype) keys = tuple(int(x) for x in vec[:nkeys_]) return keys, ts lines = self.sc.textFile(dataPath, self.minPartitions) data = lines.map(lambda x: parse(x, nkeys)) return Series(data, dtype=str(dtype))
def test_index_setter_getter(self): dataLocal = [ ((1,), array([1.0, 2.0, 3.0])), ((2,), array([2.0, 2.0, 4.0])), ((3,), array([4.0, 2.0, 1.0])) ] data = Series(self.sc.parallelize(dataLocal)) assert_true(array_equal(data.index, array([0, 1, 2]))) data.index = [3, 2, 1] assert_true(data.index == [3, 2, 1]) def setIndex(data, idx): data.index = idx assert_raises(ValueError, setIndex, data, 5) assert_raises(ValueError, setIndex, data, [1, 2])
def test_normalization_bywindow(self): y = array([1, 2, 3, 4, 5], dtype='float16') rdd = self.sc.parallelize([(0, y)]) data = Series(rdd, dtype='float16') out = data.normalize('window', window=2) # check that _dtype has been set properly *before* calling first(), b/c first() will update this # value even if it hasn't been correctly set assert_equals('float16', str(out._dtype)) vals = out.first()[1] assert_equals('float64', str(vals.dtype)) b_true = array([1.2, 1.4, 2.4, 3.4, 4.2]) result_true = (y - b_true) / (b_true + 0.1) assert(allclose(vals, result_true, atol=1e-3)) out = data.normalize('window', window=6) vals = out.first()[1] b_true = array([1.6, 1.8, 1.8, 1.8, 2.6]) result_true = (y - b_true) / (b_true + 0.1) assert(allclose(vals, result_true, atol=1e-3))
def setUp(self): super(TestSeriesRegionMeanMethods, self).setUp() self.dataLocal = [ ((0, 0), array([1.0, 2.0, 3.0])), ((0, 1), array([2.0, 2.0, 4.0])), ((1, 0), array([4.0, 2.0, 1.0])), ((1, 1), array([3.0, 1.0, 1.0])) ] self.series = Series(self.sc.parallelize(self.dataLocal), dtype=self.dataLocal[0][1].dtype, index=arange(3))
def test_seriesAggregateByIndex(self): dataLocal = [((1,), arange(12))] index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] data = Series(self.sc.parallelize(dataLocal), index=index) result = data.seriesAggregateByIndex(sum) print result.values().first() assert_true(array_equal(result.values().first(), array([6, 22, 38]))) assert_true(array_equal(result.index, array([0, 1, 2]))) index = [ [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1], [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3] ] data.index = array(index).T result = data.seriesAggregateByIndex(sum, level=[0, 1]) assert_true(array_equal(result.values().first(), array([1, 14, 13, 38]))) assert_true(array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]])))
def test_selectByIndex(self): dataLocal = [((1,), arange(12))] index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] data = Series(self.sc.parallelize(dataLocal), index=index) result = data.selectByIndex(1) assert_true(array_equal(result.values().first(), array([4, 5, 6, 7]))) assert_true(array_equal(result.index, array([1, 1, 1, 1]))) result = data.selectByIndex(1, squeeze=True) assert_true(array_equal(result.index, array([0, 1, 2, 3]))) index = [ [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1], [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3] ] data.index = array(index).T result, mask = data.selectByIndex(0, level=2, returnMask=True) assert_true(array_equal(result.values().first(), array([0, 2, 6, 8]))) assert_true(array_equal(result.index, array([[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0]]))) assert_true(array_equal(mask, array([1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0]))) result = data.selectByIndex(0, level=2, squeeze=True) assert_true(array_equal(result.values().first(), array([0, 2, 6, 8]))) assert_true(array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]]))) result = data.selectByIndex([1, 0], level=[0, 1]) assert_true(array_equal(result.values().first(), array([6, 7]))) assert_true(array_equal(result.index, array([[1, 0, 0], [1, 0, 1]]))) result = data.selectByIndex(val=[0, [2,3]], level=[0, 2]) assert_true(array_equal(result.values().first(), array([4, 5]))) assert_true(array_equal(result.index, array([[0, 1, 2], [0, 1, 3]]))) result = data.selectByIndex(1, level=1, filter=True) assert_true(array_equal(result.values().first(), array([0, 1, 6, 7]))) assert_true(array_equal(result.index, array([[0, 0, 0], [0, 0, 1], [1, 0, 0], [1, 0, 1]])))
class TestSeriesRegionMeanMethods(PySparkTestCase): def setUp(self): super(TestSeriesRegionMeanMethods, self).setUp() self.dataLocal = [ ((0, 0), array([1.0, 2.0, 3.0])), ((0, 1), array([2.0, 2.0, 4.0])), ((1, 0), array([4.0, 2.0, 1.0])), ((1, 1), array([3.0, 1.0, 1.0])) ] self.series = Series(self.sc.parallelize(self.dataLocal), dtype=self.dataLocal[0][1].dtype, index=arange(3)) def __setup_meanByRegion(self, useMask=False): itemIdxs = [1, 2] # data keys for items 1 and 2 (0-based) keys = [self.dataLocal[idx][0] for idx in itemIdxs] expectedKeys = tuple(vstack(keys).mean(axis=0).astype('int16')) expected = vstack([self.dataLocal[idx][1] for idx in itemIdxs]).mean(axis=0) if useMask: keys = array([[0, 1], [1, 0]], dtype='uint8') return keys, expectedKeys, expected @staticmethod def __checkAsserts(expectedLen, expectedKeys, expected, actual): assert_equals(expectedLen, len(actual)) assert_equals(expectedKeys, actual[0]) assert_true(array_equal(expected, actual[1])) @staticmethod def __checkNestedAsserts(expectedLen, expectedKeys, expected, actual): assert_equals(expectedLen, len(actual)) for i in xrange(expectedLen): assert_equals(expectedKeys[i], actual[i][0]) assert_true(array_equal(expected[i], actual[i][1])) def __checkReturnedSeriesAttributes(self, newSeries): assert_true(newSeries._dims is None) # check that new _dims is unset assert_equals(self.series.dtype, newSeries._dtype) # check that new dtype is set assert_true(array_equal(self.series.index, newSeries._index)) # check that new index is set assert_is_not_none(newSeries.dims) # check that new dims is at least calculable (expected to be meaningless) def __run_tst_meanOfRegion(self, useMask): keys, expectedKeys, expected = self.__setup_meanByRegion(useMask) actual = self.series.meanOfRegion(keys) TestSeriesRegionMeanMethods.__checkAsserts(2, expectedKeys, expected, actual) def test_meanOfRegion(self): self.__run_tst_meanOfRegion(False) def test_meanOfRegionWithMask(self): self.__run_tst_meanOfRegion(True) def test_meanOfRegionErrorsOnMissing(self): _, expectedKeys, expected = self.__setup_meanByRegion(False) keys = [(17, 24), (17, 25)] # if no records match, return None, None actualKey, actualVal = self.series.meanOfRegion(keys) assert_is_none(actualKey) assert_is_none(actualVal) # if we have only a partial match but haven't turned on validation, return a sensible value keys = [(0, 1), (17, 25)] actualKey, actualVal = self.series.meanOfRegion(keys) assert_equals((0, 1), actualKey) assert_true(array_equal(self.dataLocal[1][1], actualVal)) # throw an error on a partial match when validation turned on assert_raises(ValueError, self.series.meanOfRegion, keys, validate=True) def test_meanByRegions_singleRegion(self): keys, expectedKeys, expected = self.__setup_meanByRegion() actualSeries = self.series.meanByRegions([keys]) actual = actualSeries.collect() self.__checkReturnedSeriesAttributes(actualSeries) TestSeriesRegionMeanMethods.__checkNestedAsserts(1, [expectedKeys], [expected], actual) def test_meanByRegionsErrorsOnMissing(self): keys, expectedKeys, expected = self.__setup_meanByRegion() keys += [(17, 25)] # check that we get a sensible value with validation turned off: actualSeries = self.series.meanByRegions([keys]) actual = actualSeries.collect() self.__checkReturnedSeriesAttributes(actualSeries) TestSeriesRegionMeanMethods.__checkNestedAsserts(1, [expectedKeys], [expected], actual) # throw an error on a partial match when validation turned on # this error will be on the workers, which propagates back to the driver # as something other than the ValueError that it started out life as assert_raises(Exception, self.series.meanByRegions([keys], validate=True).count) def test_meanByRegions_singleRegionWithMask(self): mask, expectedKeys, expected = self.__setup_meanByRegion(True) actualSeries = self.series.meanByRegions(mask) actual = actualSeries.collect() self.__checkReturnedSeriesAttributes(actualSeries) TestSeriesRegionMeanMethods.__checkNestedAsserts(1, [expectedKeys], [expected], actual) def test_meanByRegions_twoRegions(self): nestedKeys, expectedKeys, expected = [], [], [] for itemIdxs in [(0, 1), (1, 2)]: keys = [self.dataLocal[idx][0] for idx in itemIdxs] nestedKeys.append(keys) avgKeys = tuple(vstack(keys).mean(axis=0).astype('int16')) expectedKeys.append(avgKeys) avgVals = vstack([self.dataLocal[idx][1] for idx in itemIdxs]).mean(axis=0) expected.append(avgVals) actualSeries = self.series.meanByRegions(nestedKeys) actual = actualSeries.collect() self.__checkReturnedSeriesAttributes(actualSeries) TestSeriesRegionMeanMethods.__checkNestedAsserts(2, expectedKeys, expected, actual) def test_meanByRegions_twoRegionsWithMask(self): expectedKeys, expected = [], [] mask = array([[1, 1], [2, 0]], dtype='uint8') for itemIdxs in [(0, 1), (2, )]: keys = [self.dataLocal[idx][0] for idx in itemIdxs] avgKeys = tuple(vstack(keys).mean(axis=0).astype('int16')) expectedKeys.append(avgKeys) avgVals = vstack([self.dataLocal[idx][1] for idx in itemIdxs]).mean(axis=0) expected.append(avgVals) actualSeries = self.series.meanByRegions(mask) actual = actualSeries.collect() self.__checkReturnedSeriesAttributes(actualSeries) TestSeriesRegionMeanMethods.__checkNestedAsserts(2, expectedKeys, expected, actual)
def test_seriesStatByIndex(self): dataLocal = [((1,), arange(12))] index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] data = Series(self.sc.parallelize(dataLocal), index=index) assert_true(array_equal(data.seriesStatByIndex('sum').values().first(), array([6, 22, 38]))) assert_true(array_equal(data.seriesStatByIndex('mean').values().first(), array([1.5, 5.5, 9.5]))) assert_true(array_equal(data.seriesStatByIndex('min').values().first(), array([0, 4, 8]))) assert_true(array_equal(data.seriesStatByIndex('max').values().first(), array([3, 7, 11]))) assert_true(array_equal(data.seriesStatByIndex('count').values().first(), array([4, 4, 4]))) assert_true(array_equal(data.seriesStatByIndex('median').values().first(), array([1.5, 5.5, 9.5]))) assert_true(array_equal(data.seriesSumByIndex().values().first(), array([6, 22, 38]))) assert_true(array_equal(data.seriesMeanByIndex().values().first(), array([1.5, 5.5, 9.5]))) assert_true(array_equal(data.seriesMinByIndex().values().first(), array([0, 4, 8]))) assert_true(array_equal(data.seriesMaxByIndex().values().first(), array([3, 7, 11]))) assert_true(array_equal(data.seriesCountByIndex().values().first(), array([4, 4, 4]))) assert_true(array_equal(data.seriesMedianByIndex().values().first(), array([1.5, 5.5, 9.5]))) index = [ [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1], [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3] ] data.index = array(index).T result = data.seriesStatByIndex('sum', level=[0, 1]) assert_true(array_equal(result.values().first(), array([1, 14, 13, 38]))) assert_true(array_equal(result.index, array([[0,0], [0, 1], [1, 0], [1, 1]]))) result = data.seriesSumByIndex(level=[0, 1]) assert_true(array_equal(result.values().first(), array([1, 14, 13, 38]))) assert_true(array_equal(result.index, array([[0,0], [0, 1], [1, 0], [1, 1]])))
def test_standardization_axis1(self): rdd = self.sc.parallelize([(0, array([1, 2])), (0, array([3, 4]))]) data = Series(rdd) assert(allclose(data.center(1).first()[1], array([-1, -1]))) assert(allclose(data.standardize(1).first()[1], array([1, 2]))) assert(allclose(data.zscore(1).first()[1], array([-1, -1])))
def test_toTimeSeries(self): from thunder.rdds.timeseries import TimeSeries rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))]) data = Series(rdd) ts = data.toTimeSeries() assert(isinstance(ts, TimeSeries))
def test_detrend(self): rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5]))]) data = Series(rdd).detrend('linear') # detrending linearly increasing data should yield all 0s assert(allclose(data.first()[1], array([0, 0, 0, 0, 0])))
def test_between(self): rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))]) data = Series(rdd).between(0, 1) assert(allclose(data.index, array([0, 1]))) assert(allclose(data.first()[1], array([4, 5])))