def fromNpyLocal(self, datafile, keyfile=None): """Loads Series data stored in the numpy save() .npy format. `datafile` must refer to a path visible to all workers, such as on NFS or similar mounted shared filesystem. """ data = load(datafile) if data.ndim > 2: raise IOError('Input data must be one or two dimensional') if keyfile: keys = map(lambda x: tuple(x), load(keyfile)) else: keys = arange(0, data.shape[0]) rdd = Series(self.sc.parallelize(zip(keys, data), self.minPartitions), dtype=str(data.dtype)) return rdd
def fromBinary(self, dataPath, ext='bin', confFilename='conf.json', nkeys=None, nvalues=None, keyType=None, valueType=None, newDtype='smallfloat', casting='safe'): """ Load a Series object from a directory of binary files. Parameters ---------- dataPath: string URI or local filesystem path Specifies the directory or files to be loaded. May be formatted as a URI string with scheme (e.g. "file://", "s3n://". If no scheme is present, will be interpreted as a path on the local filesystem. This path must be valid on all workers. Datafile may also refer to a single file, or to a range of files specified by a glob-style expression using a single wildcard character '*'. newDtype: dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat' Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be cast to the requested `newdtype` if not None - see Data `astype()` method. casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's `astype()` method; see numpy documentation for details. """ paramsObj = self.__loadParametersAndDefaults(dataPath, confFilename, nkeys, nvalues, keyType, valueType) self.__checkBinaryParametersAreSpecified(paramsObj) dataPath = self.__normalizeDatafilePattern(dataPath, ext) keyDtype = dtypeFunc(paramsObj.keytype) valDtype = dtypeFunc(paramsObj.valuetype) keySize = paramsObj.nkeys * keyDtype.itemsize recordSize = keySize + paramsObj.nvalues * valDtype.itemsize lines = self.sc.newAPIHadoopFile(dataPath, 'thunder.util.io.hadoop.FixedLengthBinaryInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.BytesWritable', conf={'recordLength': str(recordSize)}) data = lines.map(lambda (_, v): (tuple(int(x) for x in frombuffer(buffer(v, 0, keySize), dtype=keyDtype)), frombuffer(buffer(v, keySize), dtype=valDtype))) return Series(data, dtype=str(valDtype), index=arange(paramsObj.nvalues)).astype(newDtype, casting)
def setUp(self): super(TestLinearRegression, self).setUp() self.X = array([[-0.4309741, 0.43440693, 0.19946369, 1.40428728], [0.54587086, -1.1092286, -0.27258427, 0.35205421], [-0.4432777, 0.40580108, 0.20938645, 0.26480389], [-0.53239659, -0.90966912, -0.13967252, 1.38274305], [0.35731376, 0.39878607, 0.07762888, 1.82299252], [0.36687294, -0.17079843, -0.17765573, 0.87161138], [0.3017848, 1.36537541, 0.91211512, -0.80570055], [-0.72330999, 0.36319617, 0.08986615, -0.7830115], [1.11477831, 0.41631623, 0.11104172, -0.90049209], [-1.62162968, 0.46928843, 0.62996118, 1.08668594]]) self.y0 = array([ 4.57058016, -4.06400691, 4.25957933, 2.01583617, 0.34791879, -0.9113852, 3.41167194, 5.26059279, -2.35116878, 6.28263909 ]) self.y = Series(self.sc.parallelize([((1, ), self.y0)])) self.tol = 1E-3
def predict(self, X): """ Predicts the responses given a design matrix Parameters ---------- X: array Design matrix of shape n x k, where n is the number of samples and k is the number of regressors. Even if an intercept term was fit, should NOT include a column of ones. Returns ------- yhat: Series Series of predictions (each of length n) """ X = self._transforms.apply(X) return Series(self._models.mapValues(lambda v: v.predict(X)))
def generate(self, k=5, npartitions=10, ndims=5, nrecords=100, noise=0.1, seed=None): random.seed(seed) centers = random.randn(k, ndims) genFunc = lambda i: centers[int(floor(random.rand(1, 1) * k)) ] + noise * random.rand(ndims) dataLocal = map(genFunc, range(0, nrecords)) data = Series( self.sc.parallelize(self.appendKeys(dataLocal), npartitions)) if self.returnParams is True: return data, centers else: return data
def fromStack(self, dataPath, dims, ext="stack", blockSize="150M", dtype='int16', newDtype='smallfloat', casting='safe', startIdx=None, stopIdx=None, recursive=False): """Load a Series object directly from binary image stack files. Parameters ---------- dataPath: string Path to data files or directory, specified as either a local filesystem path or in a URI-like format, including scheme. A dataPath argument may include a single '*' wildcard character in the filename. dims: tuple of positive int Dimensions of input image data, ordered with the fastest-changing dimension first. ext: string, optional, default "stack" Extension required on data files to be loaded. blockSize: string formatted as e.g. "64M", "512k", "2G", or positive int. optional, default "150M" Requested size of Series partitions in bytes (or kilobytes, megabytes, gigabytes). dtype: dtype or dtype specifier, optional, default 'int16' Numpy dtype of input stack data newDtype: dtype or dtype specifier or string 'smallfloat' or None, optional, default 'smallfloat' Numpy dtype of output series data. Most methods expect Series data to be floating-point. Input data will be cast to the requested `newdtype` if not None - see Data `astype()` method. casting: 'no'|'equiv'|'safe'|'same_kind'|'unsafe', optional, default 'safe' Casting method to pass on to numpy's `astype()` method; see numpy documentation for details. startIdx, stopIdx: nonnegative int. optional. Indices of the first and last-plus-one data file to load, relative to the sorted filenames matching `dataPath` and `ext`. Interpreted according to python slice indexing conventions. recursive: boolean, default False If true, will recursively descend directories rooted at dataPath, loading all files in the tree that have an extension matching 'ext'. Recursive loading is currently only implemented for local filesystems (not s3). """ seriesBlocks, npointsInSeries, newDtype = \ self._getSeriesBlocksFromStack(dataPath, dims, ext=ext, blockSize=blockSize, dtype=dtype, newDtype=newDtype, casting=casting, startIdx=startIdx, stopIdx=stopIdx, recursive=recursive) return Series(seriesBlocks, dims=dims, dtype=newDtype, index=arange(npointsInSeries))
def test_standardization_axis0(self): rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5], dtype='float16'))]) data = Series(rdd, dtype='float16') centered = data.center(0) standardized = data.standardize(0) zscored = data.zscore(0) assert_equals('float16', centered._dtype) assert_equals('float16', standardized._dtype) assert_equals('float16', zscored._dtype) assert (allclose(centered.first()[1], array([-2, -1, 0, 1, 2]), atol=1e-3)) assert (allclose(standardized.first()[1], array([0.70710, 1.41421, 2.12132, 2.82842, 3.53553]), atol=1e-3)) assert (allclose(zscored.first()[1], array([-1.41421, -0.70710, 0, 0.70710, 1.41421]), atol=1e-3))
def test_selectByIndex(self): dataLocal = [((1, ), arange(12))] index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] data = Series(self.sc.parallelize(dataLocal), index=index) result = data.selectByIndex(1) assert_true(array_equal(result.values().first(), array([4, 5, 6, 7]))) assert_true(array_equal(result.index, array([1, 1, 1, 1]))) result = data.selectByIndex(1, squeeze=True) assert_true(array_equal(result.index, array([0, 1, 2, 3]))) index = [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1], [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3]] data.index = array(index).T result, mask = data.selectByIndex(0, level=2, returnMask=True) assert_true(array_equal(result.values().first(), array([0, 2, 6, 8]))) assert_true( array_equal(result.index, array([[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0]]))) assert_true( array_equal(mask, array([1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0]))) result = data.selectByIndex(0, level=2, squeeze=True) assert_true(array_equal(result.values().first(), array([0, 2, 6, 8]))) assert_true( array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]]))) result = data.selectByIndex([1, 0], level=[0, 1]) assert_true(array_equal(result.values().first(), array([6, 7]))) assert_true(array_equal(result.index, array([[1, 0, 0], [1, 0, 1]]))) result = data.selectByIndex(val=[0, [2, 3]], level=[0, 2]) assert_true(array_equal(result.values().first(), array([4, 5]))) assert_true(array_equal(result.index, array([[0, 1, 2], [0, 1, 3]]))) result = data.selectByIndex(1, level=1, filter=True) assert_true(array_equal(result.values().first(), array([0, 1, 6, 7]))) assert_true( array_equal(result.index, array([[0, 0, 0], [0, 0, 1], [1, 0, 0], [1, 0, 1]])))
def test_linearRegress(self): data = Series( self.sc.parallelize([(1, array([1.5, 2.3, 6.2, 5.1, 3.4, 2.1]))])) x = array([array([1, 0, 0, 0, 0, 0]), array([0, 1, 0, 0, 0, 0])]) model = RegressionModel.load(x, "linear") result = model.fit(data) # check accuracy of results assert (allclose( result.select('betas').values().collect()[0], array([-2.7, -1.9]))) assert (allclose( result.select('stats').values().collect()[0], array([0.42785299]))) assert (allclose( result.select('resid').values().collect()[0], array([0, 0, 2, 0.9, -0.8, -2.1]))) # check indexing of outputs assert (allclose(result.select('betas').index, array([0, 1]))) assert (allclose( result.select('resid').index, array([0, 1, 2, 3, 4, 5]))) assert (result.select('stats').index == ['stats'])
def test_seriesAggregateByIndex(self): dataLocal = [((1, ), arange(12))] index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] data = Series(self.sc.parallelize(dataLocal), index=index) result = data.seriesAggregateByIndex(sum) print result.values().first() assert_true(array_equal(result.values().first(), array([6, 22, 38]))) assert_true(array_equal(result.index, array([0, 1, 2]))) index = [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1], [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3]] data.index = array(index).T result = data.seriesAggregateByIndex(sum, level=[0, 1]) assert_true( array_equal(result.values().first(), array([1, 14, 13, 38]))) assert_true( array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]])))
def test_bilinearRegress(self): data = Series( self.sc.parallelize([(1, array([1.5, 2.3, 6.2, 5.1, 3.4, 2.1]))])) x1 = array([array([1, 0, 1, 0, 1, 0]), array([0, 1, 0, 1, 0, 1])]) x2 = array([ array([1, 1, 0, 0, 0, 0]), array([0, 0, 1, 1, 0, 0]), array([0, 0, 0, 0, 1, 1]) ]) model = RegressionModel.load((x1, x2), "bilinear") result = model.fit(data) tol = 1E-4 # to handle rounding errors assert (allclose(result.select('betas').values().collect()[0], array([-3.1249, 5.6875, 0.4375]), atol=tol)) assert (allclose( result.select('stats').values().collect()[0], array([0.6735]), tol)) assert (allclose(result.select('resid').values().collect()[0], array([0, -0.8666, 0, 1.9333, 0, -1.0666]), atol=tol))
def localCorr(self, neighborhood): """ Correlate every signal to the average of its local neighborhood. This algorithm computes, for every spatial record, the correlation coefficient between that record's series, and the average series of all records within a local neighborhood with a size defined by the neighborhood parameter. For data with three spatial keys, only neighborhoods in x and y currently supported. Parameters ---------- neighborhood : integer Size of neighborhood, describes extent in either direction, so total neighborhood will be 2n + 1. """ if len(self.dims.max) not in [2, 3]: raise NotImplementedError( 'keys must have 2 or 3 dimensions to compute local correlations' ) # flat map to key value pairs where the key is neighborhood identifier and value is time series neighbors = self.mapToNeighborhood(neighborhood) # reduce by key to get the average time series for each neighborhood means = neighbors.rdd.reduceByKey(lambda x, y: x + y).mapValues( lambda x: x / ((2 * neighborhood + 1)**2)) # join with the original time series data to compute correlations result = self.rdd.join(means) # get correlations corr = result.mapValues(lambda x: corrcoef(x[0], x[1])[0, 1]) # force sorting, but reverse keys for correct ordering output = corr.map(lambda (k, v): (k[::-1], v)).sortByKey().map( lambda (k, v): (k[::-1], v)) return Series(output, index='correlation').__finalize__(self)
def fromArraysAsImages(self, arrays): """Create a Series object from a sequence of numpy ndarrays resident in memory on the driver. The arrays will be interpreted as though each represents a single time point - effectively the same as if converting Images to a Series, with each array representing a volume image at a particular point in time. Thus in the resulting Series, the value of the record with key (0,0,0) will be array([arrays[0][0,0,0], arrays[1][0,0,0],... arrays[n][0,0,0]). The dimensions of the resulting Series will be *opposite* that of the passed numpy array. Their dtype will not be changed. """ # if passed a single array, cast it to a sequence of length 1 if isinstance(arrays, ndarray): arrays = [arrays] # check that shapes of passed arrays are consistent shape = arrays[0].shape dtype = arrays[0].dtype for ary in arrays: if not ary.shape == shape: raise ValueError( "Inconsistent array shapes: first array had shape %s, but other array has shape %s" % (str(shape), str(ary.shape))) if not ary.dtype == dtype: raise ValueError( "Inconsistent array dtypes: first array had dtype %s, but other array has dtype %s" % (str(dtype), str(ary.dtype))) # get indices so that fastest index changes first shapeiters = (xrange(n) for n in shape) keys = [idx[::-1] for idx in itertools.product(*shapeiters)] values = vstack([ary.ravel() for ary in arrays]).T dims = Dimensions.fromTuple(shape[::-1]) return Series(self.sc.parallelize(zip(keys, values), self.minPartitions), dims=dims, dtype=str(dtype))
def test_meanByRegions_twoRegions(self): dataLocal = [((0, 0), array([1.0, 2.0, 3.0])), ((0, 1), array([2.0, 2.0, 4.0])), ((1, 0), array([4.0, 2.0, 1.0])), ((1, 1), array([3.0, 1.0, 1.0]))] series = Series(self.sc.parallelize(dataLocal)) nestedKeys, expectedKeys, expected = [], [], [] expectedKeys = [] for itemIdxs in [(0, 1), (1, 2)]: keys = [dataLocal[idx][0] for idx in itemIdxs] nestedKeys.append(keys) avgKeys = tuple(vstack(keys).mean(axis=0).astype('int16')) expectedKeys.append(avgKeys) avgVals = vstack([dataLocal[idx][1] for idx in itemIdxs]).mean(axis=0) expected.append(avgVals) actualSeries = series.meanByRegion(nestedKeys) actual = actualSeries.collect() assert_equals(2, len(actual)) for regionIdx in xrange(2): assert_equals(expectedKeys[regionIdx], actual[regionIdx][0]) assert_true(array_equal(expected[regionIdx], actual[regionIdx][1]))
def fit(self, data): """ Fit a mass univariate tuning model. Parameters ---------- data : Series or a subclass (e.g. RowMatrix) The data to fit tuning models to, a collection of key-value pairs where the keys are identifiers and the values are one-dimensional arrays Returns ------- params : RDD of (tuple, array) pairs Fitted tuning parameters for each record """ if not (isinstance(data, Series)): raise Exception( 'Input must be Series or a subclass (e.g. RowMatrix)') return Series(data.rdd.mapValues(lambda x: self.get(x)), index=['center', 'spread']).__finalize__(data)
def score(self, X, y): """ Computes R-squared values for a single design matrix and multiple responses. Parameters ---------- X: array Design matrix of shape n x k, where n is the number of samples and k is the number of regressors. Even if an intercept term was fit, should NOT include a column of ones. y: Series Series of response variables where each record is a vector of length n, where n is the number of samples. Returns ------- scores: Series Series of R-squared values. """ X = self._transforms.apply(X) joined = self._models.join(y.rdd) newrdd = joined.mapValues(lambda (model, y): model.stats(X, y)) return Series(newrdd)
def localCorr(self, neighborhood): if len(self.dims.max) not in [2, 3]: raise NotImplementedError( 'keys must have 2 or 3 dimensions to compute local correlations' ) # flat map to key value pairs where the key is neighborhood identifier and value is time series neighbors = self.mapToNeighborhood(neighborhood) # reduce by key to get the average time series for each neighborhood means = neighbors.rdd.reduceByKey(lambda x, y: x + y).mapValues( lambda x: x / ((2 * neighborhood + 1)**2)) # join with the original time series data to compute correlations result = self.rdd.join(means) # get correlations corr = result.mapValues(lambda x: corrcoef(x[0], x[1])[0, 1]) # force sorting, but reverse keys for correct ordering output = corr.map(lambda (k, v): (k[::-1], v)).sortByKey().map( lambda (k, v): (k[::-1], v)) return Series(output, index='correlation').__finalize__(self)
def fit(self, data, featureset=None): """ Run classification on each record in a data set Parameters ---------- data: Series or a subclass (e.g. RowMatrix) Data to perform classification on, must be a collection of key-value pairs where the keys are identifiers and the values are one-dimensional arrays featureset : array, optional, default = None Which features to use Returns ------- perf : Series The performance of the classifer for each record """ if not isinstance(data, Series): raise Exception( 'Input must be Series or a subclass (e.g. RowMatrix)') if self.nfeatures == 1: perf = data.rdd.mapValues(lambda x: [self.get(x)]) else: if featureset is None: featureset = [[self.features[0]]] for i in featureset: assert array([item in i for item in self.features ]).sum() != 0, "Feature set invalid" perf = data.rdd.mapValues( lambda x: asarray(map(lambda i: self.get(x, i), featureset))) return Series(perf, index='performance').__finalize__(data)
def fit(self, mat): """ Calcuate the non-negative matrix decomposition. Parameters ---------- mat : Series or a subclass (e.g. RowMatrix) Data to estimate independent components from, must be a collection of key-value pairs where the keys are identifiers and the values are one-dimensional arrays Returns ---------- self : returns an instance of self. """ from numpy import add, any, diag, dot, inf, maximum, outer, sqrt, apply_along_axis from numpy.linalg import inv, norm, pinv from numpy.random import rand if not (isinstance(mat, Series)): raise Exception( 'Input must be Series or a subclass (e.g. RowMatrix)') mat = mat.rdd # a helper function to take the Frobenius norm of two zippable RDDs def rddFrobeniusNorm(A, B): return sqrt( A.zip(B).map(lambda ((keyA, x), (keyB, y)): sum( (x - y)**2)).reduce(add)) # input checking k = self.k if k < 1: raise ValueError("Supplied k must be greater than 1.") m = mat.values().first().size if self.h0 is not None: if any(self.h0 < 0): raise ValueError("Supplied h0 contains negative entries.") # alternating least-squares implementation if self.method == "als": # initialize NMF and begin als algorithm if self.verbose: print "Initializing NMF" alsIter = 0 hConvCurr = 100 if self.h0 is None: # noinspection PyUnresolvedReferences self.h0 = rand(k, m) h = self.h0 w = None # goal is to solve R = WH subject to all entries of W,H >= 0 # by iteratively updating W and H with least squares and clipping negative values while (alsIter < self.maxIter) and (hConvCurr > self.tol): # update values on iteration hOld = h wOld = w # precompute pinv(H) = inv(H' x H) * H' (easy here because h is an np array) # the rows of H should be a basis of dimension k, so in principle we could just compute directly pinvH = pinv(h) # update W using least squares row-wise with R * pinv(H); then clip negative values to 0 w = mat.mapValues(lambda x: dot(x, pinvH)) # clip negative values of W # noinspection PyUnresolvedReferences w = w.mapValues(lambda x: maximum(x, 0)) # precompute inv(W' * W) to get inv_gramian_w, a np array # We have chosen k to be small, i.e., rank(W) = k, so W'*W is invertible gramianW = w.values().map(lambda x: outer(x, x)).reduce(add) invGramianW = inv(gramianW) # pseudoinverse of W is inv(W' * W) * W' = inv_gramian_w * w pinvW = w.mapValues(lambda x: dot(invGramianW, x)) # update H using least squares row-wise with inv(W' * W) * W * R (same as pinv(W) * R) h = pinvW.values().zip( mat.values()).map(lambda (x, y): outer(x, y)).reduce(add) # clip negative values of H # noinspection PyUnresolvedReferences h = maximum(h, 0) # normalize the rows of H # noinspection PyUnresolvedReferences h = dot(diag(1 / maximum(apply_along_axis(norm, 1, h), 0.001)), h) # estimate convergence hConvCurr = norm(h - hOld) self.hConvergence.append(hConvCurr) if self.wConvergence is not None: if wOld is not None: self.wConvergence.append(rddFrobeniusNorm(w, wOld)) else: self.wConvergence.append(inf) # calculate reconstruction error if self.reconHist == 'all': recData = w.mapValues(lambda x: dot(x, h)) self.reconErr.append(rddFrobeniusNorm(mat, recData)) # report progress if self.verbose: print "finished als iteration %d with convergence = %.6f in H" % ( alsIter, hConvCurr) # increment count alsIter += 1 # report on convergence if self.verbose: if hConvCurr <= self.tol: print "Converged to specified tolerance." else: print "Warning: reached maxiter without converging to specified tolerance." # calculate reconstruction error if self.reconHist == 'final': recData = w.mapValues(lambda x: dot(x, h)) self.reconErr = rddFrobeniusNorm(mat, recData) # report results self.h = h # TODO: need to propagate metadata through to this new Series object self.w = Series(w) else: raise Exception("Algorithm %s is not supported" % self.method) return self
def test_toTimeSeries(self): from thunder.rdds.timeseries import TimeSeries rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))]) data = Series(rdd) ts = data.toTimeSeries() assert(isinstance(ts, TimeSeries))
def test_detrend(self): rdd = self.sc.parallelize([(0, array([1, 2, 3, 4, 5]))]) data = Series(rdd).detrend('linear') # detrending linearly increasing data should yield all 0s assert (allclose(data.first()[1], array([0, 0, 0, 0, 0])))
def fit(self, mat): """ Calcuate the non-negative matrix decomposition. Parameters ---------- mat : Series or a subclass (e.g. RowMatrix) Data to estimate independent components from, must be a collection of key-value pairs where the keys are identifiers and the values are one-dimensional arrays Returns ---------- self : returns an instance of self. """ import numpy as np if not (isinstance(mat, Series)): raise Exception('Input must be Series or a subclass (e.g. RowMatrix)') mat = mat.rdd # a helper function to take the Frobenius norm of two zippable RDDs def rddFrobeniusNorm(A, B): return np.sqrt(A.zip(B).map(lambda ((key_a, x), (key_b, y)): sum((x - y) ** 2)).reduce(np.add)) # input checking k = self.k if k < 1: raise ValueError("Supplied k must be greater than 1.") m = mat.values().first().size if self.h0 is not None: if np.any(self.h0 < 0): raise ValueError("Supplied h0 contains negative entries.") # alternating least-squares implementation if self.method == "als": # initialize NMF and begin als algorithm print "Initializing NMF" als_iter = 0 h_conv_curr = 100 if self.h0 is None: # noinspection PyUnresolvedReferences self.h0 = np.random.rand(k, m) h = self.h0 w = None # goal is to solve R = WH subject to all entries of W,H >= 0 # by iteratively updating W and H with least squares and clipping negative values while (als_iter < self.maxiter) and (h_conv_curr > self.tol): # update values on iteration h_old = h w_old = w # precompute pinv(H) = inv(H' x H) * H' (easy here because h is an np array) # the rows of H should be a basis of dimension k, so in principle we could just compute directly pinv_h = np.linalg.pinv(h) # update W using least squares row-wise with R * pinv(H); then clip negative values to 0 w = mat.mapValues(lambda x: np.dot(x, pinv_h)) # clip negative values of W # noinspection PyUnresolvedReferences w = w.mapValues(lambda x: np.maximum(x, 0)) # precompute inv(W' * W) to get inv_gramian_w, a np array # We have chosen k to be small, i.e., rank(W) = k, so W'*W is invertible gramian_w = w.values().map(lambda x: np.outer(x, x)).reduce(np.add) inv_gramian_w = np.linalg.inv(gramian_w) # pseudoinverse of W is inv(W' * W) * W' = inv_gramian_w * w pinv_w = w.mapValues(lambda x: np.dot(inv_gramian_w, x)) # update H using least squares row-wise with inv(W' * W) * W * R (same as pinv(W) * R) h = pinv_w.values().zip(mat.values()).map(lambda (x, y): np.outer(x, y)).reduce(np.add) # clip negative values of H # noinspection PyUnresolvedReferences h = np.maximum(h, 0) # normalize the rows of H # noinspection PyUnresolvedReferences h = np.dot(np.diag(1 / np.maximum(np.linalg.norm(h, axis=1), 0.001)), h) # estimate convergence h_conv_curr = np.linalg.norm(h-h_old) self.h_convergence.append(h_conv_curr) if self.w_convergence is not None: if w_old is not None: self.w_convergence.append(rddFrobeniusNorm(w, w_old)) else: self.w.convergence.append(np.inf) # calculate reconstruction error if self.recon_hist == 'all': rec_data = w.mapValues(lambda x: np.dot(x, h)) self.recon_err.append(rddFrobeniusNorm(mat, rec_data)) # report progress print "finished als iteration %d with convergence = %.6f in H" % (als_iter, h_conv_curr) # increment count als_iter += 1 # report on convergence if h_conv_curr <= self.tol: print "Converged to specified tolerance." else: print "Warning: reached maxiter without converging to specified tolerance." # calculate reconstruction error if self.recon_hist == 'final': rec_data = w.mapValues(lambda x: np.dot(x, h)) self.recon_err = rddFrobeniusNorm(mat, rec_data) # report results self.h = h self.w = Series(w) else: print "Error: %s is not a supported algorithm." % self.method return self
def get_local_corr(self, data, neighborhood, images=False): rdd = self.sc.parallelize(data) imgs = Images(rdd) if images else Series(rdd).toImages() return imgs.localCorr(neighborhood=neighborhood)
def test_seriesStatByIndex(self): dataLocal = [((1, ), arange(12))] index = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] data = Series(self.sc.parallelize(dataLocal), index=index) assert_true( array_equal( data.seriesStatByIndex('sum').values().first(), array([6, 22, 38]))) assert_true( array_equal( data.seriesStatByIndex('mean').values().first(), array([1.5, 5.5, 9.5]))) assert_true( array_equal( data.seriesStatByIndex('min').values().first(), array([0, 4, 8]))) assert_true( array_equal( data.seriesStatByIndex('max').values().first(), array([3, 7, 11]))) assert_true( array_equal( data.seriesStatByIndex('count').values().first(), array([4, 4, 4]))) assert_true( array_equal( data.seriesStatByIndex('median').values().first(), array([1.5, 5.5, 9.5]))) assert_true( array_equal(data.seriesSumByIndex().values().first(), array([6, 22, 38]))) assert_true( array_equal(data.seriesMeanByIndex().values().first(), array([1.5, 5.5, 9.5]))) assert_true( array_equal(data.seriesMinByIndex().values().first(), array([0, 4, 8]))) assert_true( array_equal(data.seriesMaxByIndex().values().first(), array([3, 7, 11]))) assert_true( array_equal(data.seriesCountByIndex().values().first(), array([4, 4, 4]))) assert_true( array_equal(data.seriesMedianByIndex().values().first(), array([1.5, 5.5, 9.5]))) index = [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1], [0, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 3]] data.index = array(index).T result = data.seriesStatByIndex('sum', level=[0, 1]) assert_true( array_equal(result.values().first(), array([1, 14, 13, 38]))) assert_true( array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]]))) result = data.seriesSumByIndex(level=[0, 1]) assert_true( array_equal(result.values().first(), array([1, 14, 13, 38]))) assert_true( array_equal(result.index, array([[0, 0], [0, 1], [1, 0], [1, 1]])))
def test_between(self): rdd = self.sc.parallelize([(0, array([4, 5, 6, 7])), (1, array([8, 9, 10, 11]))]) data = Series(rdd).between(0, 1) assert (allclose(data.index, array([0, 1]))) assert (allclose(data.first()[1], array([4, 5])))