def calc(self, data): """Calculate averages. Keys (tuples) are converted into linear indices based on their dimensions Parameters ---------- data : RDD of (tuple, array) pairs, each array of shape (ncols,) Data to compute averages from Returns ------- ts : array, shape (n, ncols) """ dims = getdims(data) data = subtoind(data, dims.max) # loop over indices, averaging time series ts = zeros((self.n, len(data.first()[1]))) for i in range(0, self.n): if len(self.inds[i]) > 0: ts[i, :] = self.select( data, i).map(lambda (k, x): x).sum() / len(self.inds[i]) return ts
def calc(self, data): """Calculate averages. Keys (tuples) are converted into linear indices based on their dimensions Parameters ---------- data : RDD of (tuple, array) pairs, each array of shape (ncols,) Data to compute averages from Returns ------- self : returns an instance of self. """ dims = getdims(data) data = subtoind(data, dims.max) # loop over indices, computing average keys and average values keys = zeros((self.n, len(dims.count()))) values = zeros((self.n, len(data.first()[1]))) for idx, indlist in enumerate(self.inds): if len(indlist) > 0: values[idx, :] = self.select(data, idx).map(lambda (k, x): x).sum() / len(indlist) keys[idx, :] = mean(map(lambda (k, v): k, indtosub(map(lambda k: (k, 0), indlist), dims.max)), axis=0) self.keys = keys self.values = values return self
def calc(self, data): """Calculate averages. Keys (tuples) are converted into linear indices based on their dimensions Parameters ---------- data : RDD of (tuple, array) pairs, each array of shape (ncols,) Data to compute averages from Returns ------- self : returns an instance of self. """ dims = getdims(data) data = subtoind(data, dims.max) # loop over indices, computing average keys and average values keys = zeros((self.n, len(dims.count()))) values = zeros((self.n, len(data.first()[1]))) for idx, indlist in enumerate(self.inds): if len(indlist) > 0: values[idx, :] = self.select( data, idx).map(lambda (k, x): x).sum() / len(indlist) keys[idx, :] = mean(map( lambda (k, v): k, indtosub(map(lambda k: (k, 0), indlist), dims.max)), axis=0) self.keys = keys self.values = values return self
def test_get_dims_array(self): subs = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1), (2, 3, 1), (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2), (1, 3, 2), (2, 3, 2)] data_local = map(lambda x: (x, array([1.0])), subs) dims = getdims(data_local) assert(allclose(dims.max, (2, 3, 2))) assert(allclose(dims.count(), (2, 3, 2))) assert(allclose(dims.min, (1, 1, 1)))
def test_get_dims_array(self): subs = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1), (2, 3, 1), (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2), (1, 3, 2), (2, 3, 2)] data_local = map(lambda x: (x, array([1.0])), subs) dims = getdims(data_local) assert (allclose(dims.max, (2, 3, 2))) assert (allclose(dims.count(), (2, 3, 2))) assert (allclose(dims.min, (1, 1, 1)))
def calc(self, data): """Compute correlation between every data point and the average of a local neighborhood, by correlating each data point with the average of a local neighborhood in x and y (typically time series data) Parameters ---------- data : RDD of (tuple, array) pairs The data to compute correlations on Returns ------- corr : RDD of (tuple, float) pairs The local correlation for each record, sorted by keys """ def clip(val, mn, mx): """Clip a value below by mn and above by mx""" if val < mn: return mn if val > mx: return mx else: return val def maptoneighborhood(ind, ts, sz, mn, mx): """Create a list of key value pairs with multiple shifted copies of the time series ts over a region specified by sz """ rng_x = range(-sz, sz+1, 1) rng_y = range(-sz, sz+1, 1) out = list() for x in rng_x: for y in rng_y: new_x = clip(ind[0] + x, mn[0], mx[0]) new_y = clip(ind[1] + y, mn[1], mx[1]) newind = (new_x, new_y, ind[2]) out.append((newind, ts)) return out # get boundaries using dimension keys dims = getdims(data) # flat map to key value pairs where the key is neighborhood identifier and value is time series neighbors = data.flatMap(lambda (k, v): maptoneighborhood(k, v, self.neighborhood, dims.min[0:2], dims.max[0:2])) # reduce by key to get the average time series for each neighborhood means = neighbors.reduceByKey(lambda x, y: x + y).mapValues(lambda x: x / ((2*self.neighborhood+1)**2)) # join with the original time series data to compute correlations result = data.join(means) # get correlations corr = result.mapValues(lambda x: corrcoef(x[0], x[1])[0, 1]).sortByKey() return corr
def test_get_dims_rdd(self): subs = [ (1, 1, 1), (2, 1, 1), (1, 2, 1), (2, 2, 1), (1, 3, 1), (2, 3, 1), (1, 1, 2), (2, 1, 2), (1, 2, 2), (2, 2, 2), (1, 3, 2), (2, 3, 2), ] data_local = map(lambda x: (x, array([1.0])), subs) data = self.sc.parallelize(data_local) dims = getdims(data) assert allclose(dims.max, (2, 3, 2)) assert allclose(dims.count(), (2, 3, 2)) assert allclose(dims.min, (1, 1, 1))
def calc(self, data): """Calculate averages. Keys (tuples) are converted into linear indices based on their dimensions Parameters ---------- data : RDD of (tuple, array) pairs, each array of shape (ncols,) Data to compute averages from Returns ------- ts : array, shape (n, ncols) """ dims = getdims(data) data = subtoind(data, dims.max) # loop over indices, averaging time series ts = zeros((self.n, len(data.first()[1]))) for i in range(0, self.n): ts[i, :] = self.select(data, i).map(lambda (k, x): x).sum() / len(self.inds[i]) return ts