def test_cumulative_partition(): n, p = 1000, 1000 a = np.random.randn(n, p) for f in [.001, .1, .25, .5]: parts = array_constant_partition(a.shape, f=f, min_size=1) cum_parts = cumulative_partition(parts) for i, cum_part in enumerate(cum_parts): a_actual = a[:int((i + 1) * n * f), :] a_part_stack = np.vstack([a[p, :] for p in parts[:i + 1]]) a_cum = a[cum_part, :] np.testing.assert_array_equal(a_actual, a_part_stack) np.testing.assert_array_equal(a_actual, a_cum) parts = array_constant_partition(a.shape, f=f, min_size=1, axis=1) cum_parts = cumulative_partition(parts) for i, cum_part in enumerate(cum_parts): a_actual = a[:, 0:int((i + 1) * p * f)] a_part_stack = np.hstack([a[:, p] for p in parts[:i + 1]]) a_cum = a[:, cum_part] np.testing.assert_array_equal(a_actual, a_part_stack) np.testing.assert_array_equal(a_actual, a_cum)
def test_array_constant_partition_sizes(): a = np.random.rand(10, 20) parts = array_constant_partition(a.shape, f=.1, min_size=1) assert len(parts) == 10 parts = array_constant_partition(a.shape, f=.1, min_size=5) assert len(parts) == 2 parts = array_constant_partition(a.shape, f=.1, axis=1, min_size=1) assert len(parts) == 10 parts = array_constant_partition(a.shape, f=.1, axis=1, min_size=5) assert len(parts) == 4
def test_array_constant_partition_array(): n, p = 1000, 2000 a = np.random.rand(1000, 2000) for f in [0.01, .1, .25, .5]: parts = array_constant_partition(a.shape, f=f) for i, part in enumerate(parts): np.testing.assert_array_equal( a[int(i * f * n):int((i + 1) * (f * n))], a[part, :]) parts = array_constant_partition(a.shape, f=f, axis=1) for i, part in enumerate(parts): np.testing.assert_array_equal( a[:, int(i * f * p):int((i + 1) * (f * p))], a[:, part])
def test_array_constant_partition_bad_size(): array_constant_partition((10, 10), f=.5) with pytest.raises(ValueError): array_constant_partition((10, 10), f=.51) with pytest.raises(ValueError): array_constant_partition((10, 10, 10), f=.51)
def svd(self, array: LargeArrayType, verbose: bool = True, return_history: bool = False, **kwargs): self._reset_history() self.history.time['start'] = time.time() self.array = da.array(array) if self.factor == 'n': self.factor = self.array.shape[0] elif self.factor == 'p': self.factor = self.array.shape[1] elif self.factor is None: self.factor = False vec_t = self.k + self.buffer partitions = array_constant_partition(self.array.shape, f=self.f, min_size=vec_t) partitions = cumulative_partition(partitions) sub_array = self.array[partitions[0], :] if self.sub_svd_start == 'warm': x = sub_svd_init( sub_array, k=vec_t, warm_start_row_factor=self.init_row_sampling_factor, log=0) else: x = rnormal_start(sub_array, k=vec_t, log=0) x = sub_array.T.dot(x) for part in tqdm(partitions[:-1], disable=not verbose): _PM = _vPowerMethod(v_start=x, k=self.k, buffer=self.buffer, max_iter=self.max_iter, factor=self.factor, scoring_method=self.scoring_method, tol=self.tol, warn=self.warn) x = _PM.svd(self.array[part, :], verbose=False, **{ 'mask_nan': False, 'transpose': False }) self.history.iter['last_value'] = _PM.history.iter['last_value'] self.history.iter['sub_svd'].append( self.history.iter['last_value']) if self.lmbd: c_norms = np.linalg.norm(x, 2, axis=0) x *= (1 - self.lmbd) x += (self.lmbd * c_norms / np.sqrt(x.shape[0])) * da.random.normal(size=x.shape) if 'v-subspace' in self.scoring_method: self.history.iter['V'].append(_PM.history.iter['V'][-1]) self.history.iter['S'].append(_PM.history.iter['S'][-1]) self.history.acc['sub_svd_acc'].append(_PM.history.acc) self.history.time['iter'].append(_PM.history.time) _PM = _vPowerMethod(v_start=x, k=self.k, buffer=self.buffer, max_iter=self.max_iter, factor=self.factor, scoring_method=self.scoring_method, tol=self.tol, full_svd=True, warn=self.warn) _PM.svd(self.array, verbose=False, **{ 'mask_nan': False, 'transpose': False }) self.history.iter['last_value'] = _PM.history.iter['last_value'] self.history.iter['sub_svd'].append(self.history.iter['last_value']) if return_history: return self.history else: return self.history.iter['last_value']