def test_factorize_converges(self): l = np.array([1e-12, 1e-3, 1e-12, 1e-3], np.dtype('float64')) max_iter = 1000 max_diff = 0.1 d_name = 'factorized' d = self.data[d_name] f = Factorization(d, self.n_features_split, self.n_factors) monitor = f.sfa(max_diff, max_iter, l, True) assert (monitor.n_iter < max_iter) assert (monitor.max_diff_factors[-1] < max_diff) assert (monitor.max_diff_coefficients[-1] < max_diff) self.assertEqual(np.byte_bounds(d), np.byte_bounds(f.data)) self.check_factorization_type_and_shape(f)
def test_factorize_gets_close(self): l2 = 0.1 l = np.array([0.01, l2, 0.02, l2], np.dtype('float64')) max_iter = 1000 max_diff = 1e-6 d_name = 'factorized' d = self.data[d_name] f = Factorization(d, self.n_features_split, self.n_factors) f.sfa(max_diff, max_iter, l) self.assertEqual(np.byte_bounds(d), np.byte_bounds(f.data)) self.check_factorization_type_and_shape(f) coef = f.coefficients * (1 + l2) data_rec = np.dot(coef, f.factors).T np.testing.assert_allclose(d, data_rec, atol=1)
def test_factorize(self): l = np.array([0.01, 1] * len(self.n_features_split), np.dtype('float64')) max_iter = 10 max_diff = 1e-6 for d_name, d in self.data.items(): with (self.subTest(data_name=d_name)): f = Factorization(d, self.n_features_split, self.n_factors) monitor = f.sfa(max_diff, max_iter, l, True) diff = max([ monitor.max_diff_factors[-1], monitor.max_diff_coefficients[-1] ]) assert (monitor.n_iter < max_iter + 1) assert (diff >= 0.0) if (diff > max_diff): self.assertEqual(monitor.n_iter, max_iter) self.assertEqual(np.byte_bounds(d), np.byte_bounds(f.data)) self.check_factorization_type_and_shape(f)
def fit(self, data, n_factors, l1=0, l2=0, max_iter=5000, eps=1e-6, do_monitor=False): """Fit coefficients from data. Runs an EM algorithm to find the best fit. Arguments: data: Input data to find factors in. A `~sfamd.StackedDataMatrix` or list of `~sfamd.DataMatrix`. n_factors: Number of factors to estimate l1: :math:`\ell_1` penalties, possibly per data type (lasso) l2: :math:`\ell_2` penalties, possibly per data type (ridge) max_iter: Maximum number of em iterations to perform. eps: Convergence criterion. If the estimated coefficients don't change more that this, the algorithm stops. """ if not isinstance(data, DataMatrix): data = DataMatrix(data) if not isinstance(data, StackedDataMatrix): data = StackedDataMatrix([data]) if any([n < n_factors for n in data.dt_n_features]): raise Exception("Number of features in each data type needs to be " "higher than the number of factors") lambdas = SFA._penalties_to_array(l1, l2, data.n_dt) self._data = data d = np.require(data.dataW, '=f8', 'F') nfs = np.asarray(data.dt_n_features, np_size_t) self._factorization = Factorization(d, nfs, n_factors) self.monitor = self._factorization.sfa(eps, max_iter, lambdas, do_monitor)
class SFA(): """Sparse factor analysis of multiple data types. This class is an implementation of the sparse factor analysis method that finds common factors of feature in data. Groups of features can be from different data types. Sparsity and residual variance can be different per data type. """ def __init__(self): self._factorization = None @staticmethod def _penalties_to_array(l1, l2, n_dt): """Converts inputs of l1 and l2 penalties in various formats to arrays. A single value is repeated to be the same for all data types. Lists of numbers are converted to arrays and should have a length of ``n_dt``. Arguments: l1 (float or list of float or ndarray): l1 penalty(s) l2 (float or list of float or ndarray): l2 penalty(s) n_dt(int): Number of data types to extend array to if just one l1 or l2 penalty is given. Returns: Tuple of arrays of length ``n_dt`` with l1 and l2 penalties. """ if isinstance(l1, numbers.Real): l1 = [float(l1)] * n_dt if isinstance(l2, numbers.Real): l2 = [float(l2)] * n_dt l1 = np.array(l1, dtype='f8') l2 = np.array(l2, dtype='f8') if not l1.shape == l2.shape == (n_dt, ): raise Exception("Number of data types not consistent among " "parameters.") lambdas = np.array(list(chain(*zip(l1, l2))), dtype='f8') return lambdas def fit(self, data, n_factors, l1=0, l2=0, max_iter=5000, eps=1e-6, do_monitor=False): """Fit coefficients from data. Runs an EM algorithm to find the best fit. Arguments: data: Input data to find factors in. A `~sfamd.StackedDataMatrix` or list of `~sfamd.DataMatrix`. n_factors: Number of factors to estimate l1: :math:`\ell_1` penalties, possibly per data type (lasso) l2: :math:`\ell_2` penalties, possibly per data type (ridge) max_iter: Maximum number of em iterations to perform. eps: Convergence criterion. If the estimated coefficients don't change more that this, the algorithm stops. """ if not isinstance(data, DataMatrix): data = DataMatrix(data) if not isinstance(data, StackedDataMatrix): data = StackedDataMatrix([data]) if any([n < n_factors for n in data.dt_n_features]): raise Exception("Number of features in each data type needs to be " "higher than the number of factors") lambdas = SFA._penalties_to_array(l1, l2, data.n_dt) self._data = data d = np.require(data.dataW, '=f8', 'F') nfs = np.asarray(data.dt_n_features, np_size_t) self._factorization = Factorization(d, nfs, n_factors) self.monitor = self._factorization.sfa(eps, max_iter, lambdas, do_monitor) @property def coefficients(self): """Coefficients of this model, per data type.""" return [self._factorization.coefficients[s, :] for s in self._data.slices] @property def reconstruction_error(self): B = self._factorization.coefficients Z = self._factorization.factors rec = np.dot(B, Z) err = np.sum((self._data.dataW.T - rec)**2, 0) return float(np.mean(err)) def fit_transform(self, data, n_factors, l1=0, l2=0, max_iter=5000, eps=1e-6): self.fit(data, n_factors, l1, l2, max_iter, eps) return self._factorization.factors.T def transform(self, data: DataMatrix) -> np.ndarray: """Gives best estimate of factors for data given coefficients. Arguments: data: Data to determine factors for. """ Psi_inv = np.reciprocal(self._factorization.residual_var) B = self._factorization.coefficients Psi_inv_B = Psi_inv[:, np.newaxis] * B btb = (Psi_inv_B.T).dot(B) btbi = np.linalg.inv(btb + np.identity(btb.shape[0])) eq1 = np.dot(Psi_inv_B, btbi) Z = np.dot(data.dataW, eq1) return Z def monitored_fit(self, data, n_factors, l1=0, l2=0, max_iter=5000, eps=1e-6): self.fit(data, n_factors, l1, l2, max_iter, eps, True) return self._monitor_as_dict() def _monitor_as_dict(self): mon = dict() mon['iteration'] = list(self.monitor.iteration) mon['max_diff_factors'] = list(self.monitor.max_diff_factors) mon['max_diff_coefficients'] = list(self.monitor.max_diff_coefficients) mon['reconstruction_error'] = list(self.monitor.reconstruction_error) mon['explained_variance'] = list(self.monitor.explained_variance) return mon
def test_init(self): for d_name, d in self.data.items(): with (self.subTest(data_name=d_name)): f = Factorization(d, self.n_features_split, self.n_factors) self.assertEqual(np.byte_bounds(d), np.byte_bounds(f.data)) self.check_factorization_type_and_shape(f)