def monotonicity_prox(tensor, decreasing=False): """ This function projects each column of the input array on the set of arrays so that x[1] <= x[2] <= ... <= x[n] (decreasing=False) or x[1] >= x[2] >= ... >= x[n] (decreasing=True) is satisfied columnwise. Parameters ---------- tensor : ndarray decreasing : If it is True, function returns columnwise monotone decreasing tensor. Otherwise, returned array will be monotone increasing. Default: True Returns ------- ndarray A tensor of which columns' are monotonic. References ---------- .. [1]: G. Chierchia, E. Chouzenoux, P. L. Combettes, and J.-C. Pesquet "The Proximity Operator Repository. User's guide" """ if tl.ndim(tensor) == 1: tensor = tl.reshape(tensor, [tl.shape(tensor)[0], 1]) elif tl.ndim(tensor) > 2: raise ValueError( "Monotonicity prox doesn't support an input which has more than 2 dimensions." ) tensor_mon = tl.copy(tensor) if decreasing: tensor_mon = tl.flip(tensor_mon, axis=0) row, column = tl.shape(tensor_mon) cum_sum = tl.cumsum(tensor_mon, axis=0) for j in range(column): assisted_tensor = tl.zeros([row, row]) for i in range(row): if i == 0: assisted_tensor = tl.index_update( assisted_tensor, tl.index[i, i:], cum_sum[i:, j] / tl.tensor(tl.arange(row - i) + 1, **tl.context(tensor))) else: assisted_tensor = tl.index_update( assisted_tensor, tl.index[i, i:], (cum_sum[i:, j] - cum_sum[i - 1, j]) / tl.tensor(tl.arange(row - i) + 1, **tl.context(tensor))) tensor_mon = tl.index_update(tensor_mon, tl.index[:, j], tl.max(assisted_tensor, axis=0)) for i in reversed(range(row - 1)): if tensor_mon[i, j] > tensor_mon[i + 1, j]: tensor_mon = tl.index_update(tensor_mon, tl.index[i, j], tensor_mon[i + 1, j]) if decreasing: tensor_mon = tl.flip(tensor_mon, axis=0) return tensor_mon
def test_tucker(monkeypatch): """Test for the Tucker decomposition""" rng = tl.check_random_state(1234) tol_norm_2 = 10e-3 tol_max_abs = 10e-1 tensor = tl.tensor(rng.random_sample((3, 4, 3))) core, factors = tucker(tensor, rank=None, n_iter_max=200, verbose=True) reconstructed_tensor = tucker_to_tensor((core, factors)) norm_rec = tl.norm(reconstructed_tensor, 2) norm_tensor = tl.norm(tensor, 2) assert((norm_rec - norm_tensor)/norm_rec < tol_norm_2) # Test the max abs difference between the reconstruction and the tensor assert(tl.max(tl.abs(reconstructed_tensor - tensor)) < tol_max_abs) # Test the shape of the core and factors ranks = [2, 3, 1] core, factors = tucker(tensor, rank=ranks, n_iter_max=100, verbose=1) for i, rank in enumerate(ranks): assert_equal(factors[i].shape, (tensor.shape[i], ranks[i]), err_msg="factors[{}].shape={}, expected {}".format( i, factors[i].shape, (tensor.shape[i], ranks[i]))) assert_equal(tl.shape(core)[i], rank, err_msg="Core.shape[{}]={}, " "expected {}".format(i, core.shape[i], rank)) # try fixing the core factors_init = [tl.copy(f) for f in factors] _, factors = tucker(tensor, rank=ranks, init=(core, factors), fixed_factors=[1], n_iter_max=100, verbose=1) assert_array_equal(factors[1], factors_init[1]) # Random and SVD init should converge to a similar solution tol_norm_2 = 10e-1 tol_max_abs = 10e-1 core_svd, factors_svd = tucker(tensor, rank=[3, 4, 3], n_iter_max=200, init='svd', verbose=1) core_random, factors_random = tucker(tensor, rank=[3, 4, 3], n_iter_max=200, init='random', random_state=1234) rec_svd = tucker_to_tensor((core_svd, factors_svd)) rec_random = tucker_to_tensor((core_random, factors_random)) error = tl.norm(rec_svd - rec_random, 2) error /= tl.norm(rec_svd, 2) assert_(error < tol_norm_2, 'norm 2 of difference between svd and random init too high') assert_(tl.max(tl.abs(rec_svd - rec_random)) < tol_max_abs, 'abs norm of difference between svd and random init too high') assert_class_wrapper_correctly_passes_arguments(monkeypatch, tucker, Tucker, ignore_args={}, rank=3)
def hard_thresholding(tensor, number_of_non_zero): """ Proximal operator of the l0 ``norm'' Keeps greater "number_of_non_zero" elements untouched and sets other elements to zero. Parameters ---------- tensor : ndarray number_of_non_zero : int Returns ------- ndarray Thresholded tensor on which the operator has been applied """ tensor_vec = tl.copy(tl.tensor_to_vec(tensor)) sorted_indices = tl.argsort(tl.argsort(tl.abs(tensor_vec), axis=0, descending=True), axis=0) return tl.reshape( tl.where(sorted_indices < number_of_non_zero, tensor_vec, tl.tensor(0, **tl.context(tensor_vec))), tensor.shape)
def parafac(tensor, rank, n_iter_max=100, init='svd', svd='numpy_svd',\ normalize_factors=False, orthogonalise=False,\ tol=1e-8, random_state=None,\ verbose=0, return_errors=False,\ sparsity = None,\ l2_reg = 0, mask=None,\ cvg_criterion = 'abs_rec_error',\ fixed_modes = [], svd_mask_repeats=5, linesearch = False): """CANDECOMP/PARAFAC decomposition via alternating least squares (ALS) Computes a rank-`rank` decomposition of `tensor` [1]_ such that:: tensor = [|weights; factors[0], ..., factors[-1] |]. Parameters ---------- tensor : ndarray rank : int Number of components. n_iter_max : int Maximum number of iteration init : {'svd', 'random'}, optional Type of factor matrix initialization. See `initialize_factors`. svd : str, default is 'numpy_svd' function to use to compute the SVD, acceptable values in tensorly.SVD_FUNS normalize_factors : if True, aggregate the weights of each factor in a 1D-tensor of shape (rank, ), which will contain the norms of the factors tol : float, optional (Default: 1e-6) Relative reconstruction error tolerance. The algorithm is considered to have found the global minimum when the reconstruction error is less than `tol`. random_state : {None, int, np.random.RandomState} verbose : int, optional Level of verbosity return_errors : bool, optional Activate return of iteration errors mask : ndarray array of booleans with the same shape as ``tensor`` should be 0 where the values are missing and 1 everywhere else. Note: if tensor is sparse, then mask should also be sparse with a fill value of 1 (or True). Allows for missing values [2]_ cvg_criterion : {'abs_rec_error', 'rec_error'}, optional Stopping criterion for ALS, works if `tol` is not None. If 'rec_error', ALS stops at current iteration if ``(previous rec_error - current rec_error) < tol``. If 'abs_rec_error', ALS terminates when `|previous rec_error - current rec_error| < tol`. sparsity : float or int If `sparsity` is not None, we approximate tensor as a sum of low_rank_component and sparse_component, where low_rank_component = cp_to_tensor((weights, factors)). `sparsity` denotes desired fraction or number of non-zero elements in the sparse_component of the `tensor`. fixed_modes : list, default is [] A list of modes for which the initial value is not modified. The last mode cannot be fixed due to error computation. svd_mask_repeats: int If using a tensor with masked values, this initializes using SVD multiple times to remove the effect of these missing values on the initialization. linesearch : bool, default is False Whether to perform line search as proposed by Bro [3]. Returns ------- CPTensor : (weight, factors) * weights : 1D array of shape (rank, ) * all ones if normalize_factors is False (default) * weights of the (normalized) factors otherwise * factors : List of factors of the CP decomposition element `i` is of shape ``(tensor.shape[i], rank)`` * sparse_component : nD array of shape tensor.shape. Returns only if `sparsity` is not None. errors : list A list of reconstruction errors at each iteration of the algorithms. References ---------- .. [1] T.G.Kolda and B.W.Bader, "Tensor Decompositions and Applications", SIAM REVIEW, vol. 51, n. 3, pp. 455-500, 2009. .. [2] Tomasi, Giorgio, and Rasmus Bro. "PARAFAC and missing values." Chemometrics and Intelligent Laboratory Systems 75.2 (2005): 163-180. .. [3] R. Bro, "Multi-Way Analysis in the Food Industry: Models, Algorithms, and Applications", PhD., University of Amsterdam, 1998 """ rank = validate_cp_rank(tl.shape(tensor), rank=rank) if orthogonalise and not isinstance(orthogonalise, int): orthogonalise = n_iter_max if linesearch: acc_pow = 2.0 # Extrapolate to the iteration^(1/acc_pow) ahead acc_fail = 0 # How many times acceleration have failed max_fail = 4 # Increase acc_pow with one after max_fail failure weights, factors = initialize_cp(tensor, rank, init=init, svd=svd, random_state=random_state, normalize_factors=normalize_factors) if mask is not None and init == "svd": for _ in range(svd_mask_repeats): tensor = tensor*mask + tl.cp_to_tensor((weights, factors), mask=1-mask) weights, factors = initialize_cp(tensor, rank, init=init, svd=svd, random_state=random_state, normalize_factors=normalize_factors) rec_errors = [] norm_tensor = tl.norm(tensor, 2) Id = tl.eye(rank, **tl.context(tensor))*l2_reg if tl.ndim(tensor)-1 in fixed_modes: warnings.warn('You asked for fixing the last mode, which is not supported.\n The last mode will not be fixed. Consider using tl.moveaxis()') fixed_modes.remove(tl.ndim(tensor)-1) modes_list = [mode for mode in range(tl.ndim(tensor)) if mode not in fixed_modes] if sparsity: sparse_component = tl.zeros_like(tensor) if isinstance(sparsity, float): sparsity = int(sparsity * np.prod(tensor.shape)) else: sparsity = int(sparsity) for iteration in range(n_iter_max): if orthogonalise and iteration <= orthogonalise: factors = [tl.qr(f)[0] if min(tl.shape(f)) >= rank else f for i, f in enumerate(factors)] if linesearch and iteration % 2 == 0: factors_last = [tl.copy(f) for f in factors] weights_last = tl.copy(weights) if verbose > 1: print("Starting iteration", iteration + 1) for mode in modes_list: if verbose > 1: print("Mode", mode, "of", tl.ndim(tensor)) pseudo_inverse = tl.tensor(np.ones((rank, rank)), **tl.context(tensor)) for i, factor in enumerate(factors): if i != mode: pseudo_inverse = pseudo_inverse*tl.dot(tl.conj(tl.transpose(factor)), factor) pseudo_inverse += Id if not iteration and weights is not None: # Take into account init weights mttkrp = unfolding_dot_khatri_rao(tensor, (weights, factors), mode) else: mttkrp = unfolding_dot_khatri_rao(tensor, (None, factors), mode) factor = tl.transpose(tl.solve(tl.conj(tl.transpose(pseudo_inverse)), tl.transpose(mttkrp))) if normalize_factors: scales = tl.norm(factor, 2, axis=0) weights = tl.where(scales==0, tl.ones(tl.shape(scales), **tl.context(factor)), scales) factor = factor / tl.reshape(weights, (1, -1)) factors[mode] = factor # Will we be performing a line search iteration if linesearch and iteration % 2 == 0 and iteration > 5: line_iter = True else: line_iter = False # Calculate the current unnormalized error if we need it if (tol or return_errors) and line_iter is False: unnorml_rec_error, tensor, norm_tensor = error_calc(tensor, norm_tensor, weights, factors, sparsity, mask, mttkrp) else: if mask is not None: tensor = tensor*mask + tl.cp_to_tensor((weights, factors), mask=1-mask) # Start line search if requested. if line_iter is True: jump = iteration ** (1.0 / acc_pow) new_weights = weights_last + (weights - weights_last) * jump new_factors = [factors_last[ii] + (factors[ii] - factors_last[ii])*jump for ii in range(tl.ndim(tensor))] new_rec_error, new_tensor, new_norm_tensor = error_calc(tensor, norm_tensor, new_weights, new_factors, sparsity, mask) if (new_rec_error / new_norm_tensor) < rec_errors[-1]: factors, weights = new_factors, new_weights tensor, norm_tensor = new_tensor, new_norm_tensor unnorml_rec_error = new_rec_error acc_fail = 0 if verbose: print("Accepted line search jump of {}.".format(jump)) else: unnorml_rec_error, tensor, norm_tensor = error_calc(tensor, norm_tensor, weights, factors, sparsity, mask, mttkrp) acc_fail += 1 if verbose: print("Line search failed for jump of {}.".format(jump)) if acc_fail == max_fail: acc_pow += 1.0 acc_fail = 0 if verbose: print("Reducing acceleration.") rec_error = unnorml_rec_error / norm_tensor rec_errors.append(rec_error) if tol: if iteration >= 1: rec_error_decrease = rec_errors[-2] - rec_errors[-1] if verbose: print("iteration {}, reconstruction error: {}, decrease = {}, unnormalized = {}".format(iteration, rec_error, rec_error_decrease, unnorml_rec_error)) if cvg_criterion == 'abs_rec_error': stop_flag = abs(rec_error_decrease) < tol elif cvg_criterion == 'rec_error': stop_flag = rec_error_decrease < tol else: raise TypeError("Unknown convergence criterion") if stop_flag: if verbose: print("PARAFAC converged after {} iterations".format(iteration)) break else: if verbose: print('reconstruction error={}'.format(rec_errors[-1])) cp_tensor = CPTensor((weights, factors)) if sparsity: sparse_component = sparsify_tensor(tensor -\ cp_to_tensor((weights, factors)),\ sparsity) cp_tensor = (cp_tensor, sparse_component) if return_errors: return cp_tensor, rec_errors else: return cp_tensor
def fista(UtM, UtU, x=None, n_iter_max=100, non_negative=True, sparsity_coef=0, lr=None, tol=10e-8): """ Fast Iterative Shrinkage Thresholding Algorithm (FISTA) Computes an approximate (nonnegative) solution for Ux=M linear system. Parameters ---------- UtM : ndarray Pre-computed product of the transposed of U and M UtU : ndarray Pre-computed product of the transposed of U and U x : init Default: None n_iter_max : int Maximum number of iteration Default: 100 non_negative : bool, default is False if True, result will be non-negative lr : float learning rate Default : None sparsity_coef : float or None tol : float stopping criterion Returns ------- x : approximate solution such that Ux = M Notes ----- We solve the following problem :math: `1/2 ||m - Ux ||_2^2 + \\lambda |x|_1` Reference ---------- [1] : Beck, A., & Teboulle, M. (2009). A fast iterative shrinkage-thresholding algorithm for linear inverse problems. SIAM journal on imaging sciences, 2(1), 183-202. """ if sparsity_coef is None: sparsity_coef = 0 if x is None: x = tl.zeros(tl.shape(UtM), **tl.context(UtM)) if lr is None: lr = 1 / (tl.partial_svd(UtU)[1][0]) # Parameters momentum_old = tl.tensor(1.0) norm_0 = 0.0 x_update = tl.copy(x) for iteration in range(n_iter_max): if isinstance(UtU, list): x_gradient = -UtM + tl.tenalg.multi_mode_dot( x_update, UtU, transpose=False) + sparsity_coef else: x_gradient = -UtM + tl.dot(UtU, x_update) + sparsity_coef if non_negative is True: x_gradient = tl.where(lr * x_gradient < x_update, x_gradient, x_update / lr) x_new = x_update - lr * x_gradient momentum = (1 + tl.sqrt(1 + 4 * momentum_old**2)) / 2 x_update = x_new + ((momentum_old - 1) / momentum) * (x_new - x) momentum_old = momentum x = tl.copy(x_new) norm = tl.norm(lr * x_gradient) if iteration == 1: norm_0 = norm if norm < tol * norm_0: break return x
def unimodality_prox(tensor): """ This function projects each column of the input array on the set of arrays so that x[1] <= x[2] <= x[j] >= x[j+1]... >= x[n] is satisfied columnwise. Parameters ---------- tensor : ndarray Returns ------- ndarray A tensor of which columns' distribution are unimodal. References ---------- .. [1]: Bro, R., & Sidiropoulos, N. D. (1998). Least squares algorithms under unimodality and non‐negativity constraints. Journal of Chemometrics: A Journal of the Chemometrics Society, 12(4), 223-247. """ if tl.ndim(tensor) == 1: tensor = tl.vec_to_tensor(tensor, [tl.shape(tensor)[0], 1]) elif tl.ndim(tensor) > 2: raise ValueError( "Unimodality prox doesn't support an input which has more than 2 dimensions." ) tensor_unimodal = tl.copy(tensor) monotone_increasing = tl.tensor(monotonicity_prox(tensor), **tl.context(tensor)) monotone_decreasing = tl.tensor(monotonicity_prox(tensor, decreasing=True), **tl.context(tensor)) # Next line finds mutual peak points values = tl.tensor( tl.to_numpy((tensor - monotone_decreasing >= 0)) * tl.to_numpy( (tensor - monotone_increasing >= 0)), **tl.context(tensor)) sum_inc = tl.where(values == 1, tl.cumsum(tl.abs(tensor - monotone_increasing), axis=0), tl.tensor(0, **tl.context(tensor))) sum_inc = tl.where(values == 1, sum_inc - tl.abs(tensor - monotone_increasing), tl.tensor(0, **tl.context(tensor))) sum_dec = tl.where( tl.flip(values, axis=0) == 1, tl.cumsum(tl.abs( tl.flip(tensor, axis=0) - tl.flip(monotone_decreasing, axis=0)), axis=0), tl.tensor(0, **tl.context(tensor))) sum_dec = tl.where( tl.flip(values, axis=0) == 1, sum_dec - tl.abs(tl.flip(tensor, axis=0) - tl.flip(monotone_decreasing, axis=0)), tl.tensor(0, **tl.context(tensor))) difference = tl.where(values == 1, sum_inc + tl.flip(sum_dec, axis=0), tl.max(sum_inc + tl.flip(sum_dec, axis=0))) min_indice = tl.argmin(tl.tensor(difference), axis=0) for i in range(len(min_indice)): tensor_unimodal = tl.index_update( tensor_unimodal, tl.index[:int(min_indice[i]), i], monotone_increasing[:int(min_indice[i]), i]) tensor_unimodal = tl.index_update( tensor_unimodal, tl.index[int(min_indice[i] + 1):, i], monotone_decreasing[int(min_indice[i] + 1):, i]) return tensor_unimodal
def admm(UtM, UtU, x, dual_var, n_iter_max=100, n_const=None, order=None, non_negative=None, l1_reg=None, l2_reg=None, l2_square_reg=None, unimodality=None, normalize=None, simplex=None, normalized_sparsity=None, soft_sparsity=None, smoothness=None, monotonicity=None, hard_sparsity=None, tol=1e-4): """ Alternating direction method of multipliers (ADMM) algorithm to minimize a quadratic function under convex constraints. Parameters ---------- UtM: ndarray Pre-computed product of the transposed of U and M. UtU: ndarray Pre-computed product of the transposed of U and U. x: init Default: None dual_var : ndarray Dual variable to update x n_iter_max : int Maximum number of iteration Default: 100 n_const : int Default : None order : int Default : None non_negative : bool or dictionary This constraint is clipping negative values to '0'. If it is True non-negative constraint is applied to all modes. l1_reg : float or list or dictionary, optional l2_reg : float or list or dictionary, optional l2_square : float or list or dictionary, optional unimodality : bool or dictionary, optional If it is True unimodality constraint is applied to all modes. normalize : bool or dictionary, optional This constraint divides all the values by maximum value of the input array. If it is True normalize constraint is applied to all modes. simplex : float or list or dictionary, optional normalized_sparsity : float or list or dictionary, optional soft_sparsity : float or list or dictionary, optional smoothness : float or list or dictionary, optional monotonicity : bool or dictionary, optional hard_sparsity : float or list or dictionary, optional tol : float Returns ------- x : Updated ndarray x_split : Updated ndarray dual_var : Updated ndarray Notes ----- ADMM solves the convex optimization problem :math:`\\min_ f(x) + g(z)` where :math: A(x_split) + Bx = c. Following updates are iterated to solve the problem:: .. math:: \\begin{equation} x_split = argmin_(x_split) f(x_split) + (rho/2)||A(x_split) + Bx - c||_2^2 x = argmin_x g(x) + (rho/2)||A(x_split) + Bx - c||_2^2 dual_var = dual_var + (Ax + B(x_split) - c) \\end{equation} where rho is a constant defined by the user. Let us define a least square problem such as :math:`\\||Ux - M||^2 + r(x)`. ADMM can be adapted to this least square problem as following:: .. math:: \\begin{equation} x_split = (UtU + rho\times I)\times(UtM + rho\times(x + dual_var)^T) x = argmin r(x) + (rho/2)||x - x_split^T + dual_var||_2^2 dual_var = dual_var + x - x_split^T \\end{equation} where r is the regularization operator. Here, x can be updated by using proximity operator of :math:`x_split^T - dual_var`. References ---------- .. [1] Huang, Kejun, Nicholas D. Sidiropoulos, and Athanasios P. Liavas. "A flexible and efficient algorithmic framework for constrained matrix and tensor factorization." IEEE Transactions on Signal Processing 64.19 (2016): 5052-5065. """ rho = tl.trace(UtU) / tl.shape(x)[1] for iteration in range(n_iter_max): x_old = tl.copy(x) x_split = tl.solve(tl.transpose(UtU + rho * tl.eye(tl.shape(UtU)[1])), tl.transpose(UtM + rho * (x + dual_var))) x = proximal_operator(tl.transpose(x_split) - dual_var, non_negative=non_negative, l1_reg=l1_reg, l2_reg=l2_reg, l2_square_reg=l2_square_reg, unimodality=unimodality, normalize=normalize, simplex=simplex, normalized_sparsity=normalized_sparsity, soft_sparsity=soft_sparsity, smoothness=smoothness, monotonicity=monotonicity, hard_sparsity=hard_sparsity, n_const=n_const, order=order) if n_const is None: x = tl.transpose(tl.solve(tl.transpose(UtU), tl.transpose(UtM))) return x, x_split, dual_var dual_var = dual_var + x - tl.transpose(x_split) dual_residual = x - tl.transpose(x_split) primal_residual = x - x_old if tl.norm(dual_residual) < tol * tl.norm(x) and tl.norm( primal_residual) < tol * tl.norm(dual_var): break return x, x_split, dual_var
def one_ntd_step(tensor, ranks, in_core, in_factors, norm_tensor, sparsity_coefficients, fixed_modes, normalize, mode_core_norm, alpha=0.5, delta=0.01): """ One pass of Hierarchical Alternating Least Squares update along all modes, and gradient update on the core, which decreases reconstruction error in Nonnegative Tucker Decomposition. Update the factors by solving a least squares problem per mode, as described in [1]. Note that the unfolding order is the one described in [2], which is different from [1]. This function is strictly superior to a least squares solver ran on the matricized problems min_X ||Y - AX||_F^2 since A is structured as a Kronecker product of the other factors/core. Tensors are manipulated with the tensorly toolbox [3]. Parameters ---------- unfolded_tensors: list of array The spectrogram tensor, unfolded according to all its modes. ranks: list of integers Ranks for eac factor of the decomposition. in_core : tensorly tensor Current estimates of the core in_factors: list of array Current estimates for the factors of this NTD. The value of factor[update_mode] will be updated using a least squares update. The values in in_factors are not modified. norm_tensor : float The Frobenius norm of the input tensor sparsity_coefficients: list of float (as much as the number of modes + 1 for the core) The sparsity coefficients on each factor and on the core respectively. fixed_modes: list of integers (between 0 and the number of modes + 1 for the core) Has to be set not to update a factor, taken in the order of modes and lastly on the core. normalize: list of boolean (as much as the number of modes + 1 for the core) A boolean whereas the factors need to be normalized. The normalization is a l_2 normalization on each of the rank components (For the factors, each column will be normalized, ie each atom of the dimension of the current rank). mode_core_norm: integer or None The mode on which normalize the core, or None if normalization shouldn't be enforced. Will only be useful if the last element of the previous "normalise" argument is set to True. Indexes of the modes start at 0. Default: None alpha : positive float Ratio between outer computations and inner loops. Typically set to 0.5 or 1. Set to +inf in the deterministic mode, as it depends on runtime. Default: 0.5 delta : float in [0,1] Early stop criterion, while err_k > delta*err_0. Set small for almost exact nnls solution, or larger (e.g. 1e-2) for inner loops of a NTD computation. Default: 0.01 Returns ------- core: tensorly tensor The core tensor linking the factors of the decomposition factors: list of factors An array containing all the factors computed with the NTD cost_fct_val: The value of the cost function at this step, normalized by the squared norm of the original tensor. References ---------- [1] Tamara G Kolda and Brett W Bader. "Tensor decompositions and applications", SIAM review 51.3 (2009), pp. 455{500. [2] Jeremy E Cohen. "About notations in multiway array processing", arXiv preprint arXiv:1511.01306, (2015). [3] J. Kossai et al. "TensorLy: Tensor Learning in Python", arxiv preprint (2018) """ # Avoiding errors for fixed_value in fixed_modes: sparsity_coefficients[fixed_value] = None # Copy core = in_core.copy() factors = in_factors.copy() # Generating the mode update sequence modes_list = [mode for mode in range(tl.ndim(tensor)) if mode not in fixed_modes] for mode in modes_list: #unfolded_core = tl.base.unfold(core, mode) tic = time.time() # UtU # First, element-wise products # some computations could be reused but the gain is small. elemprod = factors.copy() for i, factor in enumerate(factors): if i != mode: elemprod[i] = tl.dot(tl.conj(tl.transpose(factor)), factor) # Second, the multiway product with core G temp = tl.tenalg.multi_mode_dot(core, elemprod, skip=mode) # this line can be computed with tensor contractions con_modes = [i for i in range(tl.ndim(tensor)) if i != mode] UtU = tl.tenalg.contract(temp, con_modes, core, con_modes) #UtU = unfold(temp, mode)@tl.transpose(unfold(core, mode)) # UtM # First, the contraction of data with other factors temp = tl.tenalg.multi_mode_dot(tensor, factors, skip=mode, transpose = True) # again, computable by tensor contractions #MtU = unfold(temp, mode)@tl.transpose(unfold(core, mode)) MtU = tl.tenalg.contract(temp, con_modes, core, con_modes) UtM = tl.transpose(MtU) # Computing the Kronekcer product #kron = tl.tenalg.kronecker(factors, skip_matrix = mode, reverse = False) #kron_core = tl.dot(kron, tl.transpose(unfolded_core)) #rhs = tl.dot(unfolded_tensors[mode], kron_core) # Maybe suboptimal #cross = tl.dot(tl.transpose(kron_core), kron_core) timer = time.time() - tic # Call the hals resolution with nnls, optimizing the current mode factors[mode] = tl.transpose(nnls.hals_nnls_acc(UtM, UtU, tl.transpose(factors[mode]), maxiter=100, atime=timer, alpha=alpha, delta=delta, sparsity_coefficient = sparsity_coefficients[mode], normalize = normalize[mode])[0]) #refolded_tensor = tl.base.fold(unfolded_tensors[0], 0, tensor_shape) # Core update #all_MtX = tl.tenalg.multi_mode_dot(tensor, factors, transpose = True) # better implementation: reuse the computation of temp ! # Also reuse elemprod form last update all_MtX = tl.tenalg.mode_dot(temp, tl.transpose(factors[modes_list[-1]]), modes_list[-1]) all_MtM = tl.copy(elemprod) all_MtM[modes_list[-1]] = factors[modes_list[-1]].T@factors[modes_list[-1]] #all_MtM = np.array([fac.T@fac for fac in factors]) # Projected gradient gradient_step = 1 #print(f"factors[modes_list[-1]]: {factors[modes_list[-1]]}") #print(f"all_MtM: {all_MtM}") for MtM in all_MtM: #print(f"MtM: {MtM}") gradient_step *= 1/(scipy.sparse.linalg.svds(MtM, k=1)[1][0]) gradient_step = round(gradient_step, 6) # Heurisitc, to avoid consecutive imprecision cnt = 1 upd_0 = 0 upd = 1 if sparsity_coefficients[-1] is None: sparse = 0 else: sparse = sparsity_coefficients[-1] # TODO: dynamic stopping criterion # Maybe: try fast gradient instead of gradient while cnt <= 300 and upd>= delta * upd_0: gradient = - all_MtX + tl.tenalg.multi_mode_dot(core, all_MtM, transpose = False) + sparse * tl.ones(core.shape) # Proposition of reformulation for error computations delta_core = np.minimum(gradient_step*gradient, core) core = core - delta_core upd = tl.norm(delta_core) if cnt == 1: upd_0 = upd cnt += 1 if normalize[-1]: unfolded_core = tl.unfold(core, mode_core_norm) for idx_mat in range(unfolded_core.shape[0]): if tl.norm(unfolded_core[idx_mat]) != 0: unfolded_core[idx_mat] = unfolded_core[idx_mat] / tl.norm(unfolded_core[idx_mat], 2) core = tl.fold(unfolded_core, mode_core_norm, core.shape) # Adding the l1 norm value to the reconstruction error sparsity_error = 0 for index, sparse in enumerate(sparsity_coefficients): if sparse: if index < len(factors): sparsity_error += 2 * (sparse * np.linalg.norm(factors[index], ord=1)) elif index == len(factors): sparsity_error += 2 * (sparse * tl.norm(core, 1)) else: raise NotImplementedError("TODEBUG: Too many sparsity coefficients, should have been raised before.") rec_error = norm_tensor ** 2 - 2*tl.tenalg.inner(all_MtX, core) + tl.tenalg.inner(tl.tenalg.multi_mode_dot(core, all_MtM, transpose = False), core) cost_fct_val = (rec_error + sparsity_error) / (norm_tensor ** 2) #exhaustive_rec_error = (tl.norm(tensor - tl.tenalg.multi_mode_dot(core, factors, transpose = False), 2) + sparsity_error) / norm_tensor #print("diff: " + str(rec_error - exhaustive_rec_error)) #print("max" + str(np.amax(factors[2]))) return core, factors, cost_fct_val # exhaustive_rec_error
def gcp(X, R, type='normal', func=None, grad=None, lower=None,\ opt='lbfgsb', mask=None, maxiters=1000, \ init='random', printitn=10, state=None, factr=1e7, pgtol=1e-4, \ fsamp=None, gsamp=None, oversample=1.1, sampler='uniform', \ fsampler=None, rate=1e-3, decay=0.1, maxfails=1, epciters=1000, \ festtol=-math.inf, beta1=0.9, beta2=0.999, epsilon=1e-8): """Generalized CANDECOMP/PARAFAC (GCP) decomposition via all-at-once optimization (OPT) [1] Computes a rank-'R' decomposition of 'tensor' such that:: tensor = [|weights; factors[0], ..., factors[-1] |]. GCP-OPT allows the use of a variety of statistically motivated loss functions suited to the data held in a tensor (i.e. continuous, discrete, binary, etc) Parameters ---------- X : ndarray Tensor to factorize **COMING SOON** Sparse tensor support R : int Rank of decomposition (Number of components). type : str, Type of objective function used Options include: 'normal' or 'gaussian' - Gaussian for real-valued data (DEFAULT) 'binary' or 'bernoulli-odds' - Bernoulli w/ odds link for binary data 'bernoulli-logit' - Bernoulli w/ logit link for binary data 'count' or 'poisson' - Poisson for count data 'poisson-log' - Poisson w/ log link for count data 'rayleigh' - Rayleigh distribution for real-valued data 'gamma' - Gamma distribution for non-negative real-valued data **COMING SOON**: 'huber (DELTA) - Similar to Gaussian, for real-valued data 'negative-binomial (r)' - Negative binomial for count data 'beta (BETA)' - Beta divergence for non-negative real-valued data 'user-specified' - Customized objective function provided by user func: lambda function User specified custom objective function, eg. lambda x, m: (m-x)**2 grad: lambda function User specified custom gradient function, eg. lambda x, m: 2*(m-x) lower: 0 or -inf Lower bound for custom objective/gradient opt : str Optimization method Options include: 'lbfgsb' - Bound-constrained limited-memory BFGS 'sgd' - Stochastic gradient descent (SGD) **COMING SOON** 'adam' - Momentum-based SGD method 'adagrad' - Adaptive gradient algorithm, well suited for sparse data If 'tensor' is dense, all 4 options can be used, 'lbfgsb' by default. **COMING SOON** - Sparse format support If 'tensor' is sparse, only 'sgd', 'adam' and 'adagrad' can be used, 'adam' by default. Each method has specific parameters, see documentation mask : ndarray Specifies a mask, 0's for missing/incomplete entries, 1's elsewhere, with the same shape as 'tensor'. **COMING SOON** - Missing/incomplete data simulation. maxiters : int Maximum number of outer iterations, 1000 by default. init : {'random', 'svd', cptensor} Initialization for factor matrices, 'random' by default. Options include: 'random' - random initialization from a uniform distribution on [0,1) 'svd' - initialize the `m`th factor matrix using the `rank` left singular vectors of the `m`th unfolding of the input tensor. cptensor - initialization provided by user. NOTE: weights are pulled in the last factor and then the weights are set to "1" for the output tensor. Initializations all result in a cptensor where the weights are one. printitn : int Print every n iterations; 0 for no printing, 10 by default. state : {None, int, np.random.RandomState} Seed for reproducable random number generation factr : float (L-BFGS-B parameter) Tolerance on the change of objective values. Defaults to 1e7. pgtol : float (L-BFGS-B parameter) Projected gradient tolerance. Defaults to 1e-5 sampler : {uniform, stratified, semi-stratified} Type of sampling to use for stochastic gradient (SGD/ADAM/ADAGRAD). Defaults to 'uniform' for dense tensors. Defaults to 'stratified' for sparse tensors. Options include: 'uniform' - Uniform random sampling **COMING SOON** 'stratified' - Stratified sampling, targets sparse data. Zero and nonzero values sampled separately. 'semi-stratified' - Similar to stratified sampling, but is more computationally efficient (See papers referenced). gsamp : int Number of samples for stochastic gradient (SGD/ADAM/ADAGRAD parameter). Generally set to be O(sum(shape)*R). **COMING SOON** For stratified or semi-stratified, this may be two numbers: - the number of nnz samples - the number of zero samples. If only one number is specified, then this value is used for both nnzs and zeros (total number of samples is 2x specified value in this case). fsampler : {'uniform', 'stratified', custom} Type of sampling for estimating objective function (SGD/ADAM/ADAGRAD parameter). Options include: 'uniform' - Uniform random sampling **COMING SOON** 'stratified' - Stratified sampling, targets sparse data. Zero and nonzero values sampled separately. custom - User-defined sampler (lambda function). Custom option is primarily useful in reusing sampled elements across multiple tests. fsamp : int (SGD/ADAM/ADAGRAD parameter) Number of samples to estimate objective function. This should generally be somewhat large since we want this sample to generate a reliable estimate of the true function value. oversample : float (Stratified sampling parameter) Factor to oversample when implicitly sampling zeros in the sparse case. Defaults to 1.1. Only adjust for very small tensors. rate : float (SGD/ADAM parameter) Initial learning rate. Defaults to 1e-3. decay : float (SGD/ADAM parameter) Amount to decrease learning rate when progress stagnates, i.e. no change in objective function between epochs. Defaults to 0.1. maxfails : int (SGD/ADAM parameter) Number of times to decrease the learning rate. Defaults to 1, may be set to zero. epciters : int (SGD/ADAM parameter) Iterations per epoch. Defaults to 1000. festtol : float (SGD/ADAM parameter) Quit estimation of function if it goes below this level. Defaults to -inf. beta1 : float (ADAM parameter) - generally doesn't need to be changed Defaults to 0.9 beta2 : float (ADAM parameter) - generally doesn't need to be changed Defaults to 0.999 epsilon : float (ADAM parameter) - generally doesn't need to be changed Defaults to 1e-8 Returns ------- Mfin : CPTensor Canonical polyadic decomposition of input tensor X Reference --------- [1] D. Hong, T. G. Kolda, J. A. Duersch, Generalized Canonical Polyadic Tensor Decomposition, SIAM Review, 62:133-163, 2020, https://doi.org/10.1137/18M1203626 [2] T. G. Kolda, D. Hong, Stochastic Gradients for Large-Scale Tensor Decomposition. SIAM J. Mathematics of Data Science, 2:1066-1095, 2020, https://doi.org/10.1137/19m1266265 """ # Timer - Setup (outside optimization) start_setup0 = time.perf_counter() # Initial setup nd = tl.ndim(X) sz = tl.shape(X) tsz = X.size X_context = tl.context(X) vecsz = 0 for i in range(nd): # tsz *= sz[i] vecsz += sz[i] vecsz *= R W = mask # Random set-up if state is not None: state = tl.check_random_state(state) # capture stats(nnzs, zeros, missing) nnonnzeros = 0 X = tl.tensor_to_vec(X) for i in X: if i > 0: nnonnzeros += 1 X = tl.reshape(X, sz) nzeros = tsz - nnonnzeros nmissing = 0 if W is not None: W = tl.tensor_to_vec(W) for i in range(tl.shape(W)[0]): if W[i] > 0: nmissing += 1 # TODO: is this right?? W = tl.reshape(W, sz) # Dictionary for storing important information regarding the decomposition problem info = {} info['tsz'] = tsz info[ 'nmissing'] = 0 # TODO: revisit once missing value functionality incorporated info['nnonnzeros'] = nnonnzeros info[ 'nzeros'] = nzeros # TODO: revisit once missing value functionality incorporated # Set up function, gradient, and bounds fh, gh, lb = validate_type(type, X) info['type'] = type info['fh'] = fh info['gh'] = gh info['lb'] = lb # initialize CP-tensor and make a copy to work with so as to have the starting guess M0 = initialize_cp(X, R, init=init, random_state=state) wghts0 = tl.copy(M0[0]) fcts0 = [] for i in range(nd): f = tl.copy(M0[1][i]) fcts0.append(f) M = CPTensor((wghts0, fcts0)) # Lambda weights are assumed to be all ones throughout, check initial guess satisfies assumption if not tl.all(M[0]): print("Initialization of CP tensor has failed (lambda weight(s) != 1.") sys.exit(1) # check optimization method if validate_opt(opt): print("Choose optimization method from: {lbfgsb, sgd}") sys.exit(1) use_stoc = False if opt != 'lbfgsb': use_stoc = True info['opt'] = opt # set up for stochastic optimization (e.g. sgd, adam, adagrad) if use_stoc: # set up fsampler, gsampler ---> uniform sampling only for now # TODO : add stratified, semi-stratified and user-specified sampling options if not sampler == "uniform": print( "Only uniform sampling currently supported for stochastic optimization." ) sys.exit(1) fsampler_type = sampler gsampler_type = sampler # setup fsampler f_samp = fsamp if f_samp == None: upper = np.maximum(math.ceil(tsz / 10), 10 ^ 6) f_samp = np.minimum(upper, tsz) # set up lambda function/function handle for uniform sampling fsampler = lambda: tl_sample_uniform(X, f_samp) fsampler_str = "{} with {} samples".format(fsampler_type, f_samp) # setup gsampler g_samp = gsamp if g_samp == None: upper = np.maximum(1000, math.ceil(10 * tsz / maxiters)) g_samp = np.minimum(upper, tsz) # setup lambda function/function handle for uniform sampling gsampler = lambda: tl_sample_uniform(X, g_samp) gsampler_str = "{} with {} samples".format(gsampler_type, g_samp) # capture the info info['fsampler'] = fsampler_str info['gsampler'] = gsampler_str info['fsamp'] = f_samp info['gsamp'] = g_samp time_setup0 = time.perf_counter() - start_setup0 # Welcome message if printitn > 0: print("GCP-OPT-{} (Generalized CP Tensor Decomposition)".format(opt)) print("------------------------------------------------") print("Tensor size:\t\t\t\t{} ({} total entries)".format(sz, tsz)) if nmissing > 0: print("Missing entries: {} ({})".format(nmissing, 100 * nmissing / tsz)) print("Generalized function type:\t{}".format(type)) print("Objective function:\t\t\t{}".format( inspect.getsource(fh).strip())) print("Gradient function:\t\t\t{}".format( inspect.getsource(gh).strip())) print("Lower bound of factors:\t\t{}".format(lb)) print("Optimization method:\t\t{}".format(opt)) if use_stoc: print("Max iterations (epochs): {}".format(maxiters)) print("Iterations per epoch: {}".format(epciters)) print("Learning rate / decay / maxfails: {} {} {}".format( rate, decay, maxfails)) print("Function Sampler: {}".format(fsampler_str)) print("Gradient Sampler: {}".format(gsampler_str)) else: print("Max iterations:\t\t\t\t{}".format(maxiters)) print("Projected gradient tol:\t\t{}\n".format(pgtol)) # Make like a zombie and start decomposing Mfin = None # L-BFGS-B optimization if opt == 'lbfgsb': # Timer - Setup (inside optimization) start_setup1 = time.perf_counter() # set up bounds for l-bfgs-b if lb = 0 bounds = None if lb == 0: lb = tl.zeros(tsz) ub = math.inf * tl.ones(tsz) fcn = lambda x: tl_gcp_fg(vec2factors(x, sz, R, X_context), X, fh, gh) m = factors2vec(M[1]) # capture params for l-bfgs-b lbfgsb_params = {} lbfgsb_params['x0'] = factors2vec(M0.factors) lbfgsb_params['printEvery'] = printitn lbfgsb_params['maxIts'] = maxiters lbfgsb_params['maxTotalIts'] = maxiters * 10 lbfgsb_params['factr'] = factr lbfgsb_params['pgtol'] = pgtol time_setup1 = time.perf_counter() - start_setup1 if printitn > 0: print("Begin main loop") # Timer - Main operation start_main = time.perf_counter() x, f, info_dict = fmin_l_bfgs_b(fcn, m, approx_grad=False, bounds=None, \ pgtol=pgtol, factr=factr, maxiter=maxiters) time_main = time.perf_counter() - start_main # capture info info['fcn'] = fcn info['lbfgsbopts'] = lbfgsb_params info['lbfgsbout'] = info_dict info['finalf'] = f if printitn > 0: print("\nFinal objective: {}".format(f)) print("Setup time: {}".format(time_setup0 + time_setup1)) print("Main loop time: {}".format(time_main)) print("Outer iterations:" ) # TODO: access this value (see manpage for fmin_l_bfgs_b) print("Total iterations: {}".format(info_dict['nit'])) print("L-BFGS-B exit message: {} ({})".format( info_dict['task'], info_dict['warnflag'])) Mfin = vec2factors(x, sz, R, X_context) # Stochastic optimization else: # Timer - Setup (inside optimization) start_setup1 = time.perf_counter() if opt == "adam" or opt == "adagrad": print("{} not currently supported".format(opt)) sys.exit(1) # prepare for sgd # initialize moments m = [] v = [] # Extract samples for estimating function value (i.e. call fsampler), these never change fsubs, fvals, fwgts = fsampler() # Compute initial estimated function value fest = tl_gcp_fg_est(M, fh, gh, fsubs, fvals, fwgts, True, False, False, False) # Set up loop variables nfails = 0 titers = 0 M_weights = tl.copy(M[0]) M_factors = [] for k in range(nd): M_factors.append(tl.copy(M[1][k])) Msave = CPTensor( (M_weights, M_factors)) # save a copy of the initial model msave = m vsave = v fest_prev = fest[0] # Tracing the progress in function value by epoch fest_trace = tl.zeros(maxiters + 1) step_trace = tl.zeros(maxiters + 1) time_trace = tl.zeros(maxiters + 1) fest_trace[0] = fest[0] # Print status if printitn > 0: print("Begin main loop") print("Initial f-est: {}".format(fest[0])) time_setup1 = time.perf_counter() - start_setup1 start_main = time.perf_counter() time_trace[0] = time.perf_counter() - start_setup0 # Main loop - outer iteration for nepoch in range(maxiters): step = (decay**nfails) * rate # Main loop - inner iteration for iter in range(epciters): # Tracking iterations titers = titers + 1 # Select subset for stochastic gradient (i.e. call gsampler) gsubs, gvals, gwts = gsampler() # Compute gradients for each mode Gest = tl_gcp_fg_est(M, fh, gh, gsubs, gvals, gwts, False, True, False, False) # Check for inf gradient for g in Gest[0]: g_max = tl.max(g) g_min = tl.min(g) if math.isinf(g_max) or math.isinf(g_min): print( "Infinite gradient encountered! (epoch = {}, iter = {})" .format(nepoch, iter)) # TODO : add functionality for ADAM and ADAGRAD optimization # Take gradient step for k in range(nd): M.factors[k] = M.factors[k] - step * Gest[0][k] # Estimate objective (i.e. call tl_gcp_fg_est) fest = tl_gcp_fg_est(M, fh, gh, fsubs, fvals, fwgts, True, False, False, False) # Save trace (fest & step) fest_trace[nepoch + 1] = fest[0] step_trace[nepoch + 1] = step # Check convergence condition failed_epoch = False if fest[0] > fest_prev: failed_epoch = True if failed_epoch: nfails += 1 festtol_test = False if fest[0] < festtol: festtol_test = True # Reporting if printitn > 0 and (nepoch % printitn == 0 or failed_epoch or festtol_test): print("Epoch {}: f-est = {}, step = {}".format( nepoch, fest[0], step), end='') if failed_epoch: print( ", nfails = {} (resetting to solution from last epoch)" .format(nfails)) print("") # Rectify failed epoch or save current solution if failed_epoch: M = Msave m = msave v = vsave fest[0] = fest_prev titers = titers - epciters else: Msave = CPTensor((tl.copy(M.weights), tl.copy(M.factors))) msave = m vsave = v fest_prev = fest[0] time_trace[nepoch] = time.perf_counter() - start_setup0 if (nfails > maxfails) or festtol_test: break Mfin = M time_main = time.perf_counter() - start_main # capture info info['fest_trace'] = fest_trace info['step_trace'] = step_trace info['time_trace'] = time_trace info['nepoch'] = nepoch # Report end of main loop if printitn > 0: print("End Main Loop") print("") print("Final f-east: {}".format(fest[0])) print("Setup time: {0:0.6f}".format(time_setup0 + time_setup1)) print("Main loop time: {0:0.6f}".format(time_main)) print("Total iterations: {}".format(nepoch * epciters)) # Wrap up / capture remaining info info['mainTime'] = time_main info['setupTime0'] = time_setup0 info['setupTime1'] = time_setup1 info['setupTime'] = time_setup0 + time_setup1 return Mfin