Ejemplo n.º 1
0
def test_tensor_product():
    """Test tensor_dot"""
    rng = random.check_random_state(1234)

    X = tl.tensor(rng.random_sample((4, 5, 6)))
    Y = tl.tensor(rng.random_sample((3, 4, 7)))
    tdot = tl.tensor_to_vec(tensor_dot(X, Y))
    true_dot = tl.tensor_to_vec(
        tenalg.outer([tl.tensor_to_vec(X),
                      tl.tensor_to_vec(Y)]))
    testing.assert_array_almost_equal(tdot, true_dot)
Ejemplo n.º 2
0
def _e_step(Xt, Wt, T, L, N, U, B, z0, psi0, sgm0, sgmO, sgmR, sgmV):
    # matricize U and B
    matU = kronecker(U, reverse=True)
    matB = kronecker(B, reverse=True)
    # workspace
    Lp = np.prod(L)
    Np = np.prod(N)
    P = np.zeros((T, Lp, Lp))
    J = np.zeros((T, Lp, Lp))
    mu_ = np.zeros((T, Lp))
    psi = np.zeros((T, Lp, Lp))
    mu_h = np.zeros((T, Lp))
    psih = np.zeros((T, Lp, Lp))
    # forward algorithm
    for t in trange(T, desc='forward'):
        ot = tensor_to_vec(
            Wt[t])  # indices of the observed entries of a tensor X
        xt = tensor_to_vec(Xt[t])[ot]
        lt = sum(ot)  # of observed samples
        Ht = matU[ot, :]
        if t == 0:
            K = sgm0 * Ht.T @ pinv(sgm0 * Ht @ Ht.T + sgmR * np.eye(lt))
            # K = psi0 @ Ht.T @ pinv(Ht @ psi0 @ Ht.T + sgmR * np.eye(lt))
            # psi[0] = sgm0 * np.eye(Lp) - K[0] @ Ht
            psi[0] = sgm0 * (np.eye(Lp) - K[0] @ Ht)
            # psi[0] = (np.eye(Lp) - K @ Ht) @ psi0
            mu_[0] = z0 + K @ (xt - Ht @ z0)
        else:
            P[t - 1] = matB @ psi[t - 1] @ matB.T + sgmO * np.eye(Lp)
            K = P[t - 1] @ Ht.T @ pinv(Ht @ P[t - 1] @ Ht.T +
                                       sgmR * np.eye(lt))
            mu_[t] = matB @ mu_[t - 1] + K @ (xt - Ht @ matB @ mu_[t - 1])
            psi[t] = (np.eye(Lp) - K @ Ht) @ P[t - 1]

    # backward
    mu_h[-1] = mu_[-1]
    psih[-1] = psi[-1]
    for t in tqdm(list(reversed(range(T - 1))), desc='backward'):
        J[t] = psi[t] @ matB.T @ pinv(P[t])
        mu_h[t] = mu_[t] + J[t] @ (mu_h[t + 1] - matB @ mu_[t])
        psih[t] = psi[t] + J[t] @ (psih[t + 1] - P[t]) @ J[t].T

    # compute expectations
    ztt = np.zeros((T, Lp, Lp))
    zt_ = np.zeros((T, Lp, Lp))
    cov_zt_ = np.zeros((T, Lp, Lp))
    for t in trange(T, desc='compute expectations'):
        if t > 0:
            cov_zt_[t] = psih[t] @ J[t - 1].T
            zt_[t] = cov_zt_[t] + np.outer(mu_h[t], mu_h[t - 1])
        ztt[t] = psih[t] + np.outer(mu_h[t], mu_h[t])
    zt = mu_h
    cov_ztt = psih
    return zt, cov_ztt, cov_zt_, ztt, zt_
Ejemplo n.º 3
0
def set_tensor_vectorization(x):
    vectorization = []

    for sample in x:
        vectorization.append(tl.tensor_to_vec(sample))

    return np.asarray(vectorization)
Ejemplo n.º 4
0
def test_active_set_nnls():
    """Test for active_set_nnls operator"""
    a = T.tensor(np.random.rand(20, 10))
    true_res = T.tensor(np.random.rand(10, 1))
    b = T.dot(a, true_res)
    atb = T.dot(T.transpose(a), b)
    ata = T.dot(T.transpose(a), a)
    x_as = active_set_nnls(tensor_to_vec(atb), ata)
    x_as = T.reshape(x_as, T.shape(atb))
    assert_array_almost_equal(true_res, x_as, decimal=2)
Ejemplo n.º 5
0
def test_batched_tensor_product():
    """Test batched-tensor_dot

    Notes
    -----
    At the time of writing, MXNet doesn't support transpose 
    for tensors of order higher than 6
    """
    rng = random.check_random_state(1234)
    batch_size = 3

    X = tl.tensor(rng.random_sample((batch_size, 4, 5, 6)))
    Y = tl.tensor(rng.random_sample((batch_size, 3, 7)))
    tdot = tl.unfold(batched_tensor_dot(X, Y), 0)
    for i in range(batch_size):
        true_dot = tl.tensor_to_vec(
            tenalg.outer([tl.tensor_to_vec(X[i]),
                          tl.tensor_to_vec(Y[i])]))
        testing.assert_array_almost_equal(tdot[i], true_dot)
Ejemplo n.º 6
0
def factors2vec(factors):
    """Wrapper function detailed in Appendix C [1]
    Stacks the column vectors of a set of matrices into a single vecto

    Parameters
    ---------
    factors : list of ndarrays
        Factor matrices or Gradient wrt factor gradient

    Returns
    -------
    vec : ndarry
        column-wise vectorization of a list of matrices
    """
    vec = None
    for factor in factors:
        if vec is None:
            vec = tl.tensor_to_vec(tl.transpose(factor))
        else:
            vec = tl.concatenate([vec, tl.tensor_to_vec(tl.transpose(factor))])
    return vec
Ejemplo n.º 7
0
def tr_to_vec(factors):
    """Returns the tensor defined by its TR format ('factors') into
       its vectorized format

    Parameters
    ----------
    factors: list of 3D-arrays
              TR factors

    Returns
    -------
    1-D array
    vectorized format of tensor defined by 'factors'
    """
    return tl.tensor_to_vec(tr_to_tensor(factors))
Ejemplo n.º 8
0
def tt_matrix_to_vec(tt_matrix):
    """Returns the tensor defined by its TT-Matrix format ('factors') into
       its vectorized format

    Parameters
    ----------
    factors : list of 3D-arrays
        TT factors

    Returns
    -------
    1-D array
        format of tensor defined by 'factors'
    """
    return tl.tensor_to_vec(tt_matrix_to_tensor(tt_matrix))
Ejemplo n.º 9
0
def plot_coef(coef_img, img_name, thre_rate=0.01):
    coef = coef_img.get_data()
    coef_vec = tl.tensor_to_vec(coef)
    # selection = SelectPercentile(f_classif, percentile=thre_rate)
    n_voxel_th = int(coef_vec.shape[0] * thre_rate)
    top_voxel_idx = (abs(coef_vec)).argsort()[::-1][:n_voxel_th]
    thre = coef_vec[top_voxel_idx[-1]]
    # coef_to_plot = np.zeros(coef.shape[0])
    # coef_to_plot[top_voxel_idx] = coef[top_voxel_idx]
    # thre = np.amax(abs(coef)) * thre_rate # high absulte value times threshold rate
    # coef_img = nib.Nifti1Image(coef, maskimg.affine)
    # plotting.plot_stat_map(coef_img, threshold=thre, output_file='%s.png'%img_name, cut_coords=(0, 15, 55))
    plotting.plot_stat_map(coef_img,
                           threshold=thre,
                           output_file='%s_.pdf' % img_name,
                           display_mode='x',
                           vmax=0.0004,
                           cut_coords=range(0, 1, 1),
                           colorbar=False)
Ejemplo n.º 10
0
def hard_thresholding(tensor, number_of_non_zero):
    """
    Proximal operator of the l0 ``norm''
    Keeps greater "number_of_non_zero" elements untouched and sets other elements to zero.

    Parameters
    ----------
    tensor : ndarray
    number_of_non_zero : int

    Returns
    -------
    ndarray
          Thresholded tensor on which the operator has been applied
    """
    tensor_vec = tl.copy(tl.tensor_to_vec(tensor))
    sorted_indices = tl.argsort(tl.argsort(tl.abs(tensor_vec),
                                           axis=0,
                                           descending=True),
                                axis=0)
    return tl.reshape(
        tl.where(sorted_indices < number_of_non_zero, tensor_vec,
                 tl.tensor(0, **tl.context(tensor_vec))), tensor.shape)
Ejemplo n.º 11
0
def one_ntf_step(unfolded_tensors,
                 rank,
                 in_factors,
                 norm_tensor,
                 update_rule,
                 beta,
                 sparsity_coefficients,
                 fixed_modes,
                 normalize,
                 alpha=0.5,
                 delta=0.01):
    """
    One pass of Hierarchical Alternating Least Squares update along all modes

    Update the factors by solving a least squares problem per mode (in hals), as described in [1],
    or using the Multiplicative Update for the entire factors [2].

    Note that the unfolding order is the one described in [3], which is different from [1].

    Parameters
    ----------
    unfolded_tensors: list of array
        The spectrogram tensor, unfolded according to all its modes.
    in_factors: list of array
        Current estimates for the PARAFAC decomposition of
        tensor. The value of factor[update_mode]
        will be updated using a least squares update.
        The values in in_factors are not modified.
    rank: int
        Rank of the decomposition.
    norm_tensor : float
        The Frobenius norm of the input tensor
    update_rule: string "hals" | "mu"
        The chosen update rule.
        HALS performs optimization with the euclidean norm,
        MU performs the optimization using the $\beta$-divergence loss, 
        which generalizes the Euclidean norm, and the Kullback-Leibler and 
        Itakura-Saito divergences.
        The chosen beta-divergence is specified with the parameter `beta`.
    beta: float
        The beta parameter for the beta-divergence.
        2 - Euclidean norm
        1 - Kullback-Leibler divergence
        0 - Itakura-Saito divergence
    sparsity_coefficients : List of floats
        sparsity coefficients for every mode.
    fixed_modes : List of integers
        Indexes of modes that are not updated
    normalize: List of boolean (as much as the number of modes)
        A boolean where the factors need to be normalized.
        The normalization is a l_2 normalization on each of the rank components
        (columnwise)
    alpha : positive float
        Ratio between outer computations and inner loops. Typically set to
        0.5 or 1.
        Default: 0.5
    delta : float in [0,1]
        Early stop criterion, while err_k > delta*err_0. Set small for
        almost exact nnls solution, or larger (e.g. 1e-2) for inner loops
        of a PARAFAC computation.
        Default: 0.01

    Returns
    -------
    np.array(factors): numpy array
        An array containing all the factors computed with PARAFAC decomposition
    cost_fct_val:
        The value of the cost function at this step,
        normalized by the squared norm of the original tensor.
        
    References
    ----------
    [1] Tamara G Kolda and Brett W Bader. "Tensor decompositions and applications",
    SIAM review 51.3 (2009), pp. 455{500.
    
    [2] Févotte, C., & Idier, J. (2011). 
    Algorithms for nonnegative matrix factorization with the β-divergence. 
    Neural computation, 23(9), 2421-2456.

    [3] Jeremy E Cohen. "About notations in multiway array processing",
    arXiv preprint arXiv:1511.01306, (2015).
    """

    if update_rule not in ["hals", "mu"]:
        raise err.InvalidArgumentValue(
            f"Invalid update rule: {update_rule}") from None
    if update_rule == "hals" and beta != 2:
        raise err.InvalidArgumentValue(
            f"The hals is only valid for the frobenius norm, corresponding to the beta divergence with beta = 2. Here, beta was set to {beta}. To compute NMF with this value of beta, please use the mu update_rule."
        ) from None

    # Avoiding errors
    for fixed_value in fixed_modes:
        sparsity_coefficients[fixed_value] = None

    # Copy
    factors = in_factors.copy()

    # Generating the mode update sequence
    gen = [
        mode for mode in range(len(unfolded_tensors))
        if mode not in fixed_modes
    ]

    for mode in gen:
        if update_rule == "hals":
            tic = time.time()

            # Computing Hadamard of cross-products
            cross = tl.tensor(tl.ones((rank, rank)))  #, **tl.context(tensor))
            for i, factor in enumerate(factors):
                if i != mode:
                    cross *= tl.dot(tl.transpose(factor), factor)

            # Computing the Khatri Rao product
            krao = tl.tenalg.khatri_rao(factors, skip_matrix=mode)
            rhs = tl.dot(unfolded_tensors[mode], krao)

            timer = time.time() - tic

            # Call the hals resolution with nnls, optimizing the current mode
            factors[mode] = tl.transpose(
                nnls.hals_nnls_acc(
                    tl.transpose(rhs),
                    cross,
                    tl.transpose(factors[mode]),
                    maxiter=100,
                    atime=timer,
                    alpha=alpha,
                    delta=delta,
                    sparsity_coefficient=sparsity_coefficients[mode],
                    normalize=normalize[mode])[0])

        elif update_rule == "mu":
            krao = tl.tenalg.khatri_rao(factors, skip_matrix=mode)
            factors[mode] = mu.mu_betadivmin(factors[mode], krao.T,
                                             unfolded_tensors[mode], beta)

    # Adding the l1 norm value to the reconstruction error
    sparsity_error = 0
    for index, sparse in enumerate(sparsity_coefficients):
        if sparse:
            sparsity_error += 2 * (sparse *
                                   np.linalg.norm(factors[index], ord=1))

    if update_rule == "hals":
        # error computation (improved using precomputed quantities)
        rec_error = norm_tensor**2 - 2 * tl.dot(
            tl.tensor_to_vec(factors[mode]), tl.tensor_to_vec(rhs)) + tl.norm(
                tl.dot(factors[mode], tl.transpose(krao)), 2)**2

    elif update_rule == "mu":
        rec_error = beta_div.beta_divergence(unfolded_tensors[mode],
                                             factors[mode] @ krao.T, beta)

    cost_fct_val = (rec_error + sparsity_error) / (norm_tensor**2)

    return factors, cost_fct_val
Ejemplo n.º 12
0
def tl_gcp_fg(M, X, f, g, W=None, computeF=True, computeG=True, vectorG=True):
    """Loss function and gradient for generalized CP decomposition.
    (Analogous to tt_gcp_fg.m)

    Parameters
    ----------
    M : CPTensor
    X : ndarray
        Dense tensor
    f : Function handle
        elementwise loss of the form f(x,m)
    g : Function handle
        Elementwise intermediate gradient of the form g(x,m)
    W : ndarray
        Weight tensor, 1's for known values, 0's for missing values.  Function/gradient is only computed w.r.t
        know values. Setting W =[] indicates no missing data.
    computeF : boolean
        Include computation of the loss function. Default is true.
    computeG : boolean
        Include computation of the gradient.
    vectorG : boolean
        Reshape gradient matrices into a single vector.

    Returns
    -------
    F : scalar
        Loss function value
    G : ndarray(s)
        If vectorG = False, G is a list of matrices where G[k] is the same size as the k-th factor matrix
        Otherwise, G is the gradient in vector form
    """
    # setup
    Mfull = M.to_tensor()
    Mv = tl.tensor_to_vec(Mfull)
    Xv = tl.tensor_to_vec(X)

    F = None
    G = []
    # calculate loss
    if computeF:
        Fvec = f(Xv, Mv)
        if W is not None:
            # TODO handle applying weight tensor, probably need to vec it then elementwise product
            pass
        F = Fvec.sum()
    # calculate gradient
    if computeG:

        Y = g(Xv, Mv)
        Y = tl.tensor(tl.reshape(Y, tl.shape(X)))

        if W is not None:
            # TODO handle applying weight tensor, probably need to vec it then elementwise product
            pass

        for i in range(len(M[1])):
            mGrad = tl.unfolding_dot_khatri_rao(Y, M, i)
            G.append(mGrad)
        if vectorG:
            G = factors2vec(G)

    return F, G
Ejemplo n.º 13
0
def gcp(X, R, type='normal', func=None, grad=None, lower=None,\
        opt='lbfgsb', mask=None, maxiters=1000, \
        init='random', printitn=10, state=None, factr=1e7, pgtol=1e-4, \
        fsamp=None, gsamp=None, oversample=1.1, sampler='uniform', \
        fsampler=None, rate=1e-3, decay=0.1, maxfails=1, epciters=1000, \
        festtol=-math.inf, beta1=0.9, beta2=0.999, epsilon=1e-8):
    """Generalized CANDECOMP/PARAFAC (GCP) decomposition via all-at-once optimization (OPT) [1]
    Computes a rank-'R' decomposition of 'tensor' such that::

      tensor = [|weights; factors[0], ..., factors[-1] |].

    GCP-OPT allows the use of a variety of statistically motivated loss functions
    suited to the data held in a tensor (i.e. continuous, discrete, binary, etc)

    Parameters
    ----------
    X : ndarray
        Tensor to factorize
        **COMING SOON**
        Sparse tensor support
    R : int
        Rank of decomposition (Number of components).
    type : str,
        Type of objective function used
        Options include:
            'normal' or 'gaussian'          - Gaussian for real-valued data (DEFAULT)
            'binary' or 'bernoulli-odds'    - Bernoulli w/ odds link for binary data
            'bernoulli-logit'               - Bernoulli w/ logit link for binary data
            'count' or 'poisson'            - Poisson for count data
            'poisson-log'                   - Poisson w/ log link for count data
            'rayleigh'                      - Rayleigh distribution for real-valued data
            'gamma'                         - Gamma distribution for non-negative real-valued data
        **COMING SOON**:
            'huber (DELTA)                  - Similar to Gaussian, for real-valued data
            'negative-binomial (r)'         - Negative binomial for count data
            'beta (BETA)'                   - Beta divergence for non-negative real-valued data
            'user-specified'                - Customized objective function provided by user
    func: lambda function
        User specified custom objective function, eg. lambda x, m: (m-x)**2
    grad: lambda function
        User specified custom gradient function, eg. lambda x, m: 2*(m-x)
    lower: 0 or -inf
        Lower bound for custom objective/gradient
    opt : str
        Optimization method
        Options include:
            'lbfgsb'    - Bound-constrained limited-memory BFGS
            'sgd'       - Stochastic gradient descent (SGD)
            **COMING SOON**
            'adam'      - Momentum-based SGD method
            'adagrad'   - Adaptive gradient algorithm, well suited for sparse data
        If 'tensor' is dense, all 4 options can be used, 'lbfgsb' by default.
        **COMING SOON** - Sparse format support
        If 'tensor' is sparse, only 'sgd', 'adam' and 'adagrad' can be used, 'adam' by
        default.
        Each method has specific parameters, see documentation
    mask : ndarray
        Specifies a mask, 0's for missing/incomplete entries, 1's elsewhere, with
        the same shape as 'tensor'.
        **COMING SOON** - Missing/incomplete data simulation.
    maxiters : int
        Maximum number of outer iterations, 1000 by default.
    init : {'random', 'svd', cptensor}
        Initialization for factor matrices, 'random' by default.
        Options include:
            'random'    - random initialization from a uniform distribution on [0,1)
            'svd'       - initialize the `m`th factor matrix using the `rank` left
                          singular vectors of the `m`th unfolding of the input tensor.
            cptensor    - initialization provided by user.  NOTE: weights are pulled
                          in the last factor and then the weights are set to "1" for
                          the output tensor.
        Initializations all result in a cptensor where the weights are one.
    printitn : int
        Print every n iterations; 0 for no printing, 10 by default.
    state : {None, int, np.random.RandomState}
        Seed for reproducable random number generation
    factr : float
        (L-BFGS-B parameter)
        Tolerance on the change of objective values. Defaults to 1e7.
    pgtol : float
        (L-BFGS-B parameter)
        Projected gradient tolerance.  Defaults to 1e-5
    sampler : {uniform, stratified, semi-stratified}
        Type of sampling to use for stochastic gradient (SGD/ADAM/ADAGRAD).
        Defaults to 'uniform' for dense tensors.
        Defaults to 'stratified' for sparse tensors.
        Options include:
            'uniform'           - Uniform random sampling
            **COMING SOON**
            'stratified'        - Stratified sampling, targets sparse data. Zero and
                                  nonzero values sampled separately.
            'semi-stratified'   - Similar to stratified sampling, but is more
                                  computationally efficient (See papers referenced).
    gsamp : int
        Number of samples for stochastic gradient (SGD/ADAM/ADAGRAD parameter).
        Generally set to be O(sum(shape)*R).
        **COMING SOON**
        For stratified or semi-stratified, this may be two numbers:
            - the number of nnz samples
            - the number of zero samples.
        If only one number is specified, then this value is used for both nnzs and
        zeros (total number of samples is 2x specified value in this case).
    fsampler : {'uniform', 'stratified', custom}
        Type of sampling for estimating objective function (SGD/ADAM/ADAGRAD parameter).
        Options include:
            'uniform'       - Uniform random sampling
            **COMING SOON**
            'stratified'    - Stratified sampling, targets sparse data. Zero and
                              nonzero values sampled separately.
            custom          - User-defined sampler (lambda function). Custom option
                              is primarily useful in reusing sampled elements across
                              multiple tests.
    fsamp : int
        (SGD/ADAM/ADAGRAD parameter)
        Number of samples to estimate objective function.
        This should generally be somewhat large since we want this sample to generate a
        reliable estimate of the true function value.
    oversample : float
        (Stratified sampling parameter)
        Factor to oversample when implicitly sampling zeros in the sparse case.
        Defaults to 1.1. Only adjust for very small tensors.
    rate : float
        (SGD/ADAM parameter)
        Initial learning rate. Defaults to 1e-3.
    decay : float
        (SGD/ADAM parameter)
        Amount to decrease learning rate when progress stagnates, i.e. no change in
        objective function between epochs.  Defaults to 0.1.
    maxfails : int
        (SGD/ADAM parameter)
        Number of times to decrease the learning rate.
        Defaults to 1, may be set to zero.
    epciters : int
        (SGD/ADAM parameter)
        Iterations per epoch. Defaults to 1000.
    festtol : float
        (SGD/ADAM parameter)
        Quit estimation of function if it goes below this level.
        Defaults to -inf.
    beta1 : float
        (ADAM parameter)    - generally doesn't need to be changed
        Defaults to 0.9
    beta2 : float
        (ADAM parameter)    - generally doesn't need to be changed
        Defaults to 0.999
    epsilon : float
        (ADAM parameter)    - generally doesn't need to be changed
        Defaults to 1e-8

    Returns
    -------
    Mfin : CPTensor
        Canonical polyadic decomposition of input tensor X

    Reference
    ---------
    [1] D. Hong, T. G. Kolda, J. A. Duersch, Generalized Canonical
        Polyadic Tensor Decomposition, SIAM Review, 62:133-163, 2020,
        https://doi.org/10.1137/18M1203626
    [2] T. G. Kolda, D. Hong, Stochastic Gradients for Large-Scale Tensor
        Decomposition. SIAM J. Mathematics of Data Science, 2:1066-1095,
        2020, https://doi.org/10.1137/19m1266265

    """
    # Timer - Setup (outside optimization)
    start_setup0 = time.perf_counter()

    # Initial setup
    nd = tl.ndim(X)
    sz = tl.shape(X)
    tsz = X.size
    X_context = tl.context(X)
    vecsz = 0
    for i in range(nd):
        # tsz *= sz[i]
        vecsz += sz[i]
    vecsz *= R
    W = mask

    # Random set-up
    if state is not None:
        state = tl.check_random_state(state)

    # capture stats(nnzs, zeros, missing)
    nnonnzeros = 0
    X = tl.tensor_to_vec(X)
    for i in X:
        if i > 0:
            nnonnzeros += 1
    X = tl.reshape(X, sz)
    nzeros = tsz - nnonnzeros
    nmissing = 0
    if W is not None:
        W = tl.tensor_to_vec(W)
        for i in range(tl.shape(W)[0]):
            if W[i] > 0: nmissing += 1  # TODO: is this right??
        W = tl.reshape(W, sz)

    # Dictionary for storing important information regarding the decomposition problem
    info = {}
    info['tsz'] = tsz
    info[
        'nmissing'] = 0  # TODO: revisit once missing value functionality incorporated
    info['nnonnzeros'] = nnonnzeros
    info[
        'nzeros'] = nzeros  # TODO: revisit once missing value functionality incorporated

    # Set up function, gradient, and bounds
    fh, gh, lb = validate_type(type, X)
    info['type'] = type
    info['fh'] = fh
    info['gh'] = gh
    info['lb'] = lb

    # initialize CP-tensor and make a copy to work with so as to have the starting guess
    M0 = initialize_cp(X, R, init=init, random_state=state)
    wghts0 = tl.copy(M0[0])
    fcts0 = []
    for i in range(nd):
        f = tl.copy(M0[1][i])
        fcts0.append(f)
    M = CPTensor((wghts0, fcts0))

    # Lambda weights are assumed to be all ones throughout, check initial guess satisfies assumption
    if not tl.all(M[0]):
        print("Initialization of CP tensor has failed (lambda weight(s) != 1.")
        sys.exit(1)

    # check optimization method
    if validate_opt(opt):
        print("Choose optimization method from: {lbfgsb, sgd}")
        sys.exit(1)
    use_stoc = False
    if opt != 'lbfgsb':
        use_stoc = True
    info['opt'] = opt

    # set up for stochastic optimization (e.g. sgd, adam, adagrad)
    if use_stoc:
        # set up fsampler, gsampler ---> uniform sampling only for now
        # TODO : add stratified, semi-stratified and user-specified sampling options
        if not sampler == "uniform":
            print(
                "Only uniform sampling currently supported for stochastic optimization."
            )
            sys.exit(1)
        fsampler_type = sampler
        gsampler_type = sampler

        # setup fsampler
        f_samp = fsamp
        if f_samp == None:
            upper = np.maximum(math.ceil(tsz / 10), 10 ^ 6)
            f_samp = np.minimum(upper, tsz)

        # set up lambda function/function handle for uniform sampling
        fsampler = lambda: tl_sample_uniform(X, f_samp)
        fsampler_str = "{} with {} samples".format(fsampler_type, f_samp)

        # setup gsampler
        g_samp = gsamp
        if g_samp == None:
            upper = np.maximum(1000, math.ceil(10 * tsz / maxiters))
            g_samp = np.minimum(upper, tsz)

        # setup lambda function/function handle for uniform sampling
        gsampler = lambda: tl_sample_uniform(X, g_samp)
        gsampler_str = "{} with {} samples".format(gsampler_type, g_samp)

        # capture the info
        info['fsampler'] = fsampler_str
        info['gsampler'] = gsampler_str
        info['fsamp'] = f_samp
        info['gsamp'] = g_samp

    time_setup0 = time.perf_counter() - start_setup0

    # Welcome message
    if printitn > 0:
        print("GCP-OPT-{} (Generalized CP Tensor Decomposition)".format(opt))
        print("------------------------------------------------")
        print("Tensor size:\t\t\t\t{} ({} total entries)".format(sz, tsz))
        if nmissing > 0:
            print("Missing entries: {} ({})".format(nmissing,
                                                    100 * nmissing / tsz))
        print("Generalized function type:\t{}".format(type))
        print("Objective function:\t\t\t{}".format(
            inspect.getsource(fh).strip()))
        print("Gradient function:\t\t\t{}".format(
            inspect.getsource(gh).strip()))
        print("Lower bound of factors:\t\t{}".format(lb))
        print("Optimization method:\t\t{}".format(opt))
        if use_stoc:
            print("Max iterations (epochs): {}".format(maxiters))
            print("Iterations per epoch: {}".format(epciters))
            print("Learning rate / decay / maxfails: {} {} {}".format(
                rate, decay, maxfails))
            print("Function Sampler: {}".format(fsampler_str))
            print("Gradient Sampler: {}".format(gsampler_str))
        else:
            print("Max iterations:\t\t\t\t{}".format(maxiters))
            print("Projected gradient tol:\t\t{}\n".format(pgtol))

    # Make like a zombie and start decomposing
    Mfin = None
    # L-BFGS-B optimization
    if opt == 'lbfgsb':
        # Timer - Setup (inside optimization)
        start_setup1 = time.perf_counter()

        # set up bounds for l-bfgs-b if lb = 0
        bounds = None
        if lb == 0:
            lb = tl.zeros(tsz)
            ub = math.inf * tl.ones(tsz)
        fcn = lambda x: tl_gcp_fg(vec2factors(x, sz, R, X_context), X, fh, gh)
        m = factors2vec(M[1])

        # capture params for l-bfgs-b
        lbfgsb_params = {}
        lbfgsb_params['x0'] = factors2vec(M0.factors)
        lbfgsb_params['printEvery'] = printitn
        lbfgsb_params['maxIts'] = maxiters
        lbfgsb_params['maxTotalIts'] = maxiters * 10
        lbfgsb_params['factr'] = factr
        lbfgsb_params['pgtol'] = pgtol

        time_setup1 = time.perf_counter() - start_setup1

        if printitn > 0:
            print("Begin main loop")

        # Timer - Main operation
        start_main = time.perf_counter()
        x, f, info_dict = fmin_l_bfgs_b(fcn, m, approx_grad=False, bounds=None, \
                                        pgtol=pgtol, factr=factr, maxiter=maxiters)
        time_main = time.perf_counter() - start_main

        # capture info
        info['fcn'] = fcn
        info['lbfgsbopts'] = lbfgsb_params
        info['lbfgsbout'] = info_dict
        info['finalf'] = f

        if printitn > 0:
            print("\nFinal objective: {}".format(f))
            print("Setup time: {}".format(time_setup0 + time_setup1))
            print("Main loop time: {}".format(time_main))
            print("Outer iterations:"
                  )  # TODO: access this value (see manpage for fmin_l_bfgs_b)
            print("Total iterations: {}".format(info_dict['nit']))
            print("L-BFGS-B exit message: {} ({})".format(
                info_dict['task'], info_dict['warnflag']))
        Mfin = vec2factors(x, sz, R, X_context)

    # Stochastic optimization
    else:
        # Timer - Setup (inside optimization)
        start_setup1 = time.perf_counter()
        if opt == "adam" or opt == "adagrad":
            print("{} not currently supported".format(opt))
            sys.exit(1)
        # prepare for sgd
        # initialize moments
        m = []
        v = []

        # Extract samples for estimating function value (i.e. call fsampler), these never change
        fsubs, fvals, fwgts = fsampler()

        # Compute initial estimated function value
        fest = tl_gcp_fg_est(M, fh, gh, fsubs, fvals, fwgts, True, False,
                             False, False)

        # Set up loop variables
        nfails = 0
        titers = 0

        M_weights = tl.copy(M[0])
        M_factors = []
        for k in range(nd):
            M_factors.append(tl.copy(M[1][k]))
        Msave = CPTensor(
            (M_weights, M_factors))  # save a copy of the initial model
        msave = m
        vsave = v
        fest_prev = fest[0]

        # Tracing the progress in function value by epoch
        fest_trace = tl.zeros(maxiters + 1)
        step_trace = tl.zeros(maxiters + 1)
        time_trace = tl.zeros(maxiters + 1)
        fest_trace[0] = fest[0]

        # Print status
        if printitn > 0:
            print("Begin main loop")
            print("Initial f-est: {}".format(fest[0]))

        time_setup1 = time.perf_counter() - start_setup1
        start_main = time.perf_counter()
        time_trace[0] = time.perf_counter() - start_setup0

        # Main loop - outer iteration
        for nepoch in range(maxiters):
            step = (decay**nfails) * rate
            # Main loop - inner iteration
            for iter in range(epciters):
                # Tracking iterations
                titers = titers + 1

                # Select subset for stochastic gradient (i.e. call gsampler)
                gsubs, gvals, gwts = gsampler()

                # Compute gradients for each mode
                Gest = tl_gcp_fg_est(M, fh, gh, gsubs, gvals, gwts, False,
                                     True, False, False)

                # Check for inf gradient
                for g in Gest[0]:
                    g_max = tl.max(g)
                    g_min = tl.min(g)
                    if math.isinf(g_max) or math.isinf(g_min):
                        print(
                            "Infinite gradient encountered! (epoch = {}, iter = {})"
                            .format(nepoch, iter))

                # TODO : add functionality for ADAM and ADAGRAD optimization
                # Take gradient step
                for k in range(nd):
                    M.factors[k] = M.factors[k] - step * Gest[0][k]

            # Estimate objective (i.e. call tl_gcp_fg_est)
            fest = tl_gcp_fg_est(M, fh, gh, fsubs, fvals, fwgts, True, False,
                                 False, False)

            # Save trace (fest & step)
            fest_trace[nepoch + 1] = fest[0]
            step_trace[nepoch + 1] = step

            # Check convergence condition
            failed_epoch = False
            if fest[0] > fest_prev:
                failed_epoch = True
            if failed_epoch:
                nfails += 1
            festtol_test = False
            if fest[0] < festtol:
                festtol_test = True

            # Reporting
            if printitn > 0 and (nepoch % printitn == 0 or failed_epoch
                                 or festtol_test):
                print("Epoch {}: f-est = {}, step = {}".format(
                    nepoch, fest[0], step),
                      end='')
                if failed_epoch:
                    print(
                        ", nfails = {} (resetting to solution from last epoch)"
                        .format(nfails))
                print("")

            # Rectify failed epoch or save current solution
            if failed_epoch:
                M = Msave
                m = msave
                v = vsave
                fest[0] = fest_prev
                titers = titers - epciters
            else:
                Msave = CPTensor((tl.copy(M.weights), tl.copy(M.factors)))
                msave = m
                vsave = v
                fest_prev = fest[0]

            time_trace[nepoch] = time.perf_counter() - start_setup0

            if (nfails > maxfails) or festtol_test:
                break
        Mfin = M
        time_main = time.perf_counter() - start_main

        # capture info
        info['fest_trace'] = fest_trace
        info['step_trace'] = step_trace
        info['time_trace'] = time_trace
        info['nepoch'] = nepoch

        # Report end of main loop
        if printitn > 0:
            print("End Main Loop")
            print("")
            print("Final f-east: {}".format(fest[0]))
            print("Setup time: {0:0.6f}".format(time_setup0 + time_setup1))
            print("Main loop time: {0:0.6f}".format(time_main))
            print("Total iterations: {}".format(nepoch * epciters))
    # Wrap up / capture remaining info
    info['mainTime'] = time_main
    info['setupTime0'] = time_setup0
    info['setupTime1'] = time_setup1
    info['setupTime'] = time_setup0 + time_setup1

    return Mfin