Ejemplo n.º 1
0
    def log(self, x):
        self.tmp1.assign(x)

        self.tmp1.mult(-1)
        cm.exp(self.tmp1, target=self.tmp1)
        self.tmp1.add(1)
        cm.pow(self.tmp1, -1)
Ejemplo n.º 2
0
    def costAndGrad(self,data,labels):
        
        batchSize = data.shape[1]
        self.setViews(batchSize)
        
        # forward prop
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w,b in self.stack:
            cm.dot(w,self.hActs[i-1],self.hActs[i])
            self.hActs[i].add_col_vec(b)
            if i <= len(self.layerSizes):
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0,target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0,target=self.rowVec)
        cm.pow(self.rowVec,-1.0,target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64),
                labels,blank=0)
        self.deltasC.assign(cm.CUDAMatrix(deltas))

        if skip:
            return cost,self.grad,skip

        # back prop
        nl = len(self.layerSizes)
        i = nl 
        deltasIn,deltasOut = self.deltasC,self.deltasOut
        for w,b in reversed(self.stack):
            # compute gradient
            cm.dot(deltasIn,self.hActs[i].T,target=self.grad[i][0])
            deltasIn.sum(axis=1,target=self.grad[i][1])

            # compute next layer deltas
            if i > 0:
                self.hActs[i].sign(target=self.tmpGrad)
                cm.dot(w.T,deltasIn,target=deltasOut)
                deltasOut.mult(self.tmpGrad)

            if i == nl:
                deltasIn = self.deltasIn

            deltasIn,deltasOut = deltasOut,deltasIn
            i -= 1

        return cost,self.grad,skip
Ejemplo n.º 3
0
Archivo: da.py Proyecto: HelenLiGit/POT
def pairwiseEuclideanGPU(a, b, returnAsGPU=False, squared=False):
    """
    Compute the pairwise euclidean distance between matrices a and b.


    Parameters
    ----------
    a : np.ndarray (n, f)
        first matrice
    b : np.ndarray (m, f)
        second matrice
    returnAsGPU : boolean, optional (default False)
        if True, returns cudamat matrix still on GPU, else return np.ndarray
    squared : boolean, optional (default False)
        if True, return squared euclidean distance matrice


    Returns
    -------
    c : (n x m) np.ndarray or cudamat.CUDAMatrix
        pairwise euclidean distance distance matrix
    """
    # a is shape (n, f) and b shape (m, f). Return matrix c of shape (n, m).
    # First compute in c_GPU the squared euclidean distance. And return its
    # square root. At each cell [i,j] of c, we want to have
    # sum{k in range(f)} ( (a[i,k] - b[j,k])^2 ). We know that
    # (a-b)^2 = a^2 -2ab +b^2. Thus we want to have in each cell of c:
    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2).

    a_GPU = cudamat.CUDAMatrix(a)
    b_GPU = cudamat.CUDAMatrix(b)

    # Multiply a by b transpose to obtain in each cell [i,j] of c the
    # value sum{k in range(f)} ( a[i,k]b[j,k] )
    c_GPU = cudamat.dot(a_GPU, b_GPU.transpose())
    # multiply by -2 to have sum{k in range(f)} ( -2a[i,k]b[j,k] )
    c_GPU.mult(-2)

    # Compute the vectors of the sum of squared elements.
    a_GPU = cudamat.pow(a_GPU, 2).sum(axis=1)
    b_GPU = cudamat.pow(b_GPU, 2).sum(axis=1)

    # Add the vectors in each columns (respectivly rows) of c.
    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] )
    c_GPU.add_col_vec(a_GPU)
    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2)
    c_GPU.add_row_vec(b_GPU.transpose())

    if not squared:
        c_GPU = cudamat.sqrt(c_GPU)

    if returnAsGPU:
        return c_GPU
    else:
        return c_GPU.asarray()
Ejemplo n.º 4
0
def pairwiseEuclideanGPU(a, b, returnAsGPU=False, squared=False):
    """
    Compute the pairwise euclidean distance between matrices a and b.


    Parameters
    ----------
    a : np.ndarray (n, f)
        first matrice
    b : np.ndarray (m, f)
        second matrice
    returnAsGPU : boolean, optional (default False)
        if True, returns cudamat matrix still on GPU, else return np.ndarray
    squared : boolean, optional (default False)
        if True, return squared euclidean distance matrice


    Returns
    -------
    c : (n x m) np.ndarray or cudamat.CUDAMatrix
        pairwise euclidean distance distance matrix
    """
    # a is shape (n, f) and b shape (m, f). Return matrix c of shape (n, m).
    # First compute in c_GPU the squared euclidean distance. And return its
    # square root. At each cell [i,j] of c, we want to have
    # sum{k in range(f)} ( (a[i,k] - b[j,k])^2 ). We know that
    # (a-b)^2 = a^2 -2ab +b^2. Thus we want to have in each cell of c:
    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2).

    a_GPU = cudamat.CUDAMatrix(a)
    b_GPU = cudamat.CUDAMatrix(b)

    # Multiply a by b transpose to obtain in each cell [i,j] of c the
    # value sum{k in range(f)} ( a[i,k]b[j,k] )
    c_GPU = cudamat.dot(a_GPU, b_GPU.transpose())
    # multiply by -2 to have sum{k in range(f)} ( -2a[i,k]b[j,k] )
    c_GPU.mult(-2)

    # Compute the vectors of the sum of squared elements.
    a_GPU = cudamat.pow(a_GPU, 2).sum(axis=1)
    b_GPU = cudamat.pow(b_GPU, 2).sum(axis=1)

    # Add the vectors in each columns (respectivly rows) of c.
    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] )
    c_GPU.add_col_vec(a_GPU)
    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2)
    c_GPU.add_row_vec(b_GPU.transpose())

    if not squared:
        c_GPU = cudamat.sqrt(c_GPU)

    if returnAsGPU:
        return c_GPU
    else:
        return c_GPU.asarray()
Ejemplo n.º 5
0
def sinkhorn_lpl1_mm(a,
                     labels_a,
                     b,
                     M_GPU,
                     reg,
                     eta=0.1,
                     numItermax=10,
                     numInnerItermax=200,
                     stopInnerThr=1e-9,
                     verbose=False,
                     log=False):
    p = 0.5
    epsilon = 1e-3
    Nfin = len(b)

    indices_labels = []
    classes = np.unique(labels_a)
    for c in classes:
        idxc, = np.where(labels_a == c)
        indices_labels.append(cudamat.CUDAMatrix(idxc.reshape(1, -1)))

    Mreg_GPU = cudamat.empty(M_GPU.shape)
    W_GPU = cudamat.empty(M_GPU.shape).assign(0)

    for cpt in range(numItermax):
        Mreg_GPU.assign(M_GPU)
        Mreg_GPU.add_mult(W_GPU, eta)
        transp_GPU = sinkhorn(a,
                              b,
                              Mreg_GPU,
                              reg,
                              numItermax=numInnerItermax,
                              stopThr=stopInnerThr,
                              returnAsGPU=True)
        # the transport has been computed. Check if classes are really
        # separated
        W_GPU.assign(1)
        W_GPU = W_GPU.transpose()
        for (i, c) in enumerate(classes):
            (_, nbRow) = indices_labels[i].shape
            tmpC_GPU = cudamat.empty((Nfin, nbRow)).assign(0)
            transp_GPU.transpose().select_columns(indices_labels[i], tmpC_GPU)
            majs_GPU = tmpC_GPU.sum(axis=1).add(epsilon)
            cudamat.pow(majs_GPU, (p - 1))
            majs_GPU.mult(p)

            tmpC_GPU.assign(0)
            tmpC_GPU.add_col_vec(majs_GPU)
            W_GPU.set_selected_columns(indices_labels[i], tmpC_GPU)

        W_GPU = W_GPU.transpose()

    return transp_GPU.asarray()
Ejemplo n.º 6
0
def project_words_gpu(projection_matrix, similarity_matrix, kernel_name,
                      hyperparam):
    import cudamat as cm
    if kernel_name == "poly":
        k = cm.pow(cm.CUDAMatrix(similarity_matrix), hyperparam)
    elif kernel_name == 'rbf':
        k = cm.exp((cm.pow(cm.CUDAMatrix(1 - similarity_matrix),
                           2)).mult(-hyperparam))
    else:
        raise NotImplementedError(f'{kernel_name} not yet implemented for GPU')

    return cm.dot(k, cm.CUDAMatrix(projection_matrix)).asarray()
Ejemplo n.º 7
0
def test_pow():
    m = 256
    n = 128
    a = np.array(np.random.randn(m, n)*20, dtype=np.float32, order='F')
    b = np.array(np.random.rand(m, n), dtype=np.float32, order='F')
    p = 2

    c = a**p

    m1 = cm.CUDAMatrix(a)
    m2 = cm.CUDAMatrix(b)
    cm.pow(m1, p, target = m2)
    cm.pow(m1, p)

    m1.copy_to_host()
    m2.copy_to_host()

    assert np.max(np.abs(c - m1.numpy_array)) < 10**-3, "Error in cudamat.pow exceeded threshold"
    assert np.max(np.abs(c - m2.numpy_array)) < 10**-3, "Error in cudamat.pow exceeded threshold"
Ejemplo n.º 8
0
def test_pow():
    m = 256
    n = 128
    a = np.array(np.random.randn(m, n)*20, dtype=np.float32, order='F')
    b = np.array(np.random.rand(m, n), dtype=np.float32, order='F')
    p = 2

    c = a**p

    m1 = cm.CUDAMatrix(a)
    m2 = cm.CUDAMatrix(b)
    cm.pow(m1, p, target = m2)
    cm.pow(m1, p)

    m1.copy_to_host()
    m2.copy_to_host()

    assert np.max(np.abs(c - m1.numpy_array)) < 10**-3, "Error in cudamat.pow exceeded threshold"
    assert np.max(np.abs(c - m2.numpy_array)) < 10**-3, "Error in cudamat.pow exceeded threshold"
Ejemplo n.º 9
0
    def Train(self,ref):
        #ref e o vetor de todas as sa
        # idas desejados no dado instante de tempo.
        #calcular o vetor de erros
        e = self.trainingError(ref)
        max_lambda = 0.9999
        min_lambda = 0.999
        #regularization
        mu = 1e-8
        #holder = cm.CUDAMatrix(self.P.asarray())

        for saida in range(self.n_out):
            #regularization step
            #cm.dot(self.P,self.P,target = holder)
            #holder.mult(mu)
            #self.P.subtract(holder)
            #end regularization step
            self.sigma_e = (1.0 - 1.0/(self.K_a * self.neu)) * self.sigma_e + (1.0 - (1.0 - 1.0/(self.K_a * self.neu))) * e[saida]**2
            self.sigma_q = (cm.pow(cm.dot(cm.dot(self.a.T,self.P),self.a),2).mult((1.0 - (1.0 - 1.0/(self.K_a * self.neu)))).add((1.0 - 1.0/(self.K_a * self.neu)) * float(self.sigma_q))).asarray()
            self.sigma_v = (1.0 - 1.0/(self.K_b * self.neu)) * self.sigma_v + (1.0 - (1.0 - 1.0/(self.K_b * self.neu))) * e[saida]**2
            self.forget_aux = (np.sqrt(self.sigma_q) * np.sqrt(self.sigma_v))/(1e-8 + abs(np.sqrt(self.sigma_e) - np.sqrt(self.sigma_v)))
            self.forget = np.atleast_2d(np.min([self.forget_aux,max_lambda]))
            #Transpose respective output view..
            Theta = self.Wro.asarray()[saida,:]
            Theta = Theta.reshape([self.neu,1])
            Theta = cm.CUDAMatrix(Theta)

            #MQR equations
            #the P equation step by step
            A = cm.dot(self.P,self.a)
            B = cm.dot(A,self.a.T)
            C = cm.dot(B,self.P)
            D = cm.dot(cm.dot(self.a.T,self.P),self.a).add(np.asscalar(self.forget))

            self.P.subtract(C.divide(np.asscalar(D.asarray())))
            self.P.divide(np.asscalar(self.forget))
            #final update


            #error calculation
            Theta.subtract(cm.dot(self.P,self.a).mult(np.asscalar(e[saida])))

            Theta = Theta.reshape([1,self.neu])


            self.Wro.copy_to_host()
            self.Wro.numpy_array[saida,:] = Theta.asarray()
            self.Wro.copy_to_device()
Ejemplo n.º 10
0
    def update(self, lr):
        if self.use_momentum:
            self.weights_update.mult(self.momentum)
            self.weights_update.subtract_mult(self.weights_grad, lr)
            self.weights.add(self.weights_update)

            if self.use_bias:
                self.biases_update.mult(self.momentum)
                self.biases_update.subtract_mult(self.biases_grad, lr)
                self.biases.add(self.biases_update)
        elif self.use_rmsprop:
            self.weights_rmsprop_cache.mult(self.rmsprop_dr)
            cm.pow(self.weights_grad, self.weights_grad_square)
            self.weights_grad_square.mult(1.0 - self.rmsprop_dr)
            self.weights_rmsprop_cache.add(self.weights_grad_square)
            self.weights_rmsprop_cache.add(1e-8)
            cm.sqrt(self.weights_rmsprop_cache)
            self.weights_grad.mult(lr).divide(self.weights_rmsprop_cache)
            self.weights.subtract(self.weights_grad)

            self.biases_rmsprop_cache.mult(self.rmsprop_dr)
            cm.pow(self.biases_grad, self.biases_grad_square)
            self.biases_grad_square.mult(1.0 - self.rmsprop_dr)
            self.biases_rmsprop_cache.add(self.biases_grad_square)
            self.biases_rmsprop_cache.add(1e-8)
            cm.sqrt(self.biases_rmsprop_cache)
            self.biases_grad.mult(lr).divide(self.biases_rmsprop_cache)
            self.biases.subtract(self.biases_grad)
        else:
            self.weights.subtract_mult(self.weights_grad, lr)
            if self.use_bias:
                self.biases.subtract_mult(self.biases_grad, lr)

        # Max-norm regularization.
        if self.use_max_norm:
            cm.pow(self.weights, 2, self.weights_square)
            self.weights_square.sum(0, self.weights_factor)
            cm.sqrt(self.weights_factor, self.weights_factor)

            # Avoid zero weight mags.
            self.weights_factor.add(1e-8)
            self.weights_factor.reciprocal().mult(self.max_norm_c)

            # Filter not factor greater than 1.0
            self.weights_factor.less_than(1.0, self.weights_factor_mask)
            self.weights_factor.mult(self.weights_factor_mask)

            # Change 0.0 entry to 1.0.
            self.weights_factor_mask.less_than(1.0)
            self.weights_factor.add(self.weights_factor_mask)

            # Down scale over sized weights.
            self.weights.mult_by_row(self.weights_factor)
Ejemplo n.º 11
0
Archivo: da.py Proyecto: HelenLiGit/POT
def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
                     numInnerItermax=200, stopInnerThr=1e-9,
                     verbose=False, log=False):
    """
    Solve the entropic regularization optimal transport problem with nonconvex group lasso regularization

    The function solves the following optimization problem:

    .. math::
        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)+ \eta \Omega_g(\gamma)

        s.t. \gamma 1 = a

             \gamma^T 1= b

             \gamma\geq 0
    where :

    - M is the (ns,nt) metric cost matrix
    - :math:`\Omega_e` is the entropic regularization term :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
    - :math:`\Omega_g` is the group lasso  regulaization term :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1`   where  :math:`\mathcal{I}_c` are the index of samples from class c in the source domain.
    - a and b are source and target weights (sum to 1)

    The algorithm used for solving the problem is the generalised conditional gradient as proposed in  [5]_ [7]_


    Parameters
    ----------
    a : np.ndarray (ns,)
        samples weights in the source domain
    labels_a : np.ndarray (ns,)
        labels of samples in the source domain
    b : np.ndarray (nt,)
        samples weights in the target domain
    M_GPU : cudamat.CUDAMatrix (ns,nt)
        loss matrix
    reg : float
        Regularization term for entropic regularization >0
    eta : float, optional
        Regularization term  for group lasso regularization >0
    numItermax : int, optional
        Max number of iterations
    numInnerItermax : int, optional
        Max number of iterations (inner sinkhorn solver)
    stopInnerThr : float, optional
        Stop threshold on error (inner sinkhorn solver) (>0)
    verbose : bool, optional
        Print information along iterations
    log : bool, optional
        record log if True


    Returns
    -------
    gamma : (ns x nt) ndarray
        Optimal transportation matrix for the given parameters
    log : dict
        log dictionary return only if log==True in parameters


    References
    ----------

    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
    .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567.

    See Also
    --------
    ot.lp.emd : Unregularized OT
    ot.bregman.sinkhorn : Entropic regularized OT
    ot.optim.cg : General regularized OT

    """
    p = 0.5
    epsilon = 1e-3
    Nfin = len(b)

    indices_labels = []
    classes = np.unique(labels_a)
    for c in classes:
        idxc, = np.where(labels_a == c)
        indices_labels.append(cudamat.CUDAMatrix(idxc.reshape(1, -1)))

    Mreg_GPU = cudamat.empty(M_GPU.shape)
    W_GPU = cudamat.empty(M_GPU.shape).assign(0)

    for cpt in range(numItermax):
        Mreg_GPU.assign(M_GPU)
        Mreg_GPU.add_mult(W_GPU, eta)
        transp_GPU = sinkhorn(a, b, Mreg_GPU, reg, numItermax=numInnerItermax,
                              stopThr=stopInnerThr, returnAsGPU=True)
        # the transport has been computed. Check if classes are really
        # separated
        W_GPU.assign(1)
        W_GPU = W_GPU.transpose()
        for (i, c) in enumerate(classes):
            (_, nbRow) = indices_labels[i].shape
            tmpC_GPU = cudamat.empty((Nfin, nbRow)).assign(0)
            transp_GPU.transpose().select_columns(indices_labels[i], tmpC_GPU)
            majs_GPU = tmpC_GPU.sum(axis=1).add(epsilon)
            cudamat.pow(majs_GPU, (p - 1))
            majs_GPU.mult(p)

            tmpC_GPU.assign(0)
            tmpC_GPU.add_col_vec(majs_GPU)
            W_GPU.set_selected_columns(indices_labels[i], tmpC_GPU)

        W_GPU = W_GPU.transpose()

    return transp_GPU.asarray()
Ejemplo n.º 12
0
 def mult_with_derivative(self, target, activated_z):
     cm.pow(activated_z, 2, activated_z)
     activated_z.mult(-1).add(1)
     target.mult(activated_z)
Ejemplo n.º 13
0
def sinkhorn_lpl1_mm(a,
                     labels_a,
                     b,
                     M_GPU,
                     reg,
                     eta=0.1,
                     numItermax=10,
                     numInnerItermax=200,
                     stopInnerThr=1e-9,
                     verbose=False,
                     log=False):
    """
    Solve the entropic regularization optimal transport problem with nonconvex group lasso regularization

    The function solves the following optimization problem:

    .. math::
        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)+ \eta \Omega_g(\gamma)

        s.t. \gamma 1 = a

             \gamma^T 1= b

             \gamma\geq 0
    where :

    - M is the (ns,nt) metric cost matrix
    - :math:`\Omega_e` is the entropic regularization term :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
    - :math:`\Omega_g` is the group lasso  regulaization term :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1`   where  :math:`\mathcal{I}_c` are the index of samples from class c in the source domain.
    - a and b are source and target weights (sum to 1)

    The algorithm used for solving the problem is the generalised conditional gradient as proposed in  [5]_ [7]_


    Parameters
    ----------
    a : np.ndarray (ns,)
        samples weights in the source domain
    labels_a : np.ndarray (ns,)
        labels of samples in the source domain
    b : np.ndarray (nt,)
        samples weights in the target domain
    M_GPU : cudamat.CUDAMatrix (ns,nt)
        loss matrix
    reg : float
        Regularization term for entropic regularization >0
    eta : float, optional
        Regularization term  for group lasso regularization >0
    numItermax : int, optional
        Max number of iterations
    numInnerItermax : int, optional
        Max number of iterations (inner sinkhorn solver)
    stopInnerThr : float, optional
        Stop threshold on error (inner sinkhorn solver) (>0)
    verbose : bool, optional
        Print information along iterations
    log : bool, optional
        record log if True


    Returns
    -------
    gamma : (ns x nt) ndarray
        Optimal transportation matrix for the given parameters
    log : dict
        log dictionary return only if log==True in parameters


    References
    ----------

    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
    .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567.

    See Also
    --------
    ot.lp.emd : Unregularized OT
    ot.bregman.sinkhorn : Entropic regularized OT
    ot.optim.cg : General regularized OT

    """
    p = 0.5
    epsilon = 1e-3
    Nfin = len(b)

    indices_labels = []
    classes = np.unique(labels_a)
    for c in classes:
        idxc, = np.where(labels_a == c)
        indices_labels.append(cudamat.CUDAMatrix(idxc.reshape(1, -1)))

    Mreg_GPU = cudamat.empty(M_GPU.shape)
    W_GPU = cudamat.empty(M_GPU.shape).assign(0)

    for cpt in range(numItermax):
        Mreg_GPU.assign(M_GPU)
        Mreg_GPU.add_mult(W_GPU, eta)
        transp_GPU = sinkhorn(a,
                              b,
                              Mreg_GPU,
                              reg,
                              numItermax=numInnerItermax,
                              stopThr=stopInnerThr,
                              returnAsGPU=True)
        # the transport has been computed. Check if classes are really
        # separated
        W_GPU.assign(1)
        W_GPU = W_GPU.transpose()
        for (i, c) in enumerate(classes):
            (_, nbRow) = indices_labels[i].shape
            tmpC_GPU = cudamat.empty((Nfin, nbRow)).assign(0)
            transp_GPU.transpose().select_columns(indices_labels[i], tmpC_GPU)
            majs_GPU = tmpC_GPU.sum(axis=1).add(epsilon)
            cudamat.pow(majs_GPU, (p - 1))
            majs_GPU.mult(p)

            tmpC_GPU.assign(0)
            tmpC_GPU.add_col_vec(majs_GPU)
            W_GPU.set_selected_columns(indices_labels[i], tmpC_GPU)

        W_GPU = W_GPU.transpose()

    return transp_GPU.asarray()
Ejemplo n.º 14
0
    def costAndGrad(self, data, labels=None, sentence=None):

        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-2]
            wtf, _ = self.stack[-2]
            wtb, _ = self.stack[-1]
            if self.train:
                grad = self.grad[:-2]
                dwtf, _ = self.grad[-2]
                dwtb, _ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad

        # forward prop #TODO copy to device here
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w, b in stack:
            cm.dot(w, self.hActs[i - 1], self.hActs[i])
            self.hActs[i].add_col_vec(b)
            # forward prop through time
            if i == self.temporalLayer:
                self.hActsFor.assign(self.hActs[i])
                self.hActsBack.assign(self.hActs[i])
                self.hActsFor.minmax(0.0, self.maxAct, col=0)
                self.hActsBack.minmax(0.0, self.maxAct, col=T - 1)
                for t in xrange(1, T):
                    cm.mvdot_col_slice(wtf,
                                       self.hActsFor,
                                       t - 1,
                                       self.hActsFor,
                                       t,
                                       beta=1.0)
                    self.hActsFor.minmax(0.0, self.maxAct, col=t)
                    cm.mvdot_col_slice(wtb,
                                       self.hActsBack,
                                       T - t,
                                       self.hActsBack,
                                       T - t - 1,
                                       beta=1.0)
                    self.hActsBack.minmax(0.0, self.maxAct, col=T - t - 1)
                self.hActsFor.add(self.hActsBack, target=self.hActs[i])

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0, target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0, target=self.rowVec)
        cm.pow(self.rowVec, -1.0, target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        if not self.train:
            probs = self.probs.numpy_array
            return probs

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(
            np.float64),
                                          labels,
                                          blank=0)

        if self.reg > 0:
            self.regcost = 0.0
            for w, b in self.stack:
                rc = (self.reg / 2.0) * (w.euclid_norm()**2)
                self.regcost += rc
                cost = cost + rc

        if skip:
            return cost, self.grad, skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers
        deltasIn, deltasOut = self.deltasC, self.deltasOut
        for w, b in reversed(stack):
            # compute gradient
            # gradient for w
            cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0])
            if self.reg > 0:
                grad[i][0].add_mult(w, alpha=self.reg)
            # gradient for b
            deltasIn.sum(axis=1, target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T, deltasIn, target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActsFor.within(0.0, self.maxAct, target=self.tmpGradFor)
                self.hActsBack.within(0.0,
                                      self.maxAct,
                                      target=self.tmpGradBack)
                self.deltasFor.assign(deltasOut)
                self.deltasBack.assign(deltasOut)
                self.deltasFor.mult_slice(T - 1, self.tmpGradFor, T - 1)
                self.deltasBack.mult_slice(0, self.tmpGradBack, 0)

                for t in xrange(1, T):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wtf.T,
                                       self.deltasFor,
                                       T - t,
                                       self.deltasFor,
                                       T - t - 1,
                                       beta=1.0)
                    cm.mvdot_col_slice(wtb.T,
                                       self.deltasBack,
                                       t - 1,
                                       self.deltasBack,
                                       t,
                                       beta=1.0)

                    # Push through activation fn
                    self.deltasFor.mult_slice(T - t - 1, self.tmpGradFor,
                                              T - t - 1)
                    self.deltasBack.mult_slice(t, self.tmpGradBack, t)

                # Accumulate temporal gradient
                cm.dot(self.deltasFor.get_col_slice(1, T),
                       self.hActsFor.get_col_slice(0, T - 1).T,
                       target=dwtf)
                cm.dot(self.deltasBack.get_col_slice(0, T - 1),
                       self.hActsBack.get_col_slice(1, T).T,
                       target=dwtb)

                # Accumulate next layer deltas
                self.deltasFor.add(self.deltasBack, target=deltasOut)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn, deltasOut = deltasOut, deltasIn
            i -= 1
        if self.reg > 0:
            if self.temporalLayer > 0:
                dwtf.add_mult(wtf, alpha=self.reg)
                dwtb.add_mult(wtb, alpha=self.reg)

        return cost, self.grad, skip
Ejemplo n.º 15
0
    def costAndGrad(self,data,labels=None, sentence=None):
        
        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-2]
            wtf,_ = self.stack[-2]
            wtb,_ = self.stack[-1]
            if self.train:
                grad = self.grad[:-2]
                dwtf,_ = self.grad[-2]
                dwtb,_ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad
        
        # forward prop #TODO copy to device here 
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w,b in stack:
            cm.dot(w,self.hActs[i-1],self.hActs[i])
            self.hActs[i].add_col_vec(b)
            # forward prop through time
            if i == self.temporalLayer:
                self.hActsFor.assign(self.hActs[i])
                self.hActsBack.assign(self.hActs[i])
                self.hActsFor.minmax(0.0,self.maxAct,col=0)
                self.hActsBack.minmax(0.0,self.maxAct,col=T-1)
                for t in xrange(1,T):
                    cm.mvdot_col_slice(wtf,self.hActsFor,t-1,self.hActsFor,t,beta=1.0)
                    self.hActsFor.minmax(0.0,self.maxAct,col=t)
                    cm.mvdot_col_slice(wtb,self.hActsBack,T-t,self.hActsBack,T-t-1,beta=1.0)
                    self.hActsBack.minmax(0.0,self.maxAct,col=T-t-1)
                self.hActsFor.add(self.hActsBack,target=self.hActs[i])

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0,target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0,target=self.rowVec)
        cm.pow(self.rowVec,-1.0,target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        if not self.train:
            probs = self.probs.numpy_array
            return probs

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64),
                labels,blank=0)

        if self.reg > 0:
            self.regcost = 0.0
            for w, b in self.stack:
                rc = (self.reg / 2.0) * (w.euclid_norm() ** 2)
                self.regcost += rc
                cost = cost + rc

        if skip:
            return cost,self.grad,skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers 
        deltasIn,deltasOut = self.deltasC,self.deltasOut
        for w,b in reversed(stack):
            # compute gradient
            # gradient for w
            cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0])
            if self.reg > 0:
                grad[i][0].add_mult(w, alpha=self.reg)
            # gradient for b
            deltasIn.sum(axis=1,target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T,deltasIn,target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActsFor.within(0.0,self.maxAct,target=self.tmpGradFor)
                self.hActsBack.within(0.0,self.maxAct,target=self.tmpGradBack)
                self.deltasFor.assign(deltasOut)
                self.deltasBack.assign(deltasOut)
                self.deltasFor.mult_slice(T-1,self.tmpGradFor,T-1)
                self.deltasBack.mult_slice(0,self.tmpGradBack,0)

                for t in xrange(1,T):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wtf.T,self.deltasFor,T-t,
                                       self.deltasFor,T-t-1,beta=1.0)
                    cm.mvdot_col_slice(wtb.T,self.deltasBack,t-1,
                                       self.deltasBack,t,beta=1.0)

                    # Push through activation fn
                    self.deltasFor.mult_slice(T-t-1,self.tmpGradFor,T-t-1)
                    self.deltasBack.mult_slice(t,self.tmpGradBack,t)

                # Accumulate temporal gradient
                cm.dot(self.deltasFor.get_col_slice(1,T),
                        self.hActsFor.get_col_slice(0,T-1).T,target=dwtf)
                cm.dot(self.deltasBack.get_col_slice(0,T-1),
                        self.hActsBack.get_col_slice(1,T).T,target=dwtb)

                # Accumulate next layer deltas
                self.deltasFor.add(self.deltasBack,target=deltasOut)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn,deltasOut = deltasOut,deltasIn
            i -= 1
        if self.reg > 0:
            if self.temporalLayer > 0:
                dwtf.add_mult(wtf, alpha=self.reg)
                dwtb.add_mult(wtb, alpha=self.reg)

        return cost,self.grad,skip
Ejemplo n.º 16
0
    def costAndGrad(self, data, labels=None):

        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-1]
            wt, _ = self.stack[-1]
            if self.train:
                grad = self.grad[:-1]
                dwt, _ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad

        # forward prop
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w, b in stack:
            cm.dot(w, self.hActs[i - 1], self.hActs[i])
            self.hActs[i].add_col_vec(b)

            # forward prop through time
            if i == self.temporalLayer:
                for t in xrange(1, T):
                    self.hActs[i].minmax(0.0, self.maxAct, col=t - 1)
                    cm.mvdot_col_slice(wt,
                                       self.hActs[i],
                                       t - 1,
                                       self.hActs[i],
                                       t,
                                       beta=1.0)
                self.hActs[i].minmax(0.0, self.maxAct, col=T - 1)

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0, target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0, target=self.rowVec)
        cm.pow(self.rowVec, -1.0, target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        if not self.train:
            return ctc.decode_best_path(
                self.probs.numpy_array.astype(np.float64))

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(
            np.float64),
                                          labels,
                                          blank=0)

        if skip:
            return cost, self.grad, skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers
        deltasIn, deltasOut = self.deltasC, self.deltasOut
        for w, b in reversed(stack):
            # compute gradient
            cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0])
            deltasIn.sum(axis=1, target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T, deltasIn, target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActs[i].within(0.0, self.maxAct, target=self.tmpGrad)
                self.deltaTemp.assign(0.0)
                for t in xrange(T - 1, 0, -1):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wt.T,
                                       self.deltaTemp,
                                       t,
                                       deltasOut,
                                       t,
                                       beta=1.0)

                    # Push through activation fn
                    deltasOut.mult_slice(t, self.tmpGrad, t)
                    self.deltaTemp.set_single_col(t - 1, deltasOut, t)

                # Accumulate temporal gradient
                cm.dot(self.deltaTemp, self.hActs[i].T, target=dwt)

                cm.mvdot_col_slice(wt.T,
                                   self.deltaTemp,
                                   0,
                                   deltasOut,
                                   0,
                                   beta=1.0)
                deltasOut.mult_slice(0, self.tmpGrad, 0)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn, deltasOut = deltasOut, deltasIn
            i -= 1

        return cost, self.grad, skip
Ejemplo n.º 17
0
    def costAndGrad(self,data,labels=None):
        
        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-1]
            wt,_ = self.stack[-1]
            if self.train:
                grad = self.grad[:-1]
                dwt,_ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad
        
        # forward prop 
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w,b in stack:
            cm.dot(w,self.hActs[i-1],self.hActs[i])
            self.hActs[i].add_col_vec(b)

            # forward prop through time
            if i == self.temporalLayer:
                for t in xrange(1,T):
                    self.hActs[i].minmax(0.0,self.maxAct,col=t-1)
                    cm.mvdot_col_slice(wt,self.hActs[i],t-1,self.hActs[i],t,beta=1.0)
                self.hActs[i].minmax(0.0,self.maxAct,col=T-1)

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0,target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0,target=self.rowVec)
        cm.pow(self.rowVec,-1.0,target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
	if not self.train:
	    return ctc.decode_best_path(self.probs.numpy_array.astype(np.float64))

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64),
                labels,blank=0)

        if skip:
            return cost,self.grad,skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers 
        deltasIn,deltasOut = self.deltasC,self.deltasOut
        for w,b in reversed(stack):
            # compute gradient
            cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0])
            deltasIn.sum(axis=1,target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T,deltasIn,target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActs[i].within(0.0,self.maxAct,target=self.tmpGrad)
                self.deltaTemp.assign(0.0)
                for t in xrange(T-1,0,-1):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wt.T,self.deltaTemp,t,deltasOut,t,beta=1.0)

                    # Push through activation fn
                    deltasOut.mult_slice(t,self.tmpGrad,t) 
                    self.deltaTemp.set_single_col(t-1,deltasOut,t)

 
                # Accumulate temporal gradient
                cm.dot(self.deltaTemp,self.hActs[i].T,
                        target=dwt)

                cm.mvdot_col_slice(wt.T,self.deltaTemp,0,deltasOut,0,beta=1.0)
                deltasOut.mult_slice(0,self.tmpGrad,0)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn,deltasOut = deltasOut,deltasIn
            i -= 1

        return cost,self.grad,skip
def matrix_factorization_clustering(X_aux, k, l, norm=False, num_iters=100):
    cm.cublas_init()

    m, n = X_aux.shape
    U = cm.CUDAMatrix(np.random.rand(m, k))
    S = cm.CUDAMatrix(np.random.rand(k, l))
    V = cm.CUDAMatrix(np.random.rand(n, l))

    X = cm.CUDAMatrix(X_aux)

    # if norm:
    #     X = Normalizer().fit_transform(X)

    XV = cm.CUDAMatrix(np.random.rand(m, l))
    XVSt = cm.CUDAMatrix(np.random.rand(m, k))
    US = cm.CUDAMatrix(np.random.rand(m, l))
    USVt = cm.CUDAMatrix(np.random.rand(m, n))
    USVtXt = cm.CUDAMatrix(np.random.rand(m, m))
    USVtXtU = cm.CUDAMatrix(np.random.rand(m, k))
    U_aux = cm.CUDAMatrix(np.random.rand(m, k))

    XtUS = cm.CUDAMatrix(np.random.rand(m, l))
    VSt = cm.CUDAMatrix(np.random.rand(n, k))
    VStUt = cm.CUDAMatrix(np.random.rand(n, m))
    UtX = cm.CUDAMatrix(np.random.rand(k, n))
    VStUtXV = cm.CUDAMatrix(np.random.rand(n, l))
    V_aux = cm.CUDAMatrix(np.random.rand(n, l))

    UtXV = cm.CUDAMatrix(np.random.rand(k, l))
    UtUS = cm.CUDAMatrix(np.random.rand(k, l))
    UtUSVt = cm.CUDAMatrix(np.random.rand(k, n))
    UtUSVtV = cm.CUDAMatrix(np.random.rand(k, l))
    S_aux = cm.CUDAMatrix(np.random.rand(k, l))

    error_best = np.inf
    error = np.inf

    for i in range(num_iters):
        # compute U
        cm.dot(X, V, target=XV)
        cm.dot(XV, S.T, target=XVSt)

        if i is 0:
            cm.dot(U, S, target=US)
            cm.dot(US, V.T, target=USVt)
        cm.dot(USVt, X.T, target=USVtXt)
        cm.dot(USVtXt, U, target=USVtXtU)

        cm.divide(XVSt, USVtXtU, U_aux)
        cm.mult(U, U_aux, U)

        # compute V
        cm.dot(U, S, target=US)
        cm.dot(X.T, US, target=XtUS)
        cm.dot(V, S.T, target=VSt)
        cm.dot(VSt, U.T, target=VStUt)
        cm.dot(VStUt, XV, target=VStUtXV)

        cm.divide(XtUS, VStUtXV, target=V_aux)
        cm.mult(V, V_aux, V)

        # compute S
        cm.dot(U.T, X, target=UtX)
        cm.dot(UtX, V, target=UtXV)

        cm.dot(U.T, US, target=UtUS)
        cm.dot(UtUS, V.T, UtUSVt)
        cm.dot(UtUSVt, V, target=UtUSVtV)

        cm.divide(UtXV, UtUSVtV, target=S_aux)
        cm.mult(S, S_aux, target=S)

        error_ant = error

        cm.dot(U, S, target=US)
        cm.dot(US, V.T, target=USVt)
        error = cm.sum(cm.pow(cm.subtract(X, USVt), 2), axis=0)

        if error < error_best:
            U_best_cm = U
            S_best_cm = S
            V_best_cm = V
            error_best = error

        if np.abs(error - error_ant) <= 0.000001:
            break

        U_best = U_best_cm.asarray()
        S_best = S_best_cm.asarray()
        V_best = V_best_cm.asarray()

    Du = np.diag(np.ones(m).dot(U_best))
    Dv = np.diag(np.ones(n).dot(V_best))

    U_norm = U_best.dot(np.diag(S_best.dot(Dv).dot(np.ones(l))))
    V_norm = V_best.dot(np.diag(np.ones(k).dot(Du).dot(S_best)))

    rows_ind = np.argmax(U_best, axis=1)
    cols_ind = np.argmax(V_best, axis=1)

    cm.shutdown()

    return U_norm, S_best, V_norm, rows_ind, cols_ind, error_best
def matrix_factorization_clustering(X_aux, k, l, norm=False, num_iters=100):
    cm.cublas_init()

    m, n = X_aux.shape
    U = cm.CUDAMatrix(np.random.rand(m, k))
    S = cm.CUDAMatrix(np.random.rand(k, l))
    V = cm.CUDAMatrix(np.random.rand(n, l))

    X = cm.CUDAMatrix(X_aux)

    # if norm:
    #     X = Normalizer().fit_transform(X)

    XV = cm.CUDAMatrix(np.random.rand(m, l))
    XVSt = cm.CUDAMatrix(np.random.rand(m, k))
    US = cm.CUDAMatrix(np.random.rand(m, l))
    USVt = cm.CUDAMatrix(np.random.rand(m, n))
    USVtXt = cm.CUDAMatrix(np.random.rand(m, m))
    USVtXtU = cm.CUDAMatrix(np.random.rand(m, k))
    U_aux = cm.CUDAMatrix(np.random.rand(m, k))

    XtUS = cm.CUDAMatrix(np.random.rand(m, l))
    VSt = cm.CUDAMatrix(np.random.rand(n, k))
    VStUt = cm.CUDAMatrix(np.random.rand(n, m))
    UtX = cm.CUDAMatrix(np.random.rand(k, n))
    VStUtXV = cm.CUDAMatrix(np.random.rand(n, l))
    V_aux = cm.CUDAMatrix(np.random.rand(n, l))

    UtXV = cm.CUDAMatrix(np.random.rand(k, l))
    UtUS = cm.CUDAMatrix(np.random.rand(k, l))
    UtUSVt = cm.CUDAMatrix(np.random.rand(k, n))
    UtUSVtV = cm.CUDAMatrix(np.random.rand(k, l))
    S_aux = cm.CUDAMatrix(np.random.rand(k, l))

    error_best = np.inf
    error = np.inf

    for i in range(num_iters):
        # compute U
        cm.dot(X, V, target=XV)
        cm.dot(XV, S.T, target=XVSt)

        if i is 0:
            cm.dot(U, S, target=US)
            cm.dot(US, V.T, target=USVt)
        cm.dot(USVt, X.T, target=USVtXt)
        cm.dot(USVtXt, U, target=USVtXtU)

        cm.divide(XVSt, USVtXtU, U_aux)
        cm.mult(U, U_aux, U)

        # compute V
        cm.dot(U, S, target=US)
        cm.dot(X.T, US, target=XtUS)
        cm.dot(V, S.T, target=VSt)
        cm.dot(VSt, U.T, target=VStUt)
        cm.dot(VStUt, XV, target=VStUtXV)

        cm.divide(XtUS, VStUtXV, target=V_aux)
        cm.mult(V, V_aux, V)

        # compute S
        cm.dot(U.T, X, target=UtX)
        cm.dot(UtX, V, target=UtXV)

        cm.dot(U.T, US, target=UtUS)
        cm.dot(UtUS, V.T, UtUSVt)
        cm.dot(UtUSVt, V, target=UtUSVtV)

        cm.divide(UtXV, UtUSVtV, target=S_aux)
        cm.mult(S, S_aux, target=S)

        error_ant = error

        cm.dot(U, S, target=US)
        cm.dot(US, V.T, target=USVt)
        error = cm.sum(cm.pow(cm.subtract(X, USVt), 2), axis=0)

        if error < error_best:
            U_best_cm = U
            S_best_cm = S
            V_best_cm = V
            error_best = error

        if np.abs(error - error_ant) <= 0.000001:
            break

        U_best = U_best_cm.asarray()
        S_best = S_best_cm.asarray()
        V_best = V_best_cm.asarray()

    Du = np.diag(np.ones(m).dot(U_best))
    Dv = np.diag(np.ones(n).dot(V_best))

    U_norm = U_best.dot( np.diag(S_best.dot(Dv).dot(np.ones(l))) )
    V_norm = V_best.dot( np.diag(np.ones(k).dot(Du).dot(S_best)) )

    rows_ind = np.argmax(U_best, axis=1)
    cols_ind = np.argmax(V_best, axis=1)

    cm.shutdown()

    return U_norm, S_best, V_norm, rows_ind, cols_ind, error_best