Python sqrtの例、cudamat.sqrt Pythonの例

コード例 #1

0

ファイルを表示

def columnNorms(mat, tempMat, result):
    assert (mat.shape == tempMat.shape)
    assert (result.shape == (1, mat.shape[1]))
    #cm.pow(mat, 2, target = tempMat)
    mat.mult(mat, target=tempMat)
    tempMat.sum(axis=0, target=result)
    cm.sqrt(result)

コード例 #2

0

ファイルを表示

ファイル: mcGRBM_gpu.py プロジェクト: evelkey/cudalearn

def columnNorms(mat, tempMat, result):
    assert(mat.shape == tempMat.shape)
    assert(result.shape == (1, mat.shape[1]))
    #cm.pow(mat, 2, target = tempMat)
    mat.mult(mat, target = tempMat)
    tempMat.sum(axis = 0, target = result)
    cm.sqrt(result)

コード例 #3

0

ファイルを表示

ファイル: mcGRBM_gpu.py プロジェクト: evelkey/cudalearn

    def acceleration(self):
        #this sets self.hActProbs and self.normalizedVisMB and self.sqColLens
        self.hidActProbs(vis = self.negVis)
        
        cm.dot(self.factToHid, self.hActProbs, target = self.tempFactMB)
        self.tempFactMB.mult(-1)
        self.tempFactMB.mult(self.factResponses)
        cm.dot(self.visToFact, self.tempFactMB, target = self.normalizedAccel)

        #rename some things to be like Marc'Aurelio's code:
        normcoeff = self.tempRow2
        lengthsq = self.tempRow
        
        #these next few lines repeat some work, but it is too confusing to cache all this stuff at the moment
        self.sqColLens.mult(1.0/self.numVis, target = lengthsq)
        lengthsq.add(small) #self.tempRow is what Marc'Aurelio calls lengthsq
        cm.sqrt(lengthsq, target = normcoeff)
        normcoeff.mult(lengthsq) #now self.tempRow2 has what Marc'Aurelio calls normcoeff
        normcoeff.reciprocal()
        
        self.normalizedAccel.mult(self.negVis, target = self.tempVisMB)
        self.tempVisMB.sum(axis=0, target = self.tempRow3) #this tempRow stuff is getting absurd
        self.tempRow3.mult(-1.0/self.numVis)
        self.negVis.mult_by_row(self.tempRow3, target = self.tempVisMB)
        self.normalizedAccel.mult_by_row(lengthsq, target = self.accel)
        self.accel.add(self.tempVisMB)
        self.accel.mult_by_row(normcoeff)
        
        #quadratic in v term contribution to gradient
        self.accel.add(self.negVis)
        
        self.accel.mult(2) #all parts before this point have a 2 show up because of differentiation
        
        #vis bias contribution
        self.accel.add_col_mult(self.visBias, -1)

コード例 #4

0

ファイルを表示

def normalizeInputData(vis, tempVis, sqColLens, normalizer, normalizedVis):
    """
    Our input is vis and our outputs are sqColLens, normalizer, and
    normalizedVis.  We clobber tempVis.
    """
    numVis, mbsz = vis.shape
    assert (sqColLens.shape == (1, mbsz))
    assert (sqColLens.shape == normalizer.shape)
    assert (tempVis.shape == vis.shape == normalizedVis.shape)

    vis.mult(vis, target=tempVis)
    tempVis.sum(axis=0, target=sqColLens)
    sqColLens.mult(1.0 / numVis, target=normalizer)
    normalizer.add(small)
    cm.sqrt(normalizer)
    normalizer.reciprocal()
    vis.mult_by_row(normalizer, target=normalizedVis)

コード例 #5

0

ファイルを表示

ファイル: mcGRBM_gpu.py プロジェクト: evelkey/cudalearn

def normalizeInputData(vis, tempVis, sqColLens, normalizer, normalizedVis):
    """
    Our input is vis and our outputs are sqColLens, normalizer, and
    normalizedVis.  We clobber tempVis.
    """
    numVis, mbsz = vis.shape
    assert(sqColLens.shape == (1, mbsz))
    assert(sqColLens.shape == normalizer.shape)
    assert(tempVis.shape == vis.shape == normalizedVis.shape)

    vis.mult(vis, target = tempVis)
    tempVis.sum(axis = 0, target = sqColLens)
    sqColLens.mult(1.0/numVis, target = normalizer)
    normalizer.add(small)
    cm.sqrt(normalizer)
    normalizer.reciprocal()
    vis.mult_by_row(normalizer, target = normalizedVis)

コード例 #6

0

ファイルを表示

def test_sqrt():
    m = 256
    n = 128
    a = np.array(np.random.rand(m, n)*20, dtype=np.float32, order='F')
    b = np.array(np.random.rand(m, n), dtype=np.float32, order='F')

    c = np.sqrt(a)

    m1 = cm.CUDAMatrix(a)
    m2 = cm.CUDAMatrix(b)
    cm.sqrt(m1, target = m2)
    cm.sqrt(m1)

    m1.copy_to_host()
    m2.copy_to_host()

    assert np.max(np.abs(c - m1.numpy_array)) < 10**-4, "Error in cudamat.sqrt exceeded threshold"
    assert np.max(np.abs(c - m2.numpy_array)) < 10**-4, "Error in cudamat.sqrt exceeded threshold"

コード例 #7

0

ファイルを表示

ファイル: test_cudamat.py プロジェクト: untom/cudamat

def test_sqrt():
    m = 256
    n = 128
    a = np.array(np.random.rand(m, n)*20, dtype=np.float32, order='F')
    b = np.array(np.random.rand(m, n), dtype=np.float32, order='F')

    c = np.sqrt(a)

    m1 = cm.CUDAMatrix(a)
    m2 = cm.CUDAMatrix(b)
    cm.sqrt(m1, target = m2)
    cm.sqrt(m1)

    m1.copy_to_host()
    m2.copy_to_host()

    assert np.max(np.abs(c - m1.numpy_array)) < 10**-4, "Error in cudamat.sqrt exceeded threshold"
    assert np.max(np.abs(c - m2.numpy_array)) < 10**-4, "Error in cudamat.sqrt exceeded threshold"

コード例 #8

0

ファイルを表示

 def compute_energy_mcRBM_visual(self, data, normdata, energy, VF, FH,
                                 bias_cov, bias_vis, w_mean, bias_mean, t1,
                                 t2, t6, feat, featsq, feat_mean, length,
                                 lengthsq, normcoeff, small, num_vis):
     # normalize input data vectors
     data.mult(data, target=t6)  # DxP (nr input dims x nr samples)
     t6.sum(axis=0, target=lengthsq)  # 1xP
     lengthsq.mult(0.5,
                   target=energy)  # energy of quadratic regularization term
     lengthsq.mult(1. /
                   num_vis)  # normalize by number of components (like std)
     lengthsq.add(small)  # small prevents division by 0
     cmt.sqrt(lengthsq, target=length)
     length.reciprocal(target=normcoeff)  # 1xP
     data.mult_by_row(normcoeff, target=normdata)  # normalized data
     ## potential
     # covariance contribution
     cmt.dot(VF.T, normdata, target=feat)  # HxP (nr factors x nr samples)
     feat.mult(feat, target=featsq)  # HxP
     cmt.dot(FH.T, featsq, target=t1)  # OxP (nr cov hiddens x nr samples)
     t1.mult(-0.5)
     t1.add_col_vec(bias_cov)  # OxP
     cmt.exp(t1)  # OxP
     t1.add(1, target=t2)  # OxP
     cmt.log(t2)
     t2.mult(-1)
     energy.add_sums(t2, axis=0)
     # mean contribution
     cmt.dot(w_mean.T, data,
             target=feat_mean)  # HxP (nr mean hiddens x nr samples)
     feat_mean.add_col_vec(bias_mean)  # HxP
     cmt.exp(feat_mean)
     feat_mean.add(1)
     cmt.log(feat_mean)
     feat_mean.mult(-1)
     energy.add_sums(feat_mean, axis=0)
     # visible bias term
     data.mult_by_col(bias_vis, target=t6)
     t6.mult(-1)  # DxP
     energy.add_sums(t6, axis=0)  # 1xP
     # kinetic
     data.mult(data, target=t6)
     energy.add_sums(t6, axis=0, mult=.5)

コード例 #9

0

ファイルを表示

    def run(self, iterations):

        for i in range(0,iterations):
            # F = XG(G.T G)^-1
            cm.dot(self.G_gpu.T, self.G_gpu, target=self.GTG_gpu)
            try:
                self.GTGpinv_gpu = cm.CUDAMatrix(np.linalg.inv(
                                                    self.GTG_gpu.asarray()))
            except LinAlgError:
                self.GTGpinv_gpu = cm.CUDAMatrix(np.linalg.pinv(
                                                    self.GTG_gpu.asarray()))
            cm.dot(self.X_gpu, self.G_gpu, target=self.XG_gpu)
            cm.dot(self.XG_gpu, self.GTGpinv_gpu, target=self.F_gpu)

            # preparation and calculation of the matrix separations
            cm.dot(self.X_gpu.T, self.F_gpu, target=self.XTF_gpu)
            cm.dot(self.F_gpu.T, self.F_gpu, target=self.FTF_gpu)

            self.XTF_gpu.greater_than(0, target=self.XTFgreater_gpu)
            self.XTF_gpu.mult(self.XTFgreater_gpu, target=self.XTFpos_gpu)
            self.XTFpos_gpu.subtract(self.XTF_gpu, target=self.XTFneg_gpu)

            self.FTF_gpu.greater_than(0, target=self.FTFgreater_gpu)
            self.FTF_gpu.mult(self.FTFgreater_gpu, target=self.FTFpos_gpu)
            self.FTFpos_gpu.subtract(self.FTF_gpu, target=self.FTFneg_gpu)

            # compute the G update
            cm.dot(self.G_gpu, self.FTFpos_gpu, target=self.GFTFpos_gpu)
            cm.dot(self.G_gpu, self.FTFneg_gpu, target=self.GFTFneg_gpu)

            self.XTFpos_gpu.add(self.GFTFneg_gpu)
            self.XTFneg_gpu.add(self.GFTFpos_gpu)
            self.XTFpos_gpu.add_scalar(10**-9)
            self.XTFneg_gpu.add_scalar(10**-9)
            self.XTFpos_gpu.divide(self.XTFneg_gpu)
            cm.sqrt(self.XTFpos_gpu)

            self.G_gpu.mult(self.XTFpos_gpu)

            # test for convergence
            if (i % self.niter_test_conv == 0) and self.checkConvergence():
                print "NMF converged after %i iterations" % i
                break

コード例 #10

0

ファイルを表示

ファイル: da.py プロジェクト: HelenLiGit/POT

def pairwiseEuclideanGPU(a, b, returnAsGPU=False, squared=False):
    """
    Compute the pairwise euclidean distance between matrices a and b.


    Parameters
    ----------
    a : np.ndarray (n, f)
        first matrice
    b : np.ndarray (m, f)
        second matrice
    returnAsGPU : boolean, optional (default False)
        if True, returns cudamat matrix still on GPU, else return np.ndarray
    squared : boolean, optional (default False)
        if True, return squared euclidean distance matrice


    Returns
    -------
    c : (n x m) np.ndarray or cudamat.CUDAMatrix
        pairwise euclidean distance distance matrix
    """
    # a is shape (n, f) and b shape (m, f). Return matrix c of shape (n, m).
    # First compute in c_GPU the squared euclidean distance. And return its
    # square root. At each cell [i,j] of c, we want to have
    # sum{k in range(f)} ( (a[i,k] - b[j,k])^2 ). We know that
    # (a-b)^2 = a^2 -2ab +b^2. Thus we want to have in each cell of c:
    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2).

    a_GPU = cudamat.CUDAMatrix(a)
    b_GPU = cudamat.CUDAMatrix(b)

    # Multiply a by b transpose to obtain in each cell [i,j] of c the
    # value sum{k in range(f)} ( a[i,k]b[j,k] )
    c_GPU = cudamat.dot(a_GPU, b_GPU.transpose())
    # multiply by -2 to have sum{k in range(f)} ( -2a[i,k]b[j,k] )
    c_GPU.mult(-2)

    # Compute the vectors of the sum of squared elements.
    a_GPU = cudamat.pow(a_GPU, 2).sum(axis=1)
    b_GPU = cudamat.pow(b_GPU, 2).sum(axis=1)

    # Add the vectors in each columns (respectivly rows) of c.
    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] )
    c_GPU.add_col_vec(a_GPU)
    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2)
    c_GPU.add_row_vec(b_GPU.transpose())

    if not squared:
        c_GPU = cudamat.sqrt(c_GPU)

    if returnAsGPU:
        return c_GPU
    else:
        return c_GPU.asarray()

コード例 #11

0

ファイルを表示

def pairwiseEuclideanGPU(a, b, returnAsGPU=False, squared=False):
    """
    Compute the pairwise euclidean distance between matrices a and b.


    Parameters
    ----------
    a : np.ndarray (n, f)
        first matrice
    b : np.ndarray (m, f)
        second matrice
    returnAsGPU : boolean, optional (default False)
        if True, returns cudamat matrix still on GPU, else return np.ndarray
    squared : boolean, optional (default False)
        if True, return squared euclidean distance matrice


    Returns
    -------
    c : (n x m) np.ndarray or cudamat.CUDAMatrix
        pairwise euclidean distance distance matrix
    """
    # a is shape (n, f) and b shape (m, f). Return matrix c of shape (n, m).
    # First compute in c_GPU the squared euclidean distance. And return its
    # square root. At each cell [i,j] of c, we want to have
    # sum{k in range(f)} ( (a[i,k] - b[j,k])^2 ). We know that
    # (a-b)^2 = a^2 -2ab +b^2. Thus we want to have in each cell of c:
    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2).

    a_GPU = cudamat.CUDAMatrix(a)
    b_GPU = cudamat.CUDAMatrix(b)

    # Multiply a by b transpose to obtain in each cell [i,j] of c the
    # value sum{k in range(f)} ( a[i,k]b[j,k] )
    c_GPU = cudamat.dot(a_GPU, b_GPU.transpose())
    # multiply by -2 to have sum{k in range(f)} ( -2a[i,k]b[j,k] )
    c_GPU.mult(-2)

    # Compute the vectors of the sum of squared elements.
    a_GPU = cudamat.pow(a_GPU, 2).sum(axis=1)
    b_GPU = cudamat.pow(b_GPU, 2).sum(axis=1)

    # Add the vectors in each columns (respectivly rows) of c.
    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] )
    c_GPU.add_col_vec(a_GPU)
    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2)
    c_GPU.add_row_vec(b_GPU.transpose())

    if not squared:
        c_GPU = cudamat.sqrt(c_GPU)

    if returnAsGPU:
        return c_GPU
    else:
        return c_GPU.asarray()

コード例 #12

0

ファイルを表示

 def compute_gradient_mcRBM(self, data, normdata, VF, FH, bias_cov,
                            bias_vis, w_mean, bias_mean, t1, t2, t3, t4, t6,
                            feat, featsq, feat_mean, gradient, normgradient,
                            length, lengthsq, normcoeff, small, num_vis):
     # normalize input data
     data.mult(data, target=t6)  # DxP
     t6.sum(axis=0, target=lengthsq)  # 1xP
     lengthsq.mult(1. /
                   num_vis)  # normalize by number of components (like std)
     lengthsq.add(small)
     cmt.sqrt(lengthsq, target=length)
     length.reciprocal(target=normcoeff)  # 1xP
     data.mult_by_row(normcoeff, target=normdata)  # normalized data
     cmt.dot(VF.T, normdata, target=feat)  # HxP
     feat.mult(feat, target=featsq)  # HxP
     cmt.dot(FH.T, featsq, target=t1)  # OxP
     t1.mult(-.5)
     t1.add_col_vec(bias_cov)  # OxP
     t1.apply_sigmoid(target=t2)  # OxP
     cmt.dot(FH, t2, target=t3)  # HxP
     t3.mult(feat)
     cmt.dot(VF, t3, target=normgradient)  # VxP
     # final bprop through normalization
     length.mult(lengthsq, target=normcoeff)
     normcoeff.reciprocal()  # 1xP
     normgradient.mult(data, target=gradient)  # VxP
     gradient.sum(axis=0, target=t4)  # 1xP
     t4.mult(-1. / num_vis)
     data.mult_by_row(t4, target=gradient)
     normgradient.mult_by_row(lengthsq, target=t6)
     gradient.add(t6)
     gradient.mult_by_row(normcoeff)
     # add quadratic term gradient
     gradient.add(data)
     # add visible bias term
     gradient.add_col_mult(bias_vis, -1)
     # add MEAN contribution to gradient
     cmt.dot(w_mean.T, data, target=feat_mean)  # HxP
     feat_mean.add_col_vec(bias_mean)  # HxP
     feat_mean.apply_sigmoid()  # HxP
     gradient.subtract_dot(w_mean, feat_mean)  # VxP

コード例 #13

0

ファイルを表示

    def acceleration(self):
        #this sets self.hActProbs and self.normalizedVisMB and self.sqColLens
        self.hidActProbs(vis=self.negVis)

        cm.dot(self.factToHid, self.hActProbs, target=self.tempFactMB)
        self.tempFactMB.mult(-1)
        self.tempFactMB.mult(self.factResponses)
        cm.dot(self.visToFact, self.tempFactMB, target=self.normalizedAccel)

        #rename some things to be like Marc'Aurelio's code:
        normcoeff = self.tempRow2
        lengthsq = self.tempRow

        #these next few lines repeat some work, but it is too confusing to cache all this stuff at the moment
        self.sqColLens.mult(1.0 / self.numVis, target=lengthsq)
        lengthsq.add(small)  #self.tempRow is what Marc'Aurelio calls lengthsq
        cm.sqrt(lengthsq, target=normcoeff)
        normcoeff.mult(
            lengthsq)  #now self.tempRow2 has what Marc'Aurelio calls normcoeff
        normcoeff.reciprocal()

        self.normalizedAccel.mult(self.negVis, target=self.tempVisMB)
        self.tempVisMB.sum(
            axis=0,
            target=self.tempRow3)  #this tempRow stuff is getting absurd
        self.tempRow3.mult(-1.0 / self.numVis)
        self.negVis.mult_by_row(self.tempRow3, target=self.tempVisMB)
        self.normalizedAccel.mult_by_row(lengthsq, target=self.accel)
        self.accel.add(self.tempVisMB)
        self.accel.mult_by_row(normcoeff)

        #quadratic in v term contribution to gradient
        self.accel.add(self.negVis)

        self.accel.mult(
            2
        )  #all parts before this point have a 2 show up because of differentiation

        #vis bias contribution
        self.accel.add_col_mult(self.visBias, -1)

コード例 #14

0

ファイルを表示

    def run(self, iterations):

        for i in range(0,iterations):

            cm.dot(self.XTXneg_gpu, self.W_gpu, target=self.XTXnegW_gpu)
            cm.dot(self.XTXpos_gpu, self.W_gpu, target=self.XTXposW_gpu)

            # Update G
            cm.dot(self.G_gpu, self.W_gpu.T, target=self.GWT_gpu)
            # G *= np.sqrt((XTXposW + np.dot(GWT, XTXnegW))
            #              /(XTXnegW+np.dot(GWT, XTXposW)))
            cm.dot(self.GWT_gpu, self.XTXnegW_gpu, target=self.update1_gpu)
            cm.dot(self.GWT_gpu, self.XTXposW_gpu, target=self.update2_gpu)
            self.update1_gpu.add(self.XTXposW_gpu)
            self.update2_gpu.add(self.XTXnegW_gpu)
            self.update2_gpu.add_scalar(10**-9)
            self.update1_gpu.divide(self.update2_gpu)
            cm.sqrt(self.update1_gpu)
            self.G_gpu.mult(self.update1_gpu)

            # Update W
            cm.dot(self.G_gpu.T, self.G_gpu, target=self.GTG_gpu)
            #W *= np.sqrt((np.dot(XTXpos, G) + np.dot(XTXnegW, GTG))
            #                                  / (np.dot(XTXneg, G)
            #                                + np.dot(XTXposW, GTG)))
            cm.dot(self.XTXpos_gpu, self.G_gpu, target=self.XTXposG_gpu)
            cm.dot(self.XTXneg_gpu, self.G_gpu, target=self.XTXnegG_gpu)
            cm.dot(self.XTXnegW_gpu, self.GTG_gpu, target=self.update1_gpu)
            cm.dot(self.XTXposW_gpu, self.GTG_gpu, target=self.update2_gpu)
            self.update1_gpu.add(self.XTXposG_gpu)
            self.update2_gpu.add(self.XTXnegG_gpu)
            self.update2_gpu.add_scalar(10**-9)
            self.update1_gpu.divide(self.update2_gpu)
            cm.sqrt(self.update1_gpu)
            self.W_gpu.mult(self.update1_gpu)

            # test for convergence
            if (i % self.niter_test_conv == 0) and self.checkConvergence():
                print "NMF converged after %i iterations" % i
                break

コード例 #15

0

ファイルを表示

ファイル: classify_main_3rd.py プロジェクト: frelam/image_search_deep_learning

def normolize(feat):
    #feat_temp = np.vstack((feat, feat))
    feat = np.reshape(feat, (4096, 1))
    a = cm.CUDAMatrix(feat)
    c = cm.dot(a.T, a)
    c = cm.sqrt(c)
    c = c.asarray()
    feat = feat / c[0]
    '''
    for index,item in enumerate(feat):
        feat[index,:]=item/(c[index][index])
    '''
    return feat

コード例 #16

0

ファイルを表示

ファイル: _fully_connected_layer.py プロジェクト: zhaoyan1117/NeuralNet

    def update(self, lr):
        if self.use_momentum:
            self.weights_update.mult(self.momentum)
            self.weights_update.subtract_mult(self.weights_grad, lr)
            self.weights.add(self.weights_update)

            if self.use_bias:
                self.biases_update.mult(self.momentum)
                self.biases_update.subtract_mult(self.biases_grad, lr)
                self.biases.add(self.biases_update)
        elif self.use_rmsprop:
            self.weights_rmsprop_cache.mult(self.rmsprop_dr)
            cm.pow(self.weights_grad, self.weights_grad_square)
            self.weights_grad_square.mult(1.0 - self.rmsprop_dr)
            self.weights_rmsprop_cache.add(self.weights_grad_square)
            self.weights_rmsprop_cache.add(1e-8)
            cm.sqrt(self.weights_rmsprop_cache)
            self.weights_grad.mult(lr).divide(self.weights_rmsprop_cache)
            self.weights.subtract(self.weights_grad)

            self.biases_rmsprop_cache.mult(self.rmsprop_dr)
            cm.pow(self.biases_grad, self.biases_grad_square)
            self.biases_grad_square.mult(1.0 - self.rmsprop_dr)
            self.biases_rmsprop_cache.add(self.biases_grad_square)
            self.biases_rmsprop_cache.add(1e-8)
            cm.sqrt(self.biases_rmsprop_cache)
            self.biases_grad.mult(lr).divide(self.biases_rmsprop_cache)
            self.biases.subtract(self.biases_grad)
        else:
            self.weights.subtract_mult(self.weights_grad, lr)
            if self.use_bias:
                self.biases.subtract_mult(self.biases_grad, lr)

        # Max-norm regularization.
        if self.use_max_norm:
            cm.pow(self.weights, 2, self.weights_square)
            self.weights_square.sum(0, self.weights_factor)
            cm.sqrt(self.weights_factor, self.weights_factor)

            # Avoid zero weight mags.
            self.weights_factor.add(1e-8)
            self.weights_factor.reciprocal().mult(self.max_norm_c)

            # Filter not factor greater than 1.0
            self.weights_factor.less_than(1.0, self.weights_factor_mask)
            self.weights_factor.mult(self.weights_factor_mask)

            # Change 0.0 entry to 1.0.
            self.weights_factor_mask.less_than(1.0)
            self.weights_factor.add(self.weights_factor_mask)

            # Down scale over sized weights.
            self.weights.mult_by_row(self.weights_factor)

コード例 #17

0

ファイルを表示

def optimization(M_u_u, M_u_f, M_t_f, L_u, L_t, S_u_u, S_u_u_D, S_t_t, S_t_t_D,
                 alpha, beta, k, loss, num_step):
    m = M_u_u.shape[0]
    w = M_u_f.shape[1]
    n = M_t_f.shape[0]
    #random samples from a uniform distribution over [0,1)
    U = np.random.rand(m, k)
    # int8() would reduce the precision of float number, don't do it
    # U=np.int8(U)
    V = np.random.rand(n, k)
    W = np.random.rand(w, k)
    H1 = np.random.rand(k, k)
    H2 = np.random.rand(k, k)
    H3 = np.random.rand(k, k)

    M_u_u = cm.CUDAMatrix(M_u_u)
    #print(M_t_f)
    #print(np.sum(np.sum(M_t_f)))
    M_u_f = cm.CUDAMatrix(M_u_f)
    M_t_f = cm.CUDAMatrix(M_t_f)
    U = cm.CUDAMatrix(U)
    V = cm.CUDAMatrix(V)
    W = cm.CUDAMatrix(W)
    H1 = cm.CUDAMatrix(H1)
    H2 = cm.CUDAMatrix(H2)
    H3 = cm.CUDAMatrix(H3)

    L_u = cm.CUDAMatrix(L_u)
    L_t = cm.CUDAMatrix(L_t)
    S_u_u = cm.CUDAMatrix(S_u_u)
    S_u_u_D = cm.CUDAMatrix(S_u_u_D)
    S_t_t = cm.CUDAMatrix(S_t_t)
    S_t_t_D = cm.CUDAMatrix(S_t_t_D)

    pvalue = 0.00000000000001
    step = 0
    maxU = U.asarray()
    maxPurity = per.dealWith(maxU)
    while step < num_step:

        # M_t_f is ok now (didn't change along the process )
        # M_t_f_n=M_t_f.asarray()
        # print(np.sum(np.sum(M_t_f_n)))

        # print('M_u_u')
        # print(M_u_u.asarray())
        # print('M_u_f')
        # print(M_u_f.asarray())
        # print('U')
        #print(U.asarray())
        # print('V')
        # print(V.asarray())
        # print('W')
        # print(W.asarray())
        # print('H1')
        # print(H1.asarray())
        # print('H2')
        # print(H2.asarray())
        # print('H3')
        # print(H3.asarray())
        t = targetFunction(M_u_u, M_u_f, M_t_f, L_u, L_t, U, V, W, H1, H2, H3,
                           alpha, beta).asarray()
        print('loss: ' + str(t[0][0]))
        if t <= loss:
            break

        # print(S_u_u.asarray())
        # print(cm.dot(S_u_u,U).asarray())
        #print(L_u.asarray())
        #print(manyDot([U.transpose(),L_u,U]).asarray())
        # print(cm.dot(S_u_u_D,U).asarray())

        #update U
        up = manyDot([M_u_u, U, H1.transpose()
                      ]).add(manyDot([M_u_f, W, H3.transpose()
                                      ])).add(cm.dot(S_u_u, U).mult(alpha))
        psaiU = manyDot([
            U.transpose(), M_u_u, U, H1.transpose()
        ]).add(manyDot([
            U.transpose(), M_u_f, W, H3.transpose()
        ])).subtract(manyDot([
            H1, U.transpose(), U, H1.transpose()
        ])).subtract(manyDot([H3, W.transpose(), W,
                              H3.transpose()])).subtract(
                                  manyDot([U.transpose(), L_u, U]).mult(alpha))
        down = manyDot([
            U, H1, U.transpose(), U, H1.transpose()
        ]).add(manyDot([U, H3, W.transpose(), W,
                        H3.transpose()
                        ])).add(cm.dot(S_u_u_D,
                                       U).mult(alpha)).add(cm.dot(U, psaiU))
        #make zero plus something for divide
        size = down.shape
        plus = np.ones(size) * pvalue
        plus = cm.CUDAMatrix(plus)
        down.add(plus)
        #both multiply divide and sqrt are elment-wise
        up.divide(down)
        up_cpu = up.asarray()
        up.free_device_memory()
        for i in range(0, len(up_cpu)):
            for j in range(0, len(up_cpu[i])):
                if up_cpu[i][j] < 0:
                    up_cpu[i][j] = 0
        up = cm.CUDAMatrix(up_cpu)
        U.mult(cm.sqrt(up))

        up.free_device_memory()
        psaiU.free_device_memory()
        down.free_device_memory()
        plus.free_device_memory()

        #print(M_u_u.asarray())

        #update V
        up = manyDot([M_t_f, W,
                      H2.transpose()]).add(cm.dot(S_t_t, V).mult(beta))
        psaiV = manyDot([V.transpose(), M_t_f, W,
                         H2.transpose()]).subtract(
                             manyDot([H2, W.transpose(), W,
                                      H2.transpose()])).subtract(
                                          manyDot([V.transpose(), L_t,
                                                   V]).mult(beta))
        down = manyDot([V, H2, W.transpose(), W,
                        H2.transpose()
                        ]).add(cm.dot(S_t_t_D,
                                      V).mult(beta)).add(cm.dot(V, psaiV))
        size = down.shape
        plus = np.ones(size) * pvalue
        plus = cm.CUDAMatrix(plus)
        down.add(plus)
        # print(down.asarray())
        # print(V.asarray())
        up.divide(down)
        up_cpu = up.asarray()
        up.free_device_memory()
        for i in range(0, len(up_cpu)):
            for j in range(0, len(up_cpu[i])):
                if up_cpu[i][j] < 0:
                    up_cpu[i][j] = 0
        up = cm.CUDAMatrix(up_cpu)
        V.mult(cm.sqrt(up))
        #print(V.asarray())

        up.free_device_memory()
        psaiV.free_device_memory()
        down.free_device_memory()
        plus.free_device_memory()

        #update W
        up = manyDot([M_t_f.transpose(), V,
                      H2]).add(manyDot([M_u_f.transpose(), U, H3]))
        down = manyDot([W, H2.transpose(),
                        V.transpose(), V, H2]).add(
                            manyDot([W,
                                     H3.transpose(),
                                     U.transpose(), U, H3]))
        size = down.shape
        plus = np.ones(size) * pvalue
        plus = cm.CUDAMatrix(plus)
        down.add(plus)
        W.mult(cm.sqrt(up.divide(down)))

        up.free_device_memory()
        down.free_device_memory()
        plus.free_device_memory()

        #update H1
        up = manyDot([U.transpose(), M_u_u, U])
        down = manyDot([U.transpose(), U, H1, U.transpose(), U])
        size = down.shape
        plus = np.ones(size) * pvalue
        plus = cm.CUDAMatrix(plus)
        down.add(plus)
        #print(H1)
        H1.mult(cm.sqrt(up.divide(down)))
        #print(H1)

        up.free_device_memory()
        down.free_device_memory()
        plus.free_device_memory()

        #update H2
        up = manyDot([V.transpose(), M_t_f, W])
        down = manyDot([V.transpose(), V, H2, W.transpose(), W])
        size = down.shape
        plus = np.ones(size) * pvalue
        plus = cm.CUDAMatrix(plus)
        down.add(plus)
        H2.mult(cm.sqrt(up.divide(down)))

        up.free_device_memory()
        down.free_device_memory()
        plus.free_device_memory()

        #update H3
        up = manyDot([U.transpose(), M_u_f, W])
        down = manyDot([U.transpose(), U, H3, W.transpose(), W])
        size = down.shape
        plus = np.ones(size) * pvalue
        plus = cm.CUDAMatrix(plus)
        down.add(plus)
        H3.mult(cm.sqrt(up.divide(down)))

        up.free_device_memory()
        down.free_device_memory()
        plus.free_device_memory()

        step = step + 1
        print('step: ' + str(step))
        purity = per.dealWith(U.asarray())
        if purity > maxPurity:
            #print('ex')
            maxPurity = purity
            U_c = U.copy()
            maxU = U_c.asarray()
            U_c.free_device_memory()
            # maxU=U.asarray() maxU would keep up with U in gpu

    t = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    #print(t)
    np.save('hashtag/U/U-' + t.replace(' ', '-') + '.npy', maxU)
    print('Max Purity during this process: ' + str(maxPurity))

コード例 #18

0

ファイルを表示

def NMFsemi(X,
            r,
            iterations=1000,
            G=None,
            niter_test_conv=10,
            stop_threshold=40):

    n = np.size(X, 0)
    m = np.size(X, 1)

    if G is None:
        G = np.random.random((m, r)).astype(np.float32)
    elif G.strides[1] > G.strides[0]:
        # this is a check wether the data array is correct and not a transpose
        # of some other array that is hard to process (due to strides problem)
        G = G.copy()

    # allocate the matrices on the GPU
    G_gpu = cm.CUDAMatrix(G)
    F_gpu = cm.empty((n, r))
    X_gpu = cm.CUDAMatrix(X)
    GTG_gpu = cm.empty((r, r))
    GTGpinv_gpu = cm.empty((r, r))
    XG_gpu = cm.empty((n, r))
    XTF_gpu = cm.empty((m, r))
    FTF_gpu = cm.empty((r, r))
    XTFgreater_gpu = cm.empty((m, r))
    FTFgreater_gpu = cm.empty((r, r))
    XTFpos_gpu = cm.empty((m, r))
    XTFneg_gpu = cm.empty((m, r))
    FTFpos_gpu = cm.empty((r, r))
    FTFneg_gpu = cm.empty((r, r))
    GFTFneg_gpu = cm.empty((m, r))
    GFTFpos_gpu = cm.empty((m, r))

    const = 0
    oldExposures = np.argmax(G, axis=0)

    for i in range(iterations):
        # F = XG(G.T G)^-1
        cm.dot(G_gpu.T, G_gpu, target=GTG_gpu)
        try:
            GTGpinv_gpu = cm.CUDAMatrix(np.linalg.inv(GTG_gpu.asarray()))
        except LinAlgError:
            GTGpinv_gpu = cm.CUDAMatrix(np.linalg.pinv(GTG_gpu.asarray()))
        cm.dot(X_gpu, G_gpu, target=XG_gpu)
        cm.dot(XG_gpu, GTGpinv_gpu, target=F_gpu)

        # preparation and calculation of the matrix separations
        cm.dot(X_gpu.T, F_gpu, target=XTF_gpu)
        cm.dot(F_gpu.T, F_gpu, target=FTF_gpu)

        XTF_gpu.greater_than(0, target=XTFgreater_gpu)
        XTF_gpu.mult(XTFgreater_gpu, target=XTFpos_gpu)
        XTFpos_gpu.subtract(XTF_gpu, target=XTFneg_gpu)

        FTF_gpu.greater_than(0, target=FTFgreater_gpu)
        FTF_gpu.mult(FTFgreater_gpu, target=FTFpos_gpu)
        FTFpos_gpu.subtract(FTF_gpu, target=FTFneg_gpu)

        # compute the G update
        cm.dot(G_gpu, FTFpos_gpu, target=GFTFpos_gpu)
        cm.dot(G_gpu, FTFneg_gpu, target=GFTFneg_gpu)

        XTFpos_gpu.add(GFTFneg_gpu)
        XTFneg_gpu.add(GFTFpos_gpu)
        XTFpos_gpu.divide(XTFneg_gpu)
        cm.sqrt(XTFpos_gpu)

        G_gpu.mult(XTFpos_gpu)

        if i % niter_test_conv == 0:
            newExpo = np.argmax(G_gpu.asarray(), axis=0)
            if (oldExposures != newExpo).any():
                oldExposures = newExpo
                const = 0
            else:
                const += 1
                if const == stop_threshold:
                    print "NMF converged after %i iterations" % i
                    break

    return F_gpu.asarray(), G_gpu.asarray().T

コード例 #19

0

ファイルを表示

def NMFconvex(X,
              r,
              iterations=1000,
              G=None,
              niter_test_conv=10,
              stop_threshold=40):

    n = np.size(X, 0)
    m = np.size(X, 1)

    if G is None:  # implemnt k means initialization
        G = np.random.random((m, r)).astype(np.float32)
        W = np.random.random((m, r)).astype(np.float32)
    else:
        G += 0.2
    Wi = np.dot(G, np.linalg.inv(np.dot(G.T, G)))
    Wipos = (np.abs(Wi) + Wi) / 2
    W = Wipos + 0.2 * np.sum(np.abs(Wi)) / Wi.size

    G_gpu = cm.CUDAMatrix(G)
    W_gpu = cm.CUDAMatrix(W)
    X_gpu = cm.CUDAMatrix(X)

    XTX_gpu = cm.dot(X_gpu.T, X_gpu)
    XTXpos_gpu = cm.empty((m, m))
    XTX_gpu.greater_than(0, target=XTXpos_gpu)
    XTXpos_gpu.mult(XTX_gpu)
    XTXneg_gpu = cm.empty((m, m))
    XTXpos_gpu.subtract(XTX_gpu, target=XTXneg_gpu)

    XTXnegW_gpu = cm.empty((m, r))
    XTXposW_gpu = cm.empty((m, r))
    GWT_gpu = cm.empty((m, m))
    update1_gpu = cm.empty((m, r))
    update2_gpu = cm.empty((m, r))

    GTG_gpu = cm.empty((r, r))
    XTXnegG_gpu = cm.empty((m, r))
    XTXposG_gpu = cm.empty((m, r))

    const = 0
    oldExposures = np.argmax(G, axis=1)

    for i in range(0, iterations):

        cm.dot(XTXneg_gpu, W_gpu, target=XTXnegW_gpu)
        cm.dot(XTXpos_gpu, W_gpu, target=XTXposW_gpu)

        # Update G
        cm.dot(G_gpu, W_gpu.T, target=GWT_gpu)
        # G *= np.sqrt((XTXposW + np.dot(GWT, XTXnegW))/(XTXnegW+np.dot(GWT, XTXposW)))
        cm.dot(GWT_gpu, XTXnegW_gpu, target=update1_gpu)
        cm.dot(GWT_gpu, XTXposW_gpu, target=update2_gpu)
        update1_gpu.add(XTXposW_gpu)
        update2_gpu.add(XTXnegW_gpu)
        update1_gpu.divide(update2_gpu)
        cm.sqrt(update1_gpu)
        G_gpu.mult(update1_gpu)

        # Update W
        cm.dot(G_gpu.T, G_gpu, target=GTG_gpu)
        #W *= np.sqrt((np.dot(XTXpos, G) + np.dot(XTXnegW, GTG))/(np.dot(XTXneg, G) + np.dot(XTXposW, GTG)))
        cm.dot(XTXpos_gpu, G_gpu, target=XTXposG_gpu)
        cm.dot(XTXneg_gpu, G_gpu, target=XTXnegG_gpu)
        cm.dot(XTXnegW_gpu, GTG_gpu, target=update1_gpu)
        cm.dot(XTXposW_gpu, GTG_gpu, target=update2_gpu)
        update1_gpu.add(XTXposG_gpu)
        update2_gpu.add(XTXnegG_gpu)
        update1_gpu.divide(update2_gpu)
        cm.sqrt(update1_gpu)
        W_gpu.mult(update1_gpu)

        if i % niter_test_conv == 0:
            newExpo = np.argmax(G_gpu.asarray(), axis=1)
            if (oldExposures != newExpo).any():
                oldExposures = newExpo
                const = 0
            else:
                const += 1
                if const == stop_threshold:
                    print "NMF converged after %i iterations" % i
                    break

    return cm.dot(X_gpu, W_gpu).asarray(), G_gpu.asarray().T

コード例 #20

0

ファイルを表示

import cudamat as cm
import numpy as np
cm.cuda_set_device(0)
cm.cublas_init()
t = np.load('/home/scw4750/frelam_20161027/get_feature/data/feature_0w-5w.npy')
t.dtype = '<f'
feat = t[0:40000]

print t
a = cm.CUDAMatrix(feat)
c = cm.dot(a, a.T)
e = cm.sqrt(c)
e = e.asarray()
#e.dtype = 'float'
print len(e)
dioa = None
for index, item in enumerate(e):
    if dioa is None:
        temp = np.array(item[index])
        dioa = np.copy(temp)
    else:
        temp = np.array(item[index])
        dioa = np.vstack((dioa, temp))
feat = t[40000:50000]

a = cm.CUDAMatrix(feat)
c = cm.dot(a, a.T)
e_2 = cm.sqrt(c)
e_2 = e_2.asarray()
print len(e_2)
for index, item in enumerate(e_2):

コード例 #21

0

ファイルを表示

    def train(self):
        '''
        Main train function : modified version of the original train function.
        Additions : GPU selection (useful for multi-GPU machines)
					Saving the sum of the square of the data for post-processing
					Visible data are saved
					Data samples are permuted for training
					Weights are saved every 100 training epochs
					Training energy is visualized every 100 training epochs
		NOTE : anneal learning rate used in the initial code, is NOT used here!
        '''
        #plt.ion()
        f1 = plt.figure()
        ax1 = f1.add_subplot(111)
        #ax2 = f1.add_subplot(122)
        #plt.show()

        cmt.cuda_set_device(self.gpuId)
        cmt.cublas_init()
        cmt.CUDAMatrix.init_random(1)

        np.random.seed(self.npRandSeed)
        prng = RandomState(self.npRandState)

        ################################################################
        ##################### CHANGE PATH ##############################
        # Move to current experiment path:
        os.chdir(self.saveDir)
        # Get current path:
        os.getcwd()

        self.plotsDir = 'plots'
        #self.probabilitiesDir = 'p_all'
        if not os.path.isdir(self.plotsDir):
            os.makedirs(self.plotsDir)
        if not os.path.isdir(self.plotsDir + '/energy'):
            os.makedirs(self.plotsDir + '/energy')
        #if not os.path.isdir(self.probabilitiesDir):
        #	os.makedirs(self.probabilitiesDir)
        if not os.path.isdir('weights'):
            os.makedirs('weights')

        d = self.d.astype(np.float32)
        print("visible size: ", d.shape)

        dsq = np.square(d)
        lsq = np.sum(dsq, axis=0)
        with open('lsqComplete.pkl', 'wb') as pklFile:
            cPickle.dump(lsq, pklFile)

        del dsq, lsq

        # Save visible data :
        visData = d
        np.savez('visData.npz',
                 data=d,
                 obsKeys=self.obsKeys,
                 epochTime=self.epochTime)

        with open('visData.txt', 'w') as f:
            f.write("\n Dataset : %s" % (self.dataFilename))
            f.write("\n visData size: %s " % str(visData.shape))
            f.write("\n visData type: %s " % str(visData.dtype))
            f.write("\n \n visData Range: %s " %
                    str(np.max(visData, axis=0) - np.min(visData, axis=0)))
            f.write("\n \n visData min: %s " % str(np.min(visData, axis=0)))
            f.write("\n \n visData max: %s " % str(np.max(visData, axis=0)))
            f.write("\n \n visData mean: %s " % str(np.mean(visData, axis=0)))
            f.write("\n \n visData std: %s " % str(np.std(visData, axis=0)))
            f.close()

        del visData  #if not needed for computing the latent states

        permIdx = prng.permutation(d.shape[0])

        d = d[permIdx, :]

        #subsetting train and test datasets
        #trainPerc = 0.7
        #trainSampNum = int(np.ceil(trainPerc*d.shape[0]))
        #trainSampNum = int(np.floor(trainSampNum/self.batch_size)*self.batch_size)
        #testSampNum = int(d.shape[0]-trainSampNum-1)

        # The test dataset is not used at the moment, it can be used as
        # a validation set to check for overfitting. To use it, uncomment
        # all the variables with 'test' in their name

        #~ d_test = d[trainSampNum+1:,:]
        #d = d[:trainSampNum,:]
        #obsKeys = self.obsKeys[:trainSampNum]

        totnumcases = d.shape[0]
        num_vis = d.shape[1]

        num_batches = int(totnumcases / self.batch_size)
        print("num_batches: ", num_batches)
        dev_dat = cmt.CUDAMatrix(d.T)  # VxP
        #~ test_dat = cmt.CUDAMatrix(d_test.T)

        del d, self.d, self.epochTime, self.obsKeys

        # training parameters (as in the original code by Ranzato)
        epsilon = self.epsilon
        epsilonVF = 2 * epsilon
        epsilonFH = 0.02 * epsilon
        epsilonb = 0.02 * epsilon
        epsilonw_mean = 0.2 * epsilon
        epsilonb_mean = 0.1 * epsilon
        weightcost_final = self.weightcost_final

        # HMC setting
        hmc_step_nr = self.hmc_step_nr
        hmc_step = 0.01
        hmc_target_ave_rej = self.hmc_target_ave_rej
        hmc_ave_rej = hmc_target_ave_rej

        # initialize weights
        VF = cmt.CUDAMatrix(
            np.array(0.02 * prng.randn(num_vis, self.num_fac),
                     dtype=np.float32,
                     order='F'))  # VxH
        if self.apply_mask == 0:
            FH = cmt.CUDAMatrix(
                np.array(np.eye(self.num_fac, self.num_hid_cov),
                         dtype=np.float32,
                         order='F'))  # HxO
        else:
            dd = loadmat(
                'your_FHinit_mask_file.mat'
            )  # see CVPR2010paper_material/topo2D_3x3_stride2_576filt.mat for an example
            FH = cmt.CUDAMatrix(np.array(dd["FH"], dtype=np.float32,
                                         order='F'))
        bias_cov = cmt.CUDAMatrix(
            np.array(2.0 * np.ones((self.num_hid_cov, 1)),
                     dtype=np.float32,
                     order='F'))
        bias_vis = cmt.CUDAMatrix(
            np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F'))
        w_mean = cmt.CUDAMatrix(
            np.array(0.05 * prng.randn(num_vis, self.num_hid_mean),
                     dtype=np.float32,
                     order='F'))  # VxH
        bias_mean = cmt.CUDAMatrix(
            np.array(-2.0 * np.ones((self.num_hid_mean, 1)),
                     dtype=np.float32,
                     order='F'))

        # initialize variables to store derivatives
        VFinc = cmt.CUDAMatrix(
            np.array(np.zeros((num_vis, self.num_fac)),
                     dtype=np.float32,
                     order='F'))
        FHinc = cmt.CUDAMatrix(
            np.array(np.zeros((self.num_fac, self.num_hid_cov)),
                     dtype=np.float32,
                     order='F'))
        bias_covinc = cmt.CUDAMatrix(
            np.array(np.zeros((self.num_hid_cov, 1)),
                     dtype=np.float32,
                     order='F'))
        bias_visinc = cmt.CUDAMatrix(
            np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F'))
        w_meaninc = cmt.CUDAMatrix(
            np.array(np.zeros((num_vis, self.num_hid_mean)),
                     dtype=np.float32,
                     order='F'))
        bias_meaninc = cmt.CUDAMatrix(
            np.array(np.zeros((self.num_hid_mean, 1)),
                     dtype=np.float32,
                     order='F'))

        # initialize temporary storage
        data = cmt.CUDAMatrix(
            np.array(np.empty((num_vis, self.batch_size)),
                     dtype=np.float32,
                     order='F'))  # VxP
        normdata = cmt.CUDAMatrix(
            np.array(np.empty((num_vis, self.batch_size)),
                     dtype=np.float32,
                     order='F'))  # VxP
        negdataini = cmt.CUDAMatrix(
            np.array(np.empty((num_vis, self.batch_size)),
                     dtype=np.float32,
                     order='F'))  # VxP
        feat = cmt.CUDAMatrix(
            np.array(np.empty((self.num_fac, self.batch_size)),
                     dtype=np.float32,
                     order='F'))
        featsq = cmt.CUDAMatrix(
            np.array(np.empty((self.num_fac, self.batch_size)),
                     dtype=np.float32,
                     order='F'))
        negdata = cmt.CUDAMatrix(
            np.array(prng.randn(num_vis, self.batch_size),
                     dtype=np.float32,
                     order='F'))
        old_energy = cmt.CUDAMatrix(
            np.array(np.zeros((1, self.batch_size)),
                     dtype=np.float32,
                     order='F'))
        new_energy = cmt.CUDAMatrix(
            np.array(np.zeros((1, self.batch_size)),
                     dtype=np.float32,
                     order='F'))
        energy = cmt.CUDAMatrix(
            np.array(np.zeros((1, self.batch_size)),
                     dtype=np.float32,
                     order='F'))
        gradient = cmt.CUDAMatrix(
            np.array(np.empty((num_vis, self.batch_size)),
                     dtype=np.float32,
                     order='F'))  # VxP
        normgradient = cmt.CUDAMatrix(
            np.array(np.empty((num_vis, self.batch_size)),
                     dtype=np.float32,
                     order='F'))  # VxP
        thresh = cmt.CUDAMatrix(
            np.array(np.zeros((1, self.batch_size)),
                     dtype=np.float32,
                     order='F'))
        feat_mean = cmt.CUDAMatrix(
            np.array(np.empty((self.num_hid_mean, self.batch_size)),
                     dtype=np.float32,
                     order='F'))
        vel = cmt.CUDAMatrix(
            np.array(prng.randn(num_vis, self.batch_size),
                     dtype=np.float32,
                     order='F'))
        length = cmt.CUDAMatrix(
            np.array(np.zeros((1, self.batch_size)),
                     dtype=np.float32,
                     order='F'))  # 1xP
        lengthsq = cmt.CUDAMatrix(
            np.array(np.zeros((1, self.batch_size)),
                     dtype=np.float32,
                     order='F'))  # 1xP
        normcoeff = cmt.CUDAMatrix(
            np.array(np.zeros((1, self.batch_size)),
                     dtype=np.float32,
                     order='F'))  # 1xP

        # commented to avoid computing the energy on test data
        #~ data_test = cmt.CUDAMatrix( np.array(np.empty((num_vis, testSampNum)), dtype=np.float32, order='F')) # Vxtest_batch
        #~ normdata_test = cmt.CUDAMatrix( np.array(np.empty((num_vis, testSampNum)), dtype=np.float32, order='F')) # Vxtest_batch
        #~ length_test = cmt.CUDAMatrix( np.array(np.zeros((1, testSampNum)), dtype=np.float32, order='F')) # 1xtest_batch
        #~ lengthsq_test = cmt.CUDAMatrix( np.array(np.zeros((1, testSampNum)), dtype=np.float32, order='F')) # 1xtest_batch
        #~ normcoeff_test = cmt.CUDAMatrix( np.array(np.zeros((1, testSampNum)), dtype=np.float32, order='F')) # 1xtest_batch
        #~ vel_test = cmt.CUDAMatrix( np.array(prng.randn(num_vis, testSampNum), dtype=np.float32, order='F'))
        #~ feat_test = cmt.CUDAMatrix( np.array(np.empty((self.num_fac, testSampNum)), dtype=np.float32, order='F'))
        #~ featsq_test = cmt.CUDAMatrix( np.array(np.empty((self.num_fac, testSampNum)), dtype=np.float32, order='F'))
        #~ feat_mean_test = cmt.CUDAMatrix( np.array(np.empty((self.num_hid_mean, testSampNum)), dtype=np.float32, order='F'))
        #~ energy_test = cmt.CUDAMatrix( np.array(np.zeros((1, testSampNum)), dtype=np.float32, order='F'))

        if self.apply_mask == 1:  # this used to constrain very large FH matrices only allowing to change values in a neighborhood
            dd = loadmat('your_FHinit_mask_file.mat')
            mask = cmt.CUDAMatrix(
                np.array(dd["mask"], dtype=np.float32, order='F'))
        normVF = 1
        small = 0.5

        # other temporary vars
        t1 = cmt.CUDAMatrix(
            np.array(np.empty((self.num_hid_cov, self.batch_size)),
                     dtype=np.float32,
                     order='F'))
        t2 = cmt.CUDAMatrix(
            np.array(np.empty((self.num_hid_cov, self.batch_size)),
                     dtype=np.float32,
                     order='F'))
        t3 = cmt.CUDAMatrix(
            np.array(np.empty((self.num_fac, self.batch_size)),
                     dtype=np.float32,
                     order='F'))
        t4 = cmt.CUDAMatrix(
            np.array(np.empty((1, self.batch_size)),
                     dtype=np.float32,
                     order='F'))
        t5 = cmt.CUDAMatrix(
            np.array(np.empty((1, 1)), dtype=np.float32, order='F'))
        t6 = cmt.CUDAMatrix(
            np.array(np.empty((num_vis, self.batch_size)),
                     dtype=np.float32,
                     order='F'))
        t7 = cmt.CUDAMatrix(
            np.array(np.empty((num_vis, self.batch_size)),
                     dtype=np.float32,
                     order='F'))
        t8 = cmt.CUDAMatrix(
            np.array(np.empty((num_vis, self.num_fac)),
                     dtype=np.float32,
                     order='F'))
        t9 = cmt.CUDAMatrix(
            np.array(np.zeros((self.num_fac, self.num_hid_cov)),
                     dtype=np.float32,
                     order='F'))
        t10 = cmt.CUDAMatrix(
            np.array(np.empty((1, self.num_fac)), dtype=np.float32, order='F'))
        t11 = cmt.CUDAMatrix(
            np.array(np.empty((1, self.num_hid_cov)),
                     dtype=np.float32,
                     order='F'))

        # commented to avoid computing the energy on test data
        #~ t1_test = cmt.CUDAMatrix( np.array(np.empty((self.num_hid_cov, testSampNum)), dtype=np.float32, order='F'))
        #~ t2_test = cmt.CUDAMatrix( np.array(np.empty((self.num_hid_cov, testSampNum)), dtype=np.float32, order='F'))
        #~ t3_test = cmt.CUDAMatrix( np.array(np.empty((self.num_fac, testSampNum)), dtype=np.float32, order='F'))
        #~ t4_test = cmt.CUDAMatrix( np.array(np.empty((1,testSampNum)), dtype=np.float32, order='F'))
        #~ t5_test = cmt.CUDAMatrix( np.array(np.empty((1,1)), dtype=np.float32, order='F'))
        #~ t6_test = cmt.CUDAMatrix( np.array(np.empty((num_vis, testSampNum)), dtype=np.float32, order='F'))

        meanEnergy = np.zeros(self.num_epochs)
        minEnergy = np.zeros(self.num_epochs)
        maxEnergy = np.zeros(self.num_epochs)
        #~ meanEnergy_test = np.zeros(self.num_epochs)
        #~ minEnergy_test = np.zeros(self.num_epochs)
        #~ maxEnergy_test = np.zeros(self.num_epochs)

        # start training
        for epoch in range(self.num_epochs):

            print "Epoch " + str(epoch)

            # anneal learning rates as found in the original code -
            # uncomment if you wish to use annealing!
            #~ epsilonVFc    = epsilonVF/max(1,epoch/20)
            #~ epsilonFHc    = epsilonFH/max(1,epoch/20)
            #~ epsilonbc    = epsilonb/max(1,epoch/20)
            #~ epsilonw_meanc = epsilonw_mean/max(1,epoch/20)
            #~ epsilonb_meanc = epsilonb_mean/max(1,epoch/20)

            # no annealing is used in our experiments because learning
            # was stopping too early
            epsilonVFc = epsilonVF
            epsilonFHc = epsilonFH
            epsilonbc = epsilonb
            epsilonw_meanc = epsilonw_mean
            epsilonb_meanc = epsilonb_mean

            weightcost = weightcost_final

            if epoch <= self.startFH:
                epsilonFHc = 0
            if epoch <= self.startwd:
                weightcost = 0

            # commented to avoid computing the energy on test data
            #~ data_test = test_dat

            #~ data_test.mult(data_test, target = t6_test) # DxP
            #~ t6_test.sum(axis = 0, target = lengthsq_test) # 1xP
            #~ lengthsq_test.mult(1./num_vis) # normalize by number of components (like std)
            #~ lengthsq_test.add(small) # small avoids division by 0
            #~ cmt.sqrt(lengthsq_test, target = length_test)
            #~ length_test.reciprocal(target = normcoeff_test) # 1xP
            #~ data_test.mult_by_row(normcoeff_test, target = normdata_test) # normalized data

            for batch in range(num_batches):

                # get current minibatch
                data = dev_dat.slice(
                    batch * self.batch_size, (batch + 1) *
                    self.batch_size)  # DxP (nr dims x nr samples)

                # normalize input data
                data.mult(data, target=t6)  # DxP
                t6.sum(axis=0, target=lengthsq)  # 1xP
                lengthsq.mult(
                    1. /
                    num_vis)  # normalize by number of components (like std)
                lengthsq.add(small)  # small avoids division by 0
                cmt.sqrt(lengthsq, target=length)
                length.reciprocal(target=normcoeff)  # 1xP
                data.mult_by_row(normcoeff, target=normdata)  # normalized data
                ## compute positive sample derivatives
                # covariance part
                cmt.dot(VF.T, normdata,
                        target=feat)  # HxP (nr facs x nr samples)
                feat.mult(feat, target=featsq)  # HxP
                cmt.dot(FH.T, featsq,
                        target=t1)  # OxP (nr cov hiddens x nr samples)
                t1.mult(-0.5)
                t1.add_col_vec(bias_cov)  # OxP
                t1.apply_sigmoid(target=t2)  # OxP
                cmt.dot(featsq, t2.T, target=FHinc)  # HxO
                cmt.dot(FH, t2, target=t3)  # HxP
                t3.mult(feat)
                cmt.dot(normdata, t3.T, target=VFinc)  # VxH
                t2.sum(axis=1, target=bias_covinc)
                bias_covinc.mult(-1)
                # visible bias
                data.sum(axis=1, target=bias_visinc)
                bias_visinc.mult(-1)
                # mean part
                cmt.dot(w_mean.T, data,
                        target=feat_mean)  # HxP (nr mean hiddens x nr samples)
                feat_mean.add_col_vec(bias_mean)  # HxP
                feat_mean.apply_sigmoid()  # HxP
                feat_mean.mult(-1)
                cmt.dot(data, feat_mean.T, target=w_meaninc)
                feat_mean.sum(axis=1, target=bias_meaninc)

                # HMC sampling: draw an approximate sample from the model
                if self.doPCD == 0:  # CD-1 (set negative data to current training samples)
                    hmc_step, hmc_ave_rej = self.draw_HMC_samples(
                        data, negdata, normdata, vel, gradient, normgradient,
                        new_energy, old_energy, VF, FH, bias_cov, bias_vis,
                        w_mean, bias_mean, hmc_step, hmc_step_nr, hmc_ave_rej,
                        hmc_target_ave_rej, t1, t2, t3, t4, t5, t6, t7, thresh,
                        feat, featsq, self.batch_size, feat_mean, length,
                        lengthsq, normcoeff, small, num_vis)
                else:  # PCD-1 (use previous negative data as starting point for chain)
                    negdataini.assign(negdata)
                    hmc_step, hmc_ave_rej = self.draw_HMC_samples(
                        negdataini, negdata, normdata, vel, gradient,
                        normgradient, new_energy, old_energy, VF, FH, bias_cov,
                        bias_vis, w_mean, bias_mean, hmc_step, hmc_step_nr,
                        hmc_ave_rej, hmc_target_ave_rej, t1, t2, t3, t4, t5,
                        t6, t7, thresh, feat, featsq, self.batch_size,
                        feat_mean, length, lengthsq, normcoeff, small, num_vis)

                # compute derivatives at the negative samples
                # normalize input data
                negdata.mult(negdata, target=t6)  # DxP
                t6.sum(axis=0, target=lengthsq)  # 1xP
                lengthsq.mult(
                    1. /
                    num_vis)  # normalize by number of components (like std)
                lengthsq.add(small)
                cmt.sqrt(lengthsq, target=length)
                length.reciprocal(target=normcoeff)  # 1xP
                negdata.mult_by_row(normcoeff,
                                    target=normdata)  # normalized data
                # covariance part
                cmt.dot(VF.T, normdata, target=feat)  # HxP
                feat.mult(feat, target=featsq)  # HxP
                cmt.dot(FH.T, featsq, target=t1)  # OxP
                t1.mult(-0.5)
                t1.add_col_vec(bias_cov)  # OxP
                t1.apply_sigmoid(target=t2)  # OxP
                FHinc.subtract_dot(featsq, t2.T)  # HxO
                FHinc.mult(0.5)
                cmt.dot(FH, t2, target=t3)  # HxP
                t3.mult(feat)
                VFinc.subtract_dot(normdata, t3.T)  # VxH
                bias_covinc.add_sums(t2, axis=1)
                # visible bias
                bias_visinc.add_sums(negdata, axis=1)
                # mean part
                cmt.dot(w_mean.T, negdata, target=feat_mean)  # HxP
                feat_mean.add_col_vec(bias_mean)  # HxP
                feat_mean.apply_sigmoid()  # HxP
                w_meaninc.add_dot(negdata, feat_mean.T)
                bias_meaninc.add_sums(feat_mean, axis=1)

                # update parameters
                VFinc.add_mult(VF.sign(), weightcost)  # L1 regularization
                VF.add_mult(VFinc, -epsilonVFc / self.batch_size)
                # normalize columns of VF: normalize by running average of their norm
                VF.mult(VF, target=t8)
                t8.sum(axis=0, target=t10)
                cmt.sqrt(t10)
                t10.sum(axis=1, target=t5)
                t5.copy_to_host()
                normVF = .95 * normVF + (
                    .05 / self.num_fac) * t5.numpy_array[0, 0]  # estimate norm
                t10.reciprocal()
                VF.mult_by_row(t10)
                VF.mult(normVF)
                bias_cov.add_mult(bias_covinc, -epsilonbc / self.batch_size)
                bias_vis.add_mult(bias_visinc, -epsilonbc / self.batch_size)

                if epoch > self.startFH:
                    FHinc.add_mult(FH.sign(), weightcost)  # L1 regularization
                    FH.add_mult(FHinc, -epsilonFHc / self.batch_size)  # update
                    # set to 0 negative entries in FH
                    FH.greater_than(0, target=t9)
                    FH.mult(t9)
                    if self.apply_mask == 1:
                        FH.mult(mask)
                    # normalize columns of FH: L1 norm set to 1 in each column
                    FH.sum(axis=0, target=t11)
                    t11.reciprocal()
                    FH.mult_by_row(t11)
                w_meaninc.add_mult(w_mean.sign(), weightcost)
                w_mean.add_mult(w_meaninc, -epsilonw_meanc / self.batch_size)
                bias_mean.add_mult(bias_meaninc,
                                   -epsilonb_meanc / self.batch_size)

            if self.verbose == 1:
                print "VF: " + '%3.2e' % VF.euclid_norm(
                ) + ", DVF: " + '%3.2e' % (
                    VFinc.euclid_norm() * (epsilonVFc / self.batch_size)
                ) + ", FH: " + '%3.2e' % FH.euclid_norm(
                ) + ", DFH: " + '%3.2e' % (
                    FHinc.euclid_norm() * (epsilonFHc / self.batch_size)
                ) + ", bias_cov: " + '%3.2e' % bias_cov.euclid_norm(
                ) + ", Dbias_cov: " + '%3.2e' % (
                    bias_covinc.euclid_norm() * (epsilonbc / self.batch_size)
                ) + ", bias_vis: " + '%3.2e' % bias_vis.euclid_norm(
                ) + ", Dbias_vis: " + '%3.2e' % (
                    bias_visinc.euclid_norm() * (epsilonbc / self.batch_size)
                ) + ", wm: " + '%3.2e' % w_mean.euclid_norm(
                ) + ", Dwm: " + '%3.2e' % (
                    w_meaninc.euclid_norm() *
                    (epsilonw_meanc / self.batch_size)
                ) + ", bm: " + '%3.2e' % bias_mean.euclid_norm(
                ) + ", Dbm: " + '%3.2e' % (
                    bias_meaninc.euclid_norm() *
                    (epsilonb_meanc / self.batch_size)
                ) + ", step: " + '%3.2e' % hmc_step + ", rej: " + '%3.2e' % hmc_ave_rej
                with open('terminal.txt', 'a') as f:
                    f.write('\n' + "epoch: %s" % str(epoch) + ", VF: " +
                            '%3.2e' % VF.euclid_norm() + ", DVF: " + '%3.2e' %
                            (VFinc.euclid_norm() *
                             (epsilonVFc / self.batch_size)) + ", FH: " +
                            '%3.2e' % FH.euclid_norm() + ", DFH: " + '%3.2e' %
                            (FHinc.euclid_norm() *
                             (epsilonFHc / self.batch_size)) + ", bias_cov: " +
                            '%3.2e' % bias_cov.euclid_norm() +
                            ", Dbias_cov: " + '%3.2e' %
                            (bias_covinc.euclid_norm() *
                             (epsilonbc / self.batch_size)) + ", bias_vis: " +
                            '%3.2e' % bias_vis.euclid_norm() +
                            ", Dbias_vis: " + '%3.2e' %
                            (bias_visinc.euclid_norm() *
                             (epsilonbc / self.batch_size)) + ", wm: " +
                            '%3.2e' % w_mean.euclid_norm() + ", Dwm: " +
                            '%3.2e' % (w_meaninc.euclid_norm() *
                                       (epsilonw_meanc / self.batch_size)) +
                            ", bm: " + '%3.2e' % bias_mean.euclid_norm() +
                            ", Dbm: " + '%3.2e' %
                            (bias_meaninc.euclid_norm() *
                             (epsilonb_meanc / self.batch_size)) + ", step: " +
                            '%3.2e' % hmc_step + ", rej: " +
                            '%3.2e' % hmc_ave_rej)
                sys.stdout.flush()

            # commented to avoid computing the energy on trainig data
            self.compute_energy_mcRBM_visual(data, normdata, energy, VF, FH,
                                             bias_cov, bias_vis, w_mean,
                                             bias_mean, t1, t2, t6, feat,
                                             featsq, feat_mean, length,
                                             lengthsq, normcoeff, small,
                                             num_vis)
            energy.copy_to_host()
            meanEnergy[epoch] = np.mean(energy.numpy_array)
            minEnergy[epoch] = np.min(energy.numpy_array)
            maxEnergy[epoch] = np.max(energy.numpy_array)

            # commented to avoid computing the energy on test data
            #~ self.compute_energy_mcRBM_visual(data_test,normdata_test,energy_test,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1_test,t2_test,t6_test,feat_test,featsq_test,feat_mean_test,length_test,lengthsq_test,normcoeff_test,small,num_vis)
            #~ energy_test.copy_to_host()
            #~ meanEnergy_test[epoch] = np.mean(energy_test.numpy_array)
            #~ minEnergy_test[epoch] = np.min(energy_test.numpy_array)
            #~ maxEnergy_test[epoch] = np.max(energy_test.numpy_array)

            ax1.cla()
            ax1.plot(range(epoch), meanEnergy[0:epoch])
            ax1.plot(range(epoch), maxEnergy[0:epoch])
            ax1.plot(range(epoch), minEnergy[0:epoch])

            if np.mod(epoch, 100) == 0:
                #f1.savefig(output_folder + str(epoch)+'_'+'fig.png')
                f1.savefig(self.plotsDir +
                           '/energy/energyAt_%s.png' % str(epoch))

            # back-up every once in a while
            if np.mod(epoch, 100) == 0:
                VF.copy_to_host()
                FH.copy_to_host()
                bias_cov.copy_to_host()
                w_mean.copy_to_host()
                bias_mean.copy_to_host()
                bias_vis.copy_to_host()
                savemat(
                    "./weights/ws_temp%s" % str(epoch), {
                        'VF': VF.numpy_array,
                        'FH': FH.numpy_array,
                        'bias_cov': bias_cov.numpy_array,
                        'bias_vis': bias_vis.numpy_array,
                        'w_mean': w_mean.numpy_array,
                        'bias_mean': bias_mean.numpy_array,
                        'epoch': epoch
                    })

                # uncomment if computing the energy in order to store its evolution throghout training
                #~ savemat(self.refDir + '/' + "training_energy_" + str(self.num_fac) + "_cov" + str(self.num_hid_cov) + "_mean" + str(self.num_hid_mean), {'meanEnergy':meanEnergy,'meanEnergy_test':meanEnergy_test,'maxEnergy': maxEnergy, 'maxEnergy_test': maxEnergy_test, 'minEnergy': minEnergy, 'minEnergy_test': minEnergy_test, 'epoch':epoch})
                #savemat("training_energy_" + str(self.num_fac) + "_cov" + str(self.num_hid_cov) + "_mean" + str(self.num_hid_mean), {'meanEnergy':meanEnergy, 'maxEnergy': maxEnergy, 'minEnergy': minEnergy, 'epoch':epoch})

            # in order to stop the training gracefully, create an empty file
            # named 'stop_now' in the folder containing the experiment
            # configuration file
            if os.path.isfile('stop_now'):
                break

        # final back-up
        VF.copy_to_host()
        FH.copy_to_host()
        bias_cov.copy_to_host()
        bias_vis.copy_to_host()
        w_mean.copy_to_host()
        bias_mean.copy_to_host()
        savemat(
            "ws_fac%s" % str(self.num_fac) + "_cov%s" % str(self.num_hid_cov) +
            "_mean%s" % str(self.num_hid_mean), {
                'VF': VF.numpy_array,
                'FH': FH.numpy_array,
                'bias_cov': bias_cov.numpy_array,
                'bias_vis': bias_vis.numpy_array,
                'w_mean': w_mean.numpy_array,
                'bias_mean': bias_mean.numpy_array,
                'epoch': epoch
            })

        # uncomment if computing the energy in order to store its evolution throghout training
        #~ savemat(self.refDir + '/' + "training_energy_" + str(self.num_fac) + "_cov" + str(self.num_hid_cov) + "_mean" + str(self.num_hid_mean), {'meanEnergy':meanEnergy,'meanEnergy_test':meanEnergy_test,'maxEnergy': maxEnergy, 'maxEnergy_test': maxEnergy_test, 'minEnergy': minEnergy, 'minEnergy_test': minEnergy_test, 'epoch':epoch})
        savemat(
            "training_energy_" + str(self.num_fac) + "_cov" +
            str(self.num_hid_cov) + "_mean" + str(self.num_hid_mean), {
                'meanEnergy': meanEnergy,
                'maxEnergy': maxEnergy,
                'minEnergy': minEnergy,
                'epoch': epoch
            })

        # Compute states if desired:
        # normalise data for covariance hidden:
        #dsq = np.square(visData)
        #lsq = np.sum(dsq, axis=0)
        #lsq /= visData.shape[1]
        #lsq += np.spacing(1)
        #l = np.sqrt(lsq)
        #normD = visData/l

        #logisticArg_c = (-0.5*np.dot(FH.numpy_array.T, np.square(np.dot(VF.numpy_array.T, normD.T))) + bias_cov.numpy_array).T
        #p_hc = logisticFunc(logisticArg_c)

        #logisticArg_m = np.dot(visData, w_mean.numpy_array) + bias_mean.numpy_array.T
        #p_hm = logisticFunc(logisticArg_m)

        #p_all = np.concatenate((p_hc, p_hm), axis=1)
        #savemat(self.probabilitiesDir + '/pAll_%i.mat' % epoch, mdict={'p_all':p_all})

        with open('done', 'w') as doneFile:
            doneFile.write(
                datetime.strftime(datetime.now(), '%d/%m/%Y %H:%M:%S'))