コード例 #1
0
def ijk_lu_decomposer_opt_gpu(M):

    m = M.shape[0]
    n = M.shape[1]

    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray

    from skcuda.cublas import cublasCreate, cublasDestroy, cublasDdot#, cublasDscal
    import skcuda.misc as misc
    #import skcuda.linalg as linalg
    #linalg.init()

    N_gpu = gpuarray.to_gpu(M)

    h = cublasCreate()

    for i in range(0,n):
        for j in range(0,n):
            #N[i,j] -= N[i,:min(i,j)].dot(N[:min(i,j),j])
            N_gpu[i,j] -= cublasDdot(h, N_gpu[i,:min(i,j)].size, N_gpu[i,:min(i,j)].gpudata, 1, N_gpu[:min(i,j),j].gpudata, n)
            if j<i:
                N_gpu[i,j] /= N_gpu[j,j]
                #cublasDscal(h, N_gpu[i,j].size, 1.0/np.float64(N_gpu[j,j].get()), N_gpu[i,j].gpudata, 1)

    #Move from GPU to CPU
    N = N_gpu.get()

    cublasDestroy(h)

    return N
コード例 #2
0
ファイル: test_cublas.py プロジェクト: lvaleriu/scikit-cuda
 def test_cublasDdot(self):
     x = np.random.rand(5).astype(np.float64)
     x_gpu = gpuarray.to_gpu(x)
     y = np.random.rand(5).astype(np.float64)
     y_gpu = gpuarray.to_gpu(y)
     result = cublas.cublasDdot(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1,
                                y_gpu.gpudata, 1)
     assert np.allclose(result, np.dot(x, y))
コード例 #3
0
 def forward(self, vector1, vector2):
     with torch.cuda.device_of(vector1):
         output = vector1.new(1)
         handle = torch.cuda.current_blas_handle()
         stream = torch.cuda.current_stream()
         cublas.cublasSetStream(handle, stream)
         if isinstance(vector1, torch.cuda.FloatTensor):
             result = cublas.cublasSdot(handle, vector1.numel(),
                                        vector1.data_ptr(), 1,
                                        vector2.data_ptr(), 1)
         elif isinstance(vector1, torch.cuda.DoubleTensor):
             result = cublas.cublasDdot(handle, vector1.numel(),
                                        vector1.data_ptr(), 1,
                                        vector2.data_ptr(), 1)
         output = output.fill_(float(result))
     self.save_for_backward(vector1, vector2)
     return output
コード例 #4
0
    def run(self,
            ERR_BOUND=None,
            err_iter=None,
            time_iter=None,
            SILENCE=False,
            DEBUG=False):
        # initialize
        self.DEBUG = DEBUG
        if isinstance(ERR_BOUND, float):
            IS_BOUNDED = True
        else:
            IS_BOUNDED = False

        if isinstance(err_iter, np.ndarray):
            self.ERR_RCD = True
        else:
            self.ERR_RCD = False

        if isinstance(time_iter, np.ndarray):
            self.TIME_RCD = True
        else:
            self.TIME_RCD = False

        self.x.fill(0)
        for i in range(self.BLOCK):
            self.x_block[i].fill(0)
            self.x_block_gpu[i].fill(0)
            self.Ax_gpu[i].fill(0)
        self.Ax.fill(0)
        self.x_gpu.fill(0)

        b_k_gpu = gpuarray.empty_like(self.b_gpu)
        rx_gpu = gpuarray.empty_like(self.x_block_gpu[0])
        soft_t_gpu = gpuarray.empty_like(self.x_block_gpu[0])
        Bx_gpu = gpuarray.empty_like(self.x_block_gpu[0])
        s11_gpu = gpuarray.zeros((self.idx_m, 1), np.float64)
        s13_gpu = gpuarray.zeros((self.idx_n, 1), np.float64)
        s23_gpu = gpuarray.zeros((self.idx_m, 1), np.float64)
        d_d_gpu = gpuarray.zeros((self.idx_n, 1), np.float64)
        d_ATA_gpu = [gpuarray.to_gpu(self.d_ATA[i]) for i in range(self.BLOCK)]
        d_ATA_rec_gpu = [
            gpuarray.to_gpu(1 / self.d_ATA[i]) for i in range(self.BLOCK)
        ]
        block_Cnt = 0

        time_s = 0
        start = time.time()
        if self.TIME_RCD:
            time_iter[0] = 0

        for t in range(self.ITER_MAX):
            # select mth block
            m = self.index_get(t)
            '''
            result_s11 = np.sum(self.Ax, axis=0) - self.b
            self.s11_gpu.set(result_s11)
            self._zmvG(self.h, self.s13_gpu, 1,
                       self.gpu_cal.A_b_gpu[m], self.s11_gpu)
            self.s13_gpu.get(self.result_s13)
            rx = np.multiply(self.d_ATA[m], self.x_block[m]) -\
                self.result_s13
            soft_t = soft_thresholding(rx, self.mu)
            Bx = np.multiply(self.d_ATA_rec[m], soft_t)
            # result_s21 = Bx_p - x_p
            descent_D = Bx-self.x_block[m]
            self.d_d_gpu.set(descent_D)
            self._zmvG(self.h, self.s23_gpu, 1,
                       self.gpu_cal.A_b_cw_gpu[m], self.d_d_gpu)
            self.s23_gpu.get(self.result_s23)
            # result_s23 = self._mv(m, descent_D)
            r_1 = np.transpose(result_s11) @ self.result_s23 +\
                self.mu*(np.linalg.norm(Bx, ord=1) -
                         np.linalg.norm(self.x_block[m], ord=1))
            r_2 = np.transpose(self.result_s23) @ self.result_s23
            if r_2 == 0.0:
                print('r_2 is ZERO, could not divide ZERO!')
            else:
                r = np.float64(element_proj(-r_1/r_2, 0, 1))
            # x(t+1) = x(t)+r(Bx(t)-x(t))
            self.x_block[m] += r*descent_D
            # Ax(t+1)
            self.Ax[m] += r*self.result_s23
            '''

            # '''
            # begin pure gpu calculation
            self.fun_b_k(b_k_gpu, m)
            self._zaxpy(self.h, s11_gpu, -1, b_k_gpu, self.Ax_gpu[m])
            self._zmvG(self.h, s13_gpu, 1, self.gpu_cal.A_b_gpu[m],
                       cublas._CUBLAS_OP['N'], s11_gpu)
            # s14
            d_ATA_gpu[m]._elwise_multiply(self.x_block_gpu[m], rx_gpu)
            self._axpy(self.h, -1, s13_gpu, rx_gpu)
            self.zsoft_t(soft_t_gpu, rx_gpu, self.mu)
            # s15
            d_ATA_rec_gpu[m]._elwise_multiply(soft_t_gpu, Bx_gpu)
            self._zaxpy(self.h, d_d_gpu, -1, self.x_block_gpu[m], Bx_gpu)
            self._zmvG(self.h, s23_gpu, 1, self.gpu_cal.A_b_gpu[m],
                       cublas._CUBLAS_OP['T'], d_d_gpu)

            # stepsize
            # r_1g = self.r1_get(self.h, s11_gpu, s23_gpu,
            #                    Bx_gpu, self.x_block_gpu[m])
            temp_1 = cublas.cublasDdot(self.h, s11_gpu.size, s11_gpu.gpudata,
                                       1, s23_gpu.gpudata, 1)
            start_s = time.time()
            temp_2 = self.mu * (
                cublas.cublasDasum(self.h, Bx_gpu.size, Bx_gpu.gpudata, 1) -
                cublas.cublasDasum(self.h, self.x_block_gpu[m].size,
                                   self.x_block_gpu[m].gpudata, 1))
            time_s += time.time() - start_s
            r_1g = temp_1 + temp_2
            r_2g = np.square(self._l2norm(self.h, s23_gpu))
            if r_2g == 0.0:
                print('r_2 is ZERO, could not divide ZERO!')
            else:
                r_g = np.float64(element_proj(-r_1g / r_2g, 0, 1))

            # self.debug(result_s13, self.x_block[m], self.x, t, m, r)
            # self.err_record(err_iter, result_s13, self.x_block[m], t)

            # if IS_BOUNDED:
            #     if not (self.DEBUG & self.ERR_RCD):
            #         self.error = error_crit(
            #             result_s13, self.x_block[m], self.mu)
            #     if self.error < ERR_BOUND:
            #         block_Cnt += 1
            #     if self.BLOCK - 1 == m:
            #         if block_Cnt == self.BLOCK:
            #             break
            #         else:
            #             block_Cnt = 0

            self._axpy(self.h, r_g, d_d_gpu, self.x_block_gpu[m])
            self._axpy(self.h, r_g, s23_gpu, self.Ax_gpu[m])

            # print(np.allclose(self.x_block_gpu[m].get(),
            #                   self.x_block[m]))
            # '''
            self.time_record(time_iter, t, start)
            # print("matrix@vector:", time_mul,
            #       "s, matrix.T@vector:", time_mul_t)

        if self.TIME_RCD:
            t_elapsed = time_iter[t]
        else:
            t_elapsed = time.time() - start

        self.rlt_display(SILENCE, t_elapsed, t)
        self.x = np.vstack(self.x_block)
        if not SILENCE:
            print(self.descript + ': ' + str(time_s) + ' s.')

        return t_elapsed
コード例 #5
0
 def r1_get(self, handle, s11_gpu, s23_gpu, Bx_gpu, x_block_gpu):
     return cublas.cublasDdot(handle, s11_gpu.size, s11_gpu.gpudata,
                              1, s23_gpu.gpudata, 1) +\
             self.mu*(self._l1norm(handle, Bx_gpu) -
                      self._l1norm(handle, x_block_gpu))