def ijk_lu_decomposer_opt_gpu(M): m = M.shape[0] n = M.shape[1] import pycuda.autoinit import pycuda.gpuarray as gpuarray from skcuda.cublas import cublasCreate, cublasDestroy, cublasDdot#, cublasDscal import skcuda.misc as misc #import skcuda.linalg as linalg #linalg.init() N_gpu = gpuarray.to_gpu(M) h = cublasCreate() for i in range(0,n): for j in range(0,n): #N[i,j] -= N[i,:min(i,j)].dot(N[:min(i,j),j]) N_gpu[i,j] -= cublasDdot(h, N_gpu[i,:min(i,j)].size, N_gpu[i,:min(i,j)].gpudata, 1, N_gpu[:min(i,j),j].gpudata, n) if j<i: N_gpu[i,j] /= N_gpu[j,j] #cublasDscal(h, N_gpu[i,j].size, 1.0/np.float64(N_gpu[j,j].get()), N_gpu[i,j].gpudata, 1) #Move from GPU to CPU N = N_gpu.get() cublasDestroy(h) return N
def test_cublasDdot(self): x = np.random.rand(5).astype(np.float64) x_gpu = gpuarray.to_gpu(x) y = np.random.rand(5).astype(np.float64) y_gpu = gpuarray.to_gpu(y) result = cublas.cublasDdot(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(result, np.dot(x, y))
def forward(self, vector1, vector2): with torch.cuda.device_of(vector1): output = vector1.new(1) handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream() cublas.cublasSetStream(handle, stream) if isinstance(vector1, torch.cuda.FloatTensor): result = cublas.cublasSdot(handle, vector1.numel(), vector1.data_ptr(), 1, vector2.data_ptr(), 1) elif isinstance(vector1, torch.cuda.DoubleTensor): result = cublas.cublasDdot(handle, vector1.numel(), vector1.data_ptr(), 1, vector2.data_ptr(), 1) output = output.fill_(float(result)) self.save_for_backward(vector1, vector2) return output
def run(self, ERR_BOUND=None, err_iter=None, time_iter=None, SILENCE=False, DEBUG=False): # initialize self.DEBUG = DEBUG if isinstance(ERR_BOUND, float): IS_BOUNDED = True else: IS_BOUNDED = False if isinstance(err_iter, np.ndarray): self.ERR_RCD = True else: self.ERR_RCD = False if isinstance(time_iter, np.ndarray): self.TIME_RCD = True else: self.TIME_RCD = False self.x.fill(0) for i in range(self.BLOCK): self.x_block[i].fill(0) self.x_block_gpu[i].fill(0) self.Ax_gpu[i].fill(0) self.Ax.fill(0) self.x_gpu.fill(0) b_k_gpu = gpuarray.empty_like(self.b_gpu) rx_gpu = gpuarray.empty_like(self.x_block_gpu[0]) soft_t_gpu = gpuarray.empty_like(self.x_block_gpu[0]) Bx_gpu = gpuarray.empty_like(self.x_block_gpu[0]) s11_gpu = gpuarray.zeros((self.idx_m, 1), np.float64) s13_gpu = gpuarray.zeros((self.idx_n, 1), np.float64) s23_gpu = gpuarray.zeros((self.idx_m, 1), np.float64) d_d_gpu = gpuarray.zeros((self.idx_n, 1), np.float64) d_ATA_gpu = [gpuarray.to_gpu(self.d_ATA[i]) for i in range(self.BLOCK)] d_ATA_rec_gpu = [ gpuarray.to_gpu(1 / self.d_ATA[i]) for i in range(self.BLOCK) ] block_Cnt = 0 time_s = 0 start = time.time() if self.TIME_RCD: time_iter[0] = 0 for t in range(self.ITER_MAX): # select mth block m = self.index_get(t) ''' result_s11 = np.sum(self.Ax, axis=0) - self.b self.s11_gpu.set(result_s11) self._zmvG(self.h, self.s13_gpu, 1, self.gpu_cal.A_b_gpu[m], self.s11_gpu) self.s13_gpu.get(self.result_s13) rx = np.multiply(self.d_ATA[m], self.x_block[m]) -\ self.result_s13 soft_t = soft_thresholding(rx, self.mu) Bx = np.multiply(self.d_ATA_rec[m], soft_t) # result_s21 = Bx_p - x_p descent_D = Bx-self.x_block[m] self.d_d_gpu.set(descent_D) self._zmvG(self.h, self.s23_gpu, 1, self.gpu_cal.A_b_cw_gpu[m], self.d_d_gpu) self.s23_gpu.get(self.result_s23) # result_s23 = self._mv(m, descent_D) r_1 = np.transpose(result_s11) @ self.result_s23 +\ self.mu*(np.linalg.norm(Bx, ord=1) - np.linalg.norm(self.x_block[m], ord=1)) r_2 = np.transpose(self.result_s23) @ self.result_s23 if r_2 == 0.0: print('r_2 is ZERO, could not divide ZERO!') else: r = np.float64(element_proj(-r_1/r_2, 0, 1)) # x(t+1) = x(t)+r(Bx(t)-x(t)) self.x_block[m] += r*descent_D # Ax(t+1) self.Ax[m] += r*self.result_s23 ''' # ''' # begin pure gpu calculation self.fun_b_k(b_k_gpu, m) self._zaxpy(self.h, s11_gpu, -1, b_k_gpu, self.Ax_gpu[m]) self._zmvG(self.h, s13_gpu, 1, self.gpu_cal.A_b_gpu[m], cublas._CUBLAS_OP['N'], s11_gpu) # s14 d_ATA_gpu[m]._elwise_multiply(self.x_block_gpu[m], rx_gpu) self._axpy(self.h, -1, s13_gpu, rx_gpu) self.zsoft_t(soft_t_gpu, rx_gpu, self.mu) # s15 d_ATA_rec_gpu[m]._elwise_multiply(soft_t_gpu, Bx_gpu) self._zaxpy(self.h, d_d_gpu, -1, self.x_block_gpu[m], Bx_gpu) self._zmvG(self.h, s23_gpu, 1, self.gpu_cal.A_b_gpu[m], cublas._CUBLAS_OP['T'], d_d_gpu) # stepsize # r_1g = self.r1_get(self.h, s11_gpu, s23_gpu, # Bx_gpu, self.x_block_gpu[m]) temp_1 = cublas.cublasDdot(self.h, s11_gpu.size, s11_gpu.gpudata, 1, s23_gpu.gpudata, 1) start_s = time.time() temp_2 = self.mu * ( cublas.cublasDasum(self.h, Bx_gpu.size, Bx_gpu.gpudata, 1) - cublas.cublasDasum(self.h, self.x_block_gpu[m].size, self.x_block_gpu[m].gpudata, 1)) time_s += time.time() - start_s r_1g = temp_1 + temp_2 r_2g = np.square(self._l2norm(self.h, s23_gpu)) if r_2g == 0.0: print('r_2 is ZERO, could not divide ZERO!') else: r_g = np.float64(element_proj(-r_1g / r_2g, 0, 1)) # self.debug(result_s13, self.x_block[m], self.x, t, m, r) # self.err_record(err_iter, result_s13, self.x_block[m], t) # if IS_BOUNDED: # if not (self.DEBUG & self.ERR_RCD): # self.error = error_crit( # result_s13, self.x_block[m], self.mu) # if self.error < ERR_BOUND: # block_Cnt += 1 # if self.BLOCK - 1 == m: # if block_Cnt == self.BLOCK: # break # else: # block_Cnt = 0 self._axpy(self.h, r_g, d_d_gpu, self.x_block_gpu[m]) self._axpy(self.h, r_g, s23_gpu, self.Ax_gpu[m]) # print(np.allclose(self.x_block_gpu[m].get(), # self.x_block[m])) # ''' self.time_record(time_iter, t, start) # print("matrix@vector:", time_mul, # "s, matrix.T@vector:", time_mul_t) if self.TIME_RCD: t_elapsed = time_iter[t] else: t_elapsed = time.time() - start self.rlt_display(SILENCE, t_elapsed, t) self.x = np.vstack(self.x_block) if not SILENCE: print(self.descript + ': ' + str(time_s) + ' s.') return t_elapsed
def r1_get(self, handle, s11_gpu, s23_gpu, Bx_gpu, x_block_gpu): return cublas.cublasDdot(handle, s11_gpu.size, s11_gpu.gpudata, 1, s23_gpu.gpudata, 1) +\ self.mu*(self._l1norm(handle, Bx_gpu) - self._l1norm(handle, x_block_gpu))