def idd_lssolve(queue, m, n, a, krank): for j in range(n - krank): blas.trsv(queue, a[:krank,:krank], a[:krank,krank+j],lower=False) ctx = queue.get_info(cl.command_queue_info.CONTEXT) prg = cl.Program(ctx, util.get_source('id_kerns.cl')).build() prg.moveup(queue, [krank, n-krank], None, a.data, np.int32(krank), np.int32(n))
def iddr_id(queue, m, n, a, krank, lst, rnorms): ctx = queue.get_info(cl.command_queue_info.CONTEXT) iddr_qrpiv(queue, m, n, a, krank, lst, rnorms) id_prg = cl.Program(ctx, util.get_source('id_kerns.cl')).build() id_prg.rnorm(queue, [krank, 1], None, a.data, rnorms.data, np.int32(n)) idd_lssolve(queue, m, n, a, krank)
def iddr_qrpiv(queue, m, n, a, krank, ind, ss): r = cl_array.Array(queue, a.shape, a.dtype) r.set(np.zeros(a.shape, a.dtype)) ind.set(np.arange(ind.shape[0], dtype=ind.dtype)) ctx = queue.get_info(cl.command_queue_info.CONTEXT) qr_prg = cl.Program(ctx, util.get_source('qr_kerns.cl')).build() # begin debug # from scipy import linalg # _,R,P = linalg.qr(a.get(),pivoting=True) # print(R) # print(P) # end debug ss_l = cl.LocalMemory(m*np.dtype('float64').itemsize) qr_prg.ss(queue, [m,n], [m,1], a.data, ss.data, ss_l, np.int32(0),np.int32(n)) kpiv = util.argmax(queue, ss) qk = cl.LocalMemory(m*np.dtype('float64').itemsize) aj_qk = cl.LocalMemory(m*np.dtype('float64').itemsize) nloops = np.min([m,n,krank]) for k in range(nloops): qr_prg.swap_col(queue, [m, 1], [m, 1], a.data, ss.data, r.data, ind.data, np.int32(k), np.int32(kpiv), np.int32(n)) qr_prg.proj_rm(queue, [m, n-(k+1)], [m, 1], a.data, r.data, ss.data, qk, ss_l, aj_qk, np.int32(k), np.int32(n)) qr_prg.ss(queue, [m,n-(k+1)], [m,1], a.data, ss.data, ss_l, np.int32(k+1), np.int32(n)) kpiv = util.argmax(queue, ss[k+1:]) + k + 1 # qr_prg.norm(queue, [m,1], [m, 1], # r.data, ss.data, # np.int32(nloops-1), np.int32(n)) # print(r.get()) # print(ind) # copy r into a for output a[:krank,:] = r[:krank,:]