def batch_get_sol_params(x_nd, K_nn, bend_coefs, rot_coef=np.r_[1e-4, 1e-4, 1e-1]): n, d = x_nd.shape x_gpu = gpuarray.to_gpu(x_nd) H_arr_gpu = [] for b in bend_coefs: cur_offset = np.zeros((1 + d + n, 1 + d + n), np.float32) cur_offset[d+1:, d+1:] = b * K_nn cur_offset[1:d+1, 1:d+1] = np.diag(rot_coef) H_arr_gpu.append(gpuarray.to_gpu(cur_offset)) H_ptr_gpu = get_gpu_ptrs(H_arr_gpu) A = np.r_[np.zeros((d+1,d+1)), np.c_[np.ones((n,1)), x_nd]].T n_cnts = A.shape[0] _u,_s,_vh = np.linalg.svd(A.T) N = _u[:,n_cnts:] F = np.zeros((n + d + 1, d), np.float32) F[1:d+1, :d] -= np.diag(rot_coef) Q = np.c_[np.ones((n,1)), x_nd, K_nn].astype(np.float32) F = F.astype(np.float32) N = N.astype(np.float32) Q_gpu = gpuarray.to_gpu(Q) Q_arr_gpu = [Q_gpu for _ in range(len(bend_coefs))] Q_ptr_gpu = get_gpu_ptrs(Q_arr_gpu) F_gpu = gpuarray.to_gpu(F) F_arr_gpu = [F_gpu for _ in range(len(bend_coefs))] F_ptr_gpu = get_gpu_ptrs(F_arr_gpu) N_gpu = gpuarray.to_gpu(N) N_arr_gpu = [N_gpu for _ in range(len(bend_coefs))] N_ptr_gpu = get_gpu_ptrs(N_arr_gpu) dot_batch_nocheck(Q_arr_gpu, Q_arr_gpu, H_arr_gpu, Q_ptr_gpu, Q_ptr_gpu, H_ptr_gpu, transa = 'T') # N'HN NHN_arr_gpu, NHN_ptr_gpu = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'T'), (H_arr_gpu, H_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'N')) iH_arr = [] for NHN in NHN_arr_gpu: iH_arr.append(scipy.linalg.inv(NHN.get()).copy()) iH_arr_gpu = [gpuarray.to_gpu_async(iH) for iH in iH_arr] iH_ptr_gpu = get_gpu_ptrs(iH_arr_gpu) proj_mats = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'N'), (iH_arr_gpu, iH_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'T'), (Q_arr_gpu, Q_ptr_gpu, 'T')) offset_mats = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'N'), (iH_arr_gpu, iH_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'T'), (F_arr_gpu, F_ptr_gpu, 'N')) return proj_mats, offset_mats
def get_solver(self, x_na, K_nn, bend_coefs, rot_coef=np.r_[1e-4, 1e-4, 1e-1]): n,d = x_na.shape assert len(bend_coefs) <= len(self.bend_coefs) assert n <= self.max_N if not self.cur_solver is None: self.cur_solver.valid = False Q = np.c_[np.ones((n, 1)), x_na, K_nn] A = np.r_[np.zeros((d+1, d+1)), np.c_[np.ones((n, 1)), x_na]].T R = np.zeros((n+d+1, d)) R[1:d+1, :d] = np.diag(rot_coef) n_cnts = A.shape[0] _u,_s,_vh = np.linalg.svd(A.T) N = _u[:,n_cnts:].copy() N_gpu = self.N_gpu[:(n+d+1)*n].reshape(n+d+1, n) N_gpu.set_async(N) QN = Q.dot(N) QN_gpu = self.QN_gpu[:n*n].reshape(n, n) QN_gpu.set_async(QN) WQN_gpu = self.WQN_gpu[:n*n].reshape(n, n) NHN_gpu = self.NHN_gpu[:n*n].reshape(n, n) NR = N.T.dot(R) N_arr_gpu = [] O_gpu = [] ON_gpu = [] NON_gpu = [] for i, b in enumerate(bend_coefs): O_b = np.zeros((n+d+1, n+d+1), np.float64) O_b[d+1:, d+1:] += b * K_nn O_b[1:d+1, 1:d+1] += np.diag(rot_coef) offset = i * (n+d+1)*(n+d+1) O_gpu.append(self.O_gpu[offset:offset + (n+d+1)*(n+d+1)].reshape(n+d+1, n+d+1)) O_gpu[-1].set(O_b) offset = i * (n)*(n+d+1) ON_gpu.append(self.ON_gpu[offset:offset + n*(n+d+1)].reshape(n+d+1, n)) offset = i * n * n NON_gpu.append(self.NON_gpu[offset:offset + n*n].reshape(n, n)) N_arr_gpu.append(N_gpu) O_ptrs = get_gpu_ptrs(O_gpu) ON_ptrs = get_gpu_ptrs(ON_gpu) NON_ptrs = get_gpu_ptrs(NON_gpu) N_ptrs = get_gpu_ptrs(N_arr_gpu) dot_batch_nocheck(O_gpu, N_arr_gpu, ON_gpu, O_ptrs, N_ptrs, ON_ptrs, b = 0) dot_batch_nocheck(N_arr_gpu, ON_gpu, NON_gpu, N_ptrs, ON_ptrs, NON_ptrs, transa='T', b = 0) NON_gpu = dict(zip(bend_coefs, NON_gpu)) NON = dict([(b, non.get_async()) for b, non in NON_gpu.iteritems()]) self.cur_solver = TPSSolver(bend_coefs, N, QN, NON, NR, x_na, K_nn, rot_coef, QN_gpu, WQN_gpu, NON_gpu, NHN_gpu) return self.cur_solver
def check_transform_pts(ctx, i = 0): import scikits.cuda.linalg as la n = ctx.dims[i] w_nd = ctx.w_nd[i].get()[:n] lin_dd = ctx.lin_dd[i].get() trans_d = ctx.trans_d[i].get() k_nn = ctx.kernels[i].get()[:n, :n].reshape(n, n).copy() x_nd = ctx.pts[i].get()[:n] xw_nd = ctx.pts_w[i].get()[:n] _k_gpu = gpuarray.to_gpu(k_nn) _x_gpu = gpuarray.to_gpu(x_nd) _lin_gpu = gpuarray.to_gpu(lin_dd) _trans_gpu = gpuarray.to_gpu(trans_d) _w_gpu = gpuarray.to_gpu(w_nd) fill_mat(ctx.pt_w_ptrs, ctx.trans_d_ptrs, ctx.dims_gpu, ctx.N) dot_batch_nocheck(ctx.pts, ctx.lin_dd, ctx.pts_w, ctx.pt_ptrs, ctx.lin_dd_ptrs, ctx.pt_w_ptrs) xw_nd = ctx.pts_w[i].get()[:n] cpu_xw_nd = np.dot(x_nd, lin_dd) + trans_d[None, :] # assert np.allclose(xw_nd, cpu_xw_nd) dot_batch_nocheck(ctx.kernels, ctx.w_nd, ctx.pts_w, ctx.kernel_ptrs, ctx.w_nd_ptrs, ctx.pt_w_ptrs) xw_nd = ctx.pts_w[i].get()[:n] cpu_xw_nd = cpu_xw_nd + np.dot(k_nn, w_nd) # print "w_nd\n", w_nd[:3], np.max(w_nd) # print "lin_dd\n", lin_dd[:3] # print "trans_d\n", trans_d # print "k_nn\n", k_nn[:3, :3] # print "x_nd\n", x_nd[:3, :3] # print cpu_xw_nd[:3] if not(np.allclose(xw_nd, cpu_xw_nd) ): print "k dot w_nd is difference on cpu and gpu" k_dot_w = np.dot(k_nn, w_nd) k_gpu = [gpuarray.to_gpu(k_nn)] w_gpu = [gpuarray.to_gpu(w_nd)] res_gpu = [gpuarray.zeros((n, DATA_DIM), np.float32)] k_ptrs = get_gpu_ptrs(k_gpu) w_ptrs = get_gpu_ptrs(w_gpu) res_ptrs = get_gpu_ptrs(res_gpu) dot_batch_nocheck(k_gpu, w_gpu, res_gpu, k_ptrs, w_ptrs, res_ptrs) res = res_gpu[0].get() single_gpu = la.dot(_k_gpu, _w_gpu) print "retry success {}".format(np.allclose(res, k_dot_w)) print "gpu success {}".format(np.allclose(single_gpu.get(), res)) assert np.allclose(single_gpu.get(), res) raw_input("go?")
def update_ptrs(self): self.tps_param_ptrs = get_gpu_ptrs(self.tps_params) self.trans_d_ptrs = get_gpu_ptrs(self.trans_d) self.lin_dd_ptrs = get_gpu_ptrs(self.lin_dd) self.w_nd_ptrs = get_gpu_ptrs(self.w_nd) for b in self.bend_coefs: self.proj_mat_ptrs[b] = get_gpu_ptrs(self.proj_mats[b]) self.offset_mat_ptrs[b] = get_gpu_ptrs(self.offset_mats[b]) self.pt_ptrs = get_gpu_ptrs(self.pts) self.pt_t_pt_ptrs = get_gpu_ptrs(self.pt_t_pt) self.kernel_ptrs = get_gpu_ptrs(self.kernels) self.ident_ptrs = get_gpu_ptrs([self.ident_mats[0] for _ in range(self.N)]) self.pt_w_ptrs = get_gpu_ptrs(self.pts_w) self.pt_t_ptrs = get_gpu_ptrs(self.pts_t) self.corr_cm_ptrs = get_gpu_ptrs(self.corr_cm) self.corr_rm_ptrs = get_gpu_ptrs(self.corr_rm) self.r_coef_ptrs = get_gpu_ptrs(self.r_coefs) self.c_coef_rn_ptrs = get_gpu_ptrs(self.c_coefs_rn) self.c_coef_cn_ptrs = get_gpu_ptrs(self.c_coefs_cn) ## temporary space used for bend/warp cost computations self.warp_err = gpuarray.zeros((self.N, MAX_CLD_SIZE), np.float32) self.bend_res_mat = gpuarray.zeros((DATA_DIM * self.N, DATA_DIM), np.float32) self.bend_res =[self.bend_res_mat[i*DATA_DIM:(i+1)*DATA_DIM] for i in range(self.N)] self.bend_res_ptrs = get_gpu_ptrs(self.bend_res) self.dims_gpu = gpuarray.to_gpu(np.array(self.dims, dtype=np.int32)) self.ptrs_valid = True
def batch_get_sol_params(x_nd, K_nn, bend_coefs, rot_coef): n, d = x_nd.shape x_gpu = gpuarray.to_gpu(x_nd) H_arr_gpu = [] for b in bend_coefs: cur_offset = np.zeros((1 + d + n, 1 + d + n), np.float64) cur_offset[d + 1:, d + 1:] = b * K_nn cur_offset[1:d + 1, 1:d + 1] = np.diag(rot_coef) H_arr_gpu.append(gpuarray.to_gpu(cur_offset)) H_ptr_gpu = get_gpu_ptrs(H_arr_gpu) A = np.r_[np.zeros((d + 1, d + 1)), np.c_[np.ones((n, 1)), x_nd]].T n_cnts = A.shape[0] _u, _s, _vh = np.linalg.svd(A.T) N = _u[:, n_cnts:] F = np.zeros((n + d + 1, d), np.float64) F[1:d + 1, :d] += np.diag(rot_coef) Q = np.c_[np.ones((n, 1)), x_nd, K_nn].astype(np.float64) F = F.astype(np.float64) N = N.astype(np.float64) Q_gpu = gpuarray.to_gpu(Q) Q_arr_gpu = [Q_gpu for _ in range(len(bend_coefs))] Q_ptr_gpu = get_gpu_ptrs(Q_arr_gpu) F_gpu = gpuarray.to_gpu(F) F_arr_gpu = [F_gpu for _ in range(len(bend_coefs))] F_ptr_gpu = get_gpu_ptrs(F_arr_gpu) N_gpu = gpuarray.to_gpu(N) N_arr_gpu = [N_gpu for _ in range(len(bend_coefs))] N_ptr_gpu = get_gpu_ptrs(N_arr_gpu) dot_batch_nocheck(Q_arr_gpu, Q_arr_gpu, H_arr_gpu, Q_ptr_gpu, Q_ptr_gpu, H_ptr_gpu, transa='T') # N'HN NHN_arr_gpu, NHN_ptr_gpu = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'T'), (H_arr_gpu, H_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'N')) iH_arr = [] for NHN in NHN_arr_gpu: iH_arr.append(scipy.linalg.inv(NHN.get()).copy()) iH_arr_gpu = [gpuarray.to_gpu_async(iH) for iH in iH_arr] iH_ptr_gpu = get_gpu_ptrs(iH_arr_gpu) proj_mats = m_dot_batch( (N_arr_gpu, N_ptr_gpu, 'N'), (iH_arr_gpu, iH_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'T'), (Q_arr_gpu, Q_ptr_gpu, 'T')) offset_mats = m_dot_batch( (N_arr_gpu, N_ptr_gpu, 'N'), (iH_arr_gpu, iH_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'T'), (F_arr_gpu, F_ptr_gpu, 'N')) return proj_mats, offset_mats
def test_batch_get_sol_params(f, bend_coefs, rot_coef, atol=1e-7, index=0): seg_info = f.items()[index][1] inv_group = seg_info['inv'] ds_key = 'DS_SIZE_{}'.format(DS_SIZE) x_nd = inv_group[ds_key]['scaled_cloud_xyz'][:] K_nn = inv_group[ds_key]['scaled_K_nn'][:] n, d = x_nd.shape x_gpu = gpuarray.to_gpu(x_nd) H_arr_gpu = [] for b in bend_coefs: cur_offset = np.zeros((1 + d + n, 1 + d + n), np.float64) cur_offset[d + 1:, d + 1:] = b * K_nn cur_offset[1:d + 1, 1:d + 1] = np.diag(rot_coef) H_arr_gpu.append(gpuarray.to_gpu(cur_offset)) H_ptr_gpu = get_gpu_ptrs(H_arr_gpu) A = np.r_[np.zeros((d + 1, d + 1)), np.c_[np.ones((n, 1)), x_nd]].T n_cnts = A.shape[0] _u, _s, _vh = np.linalg.svd(A.T) N = _u[:, n_cnts:] F = np.zeros((n + d + 1, d), np.float64) F[1:d + 1, :d] += np.diag(rot_coef) Q = np.c_[np.ones((n, 1)), x_nd, K_nn].astype(np.float64) F = F.astype(np.float64) N = N.astype(np.float64) Q_gpu = gpuarray.to_gpu(Q) Q_arr_gpu = [Q_gpu for _ in range(len(bend_coefs))] Q_ptr_gpu = get_gpu_ptrs(Q_arr_gpu) F_gpu = gpuarray.to_gpu(F) F_arr_gpu = [F_gpu for _ in range(len(bend_coefs))] F_ptr_gpu = get_gpu_ptrs(F_arr_gpu) N_gpu = gpuarray.to_gpu(N) N_arr_gpu = [N_gpu for _ in range(len(bend_coefs))] N_ptr_gpu = get_gpu_ptrs(N_arr_gpu) dot_batch_nocheck(Q_arr_gpu, Q_arr_gpu, H_arr_gpu, Q_ptr_gpu, Q_ptr_gpu, H_ptr_gpu, transa='T') QTQ = Q.T.dot(Q) H_list = [] for i, bend_coef in enumerate(bend_coefs): H = QTQ H[d + 1:, d + 1:] += bend_coef * K_nn rot_coefs = np.ones(d) * rot_coef if np.isscalar( rot_coef) else rot_coef H[1:d + 1, 1:d + 1] += np.diag(rot_coefs) # ipdb.set_trace() H_list.append(H) # N'HN NHN_arr_gpu, NHN_ptr_gpu = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'T'), (H_arr_gpu, H_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'N')) NHN_list = [N.T.dot(H.dot(N)) for H in H_list] for i, NHN in enumerate(NHN_list): assert (np.allclose(NHN, NHN_arr_gpu[i].get(), atol=atol)) iH_arr = [] for NHN in NHN_arr_gpu: iH_arr.append(scipy.linalg.inv(NHN.get()).copy()) h_inv_list = [scipy.linalg.inv(NHN) for NHN in NHN_list] assert (np.allclose(iH_arr, h_inv_list, atol=atol)) iH_arr_gpu = [gpuarray.to_gpu_async(iH) for iH in iH_arr] iH_ptr_gpu = get_gpu_ptrs(iH_arr_gpu) proj_mats = m_dot_batch( (N_arr_gpu, N_ptr_gpu, 'N'), (iH_arr_gpu, iH_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'T'), (Q_arr_gpu, Q_ptr_gpu, 'T')) proj_mats_list = [N.dot(h_inv.dot(N.T.dot(Q.T))) for h_inv in h_inv_list] assert (np.allclose(proj_mats_list, proj_mats[0][index].get(), atol=atol)) offset_mats = m_dot_batch( (N_arr_gpu, N_ptr_gpu, 'N'), (iH_arr_gpu, iH_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'T'), (F_arr_gpu, F_ptr_gpu, 'N')) offset_mats_list = [N.dot(h_inv.dot(N.T.dot(F))) for h_inv in h_inv_list] assert (np.allclose(offset_mats_list, offset_mats[0][index].get(), atol=atol))
def test_batch_get_sol_params(f, bend_coefs, rot_coef, atol=1e-7, index=0): seg_info = f.items()[index][1] inv_group = seg_info['inv'] ds_key = 'DS_SIZE_{}'.format(DS_SIZE) x_nd = inv_group[ds_key]['scaled_cloud_xyz'][:] K_nn = inv_group[ds_key]['scaled_K_nn'][:] n, d = x_nd.shape x_gpu = gpuarray.to_gpu(x_nd) H_arr_gpu = [] for b in bend_coefs: cur_offset = np.zeros((1 + d + n, 1 + d + n), np.float64) cur_offset[d+1:, d+1:] = b * K_nn cur_offset[1:d+1, 1:d+1] = np.diag(rot_coef) H_arr_gpu.append(gpuarray.to_gpu(cur_offset)) H_ptr_gpu = get_gpu_ptrs(H_arr_gpu) A = np.r_[np.zeros((d+1,d+1)), np.c_[np.ones((n,1)), x_nd]].T n_cnts = A.shape[0] _u,_s,_vh = np.linalg.svd(A.T) N = _u[:,n_cnts:] F = np.zeros((n + d + 1, d), np.float64) F[1:d+1, :d] += np.diag(rot_coef) Q = np.c_[np.ones((n,1)), x_nd, K_nn].astype(np.float64) F = F.astype(np.float64) N = N.astype(np.float64) Q_gpu = gpuarray.to_gpu(Q) Q_arr_gpu = [Q_gpu for _ in range(len(bend_coefs))] Q_ptr_gpu = get_gpu_ptrs(Q_arr_gpu) F_gpu = gpuarray.to_gpu(F) F_arr_gpu = [F_gpu for _ in range(len(bend_coefs))] F_ptr_gpu = get_gpu_ptrs(F_arr_gpu) N_gpu = gpuarray.to_gpu(N) N_arr_gpu = [N_gpu for _ in range(len(bend_coefs))] N_ptr_gpu = get_gpu_ptrs(N_arr_gpu) dot_batch_nocheck(Q_arr_gpu, Q_arr_gpu, H_arr_gpu, Q_ptr_gpu, Q_ptr_gpu, H_ptr_gpu, transa = 'T') QTQ = Q.T.dot(Q) H_list = [] for i, bend_coef in enumerate(bend_coefs): H = QTQ H[d+1:,d+1:] += bend_coef * K_nn rot_coefs = np.ones(d) * rot_coef if np.isscalar(rot_coef) else rot_coef H[1:d+1, 1:d+1] += np.diag(rot_coefs) # ipdb.set_trace() H_list.append(H) # N'HN NHN_arr_gpu, NHN_ptr_gpu = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'T'), (H_arr_gpu, H_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'N')) NHN_list = [N.T.dot(H.dot(N)) for H in H_list] for i, NHN in enumerate(NHN_list): assert(np.allclose(NHN, NHN_arr_gpu[i].get(), atol=atol)) iH_arr = [] for NHN in NHN_arr_gpu: iH_arr.append(scipy.linalg.inv(NHN.get()).copy()) h_inv_list = [scipy.linalg.inv(NHN) for NHN in NHN_list] assert(np.allclose(iH_arr, h_inv_list, atol=atol)) iH_arr_gpu = [gpuarray.to_gpu_async(iH) for iH in iH_arr] iH_ptr_gpu = get_gpu_ptrs(iH_arr_gpu) proj_mats = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'N'), (iH_arr_gpu, iH_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'T'), (Q_arr_gpu, Q_ptr_gpu, 'T')) proj_mats_list = [N.dot(h_inv.dot(N.T.dot(Q.T))) for h_inv in h_inv_list] assert(np.allclose(proj_mats_list, proj_mats[0][index].get(), atol=atol)) offset_mats = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'N'), (iH_arr_gpu, iH_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'T'), (F_arr_gpu, F_ptr_gpu, 'N')) offset_mats_list = [N.dot(h_inv.dot(N.T.dot(F))) for h_inv in h_inv_list] assert(np.allclose(offset_mats_list, offset_mats[0][index].get(), atol=atol))