def better_rec(self, w, model, s=1, weights=1, damp_z=1): """Quick switch to allow reconstruction at unknown scale returns a,r and scale""" from numpy.core.umath_tests import matrix_multiply proj = matrix_multiply(self.cam[np.newaxis], model) proj[:, :2] = (proj[:, :2] * s + w * weights) / (s + weights) proj[:, 2] *= damp_z out = matrix_multiply(self.cam.T[np.newaxis], proj) return out
def _map_params_to_P_zero(params, params_type, initial, params_slice, filler, boo, cholesky_of_P_zero, square_root_filters): """Map parameters from params to P_zero.""" # write params in filler filler[:] = 0 filler[boo] = params[params_slice] # transform the filler if params_type == 'short' or cholesky_of_P_zero is True: if square_root_filters is False: # make chol_t to not chol filler = matrix_multiply( np.transpose(filler, axes=(0, 2, 1)), filler) else: # make not_chol to not_chol (as covariance matrices are symmetric, # only half of its off-diagonal elements have to be estimated. here the # lower triangle is filled with he transpose of the upper triangle.) for i in range(len(filler)): filler[i] += (filler[i] - np.diag(np.diagonal(filler[i]))).T if square_root_filters is True: # make not_chol to chol_t filler = np.transpose(cholesky(filler), axes=(0, 2, 1)) if square_root_filters is False: initial[:] = filler else: initial[:, :, 1:, 1:] = filler
def test_gufunc_new_axis(self): @guvectorize([void(float64[:, :], float64[:, :], float64[:, :])], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore X = np.random.randn(10, 3, 3) Y = np.random.randn(3, 3) gold = ut.matrix_multiply(X, Y) res1 = gufunc(X, Y) np.testing.assert_allclose(gold, res1) res2 = gufunc(X, np.tile(Y, (10, 1, 1))) np.testing.assert_allclose(gold, res2)
def test_gufunc_auto_transfer(self): @guvectorize([void(float32[:, :], float32[:, :], float32[:, :])], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore gufunc.max_blocksize = 512 matrix_ct = 2 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) dB = cuda.to_device(B) C = gufunc(A, dB).copy_to_host() Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def test_gufunc(self): @guvectorize([void(float32[:, :], float32[:, :], float32[:, :])], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore gufunc.max_blocksize = 512 matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def test_gufunc_stream(self): #cuda.driver.flush_pending_free() matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) ts = time() stream = cuda.stream() dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream) dC = gufunc(dA, dB, out=dC, stream=stream) C = dC.copy_to_host(stream=stream) stream.synchronize() tcuda = time() - ts ts = time() Gold = ut.matrix_multiply(A, B) tcpu = time() - ts stream_speedups.append(tcpu / tcuda) self.assertTrue(np.allclose(C, Gold))
def get_gradient_by_agent(self, beta, data, depm): nobs, alts, nvars = data.shape self.upc_sequence.compute_utilities(data, beta, self.resources) p = self.upc_sequence.compute_probabilities(self.resources) d = (depm - p) ## WAS: g0 = (d[..., newaxis] * data).sum(axis=1) g = matrix_multiply(d[:,newaxis,:], data) g = squeeze(g) return g
def check_matmul_gufunc(self, gufunc): matrix_ct = 1001 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def check_matmul_gufunc(self, gufunc): matrix_ct = 1001 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) np.testing.assert_allclose(C, Gold, rtol=1e-5, atol=1e-8)
def build_and_rot_model(a, e, s0, r): """ Build model and rotate according to the identified rotation matrix """ from numpy.core.umath_tests import matrix_multiply r2 = Prob3dPose.upgrade_r(r.T).transpose((0, 2, 1)) mod = Prob3dPose.build_model(a, e, s0) mod = matrix_multiply(r2, mod) return mod
def transform_points_with_homography(H, _xys): """ Args: H (ndarray[float64_t, ndim=2]): homography/perspective matrix _xys (ndarray[ndim=2]): (N x 2) array """ xyz = add_homogenous_coordinate(_xys) xyz_t = matrix_multiply(H, xyz) xy_t = remove_homogenous_coordinate(xyz_t) return xy_t
def test_gufunc_hidim(self): gufunc = _get_matmulcore_gufunc(max_blocksize=512) matrix_ct = 100 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(4, 25, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(4, 25, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def x_to_y(matFn, date, vecs, reverse=False): vecs = np.asarray(vecs) assert vecs.ndim == 2 assert vecs.shape[1] == 3 et = date2es(date) mat = matFn(et) if reverse: mat = mat.T vecsOut = matrix_multiply(mat, vecs[...,np.newaxis]).reshape(vecs.shape) return vecsOut
def compare_matrix_multiply_results(self, tp): d1 = np.array(rand(2, 3, 4), dtype=tp) d2 = np.array(rand(2, 3, 4), dtype=tp) msg = "matrix multiply on type %s" % d1.dtype.name def permute_n(n): if n == 1: return ([0], ) ret = () base = permute_n(n - 1) for perm in base: for i in range(n): new = perm + [n - 1] new[n - 1] = new[i] new[i] = n - 1 ret += (new, ) return ret def slice_n(n): if n == 0: return ((), ) ret = () base = slice_n(n - 1) for sl in base: ret += (sl + (slice(None), ), ) ret += (sl + (slice(0, 1), ), ) return ret def broadcastable(s1, s2): return s1 == s2 or s1 == 1 or s2 == 1 permute_3 = permute_n(3) slice_3 = slice_n(3) + ((slice(None, None, -1), ) * 3, ) ref = True for p1 in permute_3: for p2 in permute_3: for s1 in slice_3: for s2 in slice_3: a1 = d1.transpose(p1)[s1] a2 = d2.transpose(p2)[s2] ref = ref and a1.base != None ref = ref and a2.base != None if broadcastable(a1.shape[-1], a2.shape[-2]) and \ broadcastable(a1.shape[0], a2.shape[0]): assert_array_almost_equal( umt.matrix_multiply(a1, a2), np.sum(a2[..., np.newaxis].swapaxes(-3, -1) * a1[..., np.newaxis, :], axis=-1), err_msg=msg + ' %s %s' % (str(a1.shape), str(a2.shape))) assert_equal(ref, True, err_msg="reference check")
def x_to_y(matFn, date, vecs, reverse=False): vecs = np.asarray(vecs) assert vecs.ndim == 2 assert vecs.shape[1] == 3 et = date2es(date) mat = matFn(et) if reverse: mat = mat.T vecsOut = matrix_multiply(mat, vecs[..., np.newaxis]).reshape(vecs.shape) return vecsOut
def test_gufunc_adjust_blocksize(self): matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) gufunc.max_blocksize = 32 C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def calculate_wij(X, mu, pi): mu_p = np.reshape(mu, [-1, 1, 3]) sub = X[:, None, :, :] - mu_p[None, :, :, :] temp = np.reshape(sub, [sub.shape[0] * sub.shape[1], 1, -1]) cov = np.exp(-(1 / 2.) * matrix_multiply(temp, temp.transpose([0, 2, 1]))) cov = np.reshape(cov, [sub.shape[0], sub.shape[1]]) numerator = cov * pi denumerator = (np.sum(cov * pi, axis=1)) wij = numerator / denumerator[:, None] return wij, cov
def compare_matrix_multiply_results(self, tp): d1 = np.array(np.random.rand(2, 3, 4), dtype=tp) d2 = np.array(np.random.rand(2, 3, 4), dtype=tp) msg = "matrix multiply on type %s" % d1.dtype.name def permute_n(n): if n == 1: return ([0],) ret = () base = permute_n(n-1) for perm in base: for i in range(n): new = perm + [n-1] new[n-1] = new[i] new[i] = n-1 ret += (new,) return ret def slice_n(n): if n == 0: return ((),) ret = () base = slice_n(n-1) for sl in base: ret += (sl+(slice(None),),) ret += (sl+(slice(0, 1),),) return ret def broadcastable(s1, s2): return s1 == s2 or s1 == 1 or s2 == 1 permute_3 = permute_n(3) slice_3 = slice_n(3) + ((slice(None, None, -1),)*3,) ref = True for p1 in permute_3: for p2 in permute_3: for s1 in slice_3: for s2 in slice_3: a1 = d1.transpose(p1)[s1] a2 = d2.transpose(p2)[s2] ref = ref and a1.base is not None ref = ref and a2.base is not None if (a1.shape[-1] == a2.shape[-2] and broadcastable(a1.shape[0], a2.shape[0])): assert_array_almost_equal( umt.matrix_multiply(a1, a2), np.sum(a2[..., np.newaxis].swapaxes(-3, -1) * a1[..., np.newaxis,:], axis=-1), err_msg=msg + ' %s %s' % (str(a1.shape), str(a2.shape))) assert_equal(ref, True, err_msg="reference check")
def average(self): if len(self.shape) == 1: import numpy.core.umath_tests as ut system = ut.matrix_multiply(self.qs[:,:,np.newaxis], self.qs[:,np.newaxis,:]).sum(axis=0) w, v = np.linalg.eigh(system) qiT_dot_qref = (self.qs[:,:,np.newaxis] * v[np.newaxis,:,:]).sum(axis=1) return Quaternions(v[:,np.argmin((1.-qiT_dot_qref**2).sum(axis=0))]) else: raise NotImplementedError('Cannot average multi-dimensionsal Quaternions')
def test_gufunc(self): gufunc = GUVectorize(matmulcore, '(m,n),(n,p)->(m,p)', target='cpu') gufunc.add(argtypes=[float32[:, :], float32[:, :], float32[:, :]]) gufunc = gufunc.build_ufunc() matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def test_gufunc_small(self): gufunc = _get_matmulcore_gufunc(max_blocksize=512) matrix_ct = 2 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def backward(self, downstream_gradient): probs, = self.saved_tensors n_shape = probs.shape[1] jacobian = probs[..., :, np.newaxis] * (np.eye(n_shape) - probs[..., np.newaxis, :]) # Downstream gradient is 2d, jacobian is 3d, and we need to perform # matrix-vector multiplication jacobian[i] * dL[i]. Since the jacobian # is symmetric, we can omit the transpose of the jacobian product = matrix_multiply(jacobian, downstream_gradient[..., np.newaxis]) # matrix_multiply returns a 3d tensor, however we need a 2d matrix product = product.squeeze() return product
def gmm(k, xs, tol=1e-6, max_iter=200): """Vectorized version of GMM. Faster than above but still rough.""" n, p = xs.shape mus, z = initialization.kmeanspp(k, xs, ret='both') pis = np.array([len(np.where(z == i)[0]) / n for i in np.unique(z)]) sigmas = np.array([np.eye(p)] * k) ll_old = 0 for i in range(max_iter): exp_A = [] exp_B = [] ll_new = 0 # E-step, ws are responsabilities ws = np.zeros((k, n)) for j in range(k): ws[j, :] = pis[j] * multivariate_normal(mus[j], sigmas[j]).pdf(xs) ws /= ws.sum(0) # M-step pis = ws.sum(axis=1) pis /= n mus = np.dot(ws, xs) mus /= ws.sum(1)[:, None] sigmas = np.zeros((k, p, p)) for j in range(k): ys = xs - mus[j, :] sigmas[j] = (ws[j,:,None,None]*\ matrix_multiply(ys[:,:,None], ys[:,None,:])).sum(axis=0) sigmas /= ws.sum(axis=1)[:, None, None] # update complete log likelihoood ll_new = 0 for pi, mu, sigma in zip(pis, mus, sigmas): ll_new += pi * multivariate_normal(mu, sigma).pdf(xs) ll_new = np.log(ll_new).sum() # convergence test if np.abs(ll_new - ll_old) < tol: break ll_old = ll_new z = ws.T labels = np.argmax(z, axis=1) return labels
def test_gufunc_new_axis(self): gufunc = _get_matmulcore_gufunc(dtype=float64) X = np.random.randn(10, 3, 3) Y = np.random.randn(3, 3) gold = ut.matrix_multiply(X, Y) res1 = gufunc(X, Y) np.testing.assert_allclose(gold, res1) res2 = gufunc(X, np.tile(Y, (10, 1, 1))) np.testing.assert_allclose(gold, res2)
def gmm(k, xs, tol=1e-6, max_iter=200): """Vectorized version of GMM. Faster than above but still rough.""" n, p = xs.shape mus, z = initialization.kmeanspp(k, xs, ret='both') pis = np.array([len(np.where(z==i)[0])/n for i in np.unique(z)]) sigmas = np.array([np.eye(p)]*k) ll_old = 0 for i in range(max_iter): exp_A = [] exp_B = [] ll_new = 0 # E-step, ws are responsabilities ws = np.zeros((k, n)) for j in range(k): ws[j, :] = pis[j]*multivariate_normal(mus[j], sigmas[j]).pdf(xs) ws /= ws.sum(0) # M-step pis = ws.sum(axis=1) pis /= n mus = np.dot(ws, xs) mus /= ws.sum(1)[:, None] sigmas = np.zeros((k, p, p)) for j in range(k): ys = xs - mus[j, :] sigmas[j] = (ws[j,:,None,None]*\ matrix_multiply(ys[:,:,None], ys[:,None,:])).sum(axis=0) sigmas /= ws.sum(axis=1)[:,None,None] # update complete log likelihoood ll_new = 0 for pi, mu, sigma in zip(pis, mus, sigmas): ll_new += pi*multivariate_normal(mu, sigma).pdf(xs) ll_new = np.log(ll_new).sum() # convergence test if np.abs(ll_new - ll_old) < tol: break ll_old = ll_new z = ws.T labels = np.argmax(z, axis=1) return labels
def test_gufunc(self): gufunc = GUVectorize(matmulcore, '(m,n),(n,p)->(m,p)', target=self.target) gufunc.add((float32[:, :], float32[:, :], float32[:, :])) gufunc = gufunc.build_ufunc() matrix_ct = 1001 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def flux_down(self, fluxDownTop, emission=None): '''Compute downwelling radiative flux at interfaces between layers. Inputs: fluxDownTop: flux down at top emission: emission from atmospheric levels (N) defaults to zero if not given Returns: vector of downwelling radiative flux between levels (N+1) element 0 is the flux down to the surface.''' if emission is None: emission = np.zeros_like(self.absorptivity) E = np.concatenate((emission, np.atleast_1d(fluxDownTop)), axis=-1) # dot product (matrix multiplication) along last axes return np.squeeze(matrix_multiply(self.Tdown, E[..., np.newaxis]))
def test_gufunc_auto_transfer(self): gufunc = _get_matmulcore_gufunc(max_blocksize=512) matrix_ct = 2 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) dB = cuda.to_device(B) C = gufunc(A, dB).copy_to_host() Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def test_cpu_guvectorize(self): target = 'cpu' gufunc = guvectorize([void(float32[:,:], float32[:,:], float32[:,:])], '(m,n),(n,p)->(m,p)', target=target)(matmulcore) matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def flux_up(self, fluxUpBottom, emission=None): '''Compute upwelling radiative flux at interfaces between layers. Inputs: fluxUpBottom: flux up from bottom emission: emission from atmospheric levels (N) defaults to zero if not given Returns: vector of downwelling radiative flux between levels (N+1) element N is the flux up to space.''' if emission is None: emission = np.zeros_like(self.absorptivity) E = np.concatenate((np.atleast_1d(fluxUpBottom), emission), axis=-1) # dot product (matrix multiplication) along last axes return np.squeeze(matrix_multiply(self.Tup, E[..., np.newaxis]))
def flux_up(self, fluxUpBottom, emission=None): '''Compute upwelling radiative flux at interfaces between layers. Inputs: fluxUpBottom: flux up from bottom emission: emission from atmospheric levels (N) defaults to zero if not given Returns: vector of downwelling radiative flux between levels (N+1) element N is the flux up to space.''' if emission is None: emission = np.zeros_like(self.absorptivity) E = np.concatenate((np.atleast_1d(fluxUpBottom),emission), axis=-1) # dot product (matrix multiplication) along last axes return np.squeeze(matrix_multiply(self.Tup, E[..., np.newaxis]))
def test_gufunc_array_expressions(): gufunc = GUVectorize(array_expr_gufunc, '(m,n),(n,p)->(m,p)') gufunc.add(argtypes=[float_[:, :], float_[:, :], float_[:, :]]) gufunc = gufunc.build_ufunc() matrix_ct = 10 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) if (C != Gold).any(): print(C) print(Gold) raise ValueError
def transform_params_for_P_zero(params_for_P_zero, filler, boo, estimate_cholesky_of_P_zero, direction): filler[:] = 0 if estimate_cholesky_of_P_zero is True: return params_for_P_zero elif direction == 'short_to_long': filler[boo] = params_for_P_zero filler = matrix_multiply(np.transpose(filler, axes=(0, 2, 1)), filler) return filler[boo] else: filler[boo] = params_for_P_zero for i in range(len(filler)): filler[i] += (filler[i] - np.diag(np.diagonal(filler[i]))).T filler = np.transpose(cholesky(filler), axes=(0, 2, 1)) return filler[boo]
def test_gufunc_array_expressions(): gufunc = GUVectorize(array_expr_gufunc, '(m,n),(n,p)->(m,p)') gufunc.add(argtypes=[float_[:,:], float_[:,:], float_[:,:]]) gufunc = gufunc.build_ufunc() matrix_ct = 10 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) if (C != Gold).any(): print(C) print(Gold) raise ValueError
def test_gufunc_hidim(self): matrix_ct = 100 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(4, 25, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(4, 25, 4, 5) ts = time() C = gufunc(A, B) tcuda = time() - ts ts = time() Gold = ut.matrix_multiply(A, B) tcpu = time() - ts non_stream_speedups.append(tcpu / tcuda) self.assertTrue(np.allclose(C, Gold))
def transform_params_for_P_zero(params_for_P_zero, filler, boo, estimate_cholesky_of_P_zero, direction): filler[:] = 0 if estimate_cholesky_of_P_zero is True: return params_for_P_zero elif direction == 'short_to_long': filler[boo] = params_for_P_zero filler = matrix_multiply( np.transpose(filler, axes=(0, 2, 1)), filler) return filler[boo] else: filler[boo] = params_for_P_zero for i in range(len(filler)): filler[i] += (filler[i] - np.diag(np.diagonal(filler[i]))).T filler = np.transpose(cholesky(filler), axes=(0, 2, 1)) return filler[boo]
def kpts_matrix(kpts): # We are given the keypoint in invA format # invV = perdoch.invA # V = perdoch.A # Z = perdoch.E # invert into V nKp = len(kpts) invV = kpts_to_invV(kpts) V = [np.linalg.inv(v) for v in invV] assert len(V) == (nKp) #V = faster_inverse(invV) # transform into conic matrix Z # Z = (V.T).dot(V) Vt = array(map(np.transpose, V)) Z = matrix_multiply(Vt, V) assert Z.shape == (nKp, 3, 3) return invV, V, Z
def _test_gufunc(backend, target): gufunc = GUVectorize(matmulcore, '(m,n),(n,p)->(m,p)') gufunc.add(argtypes=[f4[:,:], f4[:,:], f4[:,:]]) gufunc = gufunc.build_ufunc() matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) # print(A) # print(B) # print(C) # print(Gold) assert np.allclose(C, Gold)
def _test_gufunc(backend, target): gufunc = GUVectorize(matmulcore, '(m,n),(n,p)->(m,p)') gufunc.add(argtypes=[f4[:, :], f4[:, :], f4[:, :]]) gufunc = gufunc.build_ufunc() matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) # print(A) # print(B) # print(C) # print(Gold) assert np.allclose(C, Gold)
def reestimate_a(w, e, s0, rot, camera_r=False, Lambda=False): """Reestimate a from rest shape, rotation, and basis coefficients as a least squares problem. solution minimises ||W_i- P.dot(camera_r).dot(Mat(rot_i)).dot(s+a_i.e)||^2_2 + Lambda**2.dot(a**2) for each frame i. Where **2 is the elementwise square""" if camera_r is False: return reestimate_a_old(w, e, s0, rot) basis = e.shape[0] frames = w.shape[0] points = w.shape[-1] # co-ordinate frame is xzy # write P as short for the projection P.dot(camera_r).dot(Mat(rot_i)) mat_r = upgrade_r(rot.T).transpose(0, 2, 1) P = matrix_multiply(camera_r[np.newaxis, :2], mat_r) #For each frame project the shape and subtract that from the measurement matrix res = w - P.dot(s0) #vis.scatter2d(P.dot(s0)[0],w[0]) res = res.reshape(frames, points * 2) if Lambda is not False: res = np.hstack((res, np.zeros((frames, basis)))) #compute the rotated e basis for each frame # output is frames,basis, 2, points # input is frames, 2,3 + basis,3,points re = np.einsum('ilk,jkp', P, e).reshape(frames, basis, 2 * points) re2 = np.empty((frames, basis, 2 * points + basis)) if Lambda is not False: re2[:, :, :2 * points] = re re2[:, :, 2 * points:] = np.diag(Lambda) re = re2 #Now solve for ||res-a.dot(re)||^2_2 a = np.empty((frames, basis)) a.fill(np.NaN) residue = np.empty(frames) for i in xrange(frames): # if i ==0: #print target[i] #print re[i] a[i], b, _, _ = np.linalg.lstsq(re[i].T, res[i]) residue[i] = b #.sum(1) #print aa #a[i]=aa #vis.scatter2d(P.dot(s0+(a[0,:,np.newaxis,np.newaxis]*e).sum(0))[0],w[0]) return a, residue
def test_gufunc(self, target='cpu'): gufunc = GUVectorize(matmulcore, '(m,n),(n,p)->(m,p)', target=self.target) gufunc.add((float32[:, :], float32[:, :], float32[:, :])) gufunc = gufunc.build_ufunc() matrix_ct = 1001 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def test_cpu_guvectorize(self): target = 'cpu' gufunc = guvectorize( [void(float32[:, :], float32[:, :], float32[:, :])], '(m,n),(n,p)->(m,p)', target=target)(matmulcore) matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def test_gufunc_stream(self): @guvectorize([void(float32[:, :], float32[:, :], float32[:, :])], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore gufunc.max_blocksize = 512 #cuda.driver.flush_pending_free() matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) ts = time() stream = cuda.stream() dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream) dC = gufunc(dA, dB, out=dC, stream=stream) C = dC.copy_to_host(stream=stream) stream.synchronize() tcuda = time() - ts ts = time() Gold = ut.matrix_multiply(A, B) tcpu = time() - ts stream_speedups.append(tcpu / tcuda) self.assertTrue(np.allclose(C, Gold))
def _calculateCameraToPixelDirection(self, el, az): el = np.deg2rad(el) az = np.deg2rad(-(az-180)) x,y,z = spherical_to_cartesian(1,el,az) vecs = np.dstack((x,y,z)) # simple spherical latitude rotation works here because # the latitude is the geodetic latitude which is the # angle between the normal and the equatorial plane matLat = rotation_matrix(np.deg2rad(90 - self._calData.lat), Y)[:3,:3] matLon = rotation_matrix(np.deg2rad(-self._calData.lon), Z)[:3,:3] mat = np.dot(matLon, matLat) # rotate latitude first, then longitude vecs = vecs.reshape(el.shape[0]*el.shape[1], 3) vecsRot = matrix_multiply(mat, vecs[...,np.newaxis]).reshape(el.shape[0], el.shape[1], 3) return vecsRot
def setup_(weight): to_update = np.zeros((20, 4, 4)) helper_bool = np.zeros((4, 4), dtype=bool) helper_bool[np.triu_indices(4)] = True for i in range(20): to_update[i][helper_bool] = np.random.randn(10) + 100 pos_def_arr = matrix_multiply(np.transpose(to_update, axes=(0, 2, 1)), to_update) update_with = np.random.uniform(size=(20, 4)) outer_prod = update_with.reshape(20, 4, 1) * \ update_with.reshape(20, 1, 4) expected_result = np.transpose( cholesky(pos_def_arr + weight * outer_prod), axes=(0, 2, 1)) return to_update, update_with, expected_result
def test_gufunc_small(self): matrix_ct = 2 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) ts = time() C = gufunc(A, B) tcuda = time() - ts ts = time() Gold = ut.matrix_multiply(A, B) tcpu = time() - ts non_stream_speedups.append(tcpu / tcuda) print(C, Gold) self.assertTrue(np.allclose(C, Gold))
def boostrap(i): np.random.seed(i) if smooth_bootstrap: b = Scms(self.data + np.random.randn(*self.data.shape) * self.adaptive_bw, self.bw, min_radius=self.min_radius) if copy_bw: b.adaptive_bw = self.adaptive_bw else: bdata, bi = bootstrap_resample(self.data) b = Scms(bdata, self.bw, min_radius=self.min_radius) if copy_bw: b.adaptive_bw = self.adaptive_bw[bi] if method == 'LocInv' or method == 'GradientLogP': bh, bp, bg, _ = b._nlocal_inv_cov(self.landmarks) else: bp, bg, bh = b._kernel_density_estimate(self.landmarks) gproj = np.sum(self.landmarks_g * bg, axis=1) / np.linalg.norm(self.landmarks_g, axis=1) hproj = np.sum( matrix_multiply(bh.transpose((0, 2, 1)), self.landmarks_h_eigvecs) * self.landmarks_h_eigvecs, axis=1) return bp, bg, bh, gproj, hproj
def test_gufunc_auto_transfer(self): @guvectorize([void(float32[:, :], float32[:, :], float32[:, :])], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore gufunc.max_blocksize = 512 matrix_ct = 2 A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) dB = cuda.to_device(B) ts = time() C = gufunc(A, dB).copy_to_host() tcuda = time() - ts ts = time() Gold = ut.matrix_multiply(A, B) tcpu = time() - ts non_stream_speedups.append(tcpu / tcuda) print(C, Gold) self.assertTrue(np.allclose(C, Gold))
def rotatePole(lats, lons, altitude, angle=90, axis=[1, 0, 0]): """ Rotates the given geodetic lat/lon coordinates around the origin. :param lats, lons: shape (n,) in radians :param altitude: in km :param angle: degrees :param axis: [1, 0, 0], [0, 1, 0], or [0, 0, 1] for x y z axis :rtype: tuple (lats, lons) in radians """ assert lats.ndim == 1 and lons.ndim == 1 assert len(axis) == 3 x, y, z = geodetic2Ecef(lats, lons, altitude, wgs84A, wgs84B) xyz = np.asarray([x, y, z]).T alpha = np.deg2rad(angle) rot = rotation_matrix(alpha, axis)[:3, :3] xyzRot = matrix_multiply(rot, xyz[..., np.newaxis]).reshape(xyz.shape) lats, lons = ecef2Geodetic(xyzRot[:, 0], xyzRot[:, 1], xyzRot[:, 2], wgs84A, wgs84B) return lats, lons
def test_gufunc_hidim(self): @guvectorize([void(float32[:, :], float32[:, :], float32[:, :])], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore gufunc.max_blocksize = 512 matrix_ct = 100 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(4, 25, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(4, 25, 4, 5) C = gufunc(A, B) Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def EM(x, k, omega, mu, sigma, maxIteration, tolerance=0.01): #k = len(omega); n,m = x.shape; #dimension of x (n,m) loglike0 = 0; for l in range(maxIteration): expectA = []; expectB = []; loglikeN = 0; #E-step P = np.zeros((k,n)); for j in range(k): #MVNj = np.random.multivariate_normal(mu[j], sigma[j]); P[j,:] = omega[j]*multivariate_normal(mu[j], sigma[j]).pdf(x); P /= P.sum(0); #M-steo omega = P.sum(axis=1); omega /= n; mu = np.dot(P,x); mu /= P.sum(1)[:,None]; sigma = np.zeros((k,m,m)); for j in range(k): y = x - mu[j,:]; sigma[j] = (P[j,:,None,None]*matrix_multiply(y[:,:,None], y[:,None,:])).sum(axis=0); sigma /= P.sum(axis=1)[:,None,None]; #Update complete log likelihood loglikeN = 0; for omega, mu, sigma in zip(omega,mu,sigma): #MVN = np.random.multivariate_normal(mu,sigma); loglikeN += omega*multivariate_normal(mu,sigma).pdf(x); loglikeN = np.log(loglikeN).sum(); if np.abs(loglikeN - loglike0) < tolerance: break loglike0 = loglikeN; return loglikeN, omega, mu, sigma;