def run_test_matmul_ab_dtype_shape(self, shape, k, dtype, axes_a=None, axes_b=None, transpose=False): # TODO: Allow testing separate transpose_a, transpose_b ashape = shape[:-2] + (shape[-2], k) bshape = shape[:-2] + (k, shape[-1]) a = ((np.random.random(size=ashape)) * 127).astype(dtype) b = ((np.random.random(size=bshape)) * 127).astype(dtype) if axes_a is None: axes_a = range(len(ashape)) if axes_b is None: axes_b = range(len(bshape)) aa = a.transpose(axes_a) bb = b.transpose(axes_b) if transpose: aa, bb = H(bb), H(aa) c_gold = np.matmul(aa, bb) a = bf.asarray(a, space='cuda') b = bf.asarray(b, space='cuda') aa = a.transpose(axes_a) bb = b.transpose(axes_b) if transpose: aa, bb = H(bb), H(aa) c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, aa, bb, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL)
def run_test_matmul_aa_dtype_shape(self, shape, dtype): a = ((np.random.random(size=shape)) * 127).astype(dtype) c_gold = np.matmul(a, np.swapaxes(a, -1, -2).conj()) triu = np.triu_indices(shape[-2], 1) c_gold[..., triu[0], triu[1]] = 0 a = bf.asarray(a, space='cuda') c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, a, None, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL)
def test_setitem(self): g = bf.zeros_like(self.known_vals, space='cuda') g[...] = self.known_vals np.testing.assert_equal(g.copy('system'), self.known_vals) g[:1,1:] = [[999]] np.testing.assert_equal(g.copy('system'), np.array([[0,999],[2,3],[4,5]])) g[0,0] = 888 np.testing.assert_equal(g.copy('system'), np.array([[888,999],[2,3],[4,5]])) g[0] = [99,88] np.testing.assert_equal(g.copy('system'), np.array([[99,88],[2,3],[4,5]])) g[:,1] = [77,66,55] np.testing.assert_equal(g.copy('system'), np.array([[99,77],[2,66],[4,55]]))
def run_test_matmul_aa_ci8_shape(self, shape): shape_complex = shape[:-1] + (shape[-1] * 2, ) a8 = (np.random.random(size=shape_complex) * 255).astype(np.int8) a_gold = a8.astype(np.float32).view(np.complex64) a = a8.view(bf.DataType.ci8) # Note: np.matmul seems to be slow and inaccurate when there are batch dims c_gold = np.matmul(a_gold, np.swapaxes(a_gold, -1, -2).conj()) triu = np.triu_indices(shape[-2], 1) c_gold[..., triu[0], triu[1]] = 0 a = bf.asarray(a, space='cuda') c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, a, None, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL)
def run_test_matmul_ab_ci8_shape(self, shape, k, transpose=False): ashape_complex = shape[:-2] + (shape[-2], k * 2) bshape_complex = shape[:-2] + (k, shape[-1] * 2) a8 = (np.random.random(size=ashape_complex) * 255).astype(np.int8) b8 = (np.random.random(size=bshape_complex) * 255).astype(np.int8) a_gold = a8.astype(np.float32).view(np.complex64) b_gold = b8.astype(np.float32).view(np.complex64) if transpose: a_gold, b_gold = H(b_gold), H(a_gold) c_gold = np.matmul(a_gold, b_gold) a = a8.view(bf.DataType.ci8) b = b8.view(bf.DataType.ci8) a = bf.asarray(a, space='cuda') b = bf.asarray(b, space='cuda') if transpose: a, b = H(b), H(a) c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, a, b, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL)
def run_test_matmul_ab_beamformer_kernel(self, ntime, nbeam, nstand, nchan): x_shape = (ntime, nchan, nstand * 2) w_shape = (nbeam, nchan, nstand * 2) x8 = ((np.random.random(size=x_shape + (2, )) * 2 - 1) * 127).astype( np.int8) x = x8.astype(np.float32).view(np.complex64).reshape(x_shape) w = ((np.random.random(size=w_shape + (2, )) * 2 - 1) * 127).astype( np.int8).astype(np.float32).view(np.complex64).reshape(w_shape) b_gold = np.matmul(w.transpose(1, 0, 2), x.transpose(1, 2, 0)) x = x8.view(bf.DataType.ci8).reshape(x_shape) x = bf.asarray(x, space='cuda') w = bf.asarray(w, space='cuda') b = bf.zeros_like(b_gold, space='cuda') self.linalg.matmul(1, w.transpose(1, 0, 2), x.transpose(1, 2, 0), 0, b) b_ = b.copy('system') np.testing.assert_allclose(b_, b_gold, RTOL, ATOL) '''
def run_test_matmul_aa_ci8_shape(self, shape, transpose=False): # **TODO: This currently never triggers the transpose path in the backend shape_complex = shape[:-1] + (shape[-1] * 2, ) # Note: The xGPU-like correlation kernel does not support input values of -128 (only [-127:127]) a8 = ((np.random.random(size=shape_complex) * 2 - 1) * 127).astype( np.int8) a_gold = a8.astype(np.float32).view(np.complex64) if transpose: a_gold = H(a_gold) # Note: np.matmul seems to be slow and inaccurate when there are batch dims c_gold = np.matmul(a_gold, H(a_gold)) triu = np.triu_indices(shape[-2] if not transpose else shape[-1], 1) c_gold[..., triu[0], triu[1]] = 0 a = a8.view(bf.DataType.ci8) a = bf.asarray(a, space='cuda') if transpose: a = H(a) c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, a, None, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL)
def run_test_matmul_aa_dtype_shape(self, shape, dtype, axes=None, conj=False): a = ((np.random.random(size=shape)) * 127).astype(dtype) if axes is None: axes = range(len(shape)) aa = a.transpose(axes) if conj: aa = aa.conj() c_gold = np.matmul(aa, H(aa)) triu = np.triu_indices(shape[axes[-2]], 1) c_gold[..., triu[0], triu[1]] = 0 a = bf.asarray(a, space='cuda') aa = a.transpose(axes) if conj: aa = aa.conj() c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, aa, None, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL)
def run_test_matmul_aa_correlator_kernel(self, ntime, nstand, nchan, misalign=0): x_shape = (ntime, nchan, nstand * 2) perm = [1, 0, 2] x8 = ((np.random.random(size=x_shape + (2, )) * 2 - 1) * 127).astype( np.int8) x = x8.astype(np.float32).view(np.complex64).reshape(x_shape) x = x.transpose(perm) x = x[..., misalign:] b_gold = np.matmul(H(x), x) triu = np.triu_indices(x.shape[-1], 1) b_gold[..., triu[0], triu[1]] = 0 x = x8.view(bf.DataType.ci8).reshape(x_shape) x = bf.asarray(x, space='cuda') x = x.transpose(perm) x = x[..., misalign:] b = bf.zeros_like(b_gold, space='cuda') self.linalg.matmul(1, None, x, 0, b) b = b.copy('system') np.testing.assert_allclose(b, b_gold, RTOL * 10, ATOL)
def run_benchmark_matmul_aa_correlator_kernel(self, ntime, nstand, nchan): x_shape = (ntime, nchan, nstand * 2) perm = [1, 0, 2] x8 = ((np.random.random(size=x_shape + (2, )) * 2 - 1) * 127).astype( np.int8) x = x8.astype(np.float32).view(np.complex64).reshape(x_shape) x = x.transpose(perm) b_gold = np.matmul(H(x[:, [0], :]), x[:, [0], :]) triu = np.triu_indices(x_shape[-1], 1) b_gold[..., triu[0], triu[1]] = 0 x = x8.view(bf.DataType.ci8).reshape(x_shape) x = bf.asarray(x, space='cuda') x = x.transpose(perm) b = bf.zeros_like(b_gold, space='cuda') bf.device.stream_synchronize() t0 = time.time() nrep = 200 for _ in xrange(nrep): self.linalg.matmul(1, None, x, 0, b) bf.device.stream_synchronize() dt = time.time() - t0 nflop = nrep * nchan * ntime * nstand * (nstand + 1) / 2 * 2 * 2 * 8 print nstand, '\t', nflop / dt / 1e9, 'GFLOP/s' print '\t\t', nrep * ntime * nchan / dt / 1e6, 'MHz'
def test_zeros_like(self): g = bf.ndarray(self.known_vals, dtype='f32', space='cuda') g = bf.zeros_like(g) g = g.copy('system') known = np.zeros_like(self.known_array) np.testing.assert_equal(g, known)