def reduce(idata, odata, op='sum'): if op not in REDUCE_MAP: raise ValueError("Invalid reduce op: " + str(op)) op = REDUCE_MAP[op] _check(_bf.bfReduce(asarray(idata).as_BFarray(), asarray(odata).as_BFarray(), op))
def run_test(self, ntime, nchan, max_delay, batch_shape=()): fdmt = Fdmt() f0 = 1000. bw = 400. df = bw / nchan exponent = -2.0 fdmt.init(nchan, max_delay, f0, df, exponent, 'cuda') ishape = batch_shape + (nchan, ntime) oshape = batch_shape + (max_delay, ntime) idata = bf.asarray(np.random.normal(size=ishape).astype(np.float32), space='cuda') odata1 = bf.asarray(-999 * np.ones(oshape, np.float32), space='cuda') fdmt.execute(idata, odata1) odata1 = odata1.copy('system') if max_delay > 1: self.assertEqual(odata1.min(), -999) # TODO: Need better tests self.assertLess(odata1.max(), 100.) odata2 = bf.asarray(-999 * np.ones(oshape, np.float32), space='cuda') workspace_size = fdmt.get_workspace_size(idata, odata2) workspace = bf.asarray(np.empty(workspace_size, np.uint8), space='cuda') workspace_ptr = workspace.ctypes.data fdmt.execute_workspace(idata, odata2, workspace_ptr, workspace_size) odata2 = odata2.copy('system') np.testing.assert_equal(odata1, odata2)
def run_test_matmul_ab_dtype_shape(self, shape, k, dtype, axes_a=None, axes_b=None, transpose=False): # TODO: Allow testing separate transpose_a, transpose_b ashape = shape[:-2] + (shape[-2], k) bshape = shape[:-2] + (k, shape[-1]) a = ((np.random.random(size=ashape)) * 127).astype(dtype) b = ((np.random.random(size=bshape)) * 127).astype(dtype) if axes_a is None: axes_a = range(len(ashape)) if axes_b is None: axes_b = range(len(bshape)) aa = a.transpose(axes_a) bb = b.transpose(axes_b) if transpose: aa, bb = H(bb), H(aa) c_gold = np.matmul(aa, bb) a = bf.asarray(a, space='cuda') b = bf.asarray(b, space='cuda') aa = a.transpose(axes_a) bb = b.transpose(axes_b) if transpose: aa, bb = H(bb), H(aa) c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, aa, bb, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL)
def test_fdmt(self): fdmt = Fdmt() ntime = 1024 nchan = 128 max_delay = 200 f0 = 1000. bw = 400. df = bw / nchan exponent = -2.0 fdmt.init(nchan, max_delay, f0, df, exponent, 'cuda') idata = bf.asarray(np.random.normal(size=(nchan,ntime)).astype(np.float32), space='cuda') odata1 = bf.asarray(-999*np.ones((max_delay,ntime), np.float32), space='cuda') fdmt.execute(idata, odata1) odata1 = odata1.copy('system') self.assertEqual(odata1.min(), -999) # TODO: Need better tests self.assertLess(odata1.max(), 100.) odata2 = bf.asarray(-999*np.ones((max_delay,ntime), np.float32), space='cuda') workspace_size = fdmt.get_workspace_size(idata, odata2) self.assertEqual(workspace_size, 3293184) workspace = bf.asarray(np.empty(workspace_size, np.uint8), space='cuda') workspace_ptr = workspace.ctypes.data fdmt.execute_workspace(idata, odata2, workspace_ptr, workspace_size) odata2 = odata2.copy('system') np.testing.assert_equal(odata1, odata2)
def run_complex_reduce_test(self, shape, axis, n, op='sum', dtype=np.complex64): a = ((np.random.random(size=shape)*2-1)*127).astype(np.int8).astype(dtype) \ + 1j*((np.random.random(size=shape)*2-1)*127).astype(np.int8).astype(dtype) if op[:3] == 'pwr': b_gold = pwrscrunch(a.astype(np.complex64), n, axis, NP_OPS[op[3:]]).astype(np.float32) else: b_gold = scrunch(a.astype(np.complex64), n, axis, NP_OPS[op]) a = bf.asarray(a, space='cuda') b = bf.empty_like(b_gold, space='cuda') bf.reduce(a, b, op) #for _ in xrange(10): # bf.reduce(a, b, op) #bf.device.stream_synchronize(); #t0 = time.time() #nrep = 30 #for _ in xrange(nrep): # bf.reduce(a, b, op) #bf.device.stream_synchronize(); #dt = time.time() - t0 #print nrep * (a.nbytes + b.nbytes) / dt / 1e9, 'GB/s', shape, axis, n, dtype b = b.copy('system') np.testing.assert_allclose(b, b_gold, rtol=1e-3 if op[:3] == 'pwr' else 1e-7)
def test_polarisation_products(self): n = 89 real = np.random.randint(-127, 128, size=(n, 2)).astype(np.float32) imag = np.random.randint(-127, 128, size=(n, 2)).astype(np.float32) a = real + 1j * imag a_orig = a a = bf.asarray(a, space='cuda') b = bf.empty_like(a) for _ in range(3): bf.map(''' auto x = a(_,0); auto y = a(_,1); b(_,0).assign(x.mag2(), y.mag2()); b(_,1) = x*y.conj(); ''', shape=b.shape[:-1], data={ 'a': a, 'b': b }) b = b.copy('system') a = a_orig gold = np.empty_like(a) def mag2(x): return x.real * x.real + x.imag * x.imag gold[..., 0] = mag2(a[..., 0]) + 1j * mag2(a[..., 1]) gold[..., 1] = a[..., 0] * a[..., 1].conj() np.testing.assert_equal(b, gold)
def test_shift(self): shape = (55, 66, 77) a = np.random.randint(65536, size=shape).astype(np.int32) a = bf.asarray(a, space='cuda') b = bf.empty_like(a) bf.map("b = a(_-a.shape()/2)", a=a, b=b) a = a.copy('system') b = b.copy('system') np.testing.assert_equal(b, np.fft.fftshift(a))
def test_explicit_indexing(self): shape = (55, 66, 77) a = np.random.randint(65536, size=shape).astype(np.int32) a = bf.asarray(a, space='cuda') b = bf.empty((a.shape[2], a.shape[0], a.shape[1]), a.dtype, 'cuda') bf.map("b(i,j,k) = a(j,k,i)", b.shape, 'i', 'j', 'k', a=a, b=b) a = a.copy('system') b = b.copy('system') np.testing.assert_equal(b, a.transpose([2, 0, 1]))
def run_test_matmul_aa_dtype_shape(self, shape, dtype): a = ((np.random.random(size=shape)) * 127).astype(dtype) c_gold = np.matmul(a, np.swapaxes(a, -1, -2).conj()) triu = np.triu_indices(shape[-2], 1) c_gold[..., triu[0], triu[1]] = 0 a = bf.asarray(a, space='cuda') c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, a, None, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL)
def test_manydim(self): known_data = np.arange(3**8).reshape([3] * 8).astype(np.float32) a = bf.asarray(known_data, space='cuda') a = a[:, :, :, :, :2, :, :, :] b = bf.empty_like(a) for _ in range(3): bf.map("b = a+1", data={'a': a, 'b': b}) a = a.copy('system') b = b.copy('system') np.testing.assert_equal(b, a + 1)
def test_custom_shape(self): shape = (55, 66, 77) a = np.random.randint(65536, size=shape).astype(np.int32) a = bf.asarray(a, space='cuda') b = bf.empty((a.shape[0], a.shape[2]), a.dtype, 'cuda') j = 11 bf.map("b(i,k) = a(i,j,k)", b.shape, 'i', 'k', a=a, b=b, j=j) a = a.copy('system') b = b.copy('system') np.testing.assert_equal(b, a[:, j, :])
def run_test_matmul_ab_beamformer_kernel(self, ntime, nbeam, nstand, nchan): x_shape = (ntime, nchan, nstand * 2) w_shape = (nbeam, nchan, nstand * 2) x8 = ((np.random.random(size=x_shape + (2, )) * 2 - 1) * 127).astype( np.int8) x = x8.astype(np.float32).view(np.complex64).reshape(x_shape) w = ((np.random.random(size=w_shape + (2, )) * 2 - 1) * 127).astype( np.int8).astype(np.float32).view(np.complex64).reshape(w_shape) b_gold = np.matmul(w.transpose(1, 0, 2), x.transpose(1, 2, 0)) x = x8.view(bf.DataType.ci8).reshape(x_shape) x = bf.asarray(x, space='cuda') w = bf.asarray(w, space='cuda') b = bf.zeros_like(b_gold, space='cuda') self.linalg.matmul(1, w.transpose(1, 0, 2), x.transpose(1, 2, 0), 0, b) b_ = b.copy('system') np.testing.assert_allclose(b_, b_gold, RTOL, ATOL) '''
def test_scalar(self): n = 7919 # Note: Python integer division rounds to -inf, while C rounds toward 0 # We avoid the problem here by using only positive values x = np.random.randint(1, 256, size=n) x = bf.asarray(x, space='cuda') y = bf.empty_like(x) bf.map("y = (x-m)/s", x=x, y=y, m=1, s=3) x = x.copy('system') y = y.copy('system') np.testing.assert_equal(y, (x - 1) // 3)
def test_explicit_indexing(self): shape = (55,66,77) a = np.random.randint(65536, size=shape).astype(np.int32) a = bf.asarray(a, space='cuda') b = bf.empty((a.shape[2],a.shape[0], a.shape[1]), a.dtype, 'cuda') for _ in xrange(3): bf.map("b(i,j,k) = a(j,k,i)", shape=b.shape, axis_names=('i','j','k'), data={'a': a, 'b': b}, block_shape=(64,4), block_axes=('i','k')) a = a.copy('system') b = b.copy('system') np.testing.assert_equal(b, a.transpose([2,0,1]))
def run_test_matmul_ab_ci8_shape(self, shape, k, transpose=False): ashape_complex = shape[:-2] + (shape[-2], k * 2) bshape_complex = shape[:-2] + (k, shape[-1] * 2) a8 = (np.random.random(size=ashape_complex) * 255).astype(np.int8) b8 = (np.random.random(size=bshape_complex) * 255).astype(np.int8) a_gold = a8.astype(np.float32).view(np.complex64) b_gold = b8.astype(np.float32).view(np.complex64) if transpose: a_gold, b_gold = H(b_gold), H(a_gold) c_gold = np.matmul(a_gold, b_gold) a = a8.view(bf.DataType.ci8) b = b8.view(bf.DataType.ci8) a = bf.asarray(a, space='cuda') b = bf.asarray(b, space='cuda') if transpose: a, b = H(b), H(a) c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, a, b, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL)
def test_broadcast(self): n = 89 a = np.arange(n).astype(np.float32) a = bf.asarray(a, space='cuda') b = a[:, None] c = bf.empty((a.shape[0], b.shape[0]), a.dtype, 'cuda') # TODO: Need way to compute broadcast shape bf.map("c = a*b", a=a, b=b, c=c) a = a.copy('system') b = b.copy('system') c = c.copy('system') np.testing.assert_equal(c, a * b)
def test_scalar(self): n = 7919 # Note: Python integer division rounds to -inf, while C rounds toward 0 # We avoid the problem here by using only positive values x = np.random.randint(1, 256, size=n) x = bf.asarray(x, space='cuda') y = bf.empty_like(x) for _ in xrange(3): bf.map("y = (x-m)/s", data={'x': x, 'y': y, 'm': 1, 's': 3}) x = x.copy('system') y = y.copy('system') np.testing.assert_equal(y, (x - 1) // 3)
def test_custom_shape(self): shape = (55,66,77) a = np.random.randint(65536, size=shape).astype(np.int32) a = bf.asarray(a, space='cuda') b = bf.empty((a.shape[0],a.shape[2]), a.dtype, 'cuda') j = 11 for _ in xrange(3): bf.map("b(i,k) = a(i,j,k)", shape=b.shape, axis_names=('i','k'), data={'a': a, 'b': b, 'j': j}) a = a.copy('system') b = b.copy('system') np.testing.assert_equal(b, a[:,j,:])
def run_reduce_test(self, shape, axis, n, op='sum', dtype=np.float32): a = ((np.random.random(size=shape) * 2 - 1) * 127).astype( np.int8).astype(dtype) if op[:3] == 'pwr': b_gold = pwrscrunch(a.astype(np.float32), n, axis, NP_OPS[op[3:]]) else: b_gold = scrunch(a.astype(np.float32), n, axis, NP_OPS[op]) a = bf.asarray(a, space='cuda_managed') b = bf.empty_like(b_gold, space='cuda_managed') bf.reduce(a, b, op) stream_synchronize() np.testing.assert_allclose(b, b_gold)
def _convert_to_array(arg): if _is_literal(arg): arr = np.array(arg) if isinstance(arg, (int, long)) and -(1 << 31) <= arg < (1 << 31): arr = arr.astype(np.int32) # TODO: Any way to decide when these should be double-precision? elif isinstance(arg, float): arr = arr.astype(np.float32) elif isinstance(arg, complex): arr = arr.astype(np.complex64) arr.flags['WRITEABLE'] = False arg = arr return bf.asarray(arg)
def run_simple_test(self, x, funcstr, func): x_orig = x x = bf.asarray(x, 'cuda') y = bf.empty_like(x) x.flags['WRITEABLE'] = False x.bf.immutable = True # TODO: Is this actually doing anything? (flags is, just not sure about bf.immutable) bf.map(funcstr, x=x, y=y) x = x.copy('system') y = y.copy('system') if isinstance(x_orig, bf.ndarray): x_orig = x # Note: Using func(x) is dangerous because bf.ndarray does things like # lazy .conj(), which break when used as if it were np.ndarray. np.testing.assert_equal(y, func(x_orig))
def run_test_matmul_aa_ci8_shape(self, shape): shape_complex = shape[:-1] + (shape[-1] * 2, ) a8 = (np.random.random(size=shape_complex) * 255).astype(np.int8) a_gold = a8.astype(np.float32).view(np.complex64) a = a8.view(bf.DataType.ci8) # Note: np.matmul seems to be slow and inaccurate when there are batch dims c_gold = np.matmul(a_gold, np.swapaxes(a_gold, -1, -2).conj()) triu = np.triu_indices(shape[-2], 1) c_gold[..., triu[0], triu[1]] = 0 a = bf.asarray(a, space='cuda') c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, a, None, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL)
def run_simple_test(self, x, funcstr, func): x_orig = x x = bf.asarray(x, 'cuda_managed') y = bf.empty_like(x) x.flags['WRITEABLE'] = False x.bf.immutable = True # TODO: Is this actually doing anything? (flags is, just not sure about bf.immutable) for _ in range(3): bf.map(funcstr, {'x': x, 'y': y}) stream_synchronize() if isinstance(x_orig, bf.ndarray): x_orig = x # Note: Using func(x) is dangerous because bf.ndarray does things like # lazy .conj(), which break when used as if it were np.ndarray. np.testing.assert_equal(y, func(x_orig))
def run_test_matmul_aa_ci8_shape(self, shape, transpose=False): # **TODO: This currently never triggers the transpose path in the backend shape_complex = shape[:-1] + (shape[-1] * 2, ) # Note: The xGPU-like correlation kernel does not support input values of -128 (only [-127:127]) a8 = ((np.random.random(size=shape_complex) * 2 - 1) * 127).astype( np.int8) a_gold = a8.astype(np.float32).view(np.complex64) if transpose: a_gold = H(a_gold) # Note: np.matmul seems to be slow and inaccurate when there are batch dims c_gold = np.matmul(a_gold, H(a_gold)) triu = np.triu_indices(shape[-2] if not transpose else shape[-1], 1) c_gold[..., triu[0], triu[1]] = 0 a = a8.view(bf.DataType.ci8) a = bf.asarray(a, space='cuda') if transpose: a = H(a) c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, a, None, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL)
def run_test_matmul_aa_dtype_shape(self, shape, dtype, axes=None, conj=False): a = ((np.random.random(size=shape)) * 127).astype(dtype) if axes is None: axes = range(len(shape)) aa = a.transpose(axes) if conj: aa = aa.conj() c_gold = np.matmul(aa, H(aa)) triu = np.triu_indices(shape[axes[-2]], 1) c_gold[..., triu[0], triu[1]] = 0 a = bf.asarray(a, space='cuda') aa = a.transpose(axes) if conj: aa = aa.conj() c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, aa, None, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL)
def run_test_matmul_aa_correlator_kernel(self, ntime, nstand, nchan, misalign=0): x_shape = (ntime, nchan, nstand * 2) perm = [1, 0, 2] x8 = ((np.random.random(size=x_shape + (2, )) * 2 - 1) * 127).astype( np.int8) x = x8.astype(np.float32).view(np.complex64).reshape(x_shape) x = x.transpose(perm) x = x[..., misalign:] b_gold = np.matmul(H(x), x) triu = np.triu_indices(x.shape[-1], 1) b_gold[..., triu[0], triu[1]] = 0 x = x8.view(bf.DataType.ci8).reshape(x_shape) x = bf.asarray(x, space='cuda') x = x.transpose(perm) x = x[..., misalign:] b = bf.zeros_like(b_gold, space='cuda') self.linalg.matmul(1, None, x, 0, b) b = b.copy('system') np.testing.assert_allclose(b, b_gold, RTOL * 10, ATOL)
def run_benchmark_matmul_aa_correlator_kernel(self, ntime, nstand, nchan): x_shape = (ntime, nchan, nstand * 2) perm = [1, 0, 2] x8 = ((np.random.random(size=x_shape + (2, )) * 2 - 1) * 127).astype( np.int8) x = x8.astype(np.float32).view(np.complex64).reshape(x_shape) x = x.transpose(perm) b_gold = np.matmul(H(x[:, [0], :]), x[:, [0], :]) triu = np.triu_indices(x_shape[-1], 1) b_gold[..., triu[0], triu[1]] = 0 x = x8.view(bf.DataType.ci8).reshape(x_shape) x = bf.asarray(x, space='cuda') x = x.transpose(perm) b = bf.zeros_like(b_gold, space='cuda') bf.device.stream_synchronize() t0 = time.time() nrep = 200 for _ in xrange(nrep): self.linalg.matmul(1, None, x, 0, b) bf.device.stream_synchronize() dt = time.time() - t0 nflop = nrep * nchan * ntime * nstand * (nstand + 1) / 2 * 2 * 2 * 8 print nstand, '\t', nflop / dt / 1e9, 'GFLOP/s' print '\t\t', nrep * ntime * nchan / dt / 1e6, 'MHz'
def map( func_string, shape=None, # axis_names=None, *args, **kwargs): """Apply a function to a set of ndarrays. Arguments: func_string: The function to apply to the arrays, as a string (see below for examples). shape: The shape of the computation. *args: List of string names by which each axis is referenced in func_string. **kwargs: Map of string names to ndarrays. If shape is None, the broadcast shape of all of the arrays is used. Examples: # Add two arrays together bf.map("c = a + b", c=c, a=a, b=b) # Compute outer product of two arrays bf.map("c(i,j) = a(i) * b(j)", 'i', 'j', c=c, a=a, b=b) # Split the components of a complex array bf.map("a = c.real; b = c.imag", c=c, a=a, b=b) # Raise an array to a scalar power bf.map("c = pow(a, p)", c=c, a=a, p=2.0) # Slice an array with a scalar index bf.map("c(i) = a(i,k)", 'i', c=c, a=a, k=7, shape=c.shape) """ #if 'shape' in kwargs: # shape = kwargs.pop('shape') #else: # shape = None if isinstance(shape, basestring): raise TypeError("Invalid type for shape argument") if any([not isinstance(arg, basestring) for arg in args]): raise TypeError("Invalid type for index name, must be string") axis_names = args #if axis_names is None: # # TODO: If this is desirable, move it into the backend instead # axis_names = ['_%i'%i for i in xrange(len(shape))] #else: #if len(axis_names) != len(shape): # raise ValueError('Number of axis names must match number of dims in shape') ndim = len(shape) if shape is not None else 0 narg = len(kwargs) def is_literal(x): return (isinstance(x, int) or isinstance(x, float) or isinstance(x, complex)) arg_arrays = [] args = [] arg_names = [] for key, arg in kwargs.items(): if is_literal(arg): arr = np.array(arg) if isinstance(arg, int) and -(1 << 31) <= arg < (1 << 31): arr = arr.astype(np.int32) # TODO: Any way to decide when these should be double-precision? elif isinstance(arg, float): arr = arr.astype(np.float32) elif isinstance(arg, complex): arr = arr.astype(np.complex64) arr.flags['WRITEABLE'] = False arr = bf.asarray(arr) else: arr = bf.asarray(arg) # Note: We must keep a reference to each array lest they be garbage # collected before their corresponding BFarray is used. arg_arrays.append(arr) args.append(arr.as_BFarray()) arg_names.append(key) _check( _bf.Map(ndim, _array(shape, dtype=ctypes.c_long), _array(axis_names), narg, _array(args), _array(arg_names), func_string))
def test_simple_3D_padded(self): n = 23 x = np.random.randint(256, size=(n, n, n)) x = bf.asarray(x, space='cuda') x = x[:, :, 1:] self.run_simple_test_funcs(x)