class TestLinAlg(unittest.TestCase): def setUp(self): self.linalg = LinAlg() np.random.seed(1234) def run_test_matmul_aa_ci8_shape(self, shape): shape_complex = shape[:-1] + (shape[-1] * 2, ) a8 = (np.random.random(size=shape_complex) * 255).astype(np.int8) a_gold = a8.astype(np.float32).view(np.complex64) a = a8.view(bf.DataType.ci8) # Note: np.matmul seems to be slow and inaccurate when there are batch dims c_gold = np.matmul(a_gold, np.swapaxes(a_gold, -1, -2).conj()) triu = np.triu_indices(shape[-2], 1) c_gold[..., triu[0], triu[1]] = 0 a = bf.asarray(a, space='cuda') c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, a, None, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL) def run_test_matmul_aa_dtype_shape(self, shape, dtype): a = ((np.random.random(size=shape)) * 127).astype(dtype) c_gold = np.matmul(a, np.swapaxes(a, -1, -2).conj()) triu = np.triu_indices(shape[-2], 1) c_gold[..., triu[0], triu[1]] = 0 a = bf.asarray(a, space='cuda') c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, a, None, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL) def run_test_matmul_aa_dtype(self, dtype): self.run_test_matmul_aa_dtype_shape((11, 23), dtype) self.run_test_matmul_aa_dtype_shape((111, 223), dtype) self.run_test_matmul_aa_dtype_shape((1111, 2223), dtype) self.run_test_matmul_aa_dtype_shape((3, 111, 223), dtype) self.run_test_matmul_aa_dtype_shape((5, 3, 111, 223), dtype) self.run_test_matmul_aa_dtype_shape((5, 7, 3, 111, 223), dtype) def test_matmul_aa_ci8(self): self.run_test_matmul_aa_ci8_shape((11, 23)) self.run_test_matmul_aa_ci8_shape((111, 223)) self.run_test_matmul_aa_ci8_shape((1111, 2223)) self.run_test_matmul_aa_ci8_shape((3, 111, 223)) self.run_test_matmul_aa_ci8_shape((5, 3, 111, 223)) self.run_test_matmul_aa_ci8_shape((5, 7, 3, 111, 223)) def test_matmul_aa_f32(self): self.run_test_matmul_aa_dtype(np.float32) def test_matmul_aa_f64(self): self.run_test_matmul_aa_dtype(np.float64) def test_matmul_aa_c32(self): self.run_test_matmul_aa_dtype(np.complex64) def test_matmul_aa_c64(self): self.run_test_matmul_aa_dtype(np.complex128)
def setUp(self): self.linalg = LinAlg() np.random.seed(1234)
class TestLinAlg(unittest.TestCase): def setUp(self): self.linalg = LinAlg() np.random.seed(1234) def run_test_matmul_aa_ci8_shape(self, shape, transpose=False): # **TODO: This currently never triggers the transpose path in the backend shape_complex = shape[:-1] + (shape[-1] * 2, ) # Note: The xGPU-like correlation kernel does not support input values of -128 (only [-127:127]) a8 = ((np.random.random(size=shape_complex) * 2 - 1) * 127).astype( np.int8) a_gold = a8.astype(np.float32).view(np.complex64) if transpose: a_gold = H(a_gold) # Note: np.matmul seems to be slow and inaccurate when there are batch dims c_gold = np.matmul(a_gold, H(a_gold)) triu = np.triu_indices(shape[-2] if not transpose else shape[-1], 1) c_gold[..., triu[0], triu[1]] = 0 a = a8.view(bf.DataType.ci8) a = bf.asarray(a, space='cuda') if transpose: a = H(a) c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, a, None, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL) def run_test_matmul_aa_dtype_shape(self, shape, dtype, axes=None, conj=False): a = ((np.random.random(size=shape)) * 127).astype(dtype) if axes is None: axes = range(len(shape)) aa = a.transpose(axes) if conj: aa = aa.conj() c_gold = np.matmul(aa, H(aa)) triu = np.triu_indices(shape[axes[-2]], 1) c_gold[..., triu[0], triu[1]] = 0 a = bf.asarray(a, space='cuda') aa = a.transpose(axes) if conj: aa = aa.conj() c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, aa, None, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL) def run_test_matmul_ab_ci8_shape(self, shape, k, transpose=False): ashape_complex = shape[:-2] + (shape[-2], k * 2) bshape_complex = shape[:-2] + (k, shape[-1] * 2) a8 = (np.random.random(size=ashape_complex) * 255).astype(np.int8) b8 = (np.random.random(size=bshape_complex) * 255).astype(np.int8) a_gold = a8.astype(np.float32).view(np.complex64) b_gold = b8.astype(np.float32).view(np.complex64) if transpose: a_gold, b_gold = H(b_gold), H(a_gold) c_gold = np.matmul(a_gold, b_gold) a = a8.view(bf.DataType.ci8) b = b8.view(bf.DataType.ci8) a = bf.asarray(a, space='cuda') b = bf.asarray(b, space='cuda') if transpose: a, b = H(b), H(a) c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, a, b, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL) def run_test_matmul_ab_dtype_shape(self, shape, k, dtype, axes_a=None, axes_b=None, transpose=False): # TODO: Allow testing separate transpose_a, transpose_b ashape = shape[:-2] + (shape[-2], k) bshape = shape[:-2] + (k, shape[-1]) a = ((np.random.random(size=ashape)) * 127).astype(dtype) b = ((np.random.random(size=bshape)) * 127).astype(dtype) if axes_a is None: axes_a = range(len(ashape)) if axes_b is None: axes_b = range(len(bshape)) aa = a.transpose(axes_a) bb = b.transpose(axes_b) if transpose: aa, bb = H(bb), H(aa) c_gold = np.matmul(aa, bb) a = bf.asarray(a, space='cuda') b = bf.asarray(b, space='cuda') aa = a.transpose(axes_a) bb = b.transpose(axes_b) if transpose: aa, bb = H(bb), H(aa) c = bf.zeros_like(c_gold, space='cuda') self.linalg.matmul(1, aa, bb, 0, c) c = c.copy('system') np.testing.assert_allclose(c, c_gold, RTOL, ATOL) def run_test_matmul_ab_beamformer_kernel(self, ntime, nbeam, nstand, nchan): x_shape = (ntime, nchan, nstand * 2) w_shape = (nbeam, nchan, nstand * 2) x8 = ((np.random.random(size=x_shape + (2, )) * 2 - 1) * 127).astype( np.int8) x = x8.astype(np.float32).view(np.complex64).reshape(x_shape) w = ((np.random.random(size=w_shape + (2, )) * 2 - 1) * 127).astype( np.int8).astype(np.float32).view(np.complex64).reshape(w_shape) b_gold = np.matmul(w.transpose(1, 0, 2), x.transpose(1, 2, 0)) x = x8.view(bf.DataType.ci8).reshape(x_shape) x = bf.asarray(x, space='cuda') w = bf.asarray(w, space='cuda') b = bf.zeros_like(b_gold, space='cuda') self.linalg.matmul(1, w.transpose(1, 0, 2), x.transpose(1, 2, 0), 0, b) b_ = b.copy('system') np.testing.assert_allclose(b_, b_gold, RTOL, ATOL) ''' # Benchmarking nrep = 30 bf.device.stream_synchronize() t0 = time.time() for _ in xrange(nrep): self.linalg.matmul(1, w.transpose(1,0,2), x.transpose(1,2,0), 0, b) bf.device.stream_synchronize() dt = time.time() - t0 nflop = nrep * ntime * nbeam * nstand*2 * nchan * 8 nbyte = nrep * (x.nbytes + w.nbytes + b.nbytes) nsamp = nrep * ntime * nchan print nbeam, '\t'*1, nflop / dt / 1e9, 'GFLOP/s' print nbeam, '\t'*2, nbyte / dt / 1e9, 'GB/s' print nbeam, '\t'*3, nsamp / dt / 1e6, 'MHz/s' ''' def run_test_matmul_aa_correlator_kernel(self, ntime, nstand, nchan, misalign=0): x_shape = (ntime, nchan, nstand * 2) perm = [1, 0, 2] x8 = ((np.random.random(size=x_shape + (2, )) * 2 - 1) * 127).astype( np.int8) x = x8.astype(np.float32).view(np.complex64).reshape(x_shape) x = x.transpose(perm) x = x[..., misalign:] b_gold = np.matmul(H(x), x) triu = np.triu_indices(x.shape[-1], 1) b_gold[..., triu[0], triu[1]] = 0 x = x8.view(bf.DataType.ci8).reshape(x_shape) x = bf.asarray(x, space='cuda') x = x.transpose(perm) x = x[..., misalign:] b = bf.zeros_like(b_gold, space='cuda') self.linalg.matmul(1, None, x, 0, b) b = b.copy('system') np.testing.assert_allclose(b, b_gold, RTOL * 10, ATOL) def run_benchmark_matmul_aa_correlator_kernel(self, ntime, nstand, nchan): x_shape = (ntime, nchan, nstand * 2) perm = [1, 0, 2] x8 = ((np.random.random(size=x_shape + (2, )) * 2 - 1) * 127).astype( np.int8) x = x8.astype(np.float32).view(np.complex64).reshape(x_shape) x = x.transpose(perm) b_gold = np.matmul(H(x[:, [0], :]), x[:, [0], :]) triu = np.triu_indices(x_shape[-1], 1) b_gold[..., triu[0], triu[1]] = 0 x = x8.view(bf.DataType.ci8).reshape(x_shape) x = bf.asarray(x, space='cuda') x = x.transpose(perm) b = bf.zeros_like(b_gold, space='cuda') bf.device.stream_synchronize() t0 = time.time() nrep = 200 for _ in xrange(nrep): self.linalg.matmul(1, None, x, 0, b) bf.device.stream_synchronize() dt = time.time() - t0 nflop = nrep * nchan * ntime * nstand * (nstand + 1) / 2 * 2 * 2 * 8 print nstand, '\t', nflop / dt / 1e9, 'GFLOP/s' print '\t\t', nrep * ntime * nchan / dt / 1e6, 'MHz' def test_matmul_ab_beamformer_kernel_small(self): for nchan in xrange(1, 1 + 3): for ntime in xrange(1, 1 + 8): for nstand in [16, 64, 256]: for nbeam in xrange(1, 1 + 12): self.run_test_matmul_ab_beamformer_kernel( ntime=ntime, nbeam=nbeam, nstand=nstand, nchan=nchan) def test_matmul_ab_beamformer_kernel_large(self): for nbeam in xrange(1, 1 + 12): #print "--------------", nbeam, "---------------" self.run_test_matmul_ab_beamformer_kernel(ntime=512, nbeam=nbeam, nstand=256, nchan=10) def test_matmul_aa_correlator_kernel_small(self): for nchan in xrange(1, 1 + 5): for ntime in [1, 2, 3, 4, 8, 12]: for nstand in xrange(1, 1 + 65): for misalign in xrange(0, min(2 * (nstand - 1), 3), 2): self.run_test_matmul_aa_correlator_kernel( ntime=ntime, nstand=nstand, nchan=nchan, misalign=misalign) def test_matmul_aa_correlator_kernel_large(self): self.run_test_matmul_aa_correlator_kernel(ntime=100, nstand=200, nchan=1) self.run_test_matmul_aa_correlator_kernel(ntime=99, nstand=200, nchan=1) self.run_test_matmul_aa_correlator_kernel(ntime=100, nstand=200, nchan=3) self.run_test_matmul_aa_correlator_kernel(ntime=99, nstand=200, nchan=3) self.run_test_matmul_aa_correlator_kernel(ntime=400, nstand=100, nchan=7) self.run_test_matmul_aa_correlator_kernel(ntime=399, nstand=100, nchan=7) self.run_test_matmul_aa_correlator_kernel(ntime=36, nstand=97, nchan=31) self.run_test_matmul_aa_correlator_kernel(ntime=35, nstand=97, nchan=31) self.run_test_matmul_aa_correlator_kernel(ntime=4, nstand=512, nchan=1) self.run_test_matmul_aa_correlator_kernel(ntime=512, nstand=256, nchan=3) self.run_test_matmul_aa_correlator_kernel(ntime=1000, nstand=256, nchan=1) # Benchmarks #self.run_benchmark_matmul_aa_correlator_kernel(ntime=4096, nstand=256, nchan=64) #for nstand in [16, 28, 64, 256, 1024]: # self.run_benchmark_matmul_aa_correlator_kernel(ntime=512, nstand=nstand, nchan=256*256*48//2//(nstand*nstand)) #print #self.run_benchmark_matmul_aa_correlator_kernel(ntime=512, nstand=256, nchan=96) #print #self.run_benchmark_matmul_aa_correlator_kernel(ntime=2048, nstand=2048, nchan=2) def run_test_matmul_aa_dtype(self, dtype): self.run_test_matmul_aa_dtype_shape((3, 2), dtype) self.run_test_matmul_aa_dtype_shape((11, 23), dtype) # Note: Only Hermitian transposes are supported self.run_test_matmul_aa_dtype_shape((11, 23), dtype, [1, 0], conj=True) self.run_test_matmul_aa_dtype_shape((111, 223), dtype) self.run_test_matmul_aa_dtype_shape((111, 223), dtype, [1, 0], conj=True) self.run_test_matmul_aa_dtype_shape((1111, 2223), dtype) self.run_test_matmul_aa_dtype_shape((3, 111, 223), dtype) self.run_test_matmul_aa_dtype_shape((3, 111, 223), dtype, [0, 2, 1], conj=True) self.run_test_matmul_aa_dtype_shape((3, 111, 223), dtype, [1, 2, 0], conj=True) self.run_test_matmul_aa_dtype_shape((3, 111, 223), dtype, [1, 0, 2]) # Note: The fastest dim can't be a batch dim, so these aren't supported #self.run_test_matmul_aa_dtype_shape((3,111,223), dtype, [2,0,1]) #self.run_test_matmul_aa_dtype_shape((3,111,223), dtype, [2,1,0]) self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype) self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype, [0, 1, 3, 2], conj=True) self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype, [1, 0, 2, 3]) self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype, [1, 0, 3, 2], conj=True) self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype, [1, 2, 3, 0], conj=True) self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype, [1, 2, 0, 3]) self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype, [2, 1, 0, 3]) self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype, [2, 1, 3, 0], conj=True) self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype, [2, 0, 3, 1], conj=True) self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype, [2, 0, 1, 3]) self.run_test_matmul_aa_dtype_shape((5, 7, 3, 111, 223), dtype) def run_test_matmul_ab_dtype_transpose(self, dtype, transpose): self.run_test_matmul_ab_dtype_shape((11, 23), 7, dtype, transpose=transpose) self.run_test_matmul_ab_dtype_shape((11, 23), 11, dtype, transpose=transpose) self.run_test_matmul_ab_dtype_shape((11, 23), 23, dtype, transpose=transpose) self.run_test_matmul_ab_dtype_shape((11, 11), 11, dtype, transpose=transpose) self.run_test_matmul_ab_dtype_shape((111, 223), 77, dtype, transpose=transpose) self.run_test_matmul_ab_dtype_shape((111, 2223), 777, dtype, transpose=transpose) self.run_test_matmul_ab_dtype_shape((3, 111, 223), 77, dtype, transpose=transpose) def run_test_matmul_ab_dtype(self, dtype): self.run_test_matmul_ab_dtype_transpose(dtype, False) self.run_test_matmul_ab_dtype_transpose(dtype, True) def run_test_matmul_aa_ci8_transpose(self, transpose): # Note: The xGPU-like correlation kernel is only invoked when k%4 == 0 for kp in [0, 1]: self.run_test_matmul_aa_ci8_shape((99 + kp, 3 + kp), transpose=transpose) self.run_test_matmul_aa_ci8_shape((11 + kp, 3 + kp), transpose=transpose) self.run_test_matmul_aa_ci8_shape((11 + kp, 23 + kp), transpose=transpose) self.run_test_matmul_aa_ci8_shape((111 + kp, 223 + kp), transpose=transpose) self.run_test_matmul_aa_ci8_shape((1111 + kp, 2223 + kp), transpose=transpose) self.run_test_matmul_aa_ci8_shape((3, 111 + kp, 223 + kp), transpose=transpose) self.run_test_matmul_aa_ci8_shape((5, 3, 111 + kp, 223 + kp), transpose=transpose) self.run_test_matmul_aa_ci8_shape((5, 7, 3, 111 + kp, 223 + kp), transpose=transpose) def test_matmul_aa_ci8(self): self.run_test_matmul_aa_ci8_transpose(False) self.run_test_matmul_aa_ci8_transpose(True) def run_test_matmul_ab_ci8_transpose(self, transpose): self.run_test_matmul_ab_ci8_shape((11, 23), 7777, transpose=transpose) self.run_test_matmul_ab_ci8_shape((111, 223), 777, transpose=transpose) self.run_test_matmul_ab_ci8_shape((1111, 2223), 77, transpose=transpose) self.run_test_matmul_ab_ci8_shape((3, 111, 223), 77, transpose=transpose) self.run_test_matmul_ab_ci8_shape((5, 3, 111, 223), 77, transpose=transpose) self.run_test_matmul_ab_ci8_shape((5, 7, 3, 111, 223), 77, transpose=transpose) def test_matmul_ab_ci8(self): self.run_test_matmul_ab_ci8_transpose(False) self.run_test_matmul_ab_ci8_transpose(True) def test_matmul_aa_f32(self): self.run_test_matmul_aa_dtype(np.float32) def test_matmul_aa_f64(self): self.run_test_matmul_aa_dtype(np.float64) def test_matmul_aa_cf32(self): self.run_test_matmul_aa_dtype(np.complex64) def test_matmul_aa_cf64(self): self.run_test_matmul_aa_dtype(np.complex128) def test_matmul_ab_f32(self): self.run_test_matmul_ab_dtype(np.float32) def test_matmul_ab_f64(self): self.run_test_matmul_ab_dtype(np.float64) def test_matmul_ab_cf32(self): self.run_test_matmul_ab_dtype(np.complex64) def test_matmul_ab_cf64(self): self.run_test_matmul_ab_dtype(np.complex128)
def __init__(self, iring, nframe_per_integration, *args, **kwargs): super(CorrelateBlock, self).__init__(iring, *args, **kwargs) self.nframe_per_integration = nframe_per_integration self.linalg = LinAlg()
class CorrelateBlock(TransformBlock): def __init__(self, iring, nframe_per_integration, *args, **kwargs): super(CorrelateBlock, self).__init__(iring, *args, **kwargs) self.nframe_per_integration = nframe_per_integration self.linalg = LinAlg() def define_valid_input_spaces(self): return ('cuda', ) def define_output_nframes(self, input_nframe): """Return output nframe, given input_nframes. """ return 1 def on_sequence(self, iseq): self.nframe_integrated = 0 ihdr = iseq.header itensor = ihdr['_tensor'] assert (itensor['labels'] == ['time', 'freq', 'station', 'pol']) ohdr = deepcopy(ihdr) otensor = ohdr['_tensor'] otensor['dtype'] = 'cf32' for key in ['shape', 'labels', 'scales', 'units']: time_val, freq_val, stand_val, pol_val = itensor[key] otensor[key] = [ time_val, freq_val, stand_val, pol_val, stand_val, pol_val ] # Append subscripts to stand and pol axis labels for i in xrange(2): otensor['labels'][2 + i] += '_i' otensor['labels'][4 + i] += '_j' # Update time scale otensor['scales'][0][1] *= self.nframe_per_integration # Note: For future reference, possible values for this entry could be: # full, hermitian, lower, upper, strictly_lower, strictly_upper ohdr['matrix_fill_mode'] = 'lower' ohdr['gulp_nframe'] = min(ohdr['gulp_nframe'], self.nframe_per_integration) # Note: User can override by setting self.gulp_nframe gulp_nframe_actual = self.gulp_nframe or ohdr['gulp_nframe'] if self.nframe_per_integration % gulp_nframe_actual != 0: raise ValueError( "gulp_nframe (%i) does not divide " % gulp_nframe_actual + "nframe_per_integration (%i)" % self.nframe_per_integration) return ohdr def on_data(self, ispan, ospan): idata = ispan.data odata = ospan.data # TODO: Consider allowing returning (nframe_release, nframe_commit) # from on_data, to enable flexible decoupling of # the amount of data read/written during each gulp. beta = 0 if self.nframe_integrated == 0 else 1 ntime, nchan, nstand, npol = idata.shape # Squash stand + pol axes together and permute to get the right matmul idata_mm = idata.reshape([ntime, nchan, nstand * npol]) \ .transpose([1, 0, 2]) odata_mm = odata.reshape([nchan, nstand * npol, nstand * npol]) # Check that the memory addresses haven't changed assert (idata_mm.ctypes.data == idata.ctypes.data) assert (odata_mm.ctypes.data == odata.ctypes.data) self.linalg.matmul(1, None, idata_mm, beta, odata_mm) self.nframe_integrated += ispan.nframe assert (self.nframe_integrated <= self.nframe_per_integration) if self.nframe_integrated == self.nframe_per_integration: self.nframe_integrated = 0 return 1 else: return 0