Ejemplo n.º 1
0
class TestLinAlg(unittest.TestCase):
    def setUp(self):
        self.linalg = LinAlg()
        np.random.seed(1234)

    def run_test_matmul_aa_ci8_shape(self, shape):
        shape_complex = shape[:-1] + (shape[-1] * 2, )
        a8 = (np.random.random(size=shape_complex) * 255).astype(np.int8)
        a_gold = a8.astype(np.float32).view(np.complex64)
        a = a8.view(bf.DataType.ci8)
        # Note: np.matmul seems to be slow and inaccurate when there are batch dims
        c_gold = np.matmul(a_gold, np.swapaxes(a_gold, -1, -2).conj())
        triu = np.triu_indices(shape[-2], 1)
        c_gold[..., triu[0], triu[1]] = 0
        a = bf.asarray(a, space='cuda')
        c = bf.zeros_like(c_gold, space='cuda')
        self.linalg.matmul(1, a, None, 0, c)
        c = c.copy('system')
        np.testing.assert_allclose(c, c_gold, RTOL, ATOL)

    def run_test_matmul_aa_dtype_shape(self, shape, dtype):
        a = ((np.random.random(size=shape)) * 127).astype(dtype)
        c_gold = np.matmul(a, np.swapaxes(a, -1, -2).conj())
        triu = np.triu_indices(shape[-2], 1)
        c_gold[..., triu[0], triu[1]] = 0
        a = bf.asarray(a, space='cuda')
        c = bf.zeros_like(c_gold, space='cuda')
        self.linalg.matmul(1, a, None, 0, c)
        c = c.copy('system')
        np.testing.assert_allclose(c, c_gold, RTOL, ATOL)

    def run_test_matmul_aa_dtype(self, dtype):
        self.run_test_matmul_aa_dtype_shape((11, 23), dtype)
        self.run_test_matmul_aa_dtype_shape((111, 223), dtype)
        self.run_test_matmul_aa_dtype_shape((1111, 2223), dtype)
        self.run_test_matmul_aa_dtype_shape((3, 111, 223), dtype)
        self.run_test_matmul_aa_dtype_shape((5, 3, 111, 223), dtype)
        self.run_test_matmul_aa_dtype_shape((5, 7, 3, 111, 223), dtype)

    def test_matmul_aa_ci8(self):
        self.run_test_matmul_aa_ci8_shape((11, 23))
        self.run_test_matmul_aa_ci8_shape((111, 223))
        self.run_test_matmul_aa_ci8_shape((1111, 2223))
        self.run_test_matmul_aa_ci8_shape((3, 111, 223))
        self.run_test_matmul_aa_ci8_shape((5, 3, 111, 223))
        self.run_test_matmul_aa_ci8_shape((5, 7, 3, 111, 223))

    def test_matmul_aa_f32(self):
        self.run_test_matmul_aa_dtype(np.float32)

    def test_matmul_aa_f64(self):
        self.run_test_matmul_aa_dtype(np.float64)

    def test_matmul_aa_c32(self):
        self.run_test_matmul_aa_dtype(np.complex64)

    def test_matmul_aa_c64(self):
        self.run_test_matmul_aa_dtype(np.complex128)
Ejemplo n.º 2
0
 def setUp(self):
     self.linalg = LinAlg()
     np.random.seed(1234)
Ejemplo n.º 3
0
class TestLinAlg(unittest.TestCase):
    def setUp(self):
        self.linalg = LinAlg()
        np.random.seed(1234)

    def run_test_matmul_aa_ci8_shape(self, shape, transpose=False):
        # **TODO: This currently never triggers the transpose path in the backend
        shape_complex = shape[:-1] + (shape[-1] * 2, )
        # Note: The xGPU-like correlation kernel does not support input values of -128 (only [-127:127])
        a8 = ((np.random.random(size=shape_complex) * 2 - 1) * 127).astype(
            np.int8)
        a_gold = a8.astype(np.float32).view(np.complex64)
        if transpose:
            a_gold = H(a_gold)
        # Note: np.matmul seems to be slow and inaccurate when there are batch dims
        c_gold = np.matmul(a_gold, H(a_gold))
        triu = np.triu_indices(shape[-2] if not transpose else shape[-1], 1)
        c_gold[..., triu[0], triu[1]] = 0
        a = a8.view(bf.DataType.ci8)
        a = bf.asarray(a, space='cuda')
        if transpose:
            a = H(a)
        c = bf.zeros_like(c_gold, space='cuda')
        self.linalg.matmul(1, a, None, 0, c)
        c = c.copy('system')
        np.testing.assert_allclose(c, c_gold, RTOL, ATOL)

    def run_test_matmul_aa_dtype_shape(self,
                                       shape,
                                       dtype,
                                       axes=None,
                                       conj=False):
        a = ((np.random.random(size=shape)) * 127).astype(dtype)
        if axes is None:
            axes = range(len(shape))
        aa = a.transpose(axes)
        if conj:
            aa = aa.conj()
        c_gold = np.matmul(aa, H(aa))
        triu = np.triu_indices(shape[axes[-2]], 1)
        c_gold[..., triu[0], triu[1]] = 0
        a = bf.asarray(a, space='cuda')
        aa = a.transpose(axes)
        if conj:
            aa = aa.conj()
        c = bf.zeros_like(c_gold, space='cuda')
        self.linalg.matmul(1, aa, None, 0, c)
        c = c.copy('system')
        np.testing.assert_allclose(c, c_gold, RTOL, ATOL)

    def run_test_matmul_ab_ci8_shape(self, shape, k, transpose=False):
        ashape_complex = shape[:-2] + (shape[-2], k * 2)
        bshape_complex = shape[:-2] + (k, shape[-1] * 2)
        a8 = (np.random.random(size=ashape_complex) * 255).astype(np.int8)
        b8 = (np.random.random(size=bshape_complex) * 255).astype(np.int8)
        a_gold = a8.astype(np.float32).view(np.complex64)
        b_gold = b8.astype(np.float32).view(np.complex64)
        if transpose:
            a_gold, b_gold = H(b_gold), H(a_gold)
        c_gold = np.matmul(a_gold, b_gold)
        a = a8.view(bf.DataType.ci8)
        b = b8.view(bf.DataType.ci8)
        a = bf.asarray(a, space='cuda')
        b = bf.asarray(b, space='cuda')
        if transpose:
            a, b = H(b), H(a)
        c = bf.zeros_like(c_gold, space='cuda')
        self.linalg.matmul(1, a, b, 0, c)
        c = c.copy('system')
        np.testing.assert_allclose(c, c_gold, RTOL, ATOL)

    def run_test_matmul_ab_dtype_shape(self,
                                       shape,
                                       k,
                                       dtype,
                                       axes_a=None,
                                       axes_b=None,
                                       transpose=False):
        # TODO: Allow testing separate transpose_a, transpose_b
        ashape = shape[:-2] + (shape[-2], k)
        bshape = shape[:-2] + (k, shape[-1])
        a = ((np.random.random(size=ashape)) * 127).astype(dtype)
        b = ((np.random.random(size=bshape)) * 127).astype(dtype)
        if axes_a is None:
            axes_a = range(len(ashape))
        if axes_b is None:
            axes_b = range(len(bshape))
        aa = a.transpose(axes_a)
        bb = b.transpose(axes_b)
        if transpose:
            aa, bb = H(bb), H(aa)
        c_gold = np.matmul(aa, bb)
        a = bf.asarray(a, space='cuda')
        b = bf.asarray(b, space='cuda')
        aa = a.transpose(axes_a)
        bb = b.transpose(axes_b)
        if transpose:
            aa, bb = H(bb), H(aa)
        c = bf.zeros_like(c_gold, space='cuda')
        self.linalg.matmul(1, aa, bb, 0, c)
        c = c.copy('system')
        np.testing.assert_allclose(c, c_gold, RTOL, ATOL)

    def run_test_matmul_ab_beamformer_kernel(self, ntime, nbeam, nstand,
                                             nchan):
        x_shape = (ntime, nchan, nstand * 2)
        w_shape = (nbeam, nchan, nstand * 2)
        x8 = ((np.random.random(size=x_shape + (2, )) * 2 - 1) * 127).astype(
            np.int8)
        x = x8.astype(np.float32).view(np.complex64).reshape(x_shape)
        w = ((np.random.random(size=w_shape + (2, )) * 2 - 1) * 127).astype(
            np.int8).astype(np.float32).view(np.complex64).reshape(w_shape)
        b_gold = np.matmul(w.transpose(1, 0, 2), x.transpose(1, 2, 0))

        x = x8.view(bf.DataType.ci8).reshape(x_shape)
        x = bf.asarray(x, space='cuda')
        w = bf.asarray(w, space='cuda')
        b = bf.zeros_like(b_gold, space='cuda')
        self.linalg.matmul(1, w.transpose(1, 0, 2), x.transpose(1, 2, 0), 0, b)
        b_ = b.copy('system')

        np.testing.assert_allclose(b_, b_gold, RTOL, ATOL)
        '''
        # Benchmarking
        nrep = 30
        bf.device.stream_synchronize()
        t0 = time.time()
        for _ in xrange(nrep):
            self.linalg.matmul(1, w.transpose(1,0,2), x.transpose(1,2,0), 0, b)
        bf.device.stream_synchronize()
        dt = time.time() - t0
        nflop = nrep * ntime * nbeam * nstand*2 * nchan * 8
        nbyte = nrep * (x.nbytes + w.nbytes + b.nbytes)
        nsamp = nrep * ntime * nchan
        print nbeam, '\t'*1, nflop / dt / 1e9, 'GFLOP/s'
        print nbeam, '\t'*2, nbyte / dt / 1e9, 'GB/s'
        print nbeam, '\t'*3, nsamp / dt / 1e6, 'MHz/s'
        '''

    def run_test_matmul_aa_correlator_kernel(self,
                                             ntime,
                                             nstand,
                                             nchan,
                                             misalign=0):
        x_shape = (ntime, nchan, nstand * 2)
        perm = [1, 0, 2]
        x8 = ((np.random.random(size=x_shape + (2, )) * 2 - 1) * 127).astype(
            np.int8)
        x = x8.astype(np.float32).view(np.complex64).reshape(x_shape)
        x = x.transpose(perm)
        x = x[..., misalign:]
        b_gold = np.matmul(H(x), x)
        triu = np.triu_indices(x.shape[-1], 1)
        b_gold[..., triu[0], triu[1]] = 0
        x = x8.view(bf.DataType.ci8).reshape(x_shape)
        x = bf.asarray(x, space='cuda')
        x = x.transpose(perm)
        x = x[..., misalign:]
        b = bf.zeros_like(b_gold, space='cuda')
        self.linalg.matmul(1, None, x, 0, b)
        b = b.copy('system')
        np.testing.assert_allclose(b, b_gold, RTOL * 10, ATOL)

    def run_benchmark_matmul_aa_correlator_kernel(self, ntime, nstand, nchan):
        x_shape = (ntime, nchan, nstand * 2)
        perm = [1, 0, 2]
        x8 = ((np.random.random(size=x_shape + (2, )) * 2 - 1) * 127).astype(
            np.int8)
        x = x8.astype(np.float32).view(np.complex64).reshape(x_shape)
        x = x.transpose(perm)
        b_gold = np.matmul(H(x[:, [0], :]), x[:, [0], :])
        triu = np.triu_indices(x_shape[-1], 1)
        b_gold[..., triu[0], triu[1]] = 0
        x = x8.view(bf.DataType.ci8).reshape(x_shape)
        x = bf.asarray(x, space='cuda')
        x = x.transpose(perm)
        b = bf.zeros_like(b_gold, space='cuda')
        bf.device.stream_synchronize()
        t0 = time.time()
        nrep = 200
        for _ in xrange(nrep):
            self.linalg.matmul(1, None, x, 0, b)
        bf.device.stream_synchronize()
        dt = time.time() - t0
        nflop = nrep * nchan * ntime * nstand * (nstand + 1) / 2 * 2 * 2 * 8
        print nstand, '\t', nflop / dt / 1e9, 'GFLOP/s'
        print '\t\t', nrep * ntime * nchan / dt / 1e6, 'MHz'

    def test_matmul_ab_beamformer_kernel_small(self):
        for nchan in xrange(1, 1 + 3):
            for ntime in xrange(1, 1 + 8):
                for nstand in [16, 64, 256]:
                    for nbeam in xrange(1, 1 + 12):
                        self.run_test_matmul_ab_beamformer_kernel(
                            ntime=ntime,
                            nbeam=nbeam,
                            nstand=nstand,
                            nchan=nchan)

    def test_matmul_ab_beamformer_kernel_large(self):
        for nbeam in xrange(1, 1 + 12):
            #print "--------------", nbeam, "---------------"
            self.run_test_matmul_ab_beamformer_kernel(ntime=512,
                                                      nbeam=nbeam,
                                                      nstand=256,
                                                      nchan=10)

    def test_matmul_aa_correlator_kernel_small(self):
        for nchan in xrange(1, 1 + 5):
            for ntime in [1, 2, 3, 4, 8, 12]:
                for nstand in xrange(1, 1 + 65):
                    for misalign in xrange(0, min(2 * (nstand - 1), 3), 2):
                        self.run_test_matmul_aa_correlator_kernel(
                            ntime=ntime,
                            nstand=nstand,
                            nchan=nchan,
                            misalign=misalign)

    def test_matmul_aa_correlator_kernel_large(self):
        self.run_test_matmul_aa_correlator_kernel(ntime=100,
                                                  nstand=200,
                                                  nchan=1)
        self.run_test_matmul_aa_correlator_kernel(ntime=99,
                                                  nstand=200,
                                                  nchan=1)
        self.run_test_matmul_aa_correlator_kernel(ntime=100,
                                                  nstand=200,
                                                  nchan=3)
        self.run_test_matmul_aa_correlator_kernel(ntime=99,
                                                  nstand=200,
                                                  nchan=3)
        self.run_test_matmul_aa_correlator_kernel(ntime=400,
                                                  nstand=100,
                                                  nchan=7)
        self.run_test_matmul_aa_correlator_kernel(ntime=399,
                                                  nstand=100,
                                                  nchan=7)
        self.run_test_matmul_aa_correlator_kernel(ntime=36,
                                                  nstand=97,
                                                  nchan=31)
        self.run_test_matmul_aa_correlator_kernel(ntime=35,
                                                  nstand=97,
                                                  nchan=31)
        self.run_test_matmul_aa_correlator_kernel(ntime=4, nstand=512, nchan=1)
        self.run_test_matmul_aa_correlator_kernel(ntime=512,
                                                  nstand=256,
                                                  nchan=3)
        self.run_test_matmul_aa_correlator_kernel(ntime=1000,
                                                  nstand=256,
                                                  nchan=1)
        # Benchmarks
        #self.run_benchmark_matmul_aa_correlator_kernel(ntime=4096, nstand=256, nchan=64)
        #for nstand in [16, 28, 64, 256, 1024]:
        #    self.run_benchmark_matmul_aa_correlator_kernel(ntime=512, nstand=nstand, nchan=256*256*48//2//(nstand*nstand))
        #print
        #self.run_benchmark_matmul_aa_correlator_kernel(ntime=512, nstand=256, nchan=96)
        #print
        #self.run_benchmark_matmul_aa_correlator_kernel(ntime=2048, nstand=2048, nchan=2)
    def run_test_matmul_aa_dtype(self, dtype):
        self.run_test_matmul_aa_dtype_shape((3, 2), dtype)
        self.run_test_matmul_aa_dtype_shape((11, 23), dtype)
        # Note: Only Hermitian transposes are supported
        self.run_test_matmul_aa_dtype_shape((11, 23), dtype, [1, 0], conj=True)
        self.run_test_matmul_aa_dtype_shape((111, 223), dtype)
        self.run_test_matmul_aa_dtype_shape((111, 223),
                                            dtype, [1, 0],
                                            conj=True)
        self.run_test_matmul_aa_dtype_shape((1111, 2223), dtype)
        self.run_test_matmul_aa_dtype_shape((3, 111, 223), dtype)
        self.run_test_matmul_aa_dtype_shape((3, 111, 223),
                                            dtype, [0, 2, 1],
                                            conj=True)
        self.run_test_matmul_aa_dtype_shape((3, 111, 223),
                                            dtype, [1, 2, 0],
                                            conj=True)
        self.run_test_matmul_aa_dtype_shape((3, 111, 223), dtype, [1, 0, 2])
        # Note: The fastest dim can't be a batch dim, so these aren't supported
        #self.run_test_matmul_aa_dtype_shape((3,111,223),     dtype, [2,0,1])
        #self.run_test_matmul_aa_dtype_shape((3,111,223),     dtype, [2,1,0])
        self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype)
        self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57),
                                            dtype, [0, 1, 3, 2],
                                            conj=True)
        self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype,
                                            [1, 0, 2, 3])
        self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57),
                                            dtype, [1, 0, 3, 2],
                                            conj=True)
        self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57),
                                            dtype, [1, 2, 3, 0],
                                            conj=True)
        self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype,
                                            [1, 2, 0, 3])
        self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype,
                                            [2, 1, 0, 3])
        self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57),
                                            dtype, [2, 1, 3, 0],
                                            conj=True)
        self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57),
                                            dtype, [2, 0, 3, 1],
                                            conj=True)
        self.run_test_matmul_aa_dtype_shape((5, 3, 111, 57), dtype,
                                            [2, 0, 1, 3])
        self.run_test_matmul_aa_dtype_shape((5, 7, 3, 111, 223), dtype)

    def run_test_matmul_ab_dtype_transpose(self, dtype, transpose):
        self.run_test_matmul_ab_dtype_shape((11, 23),
                                            7,
                                            dtype,
                                            transpose=transpose)
        self.run_test_matmul_ab_dtype_shape((11, 23),
                                            11,
                                            dtype,
                                            transpose=transpose)
        self.run_test_matmul_ab_dtype_shape((11, 23),
                                            23,
                                            dtype,
                                            transpose=transpose)
        self.run_test_matmul_ab_dtype_shape((11, 11),
                                            11,
                                            dtype,
                                            transpose=transpose)
        self.run_test_matmul_ab_dtype_shape((111, 223),
                                            77,
                                            dtype,
                                            transpose=transpose)
        self.run_test_matmul_ab_dtype_shape((111, 2223),
                                            777,
                                            dtype,
                                            transpose=transpose)
        self.run_test_matmul_ab_dtype_shape((3, 111, 223),
                                            77,
                                            dtype,
                                            transpose=transpose)

    def run_test_matmul_ab_dtype(self, dtype):
        self.run_test_matmul_ab_dtype_transpose(dtype, False)
        self.run_test_matmul_ab_dtype_transpose(dtype, True)

    def run_test_matmul_aa_ci8_transpose(self, transpose):
        # Note: The xGPU-like correlation kernel is only invoked when k%4 == 0
        for kp in [0, 1]:
            self.run_test_matmul_aa_ci8_shape((99 + kp, 3 + kp),
                                              transpose=transpose)
            self.run_test_matmul_aa_ci8_shape((11 + kp, 3 + kp),
                                              transpose=transpose)
            self.run_test_matmul_aa_ci8_shape((11 + kp, 23 + kp),
                                              transpose=transpose)
            self.run_test_matmul_aa_ci8_shape((111 + kp, 223 + kp),
                                              transpose=transpose)
            self.run_test_matmul_aa_ci8_shape((1111 + kp, 2223 + kp),
                                              transpose=transpose)
            self.run_test_matmul_aa_ci8_shape((3, 111 + kp, 223 + kp),
                                              transpose=transpose)
            self.run_test_matmul_aa_ci8_shape((5, 3, 111 + kp, 223 + kp),
                                              transpose=transpose)
            self.run_test_matmul_aa_ci8_shape((5, 7, 3, 111 + kp, 223 + kp),
                                              transpose=transpose)

    def test_matmul_aa_ci8(self):
        self.run_test_matmul_aa_ci8_transpose(False)
        self.run_test_matmul_aa_ci8_transpose(True)

    def run_test_matmul_ab_ci8_transpose(self, transpose):
        self.run_test_matmul_ab_ci8_shape((11, 23), 7777, transpose=transpose)
        self.run_test_matmul_ab_ci8_shape((111, 223), 777, transpose=transpose)
        self.run_test_matmul_ab_ci8_shape((1111, 2223),
                                          77,
                                          transpose=transpose)
        self.run_test_matmul_ab_ci8_shape((3, 111, 223),
                                          77,
                                          transpose=transpose)
        self.run_test_matmul_ab_ci8_shape((5, 3, 111, 223),
                                          77,
                                          transpose=transpose)
        self.run_test_matmul_ab_ci8_shape((5, 7, 3, 111, 223),
                                          77,
                                          transpose=transpose)

    def test_matmul_ab_ci8(self):
        self.run_test_matmul_ab_ci8_transpose(False)
        self.run_test_matmul_ab_ci8_transpose(True)

    def test_matmul_aa_f32(self):
        self.run_test_matmul_aa_dtype(np.float32)

    def test_matmul_aa_f64(self):
        self.run_test_matmul_aa_dtype(np.float64)

    def test_matmul_aa_cf32(self):
        self.run_test_matmul_aa_dtype(np.complex64)

    def test_matmul_aa_cf64(self):
        self.run_test_matmul_aa_dtype(np.complex128)

    def test_matmul_ab_f32(self):
        self.run_test_matmul_ab_dtype(np.float32)

    def test_matmul_ab_f64(self):
        self.run_test_matmul_ab_dtype(np.float64)

    def test_matmul_ab_cf32(self):
        self.run_test_matmul_ab_dtype(np.complex64)

    def test_matmul_ab_cf64(self):
        self.run_test_matmul_ab_dtype(np.complex128)
Ejemplo n.º 4
0
 def __init__(self, iring, nframe_per_integration, *args, **kwargs):
     super(CorrelateBlock, self).__init__(iring, *args, **kwargs)
     self.nframe_per_integration = nframe_per_integration
     self.linalg = LinAlg()
Ejemplo n.º 5
0
class CorrelateBlock(TransformBlock):
    def __init__(self, iring, nframe_per_integration, *args, **kwargs):
        super(CorrelateBlock, self).__init__(iring, *args, **kwargs)
        self.nframe_per_integration = nframe_per_integration
        self.linalg = LinAlg()

    def define_valid_input_spaces(self):
        return ('cuda', )

    def define_output_nframes(self, input_nframe):
        """Return output nframe, given input_nframes.
        """
        return 1

    def on_sequence(self, iseq):
        self.nframe_integrated = 0
        ihdr = iseq.header
        itensor = ihdr['_tensor']
        assert (itensor['labels'] == ['time', 'freq', 'station', 'pol'])
        ohdr = deepcopy(ihdr)
        otensor = ohdr['_tensor']
        otensor['dtype'] = 'cf32'
        for key in ['shape', 'labels', 'scales', 'units']:
            time_val, freq_val, stand_val, pol_val = itensor[key]
            otensor[key] = [
                time_val, freq_val, stand_val, pol_val, stand_val, pol_val
            ]
        # Append subscripts to stand and pol axis labels
        for i in xrange(2):
            otensor['labels'][2 + i] += '_i'
            otensor['labels'][4 + i] += '_j'
        # Update time scale
        otensor['scales'][0][1] *= self.nframe_per_integration
        # Note: For future reference, possible values for this entry could be:
        #         full, hermitian, lower, upper, strictly_lower, strictly_upper
        ohdr['matrix_fill_mode'] = 'lower'
        ohdr['gulp_nframe'] = min(ohdr['gulp_nframe'],
                                  self.nframe_per_integration)
        # Note: User can override by setting self.gulp_nframe
        gulp_nframe_actual = self.gulp_nframe or ohdr['gulp_nframe']
        if self.nframe_per_integration % gulp_nframe_actual != 0:
            raise ValueError(
                "gulp_nframe (%i) does not divide " % gulp_nframe_actual +
                "nframe_per_integration (%i)" % self.nframe_per_integration)
        return ohdr

    def on_data(self, ispan, ospan):
        idata = ispan.data
        odata = ospan.data
        # TODO: Consider allowing returning (nframe_release, nframe_commit)
        #         from on_data, to enable flexible decoupling of
        #         the amount of data read/written during each gulp.
        beta = 0 if self.nframe_integrated == 0 else 1

        ntime, nchan, nstand, npol = idata.shape
        # Squash stand + pol axes together and permute to get the right matmul
        idata_mm = idata.reshape([ntime, nchan, nstand * npol]) \
                        .transpose([1, 0, 2])
        odata_mm = odata.reshape([nchan, nstand * npol, nstand * npol])
        # Check that the memory addresses haven't changed
        assert (idata_mm.ctypes.data == idata.ctypes.data)
        assert (odata_mm.ctypes.data == odata.ctypes.data)
        self.linalg.matmul(1, None, idata_mm, beta, odata_mm)

        self.nframe_integrated += ispan.nframe
        assert (self.nframe_integrated <= self.nframe_per_integration)
        if self.nframe_integrated == self.nframe_per_integration:
            self.nframe_integrated = 0
            return 1
        else:
            return 0