Beispiel #1
0
def beam_cube_dde(beam, beam_lm_ext, beam_freq_map,
                  lm, parangles, pointing_errors,
                  antenna_scaling, frequencies):

    corrs = beam.shape[3:]

    if beam.shape[2] >= BEAM_NUD_LIMIT:
        raise ValueError("beam_nud exceeds %d" % BEAM_NUD_LIMIT)

    nsrc = lm.shape[0]
    ntime, na = parangles.shape
    nchan = frequencies.shape[0]
    ncorr = reduce(mul, corrs, 1)
    nchancorr = nchan*ncorr

    oshape = (nsrc, ntime, na, nchan) + corrs

    if len(corrs) > 1:
        # Flatten the beam correlation dims
        fbeam = beam.reshape(beam.shape[:3] + (ncorr,))
    else:
        fbeam = beam

    # Generate frequency interpolation kernel
    ikernel, iblock, idt = _generate_interp_kernel(beam_freq_map, frequencies)

    # Generate main beam cube kernel
    kernel, block, dtype = _generate_main_kernel(fbeam, beam_lm_ext,
                                                 beam_freq_map,
                                                 lm, parangles,
                                                 pointing_errors,
                                                 antenna_scaling,
                                                 frequencies,
                                                 len(oshape),
                                                 ncorr)
    # Call frequency interpolation kernel
    igrid = grids((nchan, 1, 1), iblock)
    freq_data = cp.empty((3, nchan), dtype=frequencies.dtype)

    try:
        ikernel(igrid, iblock, (frequencies, beam_freq_map, freq_data))
    except CompileException:
        log.exception(format_code(ikernel.code))
        raise

    # Call main beam cube kernel
    out = cp.empty((nsrc, ntime, na, nchan) + (ncorr,), dtype=beam.dtype)
    grid = grids((nchancorr, na, ntime), block)

    try:
        kernel(grid, block, (fbeam, beam_lm_ext, beam_freq_map,
                             lm, parangles, pointing_errors,
                             antenna_scaling, frequencies, freq_data,
                             nsrc, out))
    except CompileException:
        log.exception(format_code(kernel.code))
        raise

    return out.reshape(oshape)
Beispiel #2
0
def convert(inputs, input_schema, output_schema):
    (kernel, block, in_shape, out_shape,
     dtype) = _generate_kernel(inputs, input_schema, output_schema)

    # Flatten non-schema input dimensions,
    # from inspection of the cupy reshape code,
    # this incurs a copy when inputs is non-contiguous
    nsrc = reduce(mul, inputs.shape[:-len(in_shape)], 1)
    nelems = reduce(mul, in_shape, 1)

    rinputs = inputs.reshape(nsrc, nelems)
    assert rinputs.flags.c_contiguous
    grid = grids((nsrc, 1, 1), block)

    outputs = cp.empty(shape=rinputs.shape, dtype=dtype)

    try:
        kernel(grid, block, (rinputs, outputs))
    except CompileException:
        log.exception(format_code(kernel.code))
        raise

    shape = inputs.shape[:-len(in_shape)] + out_shape
    outputs = outputs.reshape(shape)
    assert outputs.flags.c_contiguous
    return outputs
Beispiel #3
0
def test_cuda_inplace_warp_transpose(ncorrs, dtype, nvis, debug):
    cp = pytest.importorskip('cupy')

    path = pjoin("rime", "cuda", "tests", "test_warp_transpose.cu.j2")
    render = jinja_env.get_template(path).render

    dtypes = {
        np.float32: 'float',
        np.float64: 'double',
        np.int32: 'int',
        np.complex64: 'float2',
        np.complex128: 'double2',
    }

    code = render(type=dtypes[dtype], warp_size=32, corrs=ncorrs,
                  debug=debug).encode("utf-8")
    kernel = cp.RawKernel(code, "kernel")

    inputs = cp.arange(nvis * ncorrs, dtype=dtype).reshape(nvis, ncorrs)
    outputs = cp.empty_like(inputs)
    args = (inputs, outputs)
    block = (256, 1, 1)
    grid = tuple((d + b - 1) // b for d, b in zip((nvis, 1, 1), block))

    try:
        kernel(grid, block, args)
    except cp.cuda.compiler.CompileException:
        print(format_code(kernel.code))
        raise

    np.testing.assert_array_almost_equal(cp.asnumpy(inputs),
                                         cp.asnumpy(outputs))
Beispiel #4
0
def compile_using_nvcc(source, options=None, arch=None, filename='kern.cu'):
    options = options or []

    if arch is None:
        cuda_info = get_cuda_info()
        arch = min(
            [dev['major'] * 10 + dev['minor'] for dev in cuda_info['devices']])

    cc = get_compiler()
    settings = get_compiler_setting()
    arch = "--generate-code=arch=compute_{a},code=sm_{a}".format(a=arch)

    options += ['-cubin']

    cupy_path = resource_filename("cupy", pjoin("core", "include"))
    settings['include_dirs'].append(cupy_path)

    with _tempdir() as tmpdir:
        tmpfile = pjoin(tmpdir, filename)

        with open(tmpfile, "w") as f:
            f.write(source)

        try:
            stderr_file = pjoin(tmpdir, "stderr.txt")

            with stdchannel_redirected(sys.stderr, stderr_file):
                objects = cc.compile([tmpfile],
                                     include_dirs=settings['include_dirs'],
                                     macros=settings['define_macros'],
                                     extra_postargs=options)
        except errors.CompileError as e:
            with open(stderr_file, "r") as f:
                errs = f.read()

            lines = [
                "The following source code",
                format_code(source), "",
                "created the following compilation errors", "",
                errs.strip(),
                str(e).strip()
            ]

            ex = errors.CompileError("\n".join(lines))
            raise (ex, None, sys.exc_info()[2])

        assert len(objects) == 1

        mod = cp.cuda.function.Module()
        mod.load_file(objects[0])

        return mod
Beispiel #5
0
def phase_delay(lm, uvw, frequency):
    kernel, block, out_dtype = _generate_kernel(lm, uvw, frequency)
    grid = grids((frequency.shape[0], uvw.shape[0], 1), block)
    out = cp.empty(shape=(lm.shape[0], uvw.shape[0], frequency.shape[0]),
                   dtype=out_dtype)

    try:
        kernel(grid, block, (lm, uvw, frequency, out))
    except CompileException:
        log.exception(format_code(kernel.code))
        raise

    return out
Beispiel #6
0
def feed_rotation(parallactic_angles, feed_type='linear'):
    """ Cupy implementation of the feed_rotation kernel. """
    kernel, block, out_dtype = _generate_kernel(parallactic_angles, feed_type)
    in_shape = parallactic_angles.shape
    parallactic_angles = parallactic_angles.ravel()
    grid = grids((parallactic_angles.shape[0], 1, 1), block)
    out = cp.empty(shape=(parallactic_angles.shape[0], 4), dtype=out_dtype)

    try:
        kernel(grid, block, (parallactic_angles, out))
    except CompileException:
        log.exception(format_code(kernel.code))
        raise

    return out.reshape(in_shape + (2, 2))
Beispiel #7
0
def test_cuda_shuffle_transpose_2(ncorrs):
    cp = pytest.importorskip("cupy")
    jinja2 = pytest.importorskip("jinja2")

    # Implement a warp transpose using Kepler's register shuffle instructions
    # as described in del Mundo's
    # `Towards a performance-portable FFT library for heterogeneous computing`
    # https://doi.org/10.1145/2597917.2597943
    # https://homes.cs.washington.edu/~cdel/papers/cf14-fft.pdf
    # The poster is especially informative
    # https://homes.cs.washington.edu/~cdel/posters/073113-on-efficacy-shuffle-sc2013.pdf
    # and
    # `Enabling Efficient Intra-Warp Communication for
    #  Fourier Transforms in a Many-Core Architecture.`
    # https://homes.cs.washington.edu/~cdel/papers/sc13-shuffle-abstract.pdf
    # http://sc13.supercomputing.org/sites/default/files/PostersArchive/spost142.html

    _TEMPLATE = jinja2.Template("""
    #include <cupy/carray.cuh>

    {%- if (corrs < 1 or (corrs.__and__(corrs - 1) != 0)) %}
    {{ throw("corrs must be 1 or a power of 2") }}
    {%- endif %}

    {% macro warp_transpose(var_name, var_type, var_length, tmp_name="tmp") %}
    {% if var_length > 1 %}
        {
            int mask = __activemask();
            int case_id = threadIdx.x & {{var_length - 1}};
            {{var_type}} {{tmp_name}};  // For variable swaps

            // Horizontal (inter-thread) Rotation
            int addr = case_id;
            {%- for case in range(var_length) %}
            {{var_name}}[{{case}}] = __shfl_sync(mask, {{var_name}}[{{case}}], addr, {{var_length}});
            {%- if not loop.last %}
            addr = __shfl_sync(mask, addr, (case_id + 1) & {{var_length - 1}}, {{var_length}});
            {%- endif %}
            {%- endfor %}

            // Vertical (intra-thread) Rotation
            {%- for case in range(var_length) %}
            // Case {{case}}
            {%- set cycles = register_assign_cycles(corrs, case) %}
            {%- for cycle in cycles %}
            {%- set cstart = cycle[0][0] %}
            {{tmp_name}} = {{var_name}}[{{cstart}}];
            {%- for dest, src in cycle %}
            {%- set src_var = tmp_name if cstart == src else var_name + "[" + src|string + "]" %}
            {{var_name}}[{{dest}}] = case_id == {{case}} ? {{src_var}} : {{var_name}}[{{dest}}];
            {%- endfor %}
            {%- endfor %}
            {%- endfor %}

            // Horizontal (inter-thread) Rotation
            addr = ({{var_length}} - case_id) & {{var_length - 1}};
            {%- for case in range(var_length) %}
            {{var_name}}[{{case}}] = __shfl_sync(mask, {{var_name}}[{{case}}], addr, {{var_length}});
            {%- if not loop.last %}
            addr = __shfl_sync(mask, addr, (case_id + {{var_length - 1}}) & {{var_length - 1}}, {{var_length}});
            {%- endif %}
            {%- endfor %}
        }
    {%- endif %}
    {%- endmacro %}

    {%- set width = corrs %}

    extern "C" __global__ void kernel(
        const CArray<{{type}}, 2> input,
        CArray<{{type}}, 2> output)
    {
        const ptrdiff_t & nvis = input.shape()[0];
        int v = blockIdx.x*blockDim.x + threadIdx.x;

        if(v >= nvis)
            { return; }

        // Array to hold our variables
        {{type}} values[{{corrs}}];

        {% for corr in range(corrs) %}
        values[{{corr}}] = input[v + {{corr}}*nvis];
        {%- endfor %}

        if({{debug}})
        {
            if(threadIdx.x == 0)
                { printf("mask %d\\n", __activemask()); }

            printf("[%d] %d %d %d %d\\n",
                   threadIdx.x & {{warp_size - 1}},
                   values[0], values[1],
                   values[2], values[3]);

            if(threadIdx.x == 0)
                { printf("\\n"); }
        }

        {{ warp_transpose("values", type, corrs) }}
        {{ warp_transpose("values", type, corrs) }}

        if({{debug}})
        {
            if(threadIdx.x == 0)
                { printf("\\n"); }

            printf("[%d] %d %d %d %d\\n",
                   threadIdx.x & {{warp_size - 1}},
                   values[0], values[1],
                   values[2], values[3]);
        }

        {% for corr in range(corrs) %}
        output[v + {{corr}}*nvis] = values[{{corr}}];
        {%- endfor %}
    }
    """)  # noqa

    nvis = 32
    dtype = np.int32

    dtypes = {
        np.float32: 'float',
        np.float64: 'double',
        np.int32: 'int',
    }

    code = _TEMPLATE.render(type=dtypes[dtype],
                            throw=throw_helper,
                            register_assign_cycles=register_assign_cycles,
                            warp_size=32,
                            corrs=ncorrs,
                            debug="false").encode("utf-8")
    kernel = cp.RawKernel(code, "kernel")

    inputs = cp.arange(nvis * ncorrs, dtype=dtype).reshape(nvis, ncorrs)
    outputs = cp.empty_like(inputs)
    args = (inputs, outputs)
    block = (256, 1, 1)
    grid = tuple((d + b - 1) // b for d, b in zip((nvis, 1, 1), block))

    try:
        kernel(grid, block, args)
    except cp.cuda.compiler.CompileException:
        print(format_code(kernel.code))
        raise

    np.testing.assert_array_almost_equal(cp.asnumpy(inputs),
                                         cp.asnumpy(outputs))
    return

    # Dead code
    print(grid, block)
    print("\n")
    print(inputs)
    print(outputs)
Beispiel #8
0
def test_cuda_shuffle_transpose():
    cp = pytest.importorskip("cupy")
    jinja2 = pytest.importorskip("jinja2")

    _TEMPLATE = jinja2.Template("""
    #include <cupy/carray.cuh>

    #define debug {{debug}}

    extern "C" __global__ void kernel(
        const CArray<{{type}}, 2> input,
        CArray<{{type}}, 2> output)
    {
        const ptrdiff_t & nvis = input.shape()[0];
        int v = blockIdx.x*blockDim.x + threadIdx.x;
        int lane_id = threadIdx.x & ({{warp_size}} - 1);

        if(v >= nvis)
            { return; }

        // Input correlation handled by this thread
        int mask = __activemask();


        {{type}} loads[{{corrs}}];
        {{type}} values[{{corrs}}];

        {% for corr in range(corrs) %}
        loads[{{corr}}] = input[v + {{corr}}*nvis];
        {%- endfor %}

        __syncthreads();

        if(debug)
        {
            if(threadIdx.x == 0)
                { printf("mask %d\\n", mask); }

            printf("[%d] %d %d %d %d\\n",
                   lane_id,
                   loads[0], loads[1],
                   loads[2], loads[3]);

            if(threadIdx.x == 0)
                { printf("\\n"); }
        }


        // Tranpose forward
        #pragma unroll ({{corrs}})
        for(int corr=0; corr < {{corrs}}; ++corr)
        {
            int src_corr = ({{corrs}} - corr + lane_id) % {{corrs}};
            int dest_corr = (lane_id + corr) % {{corrs}};
            int src_lane = (lane_id / {{corrs}})*{{corrs}} + dest_corr;

            values[dest_corr] = __shfl_sync(mask, loads[src_corr],
                                     src_lane, {{warp_size}});
        }

        // Copy
        #pragma unroll ({{corrs}})
        for(int corr=0; corr < {{corrs}}; ++corr)
        {
            loads[corr] = values[corr];
        }

        // Transpose backward
        #pragma unroll ({{corrs}})
        for(int corr=0; corr < {{corrs}}; ++corr)
        {
            int src_corr = ({{corrs}} - corr + lane_id) % {{corrs}};
            int dest_corr = (lane_id + corr) % {{corrs}};
            int src_lane = (lane_id / {{corrs}})*{{corrs}} + dest_corr;

            values[dest_corr] = __shfl_sync(mask, loads[src_corr],
                                     src_lane, {{warp_size}});
        }


        __syncthreads();

        if(debug)
        {
            if(threadIdx.x == 0)
                { printf("\\n"); }

            printf("[%d] %d %d %d %d\\n",
                   lane_id,
                   values[0], values[1],
                   values[2], values[3]);
        }


        {% for corr in range(corrs) %}
        output[v + {{corr}}*nvis] = values[{{corr}}];
        {%- endfor %}
    }
    """)

    nvis = 32
    ncorrs = 4
    dtype = np.int32

    dtypes = {
        np.float32: 'float',
        np.float64: 'double',
        np.int32: 'int',
    }

    code = _TEMPLATE.render(type=dtypes[dtype],
                            warp_size=32,
                            corrs=ncorrs,
                            debug="false").encode("utf-8")
    kernel = cp.RawKernel(code, "kernel")

    inputs = cp.arange(nvis * ncorrs, dtype=dtype).reshape(nvis, ncorrs)
    outputs = cp.empty_like(inputs)
    args = (inputs, outputs)
    block = (256, 1, 1)
    grid = tuple((d + b - 1) // b for d, b in zip((nvis, 1, 1), block))

    try:
        kernel(grid, block, args)
    except cp.cuda.compiler.CompileException:
        print(format_code(kernel.code))
        raise

    np.testing.assert_array_almost_equal(cp.asnumpy(inputs),
                                         cp.asnumpy(outputs))
    return

    # Dead code
    print(grid, block)
    print("\n")
    print(inputs)
    print(outputs)
Beispiel #9
0
def predict_vis(time_index,
                antenna1,
                antenna2,
                dde1_jones=None,
                source_coh=None,
                dde2_jones=None,
                die1_jones=None,
                base_vis=None,
                die2_jones=None):
    """ Cupy implementation of the feed_rotation kernel. """

    have_ddes = dde1_jones is not None and dde2_jones is not None
    have_dies = die1_jones is not None and die2_jones is not None
    have_coh = source_coh is not None
    have_bvis = base_vis is not None

    # Infer the output shape
    if have_ddes:
        row = time_index.shape[0]
        chan = dde1_jones.shape[3]
        corrs = dde1_jones.shape[4:]
    elif have_coh:
        row = time_index.shape[0]
        chan = source_coh.shape[2]
        corrs = source_coh.shape[3:]
    elif have_dies:
        row = time_index.shape[0]
        chan = die1_jones.shape[2]
        corrs = die1_jones.shape[3:]
    elif have_bvis:
        row = time_index.shape[0]
        chan = base_vis.shape[1]
        corrs = base_vis.shape[2:]
    else:
        raise ValueError("Insufficient inputs supplied for determining "
                         "the output shape")

    ncorrs = len(corrs)

    # Flatten correlations
    if ncorrs == 2:
        flat_corrs = reduce(mul, corrs, 1)

        if have_ddes:
            dde_shape = dde1_jones.shape[:-ncorrs] + (flat_corrs, )
            dde1_jones = dde1_jones.reshape(dde_shape)
            dde2_jones = dde2_jones.reshape(dde_shape)

        if have_coh:
            coh_shape = source_coh.shape[:-ncorrs] + (flat_corrs, )
            source_coh = source_coh.reshape(coh_shape)

        if have_dies:
            die_shape = die1_jones.shape[:-ncorrs] + (flat_corrs, )
            die1_jones = die1_jones.reshape(die_shape)
            die2_jones = die2_jones.reshape(die_shape)

        if have_bvis:
            bvis_shape = base_vis.shape[:-ncorrs] + (flat_corrs, )
            base_vis = base_vis.reshape(bvis_shape)

    elif ncorrs == 1:
        flat_corrs = corrs[0]
    else:
        raise ValueError("Invalid correlation setup %s" % (corrs, ))

    out_shape = (row, chan) + (flat_corrs, )

    kernel, block, out_dtype = _generate_kernel(time_index, antenna1, antenna2,
                                                dde1_jones, source_coh,
                                                dde2_jones, die1_jones,
                                                base_vis, die2_jones, corrs,
                                                len(out_shape))

    grid = grids((chan * flat_corrs, row, 1), block)
    out = cp.empty(shape=out_shape, dtype=out_dtype)

    # Normalise the time index
    # TODO(sjperkins)
    # Normalise the time index with a device-wide reduction
    norm_time_index = time_index - time_index.min()

    args = (norm_time_index, antenna1, antenna2, dde1_jones, source_coh,
            dde2_jones, die1_jones, base_vis, die2_jones, out)

    try:
        kernel(grid, block, tuple(a for a in args if a is not None))
    except CompileException:
        log.exception(format_code(kernel.code))
        raise

    return out.reshape((row, chan) + corrs)