Example #1
0
def _create_vector_types():
    from pycuda.characterize import platform_bits
    if platform_bits() == 32:
        long_dtype = np.int32
        ulong_dtype = np.uint32
    else:
        long_dtype = np.int64
        ulong_dtype = np.uint64

    field_names = ["x", "y", "z", "w"]

    from pycuda.tools import get_or_register_dtype

    for base_name, base_type, counts in [
        ('char', np.int8, [1, 2, 3, 4]),
        ('uchar', np.uint8, [1, 2, 3, 4]),
        ('short', np.int16, [1, 2, 3, 4]),
        ('ushort', np.uint16, [1, 2, 3, 4]),
        ('int', np.int32, [1, 2, 3, 4]),
        ('uint', np.uint32, [1, 2, 3, 4]),
        ('long', long_dtype, [1, 2, 3, 4]),
        ('ulong', ulong_dtype, [1, 2, 3, 4]),
        ('longlong', np.int64, [1, 2]),
        ('ulonglong', np.uint64, [1, 2]),
        ('float', np.float32, [1, 2, 3, 4]),
        ('double', np.float64, [1, 2]),
    ]:
        for count in counts:
            name = "%s%d" % (base_name, count)
            dtype = np.dtype([(field_names[i], base_type)
                              for i in range(count)])

            get_or_register_dtype(name, dtype)

            setattr(vec, name, dtype)

            my_field_names = ",".join(field_names[:count])
            setattr(
                vec, "make_" + name,
                staticmethod(
                    eval(
                        "lambda %s: array((%s), dtype=my_dtype)" %
                        (my_field_names, my_field_names),
                        dict(array=np.array, my_dtype=dtype))))
Example #2
0
def _create_vector_types():
    from pycuda.characterize import platform_bits
    if platform_bits() == 32:
        long_dtype = np.int32
        ulong_dtype = np.uint32
    else:
        long_dtype = np.int64
        ulong_dtype = np.uint64

    field_names = ["x", "y", "z", "w"]

    from pycuda.tools import get_or_register_dtype

    for base_name, base_type, counts in [
            ('char', np.int8, [1, 2, 3, 4]),
            ('uchar', np.uint8, [1, 2, 3, 4]),
            ('short', np.int16, [1, 2, 3, 4]),
            ('ushort', np.uint16, [1, 2, 3, 4]),
            ('int', np.int32, [1, 2, 3, 4]),
            ('uint', np.uint32, [1, 2, 3, 4]),
            ('long', long_dtype, [1, 2, 3, 4]),
            ('ulong', ulong_dtype, [1, 2, 3, 4]),
            ('longlong', np.int64, [1, 2]),
            ('ulonglong', np.uint64, [1, 2]),
            ('float', np.float32, [1, 2, 3, 4]),
            ('double', np.float64, [1, 2]),
            ]:
        for count in counts:
            name = "%s%d" % (base_name, count)
            dtype = np.dtype([
                (field_names[i], base_type)
                for i in range(count)])

            get_or_register_dtype(name, dtype)

            setattr(vec, name, dtype)

            my_field_names = ",".join(field_names[:count])
            setattr(vec, "make_"+name,
                    staticmethod(eval(
                        "lambda %s: array((%s), dtype=my_dtype)"
                        % (my_field_names, my_field_names),
                        dict(array=np.array, my_dtype=dtype))))
Example #3
0
    + maxloc_preamble
)

maxloc_preamble_double = (
    """
    #define MAXLOCN maxlocd
    #define TTYPE double
    #define LTYPE long
"""
    + maxloc_preamble
)

maxloc_dtype_double = np.dtype([("max", np.float64), ("loc", np.int64)])
maxloc_dtype_single = np.dtype([("max", np.float32), ("loc", np.int32)])

maxloc_dtype_single = get_or_register_dtype("maxlocs", dtype=maxloc_dtype_single)
maxloc_dtype_double = get_or_register_dtype("maxlocd", dtype=maxloc_dtype_double)

mls = LowerLatencyReductionKernel(
    maxloc_dtype_single,
    neutral="maxloc_start()",
    reduce_expr="maxloc_red(a, b)",
    map_expr="maxloc_map(x[i], i)",
    arguments="float *x",
    preamble=maxloc_preamble_single,
)

mld = LowerLatencyReductionKernel(
    maxloc_dtype_double,
    neutral="maxloc_start()",
    reduce_expr="maxloc_red(a, b)",
Example #4
0
maxloc_preamble_single = """
    #define MAXLOCN maxlocs
    #define TTYPE float
    #define LTYPE int
""" + maxloc_preamble

maxloc_preamble_double = """
    #define MAXLOCN maxlocd
    #define TTYPE double
    #define LTYPE long
""" + maxloc_preamble

maxloc_dtype_double = np.dtype([("max", np.float64), ("loc", np.int64)])
maxloc_dtype_single = np.dtype([("max", np.float32), ("loc", np.int32)])

maxloc_dtype_single = get_or_register_dtype("maxlocs",
                                            dtype=maxloc_dtype_single)
maxloc_dtype_double = get_or_register_dtype("maxlocd",
                                            dtype=maxloc_dtype_double)

mls = LowerLatencyReductionKernel(maxloc_dtype_single,
                                  neutral="maxloc_start()",
                                  reduce_expr="maxloc_red(a, b)",
                                  map_expr="maxloc_map(x[i], i)",
                                  arguments="float *x",
                                  preamble=maxloc_preamble_single)

mld = LowerLatencyReductionKernel(maxloc_dtype_double,
                                  neutral="maxloc_start()",
                                  reduce_expr="maxloc_red(a, b)",
                                  map_expr="maxloc_map(x[i], i)",
                                  arguments="double *x",
Example #5
0
def get_shift_kernel(num_shifts):
    shift_preamble = Template("""
    struct shift_t${n}{
        % for i in range(n):
             float vr${i};
             float vi${i};
        % endfor
        
        __device__
        shift_t${n}(){}
        
        __device__
        shift_t${n}(shift_t${n} const &src): 
                    % for i in range(n-1):
                        vr${i}(src.vr${i}), 
                        vi${i}(src.vi${i}),
                    % endfor
                    vr${n-1}(src.vr${n-1}), 
                    vi${n-1}(src.vi${n-1})
                    {}
                   
        __device__
        shift_t${n}(shift_t${n} const volatile &src): 
                    % for i in range(n-1):
                        vr${i}(src.vr${i}), 
                        vi${i}(src.vi${i}),
                    % endfor
                    vr${n-1}(src.vr${n-1}), 
                    vi${n-1}(src.vi${n-1})
                    {}
        
        __device__
        shift_t${n} volatile &operator=( shift_t${n} const &src) volatile{
            % for i in range(n):
                 vr${i} = src.vr${i};
                 vi${i} = src.vi${i};
            % endfor
            return *this;
        }
    };
    


    __device__ shift_t${n} shift_red(shift_t${n} a, shift_t${n} b){
        % for i in range(n):
             a.vr${i} += b.vr${i};
             a.vi${i} += b.vi${i};
        % endfor
        return a;
    }

    __device__ shift_t${n} shift_start(){
        shift_t${n} t;
        % for i in range(n):
             t.vr${i}=0;
             t.vi${i}=0;
        % endfor
        return t;
    }

    __device__ shift_t${n} shift_map(pycuda::complex<float> x, 
                               % for i in range(n):
                                    float shift${i},
                               % endfor
                               float offset, float slen){
        shift_t${n} t; 
        float pphase = offset * 2 * 3.141592653  / slen;
        float  pr, pi;
        
        % for i in range(n):
            __sincosf(pphase * shift${i}, &pi, &pr);
           
            // Phase shift the input data (x) to correspond to a time shift
            t.vr${i} = x._M_re * pr - x._M_im * pi;
            t.vi${i} = x._M_re * pi + x._M_im * pr;  
        % endfor
        return t;
    }
    """).render(n = num_shifts)
    
    shift_map_args = ""
    shift_krnl_args = ""
    for i in range(num_shifts):
        shift_map_args += " shift%s," % i
        shift_krnl_args += " float shift%s, " % i        

    sd = np.dtype([("v1", np.complex64, num_shifts)])
    shift_t = get_or_register_dtype('shift_t%s' % num_shifts, sd)

    shift_krnl = LowerLatencyReductionKernel(shift_t, neutral="shift_start()",
                reduce_expr="shift_red(a, b)", map_expr="shift_map(x[i], " + shift_map_args + " offset+i, slen)",
                arguments="pycuda::complex<float> *x," + shift_krnl_args + "float offset, float slen ",
                preamble=shift_preamble)
                
    return shift_krnl