Example #1
0
    def _apply_transform_cp(self):
        source_shape_y: cp.int32 = cp.int32(self._source_cp.shape[0])
        source_shape_x: cp.int32 = cp.int32(self._source_cp.shape[1])
        frame_shape_y: cp.int32 = cp.int32(self.frame_shape.y)
        frame_shape_x: cp.int32 = cp.int32(self.frame_shape.x)

        matrix = cp.asarray(self._matrix, dtype=cp.float32)
        vector = cp.asarray(self._vector, dtype=cp.float32)
        frame_to_source_fp32 = cp.matmul(self._frame_pixels_cp,
                                         matrix.T) + vector
        frame_to_source_int32 = cp.rint(frame_to_source_fp32).astype(cp.int32)

        # for 3D array: 0 is x, 1 is y
        source_x = frame_to_source_int32[:, :,
                                         0]  # 2D array of x pixel in source
        source_y = frame_to_source_int32[:, :,
                                         1]  # 2D array of y pixel in source

        # boolean 2-D array of portal pixels that map to a pixel on the source
        # will be false if the pixel on source would fall outside of source
        mapped = (source_y >= 0) & (source_y < source_shape_y) & \
                 (source_x >= 0) & (source_x < source_shape_x)

        frame_rgba = cp.zeros(shape=(frame_shape_y, frame_shape_x, 4),
                              dtype=cp.uint8)
        # if cp.all(mapped):
        #     frame_rgba[:, :] = self._source_cp[source_y, source_x, :]
        # else:
        #     frame_rgba[mapped, :] = self._source_cp[source_y[mapped], source_x[mapped], :]
        frame_rgba[mapped, :] = self._source_cp[source_y[mapped],
                                                source_x[mapped], :]
        # self.image_rgba[~mapped, :] = self._zero_uint     probably slower than just zeroing everything first

        self._frame_rgba = cp.asnumpy(frame_rgba)
def _numba_psf(in_x, in_y, psf, spacing_x, spacing_y, O_begin, O_end, Nx, Ny,
               width_x, width_y, mesh_ratio):

    btx, bty = cuda.grid(2)

    x = in_x[bty, btx]
    y = in_y[bty, btx]

    temp = 0

    for order in range(cupy.int32(O_begin), cupy.int32(O_end), 1):
        if order == 0:
            continue

        intensity = (math.sin(order / mesh_ratio) / (order / mesh_ratio))**2

        for i in range(cupy.int32(len(spacing_x))):
            dx = spacing_x[i]
            dy = spacing_y[i]
            x_centered = x - (0.5 * Nx + dx * order + 0.5)
            y_centered = y - (0.5 * Ny + dy * order + 0.5)
            temp += math.exp(-width_x * x_centered * x_centered -
                             width_y * y_centered * y_centered) * intensity

    psf[bty, btx] = temp
Example #3
0
def load_image(path_dir, mode):

    # -----------------------------------------------------------------------------
    if mode:  # 学習用データセットの準備
        fnames = sorted(glob.glob(path_dir + "/*"))
        #  file = [(cp.load(f)/255.0).astype(cp.float32).transpose([2, 0, 1]) for f in fnames]
        file = []
        for index, item in enumerate(fnames):
            item = cp.load(item).astype(cp.float32).transpose([2, 0, 1]) / 255.0
            file.append(item)
        # label = [os.path.basename(f) for f in fnames]
        labels = []
        for num in range(len(fnames) / 2):
            labels.append(cp.int32(num))
            labels.append(cp.int32(num))

        # List = list(zip(file, labels))
        # data = chainer.datasets.TupleDataset(List)
        data = chainer.datasets.TupleDataset(file, labels)
        return data
        # -------------------------------------------------------------------------------
    if not mode:  # 識別用データセットの準備
        data = []
        labels = []
        filelist = sorted(glob.glob(path_dir + "/*"))
        for index, item in enumerate(filelist):
            item = np.load(item).astype(np.float32).transpose([2, 0, 1]) / 255.0
            item = item.reshape((1, item.shape[0], item.shape[1], item.shape[2]))
            data.append(item)
            labels.append(int(index))

        return data, labels
Example #4
0
def conv_forward_pass(image, filt, bias):
    # print("CUDA cODE")
    (n_f, n_c_f, f, _) = filt.shape # filter dimensions
    n_c, in_dim, _ = image.shape # image dimensions

    convfilter_kernel = cp.RawKernel(r'''
        extern "C" __global__
        void my_conv(const float* img, const float * filt, const float * bias, float * out, const int depth, const int img_dim, const int f, const int out_dim ) {
            const unsigned int j = blockDim.x*blockIdx.x + threadIdx.x;
            const unsigned int i = blockDim.y*blockIdx.y + threadIdx.y;
            const unsigned int pp = blockIdx.z;
            const unsigned int dp = depth;
            //printf("%d,%d ::: ",i,j);

            if(i<img_dim && j<img_dim){
                unsigned int oPixelPos = i*out_dim + j+ pp*out_dim*out_dim;
                out[oPixelPos] = 0;
                for(int h=0;h<dp;h++){
                    for(int k=0;k<f;k++){
                        for(int l=0;l<f;l++){
                            unsigned int iPixelPos = ((i+k)*img_dim + (j+l)) + img_dim*img_dim*h;
                            unsigned int filtPos = (k*f+l) + f*f*h + f*f*dp*pp;
                            out[oPixelPos] += img[iPixelPos] * filt[filtPos] + bias[pp];
                        }
                    }
                }
            }
        }
        ''', 'my_conv')

    out_dim = int(in_dim - f)+1 # calculate output dimensions
    img_gpu = cp.asarray(image, dtype=cp.float32) #28X28
    img_dim_gpu = cp.int32(in_dim) #28
    filt_size_gpu = cp.int32(f) # 5
    out_dim_gpu = cp.int32(out_dim) #24
    depth_gpu = cp.int32(n_c)
    threads_per_block = f
    num_blocks = (in_dim//f) +1

    filt_gpu = cp.asarray(filt, dtype=cp.float32) #5X5
    # print(filt)
    # filt_gpu = cp.zeros((n_f, n_c_f, f, f), dtype=cp.float32) #5X5
    filt_gpu = cp.asarray(filt_gpu.flatten(), dtype=cp.float32)
    # print(filt_gpu)


    # out_gpu = cp.zeros((out_dim,out_dim,n_f), dtype=cp.float32) # 24 X 24
    out_gpu = cp.zeros((n_f,out_dim,out_dim), dtype=cp.float32) # 24 X 24
    out_gpu = out_gpu.flatten()

    bias_gpu = cp.asarray(bias, dtype=cp.float32) #5X5
    bias_gpu = bias_gpu.flatten()

    convfilter_kernel((num_blocks,num_blocks,8), (threads_per_block,threads_per_block), (img_gpu, filt_gpu, bias_gpu, out_gpu, depth_gpu, img_dim_gpu, filt_size_gpu, out_dim_gpu))
    # # convfilter_kernel((576,), (24,5), (img_gpu, filt_gpu, out_gpu, depth_gpu, img_dim_gpu, filt_size_gpu, out_dim_gpu))
    out_gpu = cp.reshape(out_gpu,(n_f,out_dim,out_dim))
    output = cp.asnumpy(out_gpu)
    return output
Example #5
0
def prominent_peaks_optimized(img,
                              min_xdistance=1,
                              min_ydistance=1,
                              threshold=None,
                              num_peaks=cp.inf):
    """Return peaks with non-maximum suppression.
    Identifies most prominent features separated by certain distances.
    Non-maximum suppression with different sizes is applied separately
    in the first and second dimension of the image to identify peaks.

    Parameters
    ----------
    image : (M, N) ndarray
        Input image.
    min_xdistance : int
        Minimum distance separating features in the x dimension.
    min_ydistance : int
        Minimum distance separating features in the y dimension.
    threshold : float
        Minimum intensity of peaks. Default is `0.5 * max(image)`.
    num_peaks : int
        Maximum number of peaks. When the number of peaks exceeds `num_peaks`,
        return `num_peaks` coordinates based on peak intensity.

    Returns
    -------
    intensity, xcoords, ycoords : tuple of array
        Peak intensity values, x and y indices.

    Notes
    -----
    Modified from https://github.com/mritools/cupyimg _prominent_peaks method
    """
    THREADS_PER_BLOCK = (32, 1)
    # Each thread is responsible for a (min_ydistance * min_xdistance) patch
    # THREADS_PER_BLOCK is in the order of (x, y), but img.shape is in the order of (y, x)
    NUM_BLOCKS = (img.shape[1] // (THREADS_PER_BLOCK[0] * min_xdistance) +
                  ((img.shape[1] %
                    (THREADS_PER_BLOCK[0] * min_xdistance)) > 0),
                  img.shape[0] // (THREADS_PER_BLOCK[1] * min_ydistance) +
                  ((img.shape[0] %
                    (THREADS_PER_BLOCK[1] * min_ydistance)) > 0))
    NUM_THREADS = np.multiply(THREADS_PER_BLOCK, NUM_BLOCKS)
    elems = (NUM_THREADS[0] * NUM_THREADS[1], )
    intensity, xcoords, ycoords = cp.zeros(elems, dtype=cp.float32), cp.zeros(
        elems, dtype=cp.int32), cp.zeros(elems, dtype=cp.int32)
    prominent_peaks_kernel(
        NUM_BLOCKS, THREADS_PER_BLOCK,
        (img, cp.int32(img.shape[0]), cp.int32(
            img.shape[1]), cp.int32(min_xdistance), cp.int32(min_ydistance),
         cp.float32(threshold), intensity, xcoords, ycoords))
    indices = intensity != 0.0
    return intensity[indices], xcoords[indices], ycoords[indices]
Example #6
0
    def forward(self, one, two):
        rbot0 = one.new_zeros([ one.shape[0], one.shape[2] + 40, one.shape[3] + 40, one.shape[1] ])
        rbot1 = one.new_zeros([ one.shape[0], one.shape[2] + 40, one.shape[3] + 40, one.shape[1] ])

        one = one.contiguous(); assert(one.is_cuda == True)
        two = two.contiguous(); assert(two.is_cuda == True)

        output = one.new_zeros([ one.shape[0], 441, one.shape[2], one.shape[3] ])

        if one.is_cuda == True:
            n = one.shape[2] * one.shape[3]
            cupy_launch('kernel_Correlation_rearrange', cupy_kernel('kernel_Correlation_rearrange', {
                'input': one,
                'output': rbot0
            }))(
                grid=tuple([ int((n + 16 - 1) / 16), one.shape[1], one.shape[0] ]),
                block=tuple([ 16, 1, 1 ]),
                args=[ cupy.int32(n), one.data_ptr(), rbot0.data_ptr() ]
            )

            n = two.shape[2] * two.shape[3]
            cupy_launch('kernel_Correlation_rearrange', cupy_kernel('kernel_Correlation_rearrange', {
                'input': two,
                'output': rbot1
            }))(
                grid=tuple([ int((n + 16 - 1) / 16), two.shape[1], two.shape[0] ]),
                block=tuple([ 16, 1, 1 ]),
                args=[ cupy.int32(n), two.data_ptr(), rbot1.data_ptr() ]
            )

            n = output.shape[1] * output.shape[2] * output.shape[3]
            cupy_launch('kernel_Correlation_updateOutput', cupy_kernel('kernel_Correlation_updateOutput', {
                'rbot0': rbot0,
                'rbot1': rbot1,
                'top': output
            }))(
                grid=tuple([ output.shape[3], output.shape[2], output.shape[0] ]),
                block=tuple([ 32, 1, 1 ]),
                shared_mem=one.shape[1] * 4,
                args=[ cupy.int32(n), rbot0.data_ptr(), rbot1.data_ptr(), output.data_ptr() ]
            )

        elif one.is_cuda == False:
            raise NotImplementedError()

        # end

        self.save_for_backward(one, two, rbot0, rbot1)

        return output
Example #7
0
    def backward(self, gradOutput):
        one, two, rbot0, rbot1 = self.saved_tensors

        gradOutput = gradOutput.contiguous(); assert(gradOutput.is_cuda == True)

        gradOne = one.new_zeros([ one.shape[0], one.shape[1], one.shape[2], one.shape[3] ]) if self.needs_input_grad[0] == True else None
        gradTwo = one.new_zeros([ one.shape[0], one.shape[1], one.shape[2], one.shape[3] ]) if self.needs_input_grad[1] == True else None

        if one.is_cuda == True:
            if gradOne is not None:
                for intSample in range(one.shape[0]):
                    n = one.shape[1] * one.shape[2] * one.shape[3]
                    cupy_launch('kernel_Correlation_updateGradOne', cupy_kernel('kernel_Correlation_updateGradOne', {
                        'rbot0': rbot0,
                        'rbot1': rbot1,
                        'gradOutput': gradOutput,
                        'gradOne': gradOne,
                        'gradTwo': None
                    }))(
                        grid=tuple([ int((n + 512 - 1) / 512), 1, 1 ]),
                        block=tuple([ 512, 1, 1 ]),
                        args=[ cupy.int32(n), intSample, rbot0.data_ptr(), rbot1.data_ptr(), gradOutput.data_ptr(), gradOne.data_ptr(), None ]
                    )
                # end
            # end

            if gradTwo is not None:
                for intSample in range(one.shape[0]):
                    n = one.shape[1] * one.shape[2] * one.shape[3]
                    cupy_launch('kernel_Correlation_updateGradTwo', cupy_kernel('kernel_Correlation_updateGradTwo', {
                        'rbot0': rbot0,
                        'rbot1': rbot1,
                        'gradOutput': gradOutput,
                        'gradOne': None,
                        'gradTwo': gradTwo
                    }))(
                        grid=tuple([ int((n + 512 - 1) / 512), 1, 1 ]),
                        block=tuple([ 512, 1, 1 ]),
                        args=[ cupy.int32(n), intSample, rbot0.data_ptr(), rbot1.data_ptr(), gradOutput.data_ptr(), None, gradTwo.data_ptr() ]
                    )
                # end
            # end

        elif one.is_cuda == False:
            raise NotImplementedError()

        # end

        return gradOne, gradTwo
def CFAR_CA_GPU(signal_ext, origSignalLen, guardBandLen_1side,
                validSampLen_1side, scratchPad, noiseMargin, outputBoolVector):

    thrdID = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x

    if (thrdID < origSignalLen - 1) or (thrdID > 2 * origSignalLen - 2):
        return

    # check for local maxima on the CUT i.e. signal_ext[thrdID]
    if (signal_ext[thrdID] >= signal_ext[thrdID - 1]) and (
            signal_ext[thrdID] >= signal_ext[thrdID + 1]):

        count = cp.int32(0)
        for i in range(thrdID - guardBandLen_1side - validSampLen_1side,
                       thrdID - guardBandLen_1side):
            # scratchPad[count] = signal_ext[i]; # This should not be done. There should be a separate scratch pad for each thread when it is vector/matrix copying
            scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[i]
            count += 1

        for j in range(thrdID + guardBandLen_1side + 1,
                       thrdID + guardBandLen_1side + validSampLen_1side + 1):
            # scratchPad[count] = signal_ext[j]; # This should not be done. There should be a separate scratch pad for each thread when it is vector/matrix copying
            scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[j]
            count += 1
        avgNoisePower = cp.float32(0)
        for ele in range(2 * validSampLen_1side):
            avgNoisePower += scratchPad[thrdID - (origSignalLen - 1), ele]
        avgNoisePower = avgNoisePower / (2 * validSampLen_1side)

        if (signal_ext[thrdID] > noiseMargin * avgNoisePower):
            outputBoolVector[thrdID - (origSignalLen - 1)] = 1
Example #9
0
 def compress(self, tensor, name):
     shape = tensor.size()
     tensor_flatten = tensor.flatten()
     cupy_tensor = cupy.fromDlpack(to_dlpack(tensor_flatten))
     tensor_cast = cupy_tensor.view(cupy.int32)
     sign = tensor_cast & cupy.int32(0b10000000000000000000000000000000)
     exp = tensor_cast & cupy.int32(0b01111111100000000000000000000000)
     mantissa = tensor_cast & cupy.int32(0b00000000011111111111111111111111)
     exp_add_one = mantissa > cupy.random.randint(low=0, high=0b00000000011111111111111111111111,
                                                  size=cupy_tensor.shape,
                                                  dtype=cupy.int32)
     exponent = cupy.where(exp_add_one, exp + 0b00000000100000000000000000000000, exp)
     exp_shift = cupy.clip(exponent, a_min=0b00001001000000000000000000000000, a_max=0b01001000100000000000000000000000)
     exps = cupy.right_shift(exp_shift, 23)
     exps = cupy.bitwise_or(cupy.right_shift(sign, 24), exps - 18)
     tensor_compressed = exps.astype(cupy.uint8)
     return [from_dlpack(tensor_compressed.toDlpack())], shape
def Q_inner_product_cupy(Q, A, start_indices, window_size):
    num_time_points, num_lms = Q.shape
    num_extrinsic_samples, _ = A.shape

    assert not cupy.isfortran(Q)
    assert not cupy.isfortran(A)

    out = cupy.empty(
        (num_extrinsic_samples, window_size),
        dtype=cupy.complex128,
        order="C",
    )

    global _cuda_code
    if _cuda_code is None:
        # it's assumed that cuda_Q_inner_product.cu is placed in the same folder as this code
        path = os.path.join(os.path.dirname(__file__),
                            'cuda_Q_inner_product.cu')
        # alternative to deal with packaging in another directory
        if not (os.path.isfile(path)):
            path = os.path.join(
                os.path.split(os.path.dirname(__file__))[0],
                'cuda_Q_inner_product.cu')
        with open(path, 'r') as f:
            _cuda_code = f.read()
            Q_prod_fn = cupy.RawKernel(_cuda_code, "Q_inner")
    else:
        Q_prod_fn = cupy.RawKernel(_cuda_code, "Q_inner")

    float_prec = 16
    num_threads_x = 4
    num_threads_y = 1024 // 4
    block_size = num_threads_x, num_threads_y, 0
    grid_size = (
        (num_extrinsic_samples + num_threads_x - 1) // num_threads_x,
        0,
        0,
    )
    args = (
        Q,
        A,
        start_indices,
        window_size,
        num_time_points,
        num_extrinsic_samples,
        num_lms,
        out,
    )
    Q_prod_fn(
        grid_size,
        block_size,
        args,
        shared_mem=cupy.int32(num_threads_x * num_lms * float_prec),
    )

    return out
Example #11
0
 def test_const_memory(self):
     mod = cupy.RawModule(code=test_const_mem, backend=self.backend)
     ker = mod.get_function('multiply_by_const')
     mem_ptr = mod.get_global('some_array')
     const_arr = cupy.ndarray((100, ), cupy.float32, mem_ptr)
     data = cupy.arange(100, dtype=cupy.float32)
     const_arr[...] = data
     output_arr = cupy.ones(100, dtype=cupy.float32)
     ker((1, ), (100, ), (output_arr, cupy.int32(100)))
     assert (data == output_arr).all()
Example #12
0
def xengine_full(signalsFX, validFX, signalsFY, validFY, blockDim=(4,16)):
    """
    X-engine for the outputs of fengine().
    """
    
    nStand, nChan, nWin = signalsFX.shape
    nBL = nStand*(nStand+1) // 2
    
    with cupy.cuda.Stream():
        try:
            signalsFX = cupy.asarray(signalsFX)
            signalsFY = cupy.asarray(signalsFY)
            validFX = cupy.asarray(validFX)
            validFY = cupy.asarray(validFY)
        except cupy.cuda.memory.OutOfMemoryError:
            _CACHE.free()
            signalsFX = cupy.asarray(signalsFX)
            signalsFY = cupy.asarray(signalsFY)
            validFX = cupy.asarray(validFX)
            validFY = cupy.asarray(validFY)
            
        try:
            combined = _CACHE[(2*nStand,nChan,nWin,numpy.complex64)]
            valid = _CACHE[(2*nStand,nWin,numpy.uint8)]
        except KeyError:
            combined = cupy.empty((2*nStand,nChan,nWin), dtype=numpy.complex64)
            valid = cupy.empty((2*nStand,nWin), dtype=numpy.uint8)
            _CACHE[(2*nStand,nChan,nWin,numpy.complex64)] = combined
            _CACHE[(2*nStand,nWin,numpy.uint8)] = valid
            
        nct, nwt = blockDim
        ncb = int(numpy.ceil(nChan/nct))
        nwb = int(numpy.ceil(nWin/nwt))
            
        _INTERLEAVE((nStand,ncb,nwb), (nct,nwt),
                    (signalsFX, signalsFY, validFX, validFY,
                     cupy.int32(nStand), cupy.int32(nChan), cupy.int32(nWin),
                     combined, valid))
        
        try:
            output = _CACHE[(4,nBL,nChan,numpy.complex64)]
        except KeyError:
            output = cupy.empty((4,nBL,nChan), dtype=numpy.complex64)
            _CACHE[(4,nBL,nChan,numpy.complex64)] = output
            
        nbt, nct = blockDim
        nbb = int(numpy.ceil(nBL/nbt))
        ncb = int(numpy.ceil(nChan/nct))
        
        _XENGINE3((nbb,ncb), (nbt, nct),
                  (combined, valid,
                   cupy.int32(nStand), cupy.int32(nBL), cupy.int32(nChan), cupy.int32(nWin),
                    output))
        
        output_cpu = cupy.asnumpy(output)
    return output_cpu[0,:,:], output_cpu[1,:,:], output_cpu[2,:,:], output_cpu[3,:,:]
Example #13
0
    def start(self, rand_seed=None):
        if rand_seed is None:
            rand_seed = np.random.randint(1e5)
        self.nPh = int(self.nPh)
        self._reset_results()
        self._generate_initial_coodinate(self.nPh)

        M = np.int32(self.model.voxel_model.shape[1])
        L = np.int32(self.model.voxel_model.shape[2])

        print("")
        print("###### Start (Random seed: %s) ######" % rand_seed)
        print("")
        start_ = time.time()
        cp.get_default_memory_pool().free_all_blocks()
        cp.get_default_pinned_memory_pool().free_all_blocks()

        add_ = cp.asarray(self.add.astype(np.int32), dtype=np.int32)
        p_ = cp.asarray(self.p.astype(np.float32), dtype=np.float32)
        v_ = cp.asarray(self.v.astype(np.float32), dtype=np.float32)
        w_ = cp.asarray(self.w.astype(np.float32), dtype=np.float32)
        ma_ = cp.asarray(self.model.ma.astype(np.float32))
        ms_ = cp.asarray(self.model.ms.astype(np.float32))
        n_ = cp.asarray(self.model.n.astype(np.float32))
        g_ = cp.asarray(self.model.g.astype(np.float32))
        v_model = cp.asarray(self.model.voxel_model.astype(np.int8),
                             dtype=np.int8)
        l_ = cp.float32(self.model.voxel_space)
        nph = cp.int32(self.nPh)
        end_p = cp.int8(self.model.end_point)

        func((int((self.nPh + self.threadnum - 1) / self.threadnum), 1),
             (self.threadnum, 1), (add_, p_, v_, w_, ma_, ms_, n_, g_, v_model,
                                   l_, M, L, nph, end_p, np.int32(rand_seed)))

        self.add = cp.asnumpy(add_)
        self.p = cp.asnumpy(p_)
        self.v = cp.asnumpy(v_)
        self.w = cp.asnumpy(w_)

        del add_, p_, v_, w_, ma_, ms_, n_, g_,
        del v_model, l_, M, L, nph, end_p, rand_seed,
        cp.get_default_memory_pool().free_all_blocks()
        cp.get_default_pinned_memory_pool().free_all_blocks()
        gc.collect()

        self._end_process()
        print("###### End ######")
        self.getRdTtRate()
        calTime(time.time(), start_)

        return self
Example #14
0
def xengine(signalsF1, validF1, signalsF2, validF2, blockDim=(4,16)):
    """
    X-engine for the outputs of fengine().
    """
    
    nStand, nChan, nWin = signalsF1.shape
    nBL = nStand*(nStand+1) // 2
    
    with cupy.cuda.Stream():
        try:
            signalsF1 = cupy.asarray(signalsF1)
            signalsF2 = cupy.asarray(signalsF2)
            validF1 = cupy.asarray(validF1)
            validF2 = cupy.asarray(validF2)
        except cupy.cuda.memory.OutOfMemoryError:
            _CACHE.free()
            signalsF1 = cupy.asarray(signalsF1)
            signalsF2 = cupy.asarray(signalsF2)
            validF1 = cupy.asarray(validF1)
            validF2 = cupy.asarray(validF2)
            
        try:
            output = _CACHE[(1,nBL,nChan,numpy.complex64)]
        except KeyError:
            output = cupy.empty((nBL,nChan), dtype=numpy.complex64)
            _CACHE[(1,nBL,nChan,numpy.complex64)] = output
            
        nbt, nct = blockDim
        nbb = int(numpy.ceil(nBL/nbt))
        ncb = int(numpy.ceil(nChan/nct))
        
        _XENGINE2((nbb,ncb), (nbt, nct),
                  (signalsF1, signalsF2, validF1, validF2,
                   cupy.int32(nStand), cupy.int32(nBL), cupy.int32(nChan), cupy.int32(nWin),
                   output))
        
        output_cpu = cupy.asnumpy(output)
    return output_cpu
def image_to_data(image, code):
    # 大きさを揃える
    img64 = image.resize((64, 64))
    # 画像を数値配列に
    pixels = xp.array(img64, dtype=xp.float32)
    pixels = pixels.reshape((1, 64, 64))
    pixels /= 255
    # インデックスを決める
    if code in all_labels:
        label = all_labels[code]
    else:
        label = len(all_labels)
        all_labels[code] = label
    return (pixels, xp.int32(label))
Example #16
0
    def test_shfl_width(self):
        @jit.rawkernel()
        def f(a, b, w):
            laneId = jit.threadIdx.x & 0x1f
            value = jit.shfl_sync(0xffffffff, b[jit.threadIdx.x], 0, width=w)
            b[laneId] = value

        c = cupy.arange(32, dtype=cupy.int32)
        for w in (2, 4, 8, 16, 32):
            a = cupy.int32(100)
            b = cupy.arange(32, dtype=cupy.int32)
            f[1, 32](a, b, w)
            c[c % w != 0] = c[c % w == 0]
            assert (b == c).all()
def _call_nms_kernel(bbox, thresh):
    n_bbox = bbox.shape[0]
    threads_per_block = 64
    col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
    blocks = (col_blocks, col_blocks, 1)
    threads = (threads_per_block, 1, 1)

    mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
    bbox = cp.ascontiguousarray(bbox, dtype=np.float32)
    kern = _load_kernel('nms_kernel', _nms_gpu_code)
    kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
                                bbox, mask_dev))

    mask_host = mask_dev.get()
    selection, n_selec = _nms_gpu_post(
        mask_host, n_bbox, threads_per_block, col_blocks)
    return selection, n_selec
def _call_nms_kernel(bbox, thresh):
    n_bbox = bbox.shape[0]
    threads_per_block = 64
    col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
    blocks = (col_blocks, col_blocks, 1)
    threads = (threads_per_block, 1, 1)

    mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
    bbox = cp.ascontiguousarray(bbox, dtype=np.float32)
    kern = _load_kernel('nms_kernel', _nms_gpu_code)
    kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
                                bbox, mask_dev))

    mask_host = mask_dev.get()
    selection, n_selec = _nms_gpu_post(
        mask_host, n_bbox, threads_per_block, col_blocks)
    return selection, n_selec
Example #19
0
    def forward(self, tenOne, tenTwo):
        tenOne = tenOne.contiguous()
        assert (tenOne.is_cuda == True)
        tenTwo = tenTwo.contiguous()
        assert (tenTwo.is_cuda == True)

        tenOut = tenOne.new_zeros([
            tenOne.shape[0], tenOne.shape[1], tenOne.shape[2], tenOne.shape[3]
        ])

        if tenOne.is_cuda == True:
            cuda_launch(
                'hadamard_out', '''
                extern "C" __global__ void __launch_bounds__(512) hadamard_out(
                    const int n,
                    const float* __restrict__ tenOne,
                    const float* __restrict__ tenTwo,
                    float* __restrict__ tenOut
                ) {
                    int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x;

                    if (intIndex >= n) {
                        return;
                    }

                    tenOut[intIndex] = tenOne[intIndex] * tenTwo[intIndex];
                }
            ''')(grid=tuple([int((tenOut.nelement() + 512 - 1) / 512), 1, 1]),
                 block=tuple([512, 1, 1]),
                 args=[
                     cupy.int32(tenOut.nelement()),
                     tenOne.data_ptr(),
                     tenTwo.data_ptr(),
                     tenOut.data_ptr()
                 ],
                 stream=collections.namedtuple('Stream', 'ptr')(
                     torch.cuda.current_stream().cuda_stream))

        elif tenOne.is_cuda == False:
            raise NotImplementedError()

        # end

        self.save_for_backward(tenOne, tenTwo)

        return tenOut
Example #20
0
def _call_nms_kernel(bbox, thresh):
    # PyTorch does not support unsigned long Tensor.
    # Doesn't matter,since it returns ndarray finally.
    # So I'll keep it unmodified.
    n_bbox = bbox.shape[0]
    threads_per_block = 64
    col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
    blocks = (col_blocks, col_blocks, 1)
    threads = (threads_per_block, 1, 1)

    mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
    bbox = cp.ascontiguousarray(bbox, dtype=np.float32)
    kern = _load_kernel('nms_kernel', _nms_gpu_code)
    kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
                                bbox, mask_dev))

    mask_host = mask_dev.get()
    selection, n_selec = _nms_gpu_post(
        mask_host, n_bbox, threads_per_block, col_blocks)
    return selection, n_selec
def _call_nms_kernel(bbox, thresh):
    # PyTorch does not support unsigned long Tensor.
    # Doesn't matter,since it returns ndarray finally.
    # So I'll keep it unmodified.
    n_bbox = bbox.shape[0] #框的个数
    threads_per_block = 64  #一个block有多少thread
    col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)#cuda常用的对齐block操作 保证线程数最小限度全覆盖数据
    blocks = (col_blocks, col_blocks, 1)  #因为对齐一个blocks按理说是(n_blocks,1,1) 说明后面要全排列了
    threads = (threads_per_block, 1, 1)

    mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)#开辟64*n_box*sizeof(np.uint64)的连续内存 置为0 用于存放结果
    bbox = cp.ascontiguousarray(bbox, dtype=np.float32) #将bbox从numpy转成cupycuda计算 放到连续的内存中以便cuda运算 很重要
    kern = _load_kernel('nms_kernel', _nms_gpu_code)#/加载自己写的c-cuda核函数
    kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),   #调用核函数
                                bbox, mask_dev))

    mask_host = mask_dev.get() #将计算结果从gpu取到本地
    selection, n_selec = _nms_gpu_post(
        mask_host, n_bbox, threads_per_block, col_blocks) #调用我们Cython导入的nms函数进行计算
    return selection, n_selec
def CFAR_OS_GPU(signal_ext, origSignalLen, guardBandLen_1side,
                validSampLen_1side, scratchPad, noiseMargin, ordStat,
                outputBoolVector):

    thrdID = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x

    if (thrdID < origSignalLen - 1) or (thrdID > 2 * origSignalLen - 2):
        return

    # check for local maxima on the CUT i.e. signal_ext[thrdID]
    if (signal_ext[thrdID] >= signal_ext[thrdID - 1]) and (
            signal_ext[thrdID] >= signal_ext[thrdID + 1]):

        count = cp.int32(0)
        for i in range(thrdID - guardBandLen_1side - validSampLen_1side,
                       thrdID - guardBandLen_1side):
            scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[i]
            count += 1

        for j in range(thrdID + guardBandLen_1side + 1,
                       thrdID + guardBandLen_1side + validSampLen_1side + 1):
            scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[j]
            count += 1

        temp = cp.float32(0)
        ordStat_largestVal = cp.float32(0)
        # sort in decreasing order of strength upto the ordStat kth largest value
        for i in range(ordStat):
            for j in range(i + 1, 2 * validSampLen_1side):
                if (scratchPad[thrdID - (origSignalLen - 1), i] <
                        scratchPad[thrdID - (origSignalLen - 1), j]):
                    temp = scratchPad[thrdID - (origSignalLen - 1), i]
                    scratchPad[thrdID - (origSignalLen - 1),
                               i] = scratchPad[thrdID - (origSignalLen - 1), j]
                    scratchPad[thrdID - (origSignalLen - 1), j] = temp

        ordStat_largestVal = scratchPad[thrdID - (origSignalLen - 1),
                                        ordStat - 1]

        if (signal_ext[thrdID] > noiseMargin * ordStat_largestVal):
            outputBoolVector[thrdID - (origSignalLen - 1)] = 1
    def get_number_of_ranges(record: OrderedDict) -> int:
        """
        Gets the number of ranges for the record.

        Parameters
        ----------
        record: OrderedDict
            hdf5 record containing antennas_iq data and metadata

        Returns
        -------
        num_ranges: int
            The number of ranges of the data
        """
        # Infer the number of ranges from the record metadata
        first_range_offset = ProcessAntennasIQ2Bfiq.calculate_first_range_rtt(record) * 1e-6 * record['rx_sample_rate']
        num_ranges = record['num_samps'] - xp.int32(first_range_offset) - record['blanked_samples'][-1]

        # 3 extra samples taken for each record (not sure why)
        num_ranges = num_ranges - 3

        return xp.uint32(num_ranges)
Example #24
0
def _numba_upfirdn_1d(
    x, h_trans_flip, up, down, axis, x_shape_a, h_per_phase, padded_len, out
):

    X = cuda.grid(1)
    strideX = cuda.gridsize(1)

    for i in range(X, cp.int32(out.shape[0]), strideX):

        x_idx = cp.int32(cp.int32(cp.int32(i * down) // up) % padded_len)
        h_idx = cp.int32(cp.int32(cp.int32(i * down) % up) * h_per_phase)

        x_conv_idx = x_idx - h_per_phase + 1
        if x_conv_idx < 0:
            h_idx -= x_conv_idx
            x_conv_idx = 0

        # If axis = 0, we need to know each column in x.
        for x_conv_idx in range(x_conv_idx, x_idx + 1):
            if x_conv_idx < x_shape_a and x_conv_idx >= 0:
                out[i] += x[x_conv_idx] * h_trans_flip[h_idx]
            h_idx += 1
Example #25
0
    def batch(self, batchsize=2):
        x, c = self.data.next(batchSize=batchsize,
                              dataSize=[8190],
                              dataSelect=[0])
        x = x[0].reshape(batchsize, 1, 1, -1)
        c = xp.asarray(c[0])
        c_ = xp.random.randint(0, 111, batchsize)
        c_ = c_ + (c_ >= c)
        # t = next(self.test)
        # t = self.data.test(size=6143)
        # _ = lambda x:self.encode(x)
        # _ = lambda x:x/xp.float32(32768)
        # B0_ = _(B0)
        A_gen = self.generator(x, c_)
        # print(A_gen.shape)
        B_gen = self.generator(x, c)

        F_tf, F_c = self.discriminator(A_gen[:, :, :, 5119:])
        T_tf, T_c = self.discriminator(x[:, :, :, 2047:-5119])

        dis_acc = (F.argmax(F_tf, axis=1).data.sum(),
                   xp.int32(batchsize) - F.argmax(T_tf, axis=1).data.sum(),
                   (T_c.data.argmax(axis=-1) == c).sum())
        # acc = (dis_acc[0]+dis_acc[1])/8

        # self.dataRate = self.dataRate if dis_acc[0] == dis_acc[1] else self.dataRate / xp.float32(0.99) if dis_acc[0] > dis_acc[1] else self.dataRate * xp.float32(0.99)

        # receptionSize = B0.shape[-1] - B_gen.shape[-1]
        # L_gen0 = F.softmax_cross_entropy(B_gen, B0[:,:,receptionSize:].reshape(batchsize,-1))
        # print(B_gen.shape)
        # print(B0_.shape)
        # L_gen0 = 0
        L_gen0 = F.mean_squared_error(B_gen, x[:, :, :, 1023:-1024])
        L_gen1 = F.softmax_cross_entropy(F_tf,
                                         xp.zeros(batchsize, dtype=np.int32))
        L_gen2 = F.softmax_cross_entropy(F_c, c_)
        gen_loss = (L_gen0.data, L_gen1.data)
        L_gen = L_gen1 + L_gen0 + L_gen2
        # L_gen = L_gen1 + (L_gen0 if L_gen0.data > 0.0001 else 0)

        L_dis0 = F.softmax_cross_entropy(F_tf,
                                         xp.ones(batchsize, dtype=np.int32))
        L_dis1 = F.softmax_cross_entropy(T_tf,
                                         xp.zeros(batchsize, dtype=np.int32))
        L_dis2 = F.softmax_cross_entropy(T_c, c)
        dis_loss = (L_dis0.data.get(), L_dis1.data.get(), L_dis2.data.get())
        # L_dis = L_dis0 * min(xp.float32(1), 1 / self.dataRate) + L_dis1 * min(xp.float32(1), self.dataRate)
        L_dis = L_dis0 + L_dis1 + L_dis2

        self.generator.cleargrads()
        L_gen.backward()
        self.gen_opt.update()

        self.discriminator.cleargrads()
        L_dis.backward()
        self.dis_opt.update()

        self.dis_opt.alpha *= 0.99999
        self.gen_opt.alpha *= 0.99999
        return (gen_loss, dis_loss, dis_acc, self.dataRate, (F_tf.data,
                                                             T_tf.data))
Example #26
0
def cuda_int32(intIn: int):
    return cupy.int32(intIn)
    # for GPU timing using CuPy
    start = cp.cuda.Event()
    end = cp.cuda.Event()
    timing_cp = 0
    timing_cp_wall = 0

    # running the kernel using CuPy's functionality
    for i in range(4):
        if i > 0:  # warm-up not needed if using RawModule
            start.record()
            _s = time.time()
        brute_force_pairs_kernel(
            (blocks, ), (threads, ),
            (d_x1, d_y1, d_z1, d_w1, d_x2, d_y2, d_z2, d_w2, d_rbins_squared,
             d_result_cp, cp.int32(d_x1.shape[0]), cp.int32(
                 d_x2.shape[0]), cp.int32(d_rbins_squared.shape[0])))
        if i > 0:  # warm-up not needed if using RawModule
            end.record()
            end.synchronize()
            _e = time.time()
            timing_cp += cp.cuda.get_elapsed_time(start, end)
            timing_cp_wall += (_e - _s)

    print('cupy+CUDA events:', timing_cp / 3, 'ms')
    print('cupy+CUDA wall  :', timing_cp_wall / 3 * 1000, 'ms')
    d_result_cp = d_result_cp.copy()

if kind in ['both', 'numba']:
    # for GPU timing using Numba
    @cuda.jit
Example #28
0
 def _calculate_to(self, end_point: int) -> int:
     end = cp.int32(end_point)
     self._mandel_kernel((self._total_blocks, ), (self._BLOCK_SIZE, ),
                         (self._c, self._z, self._iteration, end))
     # approximation to the work done
     return self._request_size * self.iterations_per_kernel
def CFAR_OS_2D_cross_GPU(signal_ext, origSignalLenX, origSignalLenY,
                         guardBandLen_1sideX, guardBandLen_1sideY,
                         validSampLen_1sideX, validSampLen_1sideY, scratchPad,
                         noiseMargin, ordStat, outputBoolVector):

    thrdIDx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    thrdIDy = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y

    if (thrdIDx < guardBandLen_1sideX + validSampLen_1sideX) or (
            thrdIDx >
            origSignalLenX + guardBandLen_1sideX + validSampLen_1sideX -
            1) or (thrdIDy < guardBandLen_1sideY + validSampLen_1sideY) or (
                thrdIDy > origSignalLenY + guardBandLen_1sideY +
                validSampLen_1sideY - 1):
        return

    # check for local maxima on the CUT i.e. signal_ext[thrdID]
    if (signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy, thrdIDx - 1]) and (
            signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy, thrdIDx + 1]
    ) and (signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy - 1, thrdIDx]
           ) and (signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy + 1,
                                                             thrdIDx]):
        count = cp.int32(0)
        for i in range(thrdIDx - guardBandLen_1sideX - validSampLen_1sideX,
                       thrdIDx - guardBandLen_1sideX):
            scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \
                        thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[thrdIDy,i]
            count += 1

        for j in range(thrdIDx + guardBandLen_1sideX + 1, thrdIDx +
                       guardBandLen_1sideX + validSampLen_1sideX + 1):
            scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \
                        thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[thrdIDy,j]

            count += 1

        for k in range(thrdIDy - guardBandLen_1sideY - validSampLen_1sideY,
                       thrdIDy - guardBandLen_1sideY):
            scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \
                        thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[k,thrdIDx]

            count += 1

        for l in range(thrdIDy + guardBandLen_1sideY + 1, thrdIDy +
                       guardBandLen_1sideY + validSampLen_1sideY + 1):
            scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \
                        thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[l,thrdIDx]

            count += 1

        temp = cp.float32(0)
        ordStat_largestVal = cp.float32(0)
        # sort in decreasing order of strength upto the ordStat kth largest value
        for i in range(ordStat):
            for j in range(i + 1,
                           2 * validSampLen_1sideX + 2 * validSampLen_1sideX):
                if (scratchPad[thrdIDy -
                               (guardBandLen_1sideY + validSampLen_1sideY),
                               thrdIDx -
                               (guardBandLen_1sideX + validSampLen_1sideX), i]
                        <
                        scratchPad[thrdIDy -
                                   (guardBandLen_1sideY + validSampLen_1sideY),
                                   thrdIDx -
                                   (guardBandLen_1sideX + validSampLen_1sideX),
                                   j]):
                    temp = scratchPad[
                        thrdIDy - (guardBandLen_1sideY + validSampLen_1sideY),
                        thrdIDx - (guardBandLen_1sideX + validSampLen_1sideX),
                        i]
                    scratchPad[thrdIDy -
                               (guardBandLen_1sideY + validSampLen_1sideY),
                               thrdIDx -
                               (guardBandLen_1sideX + validSampLen_1sideX),
                               i] = scratchPad[
                                   thrdIDy -
                                   (guardBandLen_1sideY + validSampLen_1sideY),
                                   thrdIDx -
                                   (guardBandLen_1sideX + validSampLen_1sideX),
                                   j]
                    scratchPad[thrdIDy -
                               (guardBandLen_1sideY + validSampLen_1sideY),
                               thrdIDx -
                               (guardBandLen_1sideX + validSampLen_1sideX),
                               j] = temp

        ordStat_largestVal = scratchPad[
            thrdIDy - (guardBandLen_1sideY + validSampLen_1sideY),
            thrdIDx - (guardBandLen_1sideX + validSampLen_1sideX), ordStat - 1]

        if (signal_ext[thrdIDy, thrdIDx] > noiseMargin * ordStat_largestVal):
            outputBoolVector[thrdIDy -
                             (guardBandLen_1sideY + validSampLen_1sideY),
                             thrdIDx -
                             (guardBandLen_1sideX + validSampLen_1sideX)] = 1
def CFAR_CA_2D_cross_GPU(signal_ext, origSignalLenX, origSignalLenY,
                         guardBandLen_1sideX, guardBandLen_1sideY,
                         validSampLen_1sideX, validSampLen_1sideY, scratchPad,
                         noiseMargin, outputBoolVector):

    thrdIDx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    thrdIDy = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y

    if (thrdIDx < guardBandLen_1sideX + validSampLen_1sideX) or (
            thrdIDx >
            origSignalLenX + guardBandLen_1sideX + validSampLen_1sideX -
            1) or (thrdIDy < guardBandLen_1sideY + validSampLen_1sideY) or (
                thrdIDy > origSignalLenY + guardBandLen_1sideY +
                validSampLen_1sideY - 1):
        return

    # check for local maxima on the CUT i.e. signal_ext[thrdID]
    if (signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy, thrdIDx - 1]) and (
            signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy, thrdIDx + 1]
    ) and (signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy - 1, thrdIDx]
           ) and (signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy + 1,
                                                             thrdIDx]):
        count = cp.int32(0)
        for i in range(thrdIDx - guardBandLen_1sideX - validSampLen_1sideX,
                       thrdIDx - guardBandLen_1sideX):
            scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \
                        thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[thrdIDy,i]
            count += 1

        for j in range(thrdIDx + guardBandLen_1sideX + 1, thrdIDx +
                       guardBandLen_1sideX + validSampLen_1sideX + 1):
            scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \
                        thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[thrdIDy,j]

            count += 1

        for k in range(thrdIDy - guardBandLen_1sideY - validSampLen_1sideY,
                       thrdIDy - guardBandLen_1sideY):
            scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \
                        thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[k,thrdIDx]

            count += 1

        for l in range(thrdIDy + guardBandLen_1sideY + 1, thrdIDy +
                       guardBandLen_1sideY + validSampLen_1sideY + 1):
            scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \
                        thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[l,thrdIDx]

            count += 1

        avgNoisePower = cp.float32(0)
        for ele in range(2 * validSampLen_1sideX + 2 * validSampLen_1sideX):
            avgNoisePower += scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \
                                        thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),ele]

        avgNoisePower = avgNoisePower / (2 * validSampLen_1sideX +
                                         2 * validSampLen_1sideX)

        if (signal_ext[thrdIDy, thrdIDx] > noiseMargin * avgNoisePower):
            outputBoolVector[thrdIDy -
                             (guardBandLen_1sideY + validSampLen_1sideY),
                             thrdIDx -
                             (guardBandLen_1sideX + validSampLen_1sideX)] = 1
Example #31
0
    def correlations_from_samples(beamformed_samples_1: np.array,
                                  beamformed_samples_2: np.array,
                                  record: OrderedDict) -> np.array:
        """
        Correlate two sets of beamformed samples together. Correlation matrices are used and
        indices corresponding to lag pulse pairs are extracted.

        Parameters
        ----------
        beamformed_samples_1: ndarray [num_slices, num_beams, num_samples]
            The first beamformed samples.
        beamformed_samples_2: ndarray [num_slices, num_beams, num_samples]
            The second beamformed samples.
        record: OrderedDict
            hdf5 record containing bfiq data and metadata

        Returns
        -------
        values: np.array
            Array of correlations for each beam, range, and lag
        """

        # beamformed_samples_1: [num_beams, num_samples]
        # beamformed_samples_2: [num_beams, num_samples]
        # correlated:           [num_beams, num_samples, num_samples]
        correlated = xp.einsum('jk,jl->jkl', beamformed_samples_1,
                               beamformed_samples_2.conj())

        if cupy_available:
            correlated = xp.asnumpy(correlated)

        values = []
        if record['lags'].size == 0:
            values.append(xp.array([]))
            return values

        # First range offset in samples
        sample_off = record['first_range_rtt'] * 1e-6 * record['rx_sample_rate']
        sample_off = xp.int32(sample_off)

        # Helpful values converted to units of samples
        range_off = xp.arange(record['num_ranges'],
                              dtype=xp.int32) + sample_off
        tau_in_samples = record['tau_spacing'] * 1e-6 * record['rx_sample_rate']
        lag_pulses_as_samples = xp.array(record['lags'],
                                         xp.int32) * xp.int32(tau_in_samples)

        # [num_range_gates, 1, 1]
        # [1, num_lags, 2]
        samples_for_all_range_lags = (range_off[..., xp.newaxis, xp.newaxis] +
                                      lag_pulses_as_samples[xp.newaxis, :, :])

        # [num_range_gates, num_lags, 2]
        row = samples_for_all_range_lags[..., 1].astype(xp.int32)

        # [num_range_gates, num_lags, 2]
        column = samples_for_all_range_lags[..., 0].astype(xp.int32)

        # [num_beams, num_range_gates, num_lags]
        values = correlated[:, row, column]

        # Find the sample that corresponds to the second pulse transmitting
        second_pulse_sample_num = xp.int32(
            tau_in_samples) * record['pulses'][1] - sample_off - 1

        # Replace all ranges which are contaminated by the second pulse for lag 0
        # with the data from those ranges after the final pulse.
        values[:, second_pulse_sample_num:,
               0] = values[:, second_pulse_sample_num:, -1]

        return values