Python float32の例、cupy.float32 Pythonの例

コード例 #1

0

ファイルを表示

    def __init__(self):
        mujoco_env.MujocoEnv.__init__(self, 'humanoidasimoMRD3.xml', 5)
        utils.EzPickle.__init__(self)

        self.pos = []
        self.vel = []
        fileHandle = open(
            '/home/initial/my_project_folder/my_project/src/python_code3/trpo-master/data/states/biped3d_sim_walk_state-asimo.txt',
            'r')
        str = fileHandle.readlines()
        fileHandle.close()
        for i in str[1][8:-3].split(','):
            self.pos.append(float32(i))
        for i in str[2][7:-2].split(','):
            self.vel.append(float32(i))

        fileHandle = open(
            '/home/initial/my_project_folder/my_project/src/python_code3/trpo-master/data/motions/mocap/asimo/0007_Walking001_motion_00000_retargeted_asimo.txt',
            'r')
        str = fileHandle.readlines()
        fileHandle.close()
        self.motion = []
        for i in range(4, 31, 1):
            motion_sub = []
            for j in str[i][1:-3].split(','):
                motion_sub.append(float32(j))
            self.motion.append(motion_sub)

        self.time_step = 0.0  #e-3
        self.i = 0

コード例 #2

0

ファイルを表示

ファイル: numbaCudaKernels.py プロジェクト: Dibakar1/gpu_programming

def conv_2d(inputImageExtnd, pointSpreadFn, outputImage, InputLenX, InputLenY,
            psfOneSideLenX, psfOneSideLenY):

    thrdIDx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    thrdIDy = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y

    psfLenX = 2 * psfOneSideLenX + 1
    psfLenY = 2 * psfOneSideLenY + 1

    if (thrdIDx >= psfOneSideLenX + InputLenX + psfLenX - 1) or (
            thrdIDx < psfOneSideLenX) or (
                thrdIDy >= psfOneSideLenY + InputLenY + psfLenY - 1) or (
                    thrdIDy < psfOneSideLenY):
        return

    convSum = cp.float32(0)
    for x in range(-psfOneSideLenX, psfOneSideLenX + 1):
        for y in range(-psfOneSideLenY, psfOneSideLenY + 1):
            convSum += inputImageExtnd[thrdIDy + y, thrdIDx +
                                       x] * pointSpreadFn[psfOneSideLenY - y,
                                                          psfOneSideLenX - x]
            # if (thrdIDx == psfOneSideLenX + InputLenX + psfLenX -2) and (thrdIDy == psfOneSideLenY + InputLenY + psfLenY -2):
            #     print('x:',x,'y:',y,'imagVal:',inputImageExtnd[thrdIDy+y,thrdIDx+x],'psfval:',pointSpreadFn[2*psfOneSideLenY-y,2*psfOneSideLenX-x])

    # if (thrdIDx == psfOneSideLenX + InputLenX + psfLenX -2) and (thrdIDy == psfOneSideLenY + InputLenY + psfLenY -2):
    #     print('conSum:',convSum)
    outputImage[thrdIDy - psfOneSideLenY, thrdIDx - psfOneSideLenX] = convSum

コード例 #3

0

ファイルを表示

 def updateGeometry(self):
     # GPU variables
     self._psi = cp.zeros(self.shape, dtype=cp.complex64)
     self._phi = cp.zeros(self.shape, dtype=cp.uint8)
     self._theta = cp.zeros(self.shape, dtype=cp.float32)
     self._rho = cp.zeros(self.shape, dtype=cp.float32)
     alpha = cp.cos(cp.radians(self.phis, dtype=cp.float32))
     x = alpha*(cp.arange(self.width, dtype=cp.float32) -
                cp.float64(self.xs))
     y = cp.arange(self.height, dtype=cp.float32) - cp.float32(self.ys)
     qx = self.qprp * x
     qy = self.qprp * y
     self._iqx = (1j * qx).astype(cp.complex64)
     self._iqy = (1j * qy).astype(cp.complex64)
     self._iqxz = (1j * self.qpar * x * x).astype(cp.complex64)
     self._iqyz = (1j * self.qpar * y * y).astype(cp.complex64)
     self.outeratan2f(y, x, self._theta)
     self.outerhypot(qy, qx, self._rho)
     # CPU variables
     self.phi = self._phi.get()
     self.iqx = self._iqx.get()
     self.iqy = self._iqy.get()
     self.theta = self._theta.get().astype(cp.float64)
     self.qr = self._rho.get().astype(cp.float64)
     self.sigUpdateGeometry.emit()

コード例 #4

0

ファイルを表示

ファイル: lpmethods.py プロジェクト: nikitinvv/lprec

def em(lp, init_recon, tomo0, num_iter, reg_par, gpu):
    """
    Reconstruction with the Expectation Maximization algorithm for denoising
    with parameter reg_par manually chosen for avoiding division by 0.
    Maximization of the likelihood function L(tomo,rho) 
    """

    # choose device
    cp.cuda.Device(gpu).use()

    # Allocating necessary gpu arrays
    recon = cp.array(init_recon)
    tomo = cp.array(tomo0)
    xi = recon * 0
    upd = recon * 0
    g = tomo * 0

    # Constructing iterative scheme
    eps = reg_par
    # R^*(ones)
    lp.adjp(xi, tomo * 0 + 1, gpu)
    xi = xi + 1e-5
    # em iteratins
    for i in range(0, num_iter):
        lp.fwdp(g, recon, gpu)
        lp.adjp(upd, tomo / (g + cp.float32(eps)), gpu)
        recon = recon * (upd / xi)

    return recon.get()

コード例 #5

0

ファイルを表示

ファイル: numbaCudaKernels.py プロジェクト: Dibakar1/gpu_programming

def CFAR_CA_GPU(signal_ext, origSignalLen, guardBandLen_1side,
                validSampLen_1side, scratchPad, noiseMargin, outputBoolVector):

    thrdID = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x

    if (thrdID < origSignalLen - 1) or (thrdID > 2 * origSignalLen - 2):
        return

    # check for local maxima on the CUT i.e. signal_ext[thrdID]
    if (signal_ext[thrdID] >= signal_ext[thrdID - 1]) and (
            signal_ext[thrdID] >= signal_ext[thrdID + 1]):

        count = cp.int32(0)
        for i in range(thrdID - guardBandLen_1side - validSampLen_1side,
                       thrdID - guardBandLen_1side):
            # scratchPad[count] = signal_ext[i]; # This should not be done. There should be a separate scratch pad for each thread when it is vector/matrix copying
            scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[i]
            count += 1

        for j in range(thrdID + guardBandLen_1side + 1,
                       thrdID + guardBandLen_1side + validSampLen_1side + 1):
            # scratchPad[count] = signal_ext[j]; # This should not be done. There should be a separate scratch pad for each thread when it is vector/matrix copying
            scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[j]
            count += 1
        avgNoisePower = cp.float32(0)
        for ele in range(2 * validSampLen_1side):
            avgNoisePower += scratchPad[thrdID - (origSignalLen - 1), ele]
        avgNoisePower = avgNoisePower / (2 * validSampLen_1side)

        if (signal_ext[thrdID] > noiseMargin * avgNoisePower):
            outputBoolVector[thrdID - (origSignalLen - 1)] = 1

コード例 #6

0

ファイルを表示

def GeneralizedDiceLossFunction(y,t,w):
    

    dice_numerator=0.0
    dice_denominator=0.0
    eps = 0.0001
    div = cp.float32(y.shape[0] * y.shape[1])
    
    y = F.softmax(y,axis=1)
    for i in range(y.shape[0]):#batch-size
        soft = y[i]
        tb = cp.array(t[i].flatten()).astype(cp.float32)
        for j in range(y.shape[1]):#class-size
            wb = cp.array(w[i][j].flatten()).astype(cp.float32)
            V_in = cp.where(tb == j,1,0).astype(cp.float32)

            t_temp = chainer.Variable(V_in)
            w_temp = chainer.Variable(wb)
            soft_temp = F.flatten(soft[j])

            dice_numerator += F.sum(w_temp * soft_temp * t_temp)
            dice_denominator += F.sum(w_temp * (soft_temp + t_temp))

    loss =  2.0 * dice_numerator / (dice_denominator+eps)

    return -loss

コード例 #7

0

ファイルを表示

ファイル: PoseDetector.py プロジェクト: kpwelsh/Mesh-Pose-Detector

def valid_positions(R, vertices, depth, K, mask, lower, grid_size):
    valid_positions_device(
        ((mask.size * len(vertices)) // 512 + 1, ), (512, ),
        (cp.asarray(K.flatten()), cp.asarray(R.flatten()),
         cp.asarray(vertices.flatten()), cp.asarray(depth.flatten()),
         cp.array(depth.shape, cp.int), cp.float32(grid_size),
         cp.asarray(mask.flatten(), cp.int), cp.array(
             mask.shape, cp.int), cp.asarray(lower), cp.int(len(vertices))))

コード例 #8

0

ファイルを表示

def reduction(x, y, size):
    tid = jit.threadIdx.x
    ntid = jit.blockDim.x

    value = cupy.float32(0)
    for i in range(tid, size, ntid):
        value += x[i]

    smem = jit.shared_memory(cupy.float32, 1024)
    smem[tid] = value

    jit.syncthreads()

    if tid == cupy.uint32(0):
        value = cupy.float32(0)
        for i in range(ntid):
            value += smem[i]
        y[0] = value

コード例 #9

0

ファイルを表示

ファイル: reduction_atomic.py プロジェクト: toslunar/cupy

def reduction(x, y, size):
    tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    ntid = jit.blockDim.x * jit.gridDim.x

    value = cupy.float32(0)
    for i in range(tid, size, ntid):
        value += x[i]

    smem = jit.shared_memory(cupy.float32, 1024)
    smem[jit.threadIdx.x] = value

    jit.syncthreads()

    if jit.threadIdx.x == cupy.uint32(0):
        value = cupy.float32(0)
        for i in range(jit.blockDim.x):
            value += smem[i]
        jit.atomic_add(y, 0, value)

コード例 #10

0

ファイルを表示

ファイル: __init__.py プロジェクト: sephirothalucard/myblendercontrib

def cumulative_distribution(data, bins):
    assert cup.min(data) >= 0.0 and cup.max(data) <= 1.0
    hg_av, hg_a = cup.unique(cup.floor(data * (bins - 1)), return_index=True)
    hg_a = cup.float32(hg_a)
    hgs = cup.sum(hg_a)
    hg_a /= hgs
    res = cup.zeros((bins, ))
    res[cup.int64(hg_av)] = hg_a
    return cup.cumsum(res)

コード例 #11

0

ファイルを表示

ファイル: numbaCudaKernels.py プロジェクト: Dibakar1/gpu_programming

def CFAR_OS_GPU(signal_ext, origSignalLen, guardBandLen_1side,
                validSampLen_1side, scratchPad, noiseMargin, ordStat,
                outputBoolVector):

    thrdID = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x

    if (thrdID < origSignalLen - 1) or (thrdID > 2 * origSignalLen - 2):
        return

    # check for local maxima on the CUT i.e. signal_ext[thrdID]
    if (signal_ext[thrdID] >= signal_ext[thrdID - 1]) and (
            signal_ext[thrdID] >= signal_ext[thrdID + 1]):

        count = cp.int32(0)
        for i in range(thrdID - guardBandLen_1side - validSampLen_1side,
                       thrdID - guardBandLen_1side):
            scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[i]
            count += 1

        for j in range(thrdID + guardBandLen_1side + 1,
                       thrdID + guardBandLen_1side + validSampLen_1side + 1):
            scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[j]
            count += 1

        temp = cp.float32(0)
        ordStat_largestVal = cp.float32(0)
        # sort in decreasing order of strength upto the ordStat kth largest value
        for i in range(ordStat):
            for j in range(i + 1, 2 * validSampLen_1side):
                if (scratchPad[thrdID - (origSignalLen - 1), i] <
                        scratchPad[thrdID - (origSignalLen - 1), j]):
                    temp = scratchPad[thrdID - (origSignalLen - 1), i]
                    scratchPad[thrdID - (origSignalLen - 1),
                               i] = scratchPad[thrdID - (origSignalLen - 1), j]
                    scratchPad[thrdID - (origSignalLen - 1), j] = temp

        ordStat_largestVal = scratchPad[thrdID - (origSignalLen - 1),
                                        ordStat - 1]

        if (signal_ext[thrdID] > noiseMargin * ordStat_largestVal):
            outputBoolVector[thrdID - (origSignalLen - 1)] = 1

コード例 #12

0

ファイルを表示

def prominent_peaks_optimized(img,
                              min_xdistance=1,
                              min_ydistance=1,
                              threshold=None,
                              num_peaks=cp.inf):
    """Return peaks with non-maximum suppression.
    Identifies most prominent features separated by certain distances.
    Non-maximum suppression with different sizes is applied separately
    in the first and second dimension of the image to identify peaks.

    Parameters
    ----------
    image : (M, N) ndarray
        Input image.
    min_xdistance : int
        Minimum distance separating features in the x dimension.
    min_ydistance : int
        Minimum distance separating features in the y dimension.
    threshold : float
        Minimum intensity of peaks. Default is `0.5 * max(image)`.
    num_peaks : int
        Maximum number of peaks. When the number of peaks exceeds `num_peaks`,
        return `num_peaks` coordinates based on peak intensity.

    Returns
    -------
    intensity, xcoords, ycoords : tuple of array
        Peak intensity values, x and y indices.

    Notes
    -----
    Modified from https://github.com/mritools/cupyimg _prominent_peaks method
    """
    THREADS_PER_BLOCK = (32, 1)
    # Each thread is responsible for a (min_ydistance * min_xdistance) patch
    # THREADS_PER_BLOCK is in the order of (x, y), but img.shape is in the order of (y, x)
    NUM_BLOCKS = (img.shape[1] // (THREADS_PER_BLOCK[0] * min_xdistance) +
                  ((img.shape[1] %
                    (THREADS_PER_BLOCK[0] * min_xdistance)) > 0),
                  img.shape[0] // (THREADS_PER_BLOCK[1] * min_ydistance) +
                  ((img.shape[0] %
                    (THREADS_PER_BLOCK[1] * min_ydistance)) > 0))
    NUM_THREADS = np.multiply(THREADS_PER_BLOCK, NUM_BLOCKS)
    elems = (NUM_THREADS[0] * NUM_THREADS[1], )
    intensity, xcoords, ycoords = cp.zeros(elems, dtype=cp.float32), cp.zeros(
        elems, dtype=cp.int32), cp.zeros(elems, dtype=cp.int32)
    prominent_peaks_kernel(
        NUM_BLOCKS, THREADS_PER_BLOCK,
        (img, cp.int32(img.shape[0]), cp.int32(
            img.shape[1]), cp.int32(min_xdistance), cp.int32(min_ydistance),
         cp.float32(threshold), intensity, xcoords, ycoords))
    indices = intensity != 0.0
    return intensity[indices], xcoords[indices], ycoords[indices]

コード例 #13

0

ファイルを表示

    def start(self, rand_seed=None):
        if rand_seed is None:
            rand_seed = np.random.randint(1e5)
        self.nPh = int(self.nPh)
        self._reset_results()
        self._generate_initial_coodinate(self.nPh)

        M = np.int32(self.model.voxel_model.shape[1])
        L = np.int32(self.model.voxel_model.shape[2])

        print("")
        print("###### Start (Random seed: %s) ######" % rand_seed)
        print("")
        start_ = time.time()
        cp.get_default_memory_pool().free_all_blocks()
        cp.get_default_pinned_memory_pool().free_all_blocks()

        add_ = cp.asarray(self.add.astype(np.int32), dtype=np.int32)
        p_ = cp.asarray(self.p.astype(np.float32), dtype=np.float32)
        v_ = cp.asarray(self.v.astype(np.float32), dtype=np.float32)
        w_ = cp.asarray(self.w.astype(np.float32), dtype=np.float32)
        ma_ = cp.asarray(self.model.ma.astype(np.float32))
        ms_ = cp.asarray(self.model.ms.astype(np.float32))
        n_ = cp.asarray(self.model.n.astype(np.float32))
        g_ = cp.asarray(self.model.g.astype(np.float32))
        v_model = cp.asarray(self.model.voxel_model.astype(np.int8),
                             dtype=np.int8)
        l_ = cp.float32(self.model.voxel_space)
        nph = cp.int32(self.nPh)
        end_p = cp.int8(self.model.end_point)

        func((int((self.nPh + self.threadnum - 1) / self.threadnum), 1),
             (self.threadnum, 1), (add_, p_, v_, w_, ma_, ms_, n_, g_, v_model,
                                   l_, M, L, nph, end_p, np.int32(rand_seed)))

        self.add = cp.asnumpy(add_)
        self.p = cp.asnumpy(p_)
        self.v = cp.asnumpy(v_)
        self.w = cp.asnumpy(w_)

        del add_, p_, v_, w_, ma_, ms_, n_, g_,
        del v_model, l_, M, L, nph, end_p, rand_seed,
        cp.get_default_memory_pool().free_all_blocks()
        cp.get_default_pinned_memory_pool().free_all_blocks()
        gc.collect()

        self._end_process()
        print("###### End ######")
        self.getRdTtRate()
        calTime(time.time(), start_)

        return self

コード例 #14

0

ファイルを表示

 def __init__(self):
     self.data = dataset()
     self.data.reset()
     self.reset()
     # self.load(1)
     self.setLR()
     self.time = time.time()
     self.dataRate = xp.float32(0.8)
     self.mado = xp.hanning(442).astype(xp.float32)
     # n=10
     # load_npz(f"param/gen/gen_{n}.npz",self.generator)
     # load_npz(f"param/dis/dis_{n}.npz",self.discriminator)
     self.training(batchsize=6)

コード例 #15

0

ファイルを表示

def _call_nms_kernel(bbox, thresh):
    n_bbox = bbox.shape[0]
    threads_per_block = 64
    col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
    blocks = (col_blocks, col_blocks, 1)
    threads = (threads_per_block, 1, 1)

    mask_dev = cp.zeros((n_bbox * col_blocks, ), dtype=np.uint64)
    bbox = cp.ascontiguousarray(bbox, dtype=np.float32)
    kern = _load_kernel('nms_kernel', _nms_gpu_code)
    kern(blocks, threads, args=(n_bbox, cp.float32(thresh), bbox, mask_dev))

    mask_host = mask_dev.get()
    selection, n_selec = _nms_gpu_post(mask_host, n_bbox, threads_per_block,
                                       col_blocks)
    return selection, n_selec

コード例 #16

0

ファイルを表示

ファイル: psroi_module.py プロジェクト: pyCERN/Simple-RFCN-PyTorch

    def backward(ctx, grad_output):
        """
        the backward of psRoI_pooling
        :param ctx:         context variable
        :param grad_output: gradient input(backward) of psRoI module
        :return:
        """
        # Here we must handle None grad_output tensor. In this case we
        # can skip unnecessary computations and just return None.
        if grad_output is None:
            return None, None, None

        grad_output = grad_output.contiguous()

        int_info, in_size, spatial_scale, rois, mapping_channel = ctx.saved_tensors
        count, N, out_dim, outh, outw = int_info.tolist()
        in_size = tuple(in_size.tolist())
        B, C, H, W = in_size  # e.g.(b, 21 * 7 * 7, h, w)
        grad_input = t.zeros(in_size).cuda(
        )  # developing cuda memory to save gradient for output

        # create cuda stream
        stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)

        args = [
            count,
            grad_output.data_ptr(),
            mapping_channel.data_ptr(),
            N,
            cp.float32(spatial_scale),
            C,
            H,
            W,
            outh,
            outw,
            out_dim,
            grad_input.data_ptr(),
            rois.data_ptr(),
        ]

        psROI_backward_fn(args=args,
                          block=(CUDA_NUM_THREADS, 1, 1),
                          grid=(GET_BLOCKS(grad_output.numel()), 1, 1),
                          stream=stream)

        return grad_input, None, None  # The 'None' indicates that backpropagation to RPN and info is ignored

コード例 #17

0

ファイルを表示

ファイル: non_maximum_suppression.py プロジェクト: gwtnb/chainercv

def _call_nms_kernel(bbox, thresh):
    n_bbox = bbox.shape[0]
    threads_per_block = 64
    col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
    blocks = (col_blocks, col_blocks, 1)
    threads = (threads_per_block, 1, 1)

    mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
    bbox = cp.ascontiguousarray(bbox, dtype=np.float32)
    kern = _load_kernel('nms_kernel', _nms_gpu_code)
    kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
                                bbox, mask_dev))

    mask_host = mask_dev.get()
    selection, n_selec = _nms_gpu_post(
        mask_host, n_bbox, threads_per_block, col_blocks)
    return selection, n_selec

コード例 #18

0

ファイルを表示

ファイル: cudadice.py プロジェクト: hardbyte/anonlink-cuda

def compute_similarities(input_a, input_b, chunk_id, threshold):
    start_time = time.time()
    size_a, size_b = len(input_a) // 32, len(input_b) // 32

    similarities = cp.zeros((size_a, size_b), dtype=cp.float16)

    a_threads_per_block = 16
    b_threads_per_block = 16
    threads_per_block = (a_threads_per_block, b_threads_per_block)

    nblocks_a = size_a // a_threads_per_block
    nblocks_b = size_b // b_threads_per_block
    nblocks = (nblocks_a, nblocks_b)

    dice_kernel(nblocks, threads_per_block,
                (similarities, input_a, input_b, size_a, size_b,
                 cp.float32(threshold)))

    sparse_similarities = apply_threshold(similarities,
                                          size_a,
                                          size_b,
                                          threshold=threshold)

    sort_sparse_similarities(sparse_similarities)

    cp.cuda.Stream.null.synchronize()

    compute_time = time.time() - start_time
    # Copy data back from device to host
    start_time = time.time()

    # Transfer sparse matrix back to host
    data = sparse_similarities.get()

    transfer_time = time.time() - start_time

    comparisons = size_a * size_b
    cmp_per_sec = comparisons / compute_time

    if chunk_id % 1 == 0:
        print(
            f"{chunk_id}: Comparisons: {humanize.intword(comparisons)}, Rate: {humanize.intword(cmp_per_sec)} cmp/s. Computation: {compute_time:.3f} Result transfer: {transfer_time:.6f}s"
        )

    return data, len(data.data)

コード例 #19

0

ファイルを表示

ファイル: antennas_iq_to_bfiq.py プロジェクト: SuperDARNCanada/borealis_postprocessors

    def calculate_first_range_rtt(record: OrderedDict) -> float:
        """
        Calculates the round-trip time (in microseconds) to the first range in a record.

        Parameters
        ----------
        record: OrderedDict
            hdf5 record containing antennas_iq data and metadata

        Returns
        -------
        first_range_rtt: float
            Time that it takes signal to travel to first range gate and back, in microseconds
        """
        # km * (there and back) * (km to meters) * (seconds to us) / c
        first_range_rtt = record['first_range'] * 2.0 * 1.0e3 * 1e6 / speed_of_light

        return xp.float32(first_range_rtt)

コード例 #20

0

ファイルを表示

ファイル: antennas_iq_to_bfiq.py プロジェクト: SuperDARNCanada/borealis_postprocessors

    def calculate_range_separation(record: OrderedDict) -> float:
        """
        Calculates the separation between ranges in km.

        Parameters
        ----------
        record: OrderedDict
            hdf5 record containing antennas_iq data and metadata

        Returns
        -------
        range_sep: float
            The separation between adjacent ranges, in km.
        """
        # (1 / (sample rate)) * c / (km to meters) / 2
        range_sep = 1 / record['rx_sample_rate'] * speed_of_light / 1.0e3 / 2.0

        return xp.float32(range_sep)

コード例 #21

0

ファイルを表示

ファイル: image_cupy_N.py プロジェクト: puat133/MCMC-MultiSPDE

    def __init__(self,n_layers,rg,measurement,f,beta=1,variant="dunlop",verbose=True,hybrid_mode=False,mempool=None):
        self.n_layers = n_layers
        self.beta = cp.float32(beta)
        self.betaZ = cp.sqrt(1-beta**2).astype(cp.float32)
        self.random_gen = rg
        self.measurement = measurement
        self.fourier = f
        self.variant=variant
        self.meas_var = self.measurement.stdev**2
        self.verbose = verbose
        self.hybrid_mode = hybrid_mode
        self.epsilon = 0
        self.cholesky_stabilizer = 0
        if mempool is None:
            mempool = cp.get_default_memory_pool()

        self.create_matrix_file_name()
        if not (self.measurement_matrix_file).exists():
            self.create_H_matrix()
            
            
        else:
            self.load_H_matrix()

        #do normalizing
        self.H /= self.measurement.stdev
        if self.verbose:
            print("Used bytes so far, after creating H {}".format(mempool.used_bytes()))
        # self.H_t_H = self.H.conj()[email protected]
        self.H_t_H /= self.meas_var
        self.I = cp.eye(self.measurement.num_sample,dtype=cp.float32)
        self.In = cp.eye(self.fourier.basis_number_2D_sym,dtype=cp.float32)
        # self.I = cpx.scipy.sparse.identity(self.measurement.num_sample)
        # self.In = cpx.scipy.sparse.identity(self.fourier.basis_number_2D_sym)
        
        self.y = self.measurement.y
        self.yBar = cp.concatenate((self.y,cp.zeros(2*self.fourier.basis_number_2D_ravel-1)))


        self.beta_feedback_gain = 2.1
        self.target_acceptance_rate = 0.234
        self.record_skip = 1
        self.record_count = 0
        self.max_record_history = 1000000

コード例 #22

0

ファイルを表示

ファイル: antennas_iq_to_bfiq.py プロジェクト: SuperDARNCanada/borealis_postprocessors

    def calculate_first_range(record: OrderedDict) -> float:
        """
        Calculates the distance from the main array to the first range (in km).

        Parameters
        ----------
        record: OrderedDict
            hdf5 record containing antennas_iq data and metadata

        Returns
        -------
        first_range: float
            Distance to first range in km
        """
        # TODO: Get this from somewhere, probably linked to the experiment ran. Might need to look up
        #   based on githash
        first_range = 180.0  # scf.FIRST_RANGE

        return xp.float32(first_range)

コード例 #23

0

ファイルを表示

def _call_nms_kernel(bbox, thresh):
    # PyTorch does not support unsigned long Tensor.
    # Doesn't matter,since it returns ndarray finally.
    # So I'll keep it unmodified.
    n_bbox = bbox.shape[0]
    threads_per_block = 64
    col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
    blocks = (col_blocks, col_blocks, 1)
    threads = (threads_per_block, 1, 1)

    mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
    bbox = cp.ascontiguousarray(bbox, dtype=np.float32)
    kern = _load_kernel('nms_kernel', _nms_gpu_code)
    kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
                                bbox, mask_dev))

    mask_host = mask_dev.get()
    selection, n_selec = _nms_gpu_post(
        mask_host, n_bbox, threads_per_block, col_blocks)
    return selection, n_selec

コード例 #24

0

ファイルを表示

ファイル: non_maximum_suppression.py プロジェクト: wen0618/simple-faster-rcnn-pytorch

def _call_nms_kernel(bbox, thresh):
    # PyTorch does not support unsigned long Tensor.
    # Doesn't matter,since it returns ndarray finally.
    # So I'll keep it unmodified.
    n_bbox = bbox.shape[0] #框的个数
    threads_per_block = 64  #一个block有多少thread
    col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)#cuda常用的对齐block操作 保证线程数最小限度全覆盖数据
    blocks = (col_blocks, col_blocks, 1)  #因为对齐一个blocks按理说是(n_blocks,1,1) 说明后面要全排列了
    threads = (threads_per_block, 1, 1)

    mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)#开辟64*n_box*sizeof(np.uint64)的连续内存 置为0 用于存放结果
    bbox = cp.ascontiguousarray(bbox, dtype=np.float32) #将bbox从numpy转成cupycuda计算 放到连续的内存中以便cuda运算 很重要
    kern = _load_kernel('nms_kernel', _nms_gpu_code)#/加载自己写的c-cuda核函数
    kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),   #调用核函数
                                bbox, mask_dev))

    mask_host = mask_dev.get() #将计算结果从gpu取到本地
    selection, n_selec = _nms_gpu_post(
        mask_host, n_bbox, threads_per_block, col_blocks) #调用我们Cython导入的nms函数进行计算
    return selection, n_selec

コード例 #25

0

ファイルを表示

ファイル: QCuCustomTrap.py プロジェクト: mal858/pyfab

 def integrate(self, t, x, y, S_T, L, rho, m, f, lamb, x_0, y_0, z_0, S,
               dx_0, dy_0, out, shape):
     # Get shape and step size
     nx, ny = shape
     nt = t.size
     dt = cp.float32((t[-1] - t[0]) / t.size)
     # Type cast
     S_T, L = (cp.float32(S_T), cp.float32(L))
     rho, m = (cp.float32(rho), cp.float32(m))
     f, lamb = (cp.float32(f), cp.float32(lamb))
     x_0, y_0, z_0 = (cp.asarray(x_0, dtype=cp.float32),
                      cp.asarray(y_0, dtype=cp.float32),
                      cp.asarray(z_0, dtype=cp.float32))
     dx_0, dy_0, S = (cp.asarray(dx_0, dtype=cp.float32),
                      cp.asarray(dy_0, dtype=cp.float32),
                      cp.asarray(S, dtype=cp.float32))
     # Integrate
     self._integrate(self.grid, self.block,
                     (x, y, S_T, L, rho, m, f, lamb, x_0, y_0, z_0, S, dx_0,
                      dy_0, out, dt, nx, ny, nt))
     out = cp.asnumpy(out)

コード例 #26

0

ファイルを表示

def create_gl(N0, Nproj, Nslices, cor, interp_type):
    Nspan = 3
    beta = cp.pi / Nspan
    # size after zero padding in radial direction
    N = int(cp.ceil((N0 + abs(N0 / 2 - cor) * 2.0) / 16.0) * 16)

    # size after zero padding in the angle direction (for nondense sampling rate)
    osangles = int(max(round(3.0 * N / 2.0 / Nproj), 1))
    Nproj = osangles * Nproj
    # polar space

    proj = cp.arange(0, Nproj) * cp.pi / Nproj - beta / 2
    s = cp.linspace(-1, 1, N)
    # log-polar parameters
    (Nrho, Ntheta, dtheta, drho, aR, am,
     g) = getparameters(beta, proj[1] - proj[0], 2.0 / (N - 1), N, Nproj)
    # log-polar space
    thsp = (cp.arange(-Ntheta / 2, Ntheta / 2) *
            cp.float32(dtheta)).astype('float32')
    rhosp = (cp.arange(-Nrho, 0) * drho).astype('float32')
    erho = cp.tile(cp.exp(rhosp)[..., cp.newaxis], [1, Ntheta])
    # compensation for cubic interpolation
    B3th = splineB3(thsp, 1)
    B3th = cp.fft.fft(cp.fft.ifftshift(B3th))
    B3rho = splineB3(rhosp, 1)
    B3rho = (cp.fft.fft(cp.fft.ifftshift(B3rho)))
    B3com = cp.outer(B3rho, B3th)
    # struct with global parameters
    P = Pgl(Nspan, N, N0, Nproj, Nslices, Ntheta, Nrho, proj, s, thsp, rhosp,
            aR, beta, B3com, am, g, cor, osangles, interp_type)
    # represent as array
    parsi = cp.array([
        P.N, P.N0, P.Ntheta, P.Nrho, P.Nspan, P.Nproj, P.Nslices, P.cor,
        P.osangles, P.interp_type == 'cubic'
    ],
                     dtype='float32')
    params = cp.concatenate((parsi, erho.flatten())).get()
    return (P, params)

コード例 #27

0

ファイルを表示

def DiceLossFunction(y,t):
    
    loss = 0.0
    div =  cp.float32(y.shape[0] * y.shape[1])
    y = F.softmax(y,axis=1)

    eps = 0.0001

    for i in range(y.shape[0]):
        soft=y[i]
        tb = cp.array(t[i].flatten())
        for j in range(y.shape[1]):

            V_in = cp.where(tb == j,1,0).astype(cp.float32)
            if (cp.sum(V_in) == 0.0):
                div -=1.0
            t_temp = chainer.Variable(V_in)
            soft_temp = F.flatten(soft[j])

            loss += 2.0*F.sum(soft_temp*t_temp)/(F.sum(soft_temp + t_temp) + eps)
    loss = loss/div

    return -loss

コード例 #28

0

ファイルを表示

__add_const_cu(c_float_p(dst_cu), c_float_p(dst_cu), add, c_int_p(pnsz))
__mul_mat_cu(c_float_p(dst_cu), c_float_p(dst_cu), c_float_p(dst_cu),
             c_int_p(pnsz))
__add_mat_cu(c_float_p(dst_cu), c_float_p(dst_cu), c_float_p(dst_cu),
             c_int_p(pnsz))

# print(dst_np[:10, 0, 0])
# print(dst_cu[:10, 0, 0])
#
# print(np.sum(dst_np - dst_cu))

## Cupy in GPU
src_cp = cp.asarray(copy.deepcopy(src))
dst_cp = src_cp

mul_cp = cp.float32(mul)
add_cp = cp.float32(add)

dst_cp = dst_cp * mul_cp
dst_cp = dst_cp + add_cp
dst_cp = dst_cp * dst_cp
dst_cp = dst_cp + dst_cp

dst_cp2np = cp.asnumpy(dst_cp)

# print(dst_np[:10, 0, 0])
# print(dst_cp[:10, 0, 0])
# print(np.sum(dst_np - dst_cp2np))

## Cupy with kernel in GPU
cu_file = os.path.join(os.path.dirname(__file__), 'math_cu.cuh')

コード例 #29

0

ファイルを表示

signalExtend = cp.asarray(signalExtend.copy(),dtype=cp.float32)    
    
origSigShapeX = np.int32(signal_mag.shape[1]);
origSigShapeY = np.int32(signal_mag.shape[0]);
thrdsPerBlkx = 16;
thrdsPerBlky = 16;
thrdsPerBlk = (thrdsPerBlky,thrdsPerBlkx)
blksPerGridx = np.int32(np.ceil(origSigShapeX/thrdsPerBlkx));
blksPerGridy = np.int32(np.ceil(origSigShapeY/thrdsPerBlky));
blksPerGrid = (blksPerGridy,blksPerGridx)

valid_samp_num = 2*(valid_samp_len_x + valid_samp_len_y)
scratchPad = cp.zeros((signal_mag.shape[0],signal_mag.shape[1],valid_samp_num),dtype=cp.float32);


noiseMargin = cp.float32(valid_samp_num*(false_alarm_rate**(-1/valid_samp_num) -1))
outPutBoolArray_CAcross = cp.zeros((signal_mag.shape[0], signal_mag.shape[1]),dtype=cp.int32)

# GPU_memorySize = (signal_mag.nbytes + origSigShapeX.nbytes + origSigShapeY.nbytes + guardband_len_x.nbytes, guardband_len_y.nbytes + valid_samp_len_x.nbytes + valid_samp_len_y.nbytes + scratchPad.nbytes + noiseMargin.nbytes + outPutBoolArray_CAcross.nbytes)/(1024*1024);
GPU_memorySize = (signal_mag.nbytes + scratchPad.nbytes + outPutBoolArray_CAcross.nbytes)/(1024*1024);
print('Total memory on GPU = {0:.2f} MB\n'.format(GPU_memorySize));

CFAR_CA_2D_cross_GPU[blksPerGrid,thrdsPerBlk](signalExtend, origSigShapeX, origSigShapeY, guardband_len_x, guardband_len_y, valid_samp_len_x, valid_samp_len_y,  scratchPad, noiseMargin, outPutBoolArray_CAcross)
outPutBoolArray_CAcross = cp.asnumpy(outPutBoolArray_CAcross)
det_indices_caCross_gpu = np.where(outPutBoolArray_CAcross>0)
t6 = time()

print('CFAR CA cross GPU det range bins:', det_indices_caCross_gpu[0])
print('CFAR CA cross GPU det doppler bins:', det_indices_caCross_gpu[1],'\n')

outPutBoolArray_OScross = cp.zeros((signal_mag.shape[0], signal_mag.shape[1]),dtype=cp.int32)

コード例 #30

0

ファイルを表示

ファイル: psroi_module.py プロジェクト: pyCERN/Simple-RFCN-PyTorch

    def forward(ctx, x, rois, Info: psRoI_Info):
        """
        :param ctx:     context variable(similar to 'self')
        :param x:       input feature map
        :param rois:    rois generated by rpn,
                        note:this 'rois' is indices_and_rois combined indexes and rois
                        ==> [batch_ind, x_min, y_min, x_max, y_max]
        :return:
        """
        # Ensure memory contiguous
        x = x.contiguous()
        rois = rois.contiguous()

        in_size = B, C, H, W = x.size()  # e.g.(b, 21 * 7 * 7, h, w)
        N = rois.size(0)  # the numbers of roi
        if C % (Info.group_size * Info.group_size) != 0:
            raise ValueError(
                "The group_size must be an integral multiple of input_channel!"
            )
        out_dim = C // (Info.group_size * Info.group_size)

        output = t.zeros(N, out_dim, Info.outh,
                         Info.outw).cuda()  # Used to save output
        count = output.numel()  # the number of sub regions for psROI
        mapping_channel = torch.zeros(
            count, dtype=cp.int).cuda()  # hich channel is the bottom data in

        # Packing parameters
        args = [
            count,
            x.data_ptr(),
            cp.float32(
                Info.spatial_scale),  # must convert float param to cp.float32
            C,
            H,
            W,
            Info.outh,
            Info.outw,
            rois.data_ptr(),
            out_dim,
            Info.group_size,
            output.data_ptr(),
            mapping_channel.data_ptr(),
        ]

        # create cuda stream so that Kernel calculation and data transmission can be executed asynchronously
        stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)

        # using one-dimensional index for block and thread
        Info.forward_fn(args=args,
                        block=(CUDA_NUM_THREADS, 1, 1),
                        grid=(GET_BLOCKS(count), 1, 1),
                        stream=stream)

        # save info for backward
        saveBackwardInfo_int = [count, N, out_dim, Info.outh, Info.outw]
        saveBackwardInfo_int = torch.tensor(saveBackwardInfo_int)

        ctx.save_for_backward(saveBackwardInfo_int, torch.tensor(in_size),
                              torch.tensor(Info.spatial_scale), rois,
                              mapping_channel)

        return output

コード例 #31

0

ファイルを表示

def cuda_float32(fltIn: float):
    return cupy.float32(fltIn)