Python Context.pop Examples

Programming Language: Python

Namespace/Package Name: pycuda.driver

Class/Type: Context

Method/Function: pop

Examples at hotexamples.com: 2

Python Context.pop - 2 examples found. These are the top rated real world Python examples of pycuda.driver.Context.pop extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

get_device(9)

synchronize(3)

pop(2)

push(2)

attach(1)

Example #1

Show file

    def _init_thread_memory(self, dev_id: int, ctx: cuda.Context,
                            alloc_size: int) -> None:
        '''
        Single thread that initializes the memory for all the stream for a single 
        GPU. 
        '''

        ctx.push()
        size_per_batch = np.int32(np.ceil(alloc_size / self.num_stream))
        # Initialize streams
        for i in range(self.num_stream):
            self.streams[dev_id].append(cuda.Stream())

        for i in range(0, self.num_stream, 1):
            # allocate memory on device
            self.moments_device[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 10))))
            self.w_device[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9))))
            self.x_device[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9))))
            self.y_device[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9))))

            # set host memory for returned output
            self.c_moments[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 7))))
            self.mu[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.yf[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))

            self.m1[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 5))))
            self.float_value_set[dev_id](self.m1[dev_id][i],
                                         np.float32(0),
                                         size_per_batch,
                                         size_per_batch,
                                         block=self.block_size,
                                         grid=self.grid_size,
                                         stream=self.streams[dev_id][i])
            self.float_value_set[dev_id](self.m1[dev_id][i],
                                         np.float32(1),
                                         size_per_batch,
                                         np.int32(0),
                                         block=self.block_size,
                                         grid=self.grid_size,
                                         stream=self.streams[dev_id][i])

            self.x1[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.w1[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.x2[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.w2[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))

        ctx.synchronize()
        ctx.pop()

Example #2

Show file

    def _set_thread_args(self, dev_id: int, ctx: cuda.Context,
                         moment: np.ndarray, w_out: np.ndarray,
                         x_out: np.ndarray, y_out: np.ndarray):
        '''
        Set the input moment for all the stream for a specific GPU
        '''

        ctx.push()
        # number of input for this GPU
        max_size = moment.shape[1]
        # loop through the streams to set their input
        for i in range(0, self.num_stream, 1):
            # Size of input allocated for each stream
            size_per_batch = int(np.ceil(max_size / self.num_stream))
            # location on the original input array where the input to this stream starts
            loc = np.int32((i) * size_per_batch)
            if loc + size_per_batch > max_size:
                size_per_batch = max_size - loc

            self.moment_chunk_host[dev_id].append(
                np.ascontiguousarray(moment[:, loc:loc + size_per_batch],
                                     dtype=np.float32))
            self.moment_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.moment_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)
            self.w_chunk_host[dev_id].append(
                np.ascontiguousarray(
                    np.zeros_like(w_out[:, loc:loc + size_per_batch])))
            self.w_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.w_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)

            self.x_chunk_host[dev_id].append(
                np.ascontiguousarray(
                    np.zeros_like(x_out[:, loc:loc + size_per_batch])))
            self.x_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.x_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)

            self.y_chunk_host[dev_id].append(
                np.ascontiguousarray(
                    np.zeros_like(y_out[:, loc:loc + size_per_batch])))
            self.y_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.y_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)

        ctx.synchronize()
        ctx.pop()