Example #1
0
File: scan.py Project: lichinka/cai
        def __call__(self, input_ary, output_ary=None, allocator=None,
                stream=None):
            allocator = allocator or input_ary.allocator

            if output_ary is None:
                output_ary = input_ary

            if isinstance(output_ary, (str, unicode)) and output_ary == "new":
                output_ary = cl_array.empty_like(input_ary, allocator=allocator)

            if input_ary.shape != output_ary.shape:
                raise ValueError("input and output must have the same shape")

            if not input_ary.flags.forc:
                raise RuntimeError("ScanKernel cannot "
                        "deal with non-contiguous arrays")

            n, = input_ary.shape

            if not n:
                return output_ary

            unit_size  = self.scan_wg_size * self.scan_wg_seq_batches
            dev = driver.Context.get_device()
            max_groups = 3*dev.get_attribute(
                    driver.device_attribute.MULTIPROCESSOR_COUNT)

            from pytools import uniform_interval_splitting
            interval_size, num_groups = uniform_interval_splitting(
                    n, unit_size, max_groups);

            block_results = allocator(self.dtype.itemsize*num_groups)
            dummy_results = allocator(self.dtype.itemsize)

            # first level scan of interval (one interval per block)
            self.scan_intervals_knl.prepared_async_call(
                    (num_groups, 1), (self.scan_wg_size, 1, 1), stream,
                    input_ary.gpudata,
                    n, interval_size,
                    output_ary.gpudata,
                    block_results)

            # second level inclusive scan of per-block results
            self.scan_intervals_knl.prepared_async_call(
                    (1,1), (self.scan_wg_size, 1, 1), stream,
                    block_results,
                    num_groups, interval_size,
                    block_results,
                    dummy_results)

            # update intervals with result of second level scan
            self.final_update_knl.prepared_async_call(
                    (num_groups, 1,), (self.update_wg_size, 1, 1), stream,
                    output_ary.gpudata,
                    n, interval_size,
                    block_results)

            return output_ary
Example #2
0
        def __call__(self,
                     input_ary,
                     output_ary=None,
                     allocator=None,
                     stream=None):
            allocator = allocator or input_ary.allocator

            if output_ary is None:
                output_ary = input_ary

            if isinstance(output_ary, (str, unicode)) and output_ary == "new":
                output_ary = cl_array.empty_like(input_ary,
                                                 allocator=allocator)

            if input_ary.shape != output_ary.shape:
                raise ValueError("input and output must have the same shape")

            if not input_ary.flags.forc:
                raise RuntimeError("ScanKernel cannot "
                                   "deal with non-contiguous arrays")

            n, = input_ary.shape

            if not n:
                return output_ary

            unit_size = self.scan_wg_size * self.scan_wg_seq_batches
            dev = driver.Context.get_device()
            max_groups = 3 * dev.get_attribute(
                driver.device_attribute.MULTIPROCESSOR_COUNT)

            from pytools import uniform_interval_splitting
            interval_size, num_groups = uniform_interval_splitting(
                n, unit_size, max_groups)

            block_results = allocator(self.dtype.itemsize * num_groups)
            dummy_results = allocator(self.dtype.itemsize)

            # first level scan of interval (one interval per block)
            self.scan_intervals_knl.prepared_async_call(
                (num_groups, 1), (self.scan_wg_size, 1, 1), stream,
                input_ary.gpudata, n, interval_size, output_ary.gpudata,
                block_results)

            # second level inclusive scan of per-block results
            self.scan_intervals_knl.prepared_async_call(
                (1, 1), (self.scan_wg_size, 1, 1), stream, block_results,
                num_groups, interval_size, block_results, dummy_results)

            # update intervals with result of second level scan
            self.final_update_knl.prepared_async_call((
                num_groups,
                1,
            ), (self.update_wg_size, 1, 1), stream, output_ary.gpudata, n,
                                                      interval_size,
                                                      block_results)

            return output_ary
Example #3
0
File: scan.py Project: lichinka/cai
        def __call__(self, input_ary, output_ary=None, allocator=None,
                queue=None):
            allocator = allocator or input_ary.allocator
            queue = queue or input_ary.queue or output_ary.queue

            if output_ary is None:
                output_ary = input_ary

            if isinstance(output_ary, (str, unicode)) and output_ary == "new":
                output_ary = cl_array.empty_like(input_ary, allocator=allocator)

            if input_ary.shape != output_ary.shape:
                raise ValueError("input and output must have the same shape")

            if not input_ary.flags.forc:
                raise RuntimeError("ScanKernel cannot "
                        "deal with non-contiguous arrays")
 
            n, = input_ary.shape

            if not n:
                return output_ary

            unit_size  = self.scan_wg_size * self.scan_wg_seq_batches
            max_groups = 3*max(dev.max_compute_units for dev in self.devices)

            from pytools import uniform_interval_splitting
            interval_size, num_groups = uniform_interval_splitting(
                    n, unit_size, max_groups);

            block_results = allocator(self.dtype.itemsize*num_groups)
            dummy_results = allocator(self.dtype.itemsize)

            # first level scan of interval (one interval per block)
            self.scan_intervals_knl(
                    queue, (num_groups*self.scan_wg_size,), (self.scan_wg_size,),
                    input_ary.data,
                    n, interval_size,
                    output_ary.data,
                    block_results)

            # second level inclusive scan of per-block results
            self.scan_intervals_knl(
                    queue, (self.scan_wg_size,), (self.scan_wg_size,),
                    block_results,
                    num_groups, interval_size,
                    block_results,
                    dummy_results)

            # update intervals with result of second level scan
            self.final_update_knl(
                    queue, (num_groups*self.update_wg_size,), (self.update_wg_size,),
                    output_ary.data,
                    n, interval_size,
                    block_results)

            return output_ary
Example #4
0
        def __call__(self,
                     input_ary,
                     output_ary=None,
                     allocator=None,
                     queue=None):
            allocator = allocator or input_ary.allocator
            queue = queue or input_ary.queue or output_ary.queue

            if output_ary is None:
                output_ary = input_ary

            if isinstance(output_ary, (str, unicode)) and output_ary == "new":
                output_ary = cl_array.empty_like(input_ary,
                                                 allocator=allocator)

            if input_ary.shape != output_ary.shape:
                raise ValueError("input and output must have the same shape")

            if not input_ary.flags.forc:
                raise RuntimeError("ScanKernel cannot "
                                   "deal with non-contiguous arrays")

            n, = input_ary.shape

            if not n:
                return output_ary

            unit_size = self.scan_wg_size * self.scan_wg_seq_batches
            max_groups = 3 * max(dev.max_compute_units for dev in self.devices)

            from pytools import uniform_interval_splitting
            interval_size, num_groups = uniform_interval_splitting(
                n, unit_size, max_groups)

            block_results = allocator(self.dtype.itemsize * num_groups)
            dummy_results = allocator(self.dtype.itemsize)

            # first level scan of interval (one interval per block)
            self.scan_intervals_knl(queue, (num_groups * self.scan_wg_size, ),
                                    (self.scan_wg_size, ), input_ary.data, n,
                                    interval_size, output_ary.data,
                                    block_results)

            # second level inclusive scan of per-block results
            self.scan_intervals_knl(queue, (self.scan_wg_size, ),
                                    (self.scan_wg_size, ), block_results,
                                    num_groups, interval_size, block_results,
                                    dummy_results)

            # update intervals with result of second level scan
            self.final_update_knl(queue, (num_groups * self.update_wg_size, ),
                                  (self.update_wg_size, ), output_ary.data, n,
                                  interval_size, block_results)

            return output_ary