Example #1
0
    def _iter(self, rdr, gnm, gprof, dim, tc):
        tref = rdr.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)

        nbins = dim.ah * dim.astride
        fill = lambda b, s, v=i32(0): util.fill_dptr(
                self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front,  4 * nbins)
        fill(self.fb.d_side,   2 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))

        nts = self.info_a.ntemporal_samples
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
        nrounds = int(nsamps / (nts * 256. * 256)) + 1

        def launch_iter(n):
            if n == 0: return
            launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n),
                    self.fb.d_front, self.fb.d_side,
                    self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
                    self.info_a.d_params)
        # Split the launch into multiple rounds, possibly (slightly) reducing
        # work overlap but avoiding stalls when working on a device with an
        # active X session. TODO: characterize performance impact, autodetect
        BLOCK_SIZE = 16
        for i in range(BLOCK_SIZE-1, nrounds, BLOCK_SIZE):
            launch_iter(BLOCK_SIZE)
        launch_iter(nrounds%BLOCK_SIZE)

        nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.)))
        launch('flush_atom', self.mod, self.stream_a,
                256, (nblocks, nblocks),
                u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins))
Example #2
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        gam = f32(1 / gprof.filters.colorclip.gamma(tc) - 1)

        dsc = mkdsc(dim, 1)
        tref = mktref(self.mod, 'chan1_src')

        set_blur_width(self.mod, fb.pool, stream=stream)
        launch2('apply_gamma', self.mod, stream, dim, fb.d_left, fb.d_front,
                f32(0.1))
        tref.set_address_2d(fb.d_left, dsc, 4 * dim.astride)
        launch2('den_blur_1c',
                self.mod,
                stream,
                dim,
                fb.d_back,
                i32(2),
                i32(0),
                texrefs=[tref])
        tref.set_address_2d(fb.d_back, dsc, 4 * dim.astride)
        launch2('den_blur_1c',
                self.mod,
                stream,
                dim,
                fb.d_left,
                i32(3),
                i32(0),
                texrefs=[tref])

        launch2('haloclip', self.mod, stream, dim, fb.d_front, fb.d_left, gam)
Example #3
0
    def _iter(self, rdr, gnm, gprof, dim, tc):
        tref = rdr.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)

        nbins = dim.ah * dim.astride
        fill = lambda b, s, v=i32(0): util.fill_dptr(
            self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front, 4 * nbins)
        fill(self.fb.d_left, 4 * nbins)
        fill(self.fb.d_right, 4 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
        fill(self.fb.d_uleft, nbins / 2)
        fill(self.fb.d_uright, nbins / 2)

        nts = self.info_a.ntemporal_samples
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
        nrounds = int(nsamps / (nts * 256. * 256)) + 1

        # Split the launch into multiple rounds, to prevent a system on older
        # GPUs from locking up and to give us a chance to flush some stuff.
        hidden_stream = cuda.Stream()
        iter_stream_left, iter_stream_right = self.stream_a, hidden_stream
        block_size = 4

        while nrounds:
            n = min(nrounds, block_size)
            now = time.time()
            launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n),
                   self.fb.d_front, self.fb.d_left, self.fb.d_rb,
                   self.fb.d_seeds, self.fb.d_points, self.fb.d_uleft,
                   self.info_a.d_params)
            delta = time.time() - now
            if delta > 0.1:
                # More than 100ms passed attempting to launch. The GPU is likely
                # out of queued execution resources on a long render, and scheduling
                # additional work will just keep spinning the CPU at 100%.
                # Do a blocking sync to free up resources. This may slightly reduce
                # parallelism but makes it a whole heck of a lot easier to keep
                # using the computer while things render.
                print >> sys.stderr, 'Launches became blocking, synchronizing'
                iter_stream_right.synchronize()

            # Make sure the other stream is done flushing before we start
            iter_stream_left.wait_for_event(
                cuda.Event().record(iter_stream_right))

            launch('flush_atom', rdr.mod, iter_stream_left, (16, 16, 1),
                   (dim.astride / 16, dim.ah / 16), u64(self.fb.d_front),
                   u64(self.fb.d_left), u64(self.fb.d_uleft), i32(nbins))

            self.fb.flip_side()
            iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left
            nrounds -= n
            block_size += block_size / 2

        # Always wait on all events in the hidden stream before continuing on A
        self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
Example #4
0
    def _iter(self, rdr, gnm, gprof, dim, tc):
        tref = rdr.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)

        nbins = dim.ah * dim.astride
        fill = lambda b, s, v=i32(0): util.fill_dptr(
                self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front,  4 * nbins)
        fill(self.fb.d_left,   4 * nbins)
        fill(self.fb.d_right,  4 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
        fill(self.fb.d_uleft,  nbins / 2)
        fill(self.fb.d_uright, nbins / 2)

        nts = self.info_a.ntemporal_samples
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
        nrounds = int(nsamps / (nts * 256. * 256)) + 1

        # Split the launch into multiple rounds, to prevent a system on older
        # GPUs from locking up and to give us a chance to flush some stuff.
        hidden_stream = cuda.Stream()
        iter_stream_left, iter_stream_right = self.stream_a, hidden_stream
        block_size = 4

        while nrounds:
          n = min(nrounds, block_size)
          now = time.time()
          launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n),
                 self.fb.d_front, self.fb.d_left,
                 self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
                 self.fb.d_uleft, self.info_a.d_params)
          delta = time.time() - now
          if delta > 0.1:
            # More than 100ms passed attempting to launch. The GPU is likely
            # out of queued execution resources on a long render, and scheduling
            # additional work will just keep spinning the CPU at 100%.
            # Do a blocking sync to free up resources. This may slightly reduce
            # parallelism but makes it a whole heck of a lot easier to keep
            # using the computer while things render.
            print >> sys.stderr, 'Launches became blocking, synchronizing'
            iter_stream_right.synchronize()

          # Make sure the other stream is done flushing before we start
          iter_stream_left.wait_for_event(cuda.Event().record(iter_stream_right))

          launch('flush_atom', rdr.mod, iter_stream_left,
                  (16, 16, 1), (dim.astride / 16, dim.ah / 16),
                  u64(self.fb.d_front), u64(self.fb.d_left),
                  u64(self.fb.d_uleft), i32(nbins))

          self.fb.flip_side()
          iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left
          nrounds -= n
          block_size += block_size / 2

        # Always wait on all events in the hidden stream before continuing on A
        self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
Example #5
0
 def __init__(self, start_address):
     super().__init__(start_address)
     self.start_address = start_address
     self.bg_block_bitmap = i32(0)
     self.bg_inode_bitmap = i32(0)
     self.bg_inode_table = i32(0)
     self.bg_free_blocks_count = i16(0)
     self.bg_free_inodes_count = i16(0)
     self.bg_used_dirs_count = i16(0)
     self.bg_pad = i16(0)
     self.bg_reserved = [byte(0)] * 12
Example #6
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        gam = f32(1 / gprof.filters.colorclip.gamma(tc) - 1)

        dsc = mkdsc(dim, 1)
        tref = mktref(self.mod, "chan1_src")

        set_blur_width(self.mod, fb.pool, stream=stream)
        launch2("apply_gamma", self.mod, stream, dim, fb.d_side, fb.d_front, f32(0.1))
        tref.set_address_2d(fb.d_side, dsc, 4 * dim.astride)
        launch2("den_blur_1c", self.mod, stream, dim, fb.d_back, i32(2), i32(0), texrefs=[tref])
        tref.set_address_2d(fb.d_back, dsc, 4 * dim.astride)
        launch2("den_blur_1c", self.mod, stream, dim, fb.d_side, i32(3), i32(0), texrefs=[tref])

        launch2("haloclip", self.mod, stream, dim, fb.d_front, fb.d_side, gam)
Example #7
0
def launchC(name, mod, stream, dim, fb, *args):
    launch(
        name,
        mod,
        stream,
        (32, 8, 1),
        (int(np.ceil(dim.w / 32.0)), int(np.ceil(dim.h / 8.0))),
        fb.d_back,
        fb.d_front,
        i32(fb.gutter),
        i32(dim.w),
        i32(dim.astride),
        i32(dim.h),
        *args
    )
Example #8
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        # Helper variables and functions to keep it clean
        sb = 16 * dim.astride
        bs = sb * dim.ah

        dsc = mkdsc(dim, 4)
        tref = mktref(self.mod, 'chan4_src')
        grad_dsc = mkdsc(dim, 1)
        grad_tref = mktref(self.mod, 'chan1_src')
        set_blur_width(self.mod, fb.pool, stream=stream)

        for pattern in range(self.directions):
            # Scale spatial parameter so that a "pixel" is equivalent to an
            # actual pixel at 1080p
            sstd = params.spatial_std(tc) * dim.w / 1920.

            tref.set_address_2d(fb.d_front, dsc, sb)

            # Blur density two octaves along sampling vector, ultimately
            # storing in the side buffer
            launch2('den_blur', self.mod, stream, dim,
                    fb.d_back, i32(pattern), i32(0), texrefs=[tref])
            grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4)
            launch2('den_blur_1c', self.mod, stream, dim,
                    fb.d_left, i32(pattern), i32(1), texrefs=[grad_tref])
            grad_tref.set_address_2d(fb.d_left, grad_dsc, sb / 4)

            launch2('bilateral', self.mod, stream, dim,
                    fb.d_back, i32(pattern), i32(self.radius),
                    f32(sstd), f32(params.color_std(tc)),
                    f32(params.density_std(tc)), f32(params.density_pow(tc)),
                    f32(params.gradient(tc)),
                    texrefs=[tref, grad_tref])
            fb.flip()
Example #9
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        gam, lin, lingam = calc_lingam(gprof.filters.colorclip, tc)
        dsc = mkdsc(dim, 4)
        tref = mktref(self.mod, "chan4_src")

        set_blur_width(self.mod, fb.pool, params.width(tc), stream)
        launch2("apply_gamma_full_hi", self.mod, stream, dim, fb.d_side, fb.d_front, f32(gam - 1))
        tref.set_address_2d(fb.d_side, dsc, 16 * dim.astride)
        launch2("full_blur", self.mod, stream, dim, fb.d_back, i32(2), i32(0), texrefs=[tref])
        tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride)
        launch2("full_blur", self.mod, stream, dim, fb.d_side, i32(3), i32(0), texrefs=[tref])
        tref.set_address_2d(fb.d_side, dsc, 16 * dim.astride)
        launch2("full_blur", self.mod, stream, dim, fb.d_back, i32(0), i32(0), texrefs=[tref])
        tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride)
        launch2("full_blur", self.mod, stream, dim, fb.d_side, i32(1), i32(0), texrefs=[tref])
        launch2("smearclip", self.mod, stream, dim, fb.d_front, fb.d_side, f32(gam - 1), lin, lingam)
Example #10
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        gam, lin, lingam = calc_lingam(gprof.filters.colorclip, tc)
        dsc = mkdsc(dim, 4)
        tref = mktref(self.mod, 'chan4_src')

        set_blur_width(self.mod, fb.pool, params.width(tc), stream)
        launch2('apply_gamma_full_hi', self.mod, stream, dim, fb.d_left,
                fb.d_front, f32(gam - 1))
        tref.set_address_2d(fb.d_left, dsc, 16 * dim.astride)
        launch2('full_blur',
                self.mod,
                stream,
                dim,
                fb.d_back,
                i32(2),
                i32(0),
                texrefs=[tref])
        tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride)
        launch2('full_blur',
                self.mod,
                stream,
                dim,
                fb.d_left,
                i32(3),
                i32(0),
                texrefs=[tref])
        tref.set_address_2d(fb.d_left, dsc, 16 * dim.astride)
        launch2('full_blur',
                self.mod,
                stream,
                dim,
                fb.d_back,
                i32(0),
                i32(0),
                texrefs=[tref])
        tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride)
        launch2('full_blur',
                self.mod,
                stream,
                dim,
                fb.d_left,
                i32(1),
                i32(0),
                texrefs=[tref])
        launch2('smearclip', self.mod, stream, dim, fb.d_front, fb.d_left,
                f32(gam - 1), lin, lingam)
Example #11
0
def conv(
        # input and kernel tensors
        x: OpenCLTensor,
        kernel: OpenCLTensor,
        # strides
        strides: tuple) -> OpenCLTensor:
    # get device
    device = x.device
    # get dimensions
    ndim = len(x.shape)
    kernel_dim = len(kernel.shape) - 2  # without in- and output channels
    # TODO: flatten additional dimensions in input
    assert ndim == kernel_dim + 2
    assert len(strides) == kernel_dim
    # build output tensor
    out_image_shape = tuple(
        (s - k) // st + 1
        for s, k, st in zip(x.shape[-kernel_dim:], kernel.shape[2:], strides))
    out_shape = x.shape[:-kernel_dim -
                        1] + (kernel.shape[0], ) + out_image_shape
    out_tensor = device.Tensor.empty(out_shape, dtype=x.dtype)
    # build kernel
    knl = cache_build_conv_kernel(device.context,
                                  kernel_dim=kernel_dim,
                                  dtype=x.dtype)
    # set input arguemnts
    knl.set_args(
        # input and kernel data
        x.contiguous().data,
        kernel.contiguous().data,
        out_tensor.data,
        # in- and output shape
        i32(kernel.shape[1]),
        i32(prod(x.shape[-kernel_dim:])),
        *(i32(s) for s in x.shape[-kernel_dim:]),
        *(i32(s) for s in out_image_shape),
        # kernel shape without output channels
        i32(prod(kernel.shape[-kernel_dim:])),
        *(i32(s) for s in kernel.shape[-kernel_dim:]),
        # strides
        *(i32(st) for st in strides))
    # execute kernel
    global_shape = [prod(out_image_shape), kernel.shape[0],
                    x.shape[0]]  # flat-out-image, out-channels, batch
    local_shape = None
    cl.enqueue_nd_range_kernel(device.queue, knl, global_shape,
                               local_shape).wait()
    # return output tensor
    return out_tensor
Example #12
0
    def _interp(self, rdr, gnm, dim, ts, td):
        d_acc_size = rdr.mod.get_global('acc_size')[0]
        p_dim = self.fb.pool.allocate((len(dim), ), u32)
        p_dim[:] = dim
        cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)

        tref = self.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)
        launch('interp_palette_flat', self.mod, self.stream_a, 256,
               self.info_a.palette_height, self.fb.d_rb, self.fb.d_seeds,
               self.src_a.d_ptimes, self.src_a.d_pals, f32(ts),
               f32(td / self.info_a.palette_height))

        nts = self.info_a.ntemporal_samples
        launch('interp_iter_params', rdr.mod, self.stream_a, 256,
               np.ceil(nts / 256.), self.info_a.d_params, self.src_a.d_times,
               self.src_a.d_knots, f32(ts), f32(td / nts), i32(nts))
Example #13
0
    def apply(self, fb, gprof, params, dim, tc, stream=None):
        # Helper variables and functions to keep it clean
        sb = 16 * dim.astride
        bs = sb * dim.ah

        dsc = mkdsc(dim, 4)
        tref = mktref(self.mod, 'chan4_src')
        grad_dsc = mkdsc(dim, 1)
        grad_tref = mktref(self.mod, 'chan1_src')
        set_blur_width(self.mod, fb.pool, stream=stream)

        for pattern in range(self.directions):
            # Scale spatial parameter so that a "pixel" is equivalent to an
            # actual pixel at 1080p
            sstd = params.spatial_std(tc) * dim.w / 1920.

            tref.set_address_2d(fb.d_front, dsc, sb)

            # Blur density two octaves along sampling vector, ultimately
            # storing in the side buffer
            launch2('den_blur',
                    self.mod,
                    stream,
                    dim,
                    fb.d_back,
                    i32(pattern),
                    i32(0),
                    texrefs=[tref])
            grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4)
            launch2('den_blur_1c',
                    self.mod,
                    stream,
                    dim,
                    fb.d_left,
                    i32(pattern),
                    i32(1),
                    texrefs=[grad_tref])
            grad_tref.set_address_2d(fb.d_left, grad_dsc, sb / 4)

            launch2('bilateral',
                    self.mod,
                    stream,
                    dim,
                    fb.d_back,
                    i32(pattern),
                    i32(self.radius),
                    f32(sstd),
                    f32(params.color_std(tc)),
                    f32(params.density_std(tc)),
                    f32(params.density_pow(tc)),
                    f32(params.gradient(tc)),
                    texrefs=[tref, grad_tref])
            fb.flip()
Example #14
0
    def _interp(self, rdr, gnm, dim, ts, td):
        d_acc_size = rdr.mod.get_global('acc_size')[0]
        p_dim = self.fb.pool.allocate((len(dim),), u32)
        p_dim[:] = dim
        cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)

        tref = self.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)
        launch('interp_palette_flat', self.mod, self.stream_a,
                256, self.info_a.palette_height,
                self.fb.d_rb, self.fb.d_seeds,
                self.src_a.d_ptimes, self.src_a.d_pals,
                f32(ts), f32(td / self.info_a.palette_height))

        nts = self.info_a.ntemporal_samples
        launch('interp_iter_params', rdr.mod, self.stream_a,
                256, np.ceil(nts / 256.),
                self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
                f32(ts), f32(td / nts), i32(nts))
Example #15
0
def dot(
        # inputs
        X: OpenCLTensor,
        Y: OpenCLTensor,
        # kernel information
        block_size: int = 8 * 16,
        work_per_thread: int = 8) -> OpenCLTensor:
    assert 3 >= len(X.shape) == len(Y.shape) >= 2
    assert X.shape[:-2] == Y.shape[:-2]
    assert X.shape[-1] == Y.shape[-2]
    # get tensor information
    device = X.device
    n, M, N, K = len(X.shape), X.shape[-2], Y.shape[-1], X.shape[-1]
    # flatten batch dimensions
    X = X.reshape(-1, M, K)
    Y = Y.reshape(-1, K, N)
    assert X.shape[0] == Y.shape[0], "Batches do not align! (%i != %i)" % (
        X.shape[0], Y.shape[0])
    # pad inputs to be multiple of block size in both directions
    X = _match_blocks(X, block_size)
    Y = _match_blocks(Y, block_size)
    # create output tensor
    B, pad_M, pad_N, pad_K = X.shape[0], X.shape[1], Y.shape[2], X.shape[2]
    B, pad_M, pad_N, pad_K = i32(B), i32(pad_M), i32(pad_N), i32(pad_K)
    O = device.Tensor.empty(shape=(B, pad_M, pad_N),
                            dtype=X.dtype)  # TODO: broadcast dtype
    # kernel global and local thread layout
    global_shape = [B, pad_M // work_per_thread, pad_N // work_per_thread]
    local_shape = [1] + [block_size // work_per_thread] * 2
    # build and call kernel
    knl = cache_build_dot_kernel(device.context, X.dtype, Y.dtype, O.dtype,
                                 block_size, work_per_thread)
    e = knl(device.queue, global_shape, local_shape,
            X.contiguous().data,
            Y.contiguous().data, O.data, i32(X.offset), i32(Y.offset), pad_M,
            pad_N, pad_K)
    e.wait()
    # remove padding from output
    idx = (slice(0, B) if (n == 3) else 0, slice(0, M), slice(0, N))
    return O[idx]
Example #16
0
def atom(
    op: str,
    # input / output tensors
    additional_read: tuple = tuple(
    ),  # by default we only read the values of tensors mentioned in output
    output=(
        'o',
    ),  # output tensors, if not mentioned in named tensors then a new tensor is created
    # kernel information
    block_size: int = 256,  # local block size
    # inputs
    **
    named_tensors  # all named tensors (except output tensors) needed for execution of op
) -> Tuple[OpenCLTensor]:
    # separate tensors from scalars
    named_scalars = {
        n: v
        for n, v in named_tensors.items() if not isinstance(v, OpenCLTensor)
    }
    named_tensors = {
        n: v
        for n, v in named_tensors.items() if isinstance(v, OpenCLTensor)
    }
    # separate names and values
    tensor_names, tensors = zip(*named_tensors.items())
    tensor_names, tensors = tuple(tensor_names), tuple(tensors)
    if len(named_scalars) > 0:
        scalar_names, scalars = zip(*named_scalars.items())
        scalar_names, scalars = tuple(scalar_names), tuple(scalars)
    else:
        scalar_names, scalars = tuple(), tuple()
    # get device and dtype
    t0 = tensors[0]
    device, dtype = t0.device, t0.dtype

    shapes = (t.shape for t in tensors)
    strides = (t.strides for t in tensors)
    # broadcast shape
    shape = map(max, zip_longest(*map(reversed, shapes), fillvalue=1))
    shape = tuple(map(i32, shape))[::-1]
    ndim, numel = len(shape), prod(shape)

    # create output tensors if necessary
    for out in output:
        if out not in tensor_names:
            tensor_names += (out, )
            tensors += (device.Tensor.empty(shape, dtype=dtype), )

    # build strides
    strides = tuple((i32(0), ) * (ndim - len(t.strides)) + tuple(
        map(lambda st_sh: i32(st_sh[0] if st_sh[1] > 1 else 0),
            zip(t.strides, t.shape))) for t in tensors)
    # collapse contiguous dimensions to minimize index computations in kernel
    if ndim > 1:
        shape, strides = _collapse_contiguous_dims(shape, strides)
        ndim = len(shape)

    # by default we read only tensors that are not in output
    read = tuple(n for n in tensor_names if n not in output) + additional_read
    buffer_dtypes = tuple(map(lambda t: t.dtype, tensors))
    scalar_dtypes = tuple(map(lambda s: np.dtype(type(s)), scalars))
    # build kernel and set arguments
    knl = cache_build_atom_kernel(device.context,
                                  op=op,
                                  buffers=tensor_names,
                                  buffer_dtypes=buffer_dtypes,
                                  scalars=scalar_names,
                                  scalar_dtypes=scalar_dtypes,
                                  ndim=ndim,
                                  read=read,
                                  write=output)
    knl.set_args(
        *(t.data for t in tensors),  # buffers
        *(t.type(s) for t, s in zip(scalar_dtypes, scalars)),  # scalars
        *shape,
        *chain(*strides),  # shapes and strides
        *(i32(t.offset) for t in tensors),  # offsets
        i32(numel)  # number of elements to compute
    )
    # execute kernel and return output tensors
    cl.enqueue_nd_range_kernel(device.queue, knl,
                               [ceil(numel / block_size) * block_size],
                               [block_size]).wait()
    return tuple(t for n, t in zip(tensor_names, tensors) if n in output)
Example #17
0
 def __init__(self, start_address):
     super().__init__(start_address)
     self.start_address = 1024
     self.s_inodes_count = i32(0)
     self.s_inodes_count = i32(0)
     self.s_blocks_count = i32(0)
     self.s_r_blocks_count = i32(0)
     self.s_free_blocks_count = i32(0)
     self.s_free_inodes_count = i32(0)
     self.s_first_data_block = i32(0)
     self.s_log_block_size = i32(0)
     self.s_log_frag_size = i32(0)
     self.s_blocks_per_group = i32(0)
     self.s_frags_per_group = i32(0)
     self.s_inodes_per_group = i32(0)
     self.s_mtime = i32(0)
     self.s_wtime = i32(0)
     self.s_mnt_count = i16(0)
     self.s_max_mnt_count = i16(0)
     self.s_magic = i16(0)
     self.s_state = i16(0)
     self.s_errors = i16(0)
     self.s_minor_rev_level = i16(0)
     self.s_lastcheck = i32(0)
     self.s_checkinterval = i32(0)
     self.s_creator_os = i32(0)
     self.s_rev_level = i32(0)
     self.s_def_resuid = i16(0)
     self.s_def_resgid = i16(0)
     # EXT2_DYNAMIC_REV Specific
     self.s_first_ino = i32(0)
     self.s_inode_size = i16(0)
     self.s_block_group_nr = i16(0)
     self.s_feature_compat = i32(0)
     self.s_feature_incompat = i32(0)
     self.s_feature_ro_compat = i32(0)
     # o 104 s 16
     self.s_uuid = "some string?"
     # o 120 s 16
     self.s_volume_name = "16 bytes volume name, mostly unusued. A valid volume name would consist of only " \
                          "ISO-Latin-1 characters and be 0 terminated. "
     # o 136 s 64
     self.s_last_mounted = "64 bytes directory path where the file system was last mounted. While not normally " \
                           "used, it could serve for auto-finding the mountpoint when not indicated on the command " \
                           "line. Again the path should be zero terminated for compatibility reasons. Valid path " \
                           "is constructed from ISO-Latin-1 characters. "
     # o 200 s 4
     self.s_algo_bitmap = "32bit value used by compression algorithms to determine the compression method(s) used."
     # Performance Hints
     self.s_prealloc_blocks = byte(0)
     self.s_prealloc_dir_blocks = byte(0)
     self.allignment = i16(0)  # o 206
     # Journaling Support
     self.s_journal_uuid = "16-byte value containing the uuid of the journal superblock. See Ext3 Journaling for " \
                           "more information. "
     self.s_journal_inum = i32(0)
     self.s_journal_dev = i32(0)
     self.s_last_orphan = i32(0)
     # Directory Indexing Support
     self.s_hash_seed = [i32(0)] * 4
     self.s_def_hash_version = byte(0)
     self.padding = [byte(0)] * 3  # reserved for future expansion
     # Other options
     self.s_default_mount_options = i32(0)
     self.s_first_meta_bg = i32(0)
     # o 264 s 760 - reserved for future revisions
     self.unused = [byte(0)] * 760
Example #18
0
 def __init__(self, start_address):
     super().__init__(start_address)
     self.start_address = start_address  # bg_inode_table
     self.i_mode = i16(0)
     self.i_uid = i16(0)
     self.i_size = i32(0)
     self.i_atime = i32(0)
     self.i_ctime = i32(0)
     self.i_mtime = i32(0)
     self.i_dtime = i32(0)
     self.i_gid = i16(0)
     self.i_links_count = i16(0)
     self.i_blocks = i32(0)
     self.i_flags = i32(0)
     self.i_osd1 = i32(0)
     self.i_block = [i32(0)] * 15  # 12 blocks - direct block
     # 13th entry in this array is the block number of the first indirect block
     # 14th entry in this array is the block number of the first doubly-indirect block
     # 15th entry in this array is the block number of the triply-indirect block
     self.i_generation = i32(0)
     self.i_file_acl = i32(0)
     self.i_dir_acl = i32(0)
     self.i_faddr = i32(0)
     self.i_osd2 = [byte(0)] * 12  # 96 bit OS dependant
Example #19
0
def launchC(name, mod, stream, dim, fb, *args):
    launch(name, mod, stream, (32, 8, 1),
           (int(np.ceil(dim.w / 32.)), int(np.ceil(dim.h / 8.))),
           fb.d_back, fb.d_front, i32(fb.gutter), i32(dim.w), i32(dim.astride),
           i32(dim.h), *args)
Example #20
0
def reduce(
        reduction: str,  # reduction expression using variables 'a' and 'b'
        # input tensor
    T: OpenCLTensor,
        # options
        axis: Tuple[int],
        neutral: str = "0",
        # kernel information
        group_size: int = 128) -> OpenCLTensor:
    # get device
    device = T.device
    # total number of elements to reduce
    reduce_numel = prod((T.shape[i] for i in axis))
    keep_numel = prod(
        (T.shape[i] for i in range(len(T.shape)) if i not in axis))
    n_work_groups = ceil(reduce_numel /
                         (group_size * 2))  # number of work-groups needed

    # build output tensor
    shape = tuple(s if i not in axis else 1 for i, s in enumerate(T.shape))
    shape = (1, ) if len(shape) == 0 else shape
    # output tensor also stores partial sums of each iterations, thus n_work_groups
    O = device.Tensor.empty(shape + (n_work_groups, ), dtype=T.dtype)

    # transpose to have reduction dimensions at last
    if len(axis) < len(T.shape):
        perm = list(range(len(T.shape)))
        for i, j in enumerate(axis, 1):
            perm[-i], perm[j] = perm[j], perm[-i]
        T = T.transpose(*perm)

    # build kernels
    use_strides = (len(axis) < len(T.shape) and not T.is_contiguous())
    knl = cache_build_reduction_kernel(
        device.context,
        reduction=reduction,
        dtype=T.dtype,
        neutral=neutral,
        ndim=len(T.shape) if use_strides else
        0,  # set to 0 if not needed to prevent compiling a new kernel
        use_strides=use_strides,
        block_size=group_size)
    next_knl = cache_build_reduction_kernel(device.context,
                                            reduction=reduction,
                                            dtype=T.dtype,
                                            neutral=neutral,
                                            ndim=0,
                                            use_strides=False,
                                            block_size=group_size)

    # build additional strided input arguments
    stride_args = (*(i32(s) for s in T.shape),
                   *(i32(st) for st in T.strides)) if use_strides else tuple()

    while (reduce_numel > 1):
        knl.set_args(T.data, O.data, i32(T.offset), *stride_args,
                     i32(reduce_numel))
        e = cl.enqueue_nd_range_kernel(
            device.queue, knl, [keep_numel, n_work_groups * group_size],
            [1, group_size])
        # update values
        T = O  # input of further iterations is output of current iteration
        reduce_numel = n_work_groups
        n_work_groups = ceil(reduce_numel / (group_size * 2))
        knl = next_knl
        stride_args = tuple()

    # wait for queue to finish
    e.wait()
    # remove partial sums stored in last dimension of output
    return device.Tensor(O.data, shape=shape, dtype=O.dtype)