Beispiel #1
0
    def _assign(self, value):

        if isinstance(value, (int, float)):

            # if we have a c or f contiguous array, then use the speedy driver kernel
            if self.flags.forc and float(value) >= 0:
                drv.memset_d16(self.gpudata,
                               Flexpt.flex_from_native(value, self.iwl),
                               self.size)
            # otherwise use our copy kerel
            else:
                OpTreeNode.build("copy", value, None, out=self)

        elif isinstance(value, FlexptArray):
            if self.flags.forc and value.flags.forc and self.iwl == value.iwl:
                drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes)
            else:
                OpTreeNode.build("copy", value, None, out=self)

        elif isinstance(value, OpTreeNode):
            value.execute(out=self)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
    def ones(self, shape, iwl, allocator=drv.mem_alloc, order="C"):
        """
        Returns an array of the given shape and dtype filled with 0's.
        """

        result = FlexptArray(self, shape, iwl, allocator, order=order)

        drv.memset_d16(result.gpudata, self.flex_from_native(1,iwl), result.size)

        return result
Beispiel #3
0
    def _assign(self, value):

        if isinstance(value, (int, float)):

            # if we have a contiguous array, then use the speedy driver kernel
            if self.is_contiguous:

                value = self.dtype.type(value)

                if self.dtype.itemsize == 1:
                    drv.memset_d8( self.gpudata,
                                   unpack_from('B', value)[0],
                                   self.size)
                elif self.dtype.itemsize == 2:
                    drv.memset_d16(self.gpudata,
                                   unpack_from('H', value)[0],
                                   self.size)
                else:
                    drv.memset_d32(self.gpudata,
                                   unpack_from('I', value)[0],
                                   self.size)

            # otherwise use our copy kerel
            else:
                OpTreeNode.build("assign", self, value)

        elif isinstance(value, GPUTensor):
            # TODO: add an is_binary_compat like function
            if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype:
                drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes)
            else:
                OpTreeNode.build("assign", self, value)

        # collapse and execute an op tree as a kernel
        elif isinstance(value, OpTreeNode):
            OpTreeNode.build("assign", self, value)

        # assign to numpy array (same as set())
        elif isinstance(value, np.ndarray):
            self.set(value)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
Beispiel #4
0
    def _assign(self, value):

        if isinstance(value, (int, float)):

            # if we have a contiguous array, then use the speedy driver kernel
            if self.is_contiguous:

                value = self.dtype.type(value)

                if self.dtype.itemsize == 1:
                    drv.memset_d8(self.gpudata,
                                  unpack_from('B', value)[0], self.size)
                elif self.dtype.itemsize == 2:
                    drv.memset_d16(self.gpudata,
                                   unpack_from('H', value)[0], self.size)
                else:
                    drv.memset_d32(self.gpudata,
                                   unpack_from('I', value)[0], self.size)

            # otherwise use our copy kerel
            else:
                OpTreeNode.build("assign", self, value)

        elif isinstance(value, GPUTensor):
            # TODO: add an is_binary_compat like function
            if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype:
                drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes)
            else:
                OpTreeNode.build("assign", self, value)

        # collapse and execute an op tree as a kernel
        elif isinstance(value, OpTreeNode):
            OpTreeNode.build("assign", self, value)

        # assign to numpy array (same as set())
        elif isinstance(value, np.ndarray):
            self.set(value)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
    def _assign(self, value):
        
        if isinstance(value, (int, float)):

            # if we have a c or f contiguous array, then use the speedy driver kernel
            if self.flags.forc and float(value) >= 0:
                drv.memset_d16(self.gpudata, Flexpt.flex_from_native(value,self.iwl), self.size)
            # otherwise use our copy kerel
            else:
                OpTreeNode.build("copy", value, None, out=self)

        elif isinstance(value, FlexptArray):
            if self.flags.forc and value.flags.forc and self.iwl == value.iwl:
                drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes)
            else:
                OpTreeNode.build("copy", value, None, out=self)
        
        elif isinstance(value, OpTreeNode):
            value.execute(out=self)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
def _autotune_kernel(n, k, type_args, tune_args):

    load_cnt = 0
    array_ids = set()
    kernel_args = list()
    tune_args = list(tune_args)

    # Setup some fake data to autotune on
    # Perhaps tune on the real data but we'd need a custom memoize for that,
    # And we wouldn't be able to use n,k abstracted sizing
    data = drv.mem_alloc(n * k * 2)
    drv.memset_d16(data, 0, n * k)
    kernel_args.extend((data, k, 1, k))

    for arg in type_args:

        arg_type, arg_idx = arg[0:2]

        # Array operands
        if arg_type is flexpt_array.FlexptArray:
            if arg_idx not in array_ids:
                array_ids.add(arg_idx)
                flags = tune_args.pop()

                size = 1
                if flags & 1 == 0:
                    size *= n
                if flags & 2 == 0:
                    size *= k

                data = drv.mem_alloc(size * 2)
                drv.memset_d16(data, 0, size)
                kernel_args.extend((data, k, 1, 15, flags))
                load_cnt += 1

        # Constant operands
        elif arg_type is int:

            kernel_args.extend((0, 15))

        # Operations
        elif arg[2]:  # postop_convert_to_flex

            kernel_args.append(15)

    repeat = min(500, max(1, 8192**2 // (n * k)))

    # warm up the gpu clocks so our timings are accurate
    cur_ctx = drv.Context.get_current()
    if cur_ctx not in _context_warmup_set:
        _context_warmup_set.add(cur_ctx)
        kernel = _get_compound_ew_kernel(0, type_args)
        for r in range(repeat * 10):
            kernel.prepared_call((n, 1, 1), (32, 1, 1), *kernel_args)

    start = drv.Event()
    end = drv.Event()

    min_msecs = 99999999.9
    min_blocks = 1
    min_threads = 32
    min_factor = 0
    max_factor = 3 if load_cnt < 4 else 2

    for unroll_factor in range(max_factor):

        kernel = _get_compound_ew_kernel(unroll_factor, type_args)
        unroll = 1 << unroll_factor

        for threads in (32, 64, 128, 256):

            for blocks in (1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024):

                loads = blocks * threads * unroll

                if loads > k and min_msecs != 99999999.9:
                    #print "skipping %d loads for %d" % (loads, k)
                    continue

                loops = k // loads

                if (loops > 8 or loops < 1) and min_msecs != 99999999.9:
                    print "skipping %d loops %d // %d" % (loops, k, loads)
                    continue

                start.record()
                for r in range(repeat):
                    kernel.prepared_call((n, blocks, 1), (threads, 1, 1),
                                         *kernel_args)
                end.record()
                end.synchronize()
                msecs = end.time_since(start)

                if msecs < min_msecs:
                    min_msecs = msecs
                    min_factor = unroll_factor
                    min_blocks = blocks
                    min_threads = threads

                #print "%4d %d %4d %3d %4d (%4d, %4d) %4d %.5f" % \
                #    (repeat, unroll, blocks, threads, loads, n, k, loops, msecs)

    #print "Final: %d %4d %3d %.5f" % (1<<min_factor, min_blocks, min_threads, min_msecs)

    return (min_factor, min_blocks, min_threads)
def _autotune_kernel(n, k, type_args, tune_args):

    load_cnt    = 0
    array_ids   = set()
    kernel_args = list()
    tune_args   = list(tune_args)

    # Setup some fake data to autotune on
    # Perhaps tune on the real data but we'd need a custom memoize for that,
    # And we wouldn't be able to use n,k abstracted sizing
    data = drv.mem_alloc(n * k * 2)
    drv.memset_d16(data, 0, n * k)
    kernel_args.extend((data, k, 1, k))

    for arg in type_args:

        arg_type, arg_idx = arg[0:2]

        # Array operands
        if arg_type is flexpt_array.FlexptArray:
            if arg_idx not in array_ids:
                array_ids.add(arg_idx)
                flags = tune_args.pop()

                size = 1
                if flags & 1 == 0:
                    size *= n
                if flags & 2 == 0:
                    size *= k

                data = drv.mem_alloc(size * 2)
                drv.memset_d16(data, 0, size)
                kernel_args.extend((data, k, 1, 15, flags))
                load_cnt += 1

        # Constant operands
        elif arg_type is int:

            kernel_args.extend((0,15))

        # Operations
        elif arg[2]: # postop_convert_to_flex

            kernel_args.append(15)

    repeat = min(500, max(1, 8192**2 // (n * k)))

    # warm up the gpu clocks so our timings are accurate
    cur_ctx = drv.Context.get_current()
    if cur_ctx not in _context_warmup_set:
        _context_warmup_set.add(cur_ctx)
        kernel = _get_compound_ew_kernel(0, type_args)
        for r in range(repeat * 10):
            kernel.prepared_call((n,1,1), (32,1,1), *kernel_args)


    start = drv.Event()
    end   = drv.Event()

    min_msecs   = 99999999.9
    min_blocks  = 1
    min_threads = 32
    min_factor  = 0
    max_factor  = 3 if load_cnt < 4 else 2

    for unroll_factor in range(max_factor):

        kernel = _get_compound_ew_kernel(unroll_factor, type_args)
        unroll = 1 << unroll_factor

        for threads in (32,64,128,256):

            for blocks in (1,2,4,8,16,32,64,128,256,512,1024):

                loads = blocks * threads * unroll

                if loads > k and min_msecs != 99999999.9: 
                    #print "skipping %d loads for %d" % (loads, k)
                    continue

                loops = k // loads

                if (loops > 8 or loops < 1) and min_msecs != 99999999.9: 
                    print "skipping %d loops %d // %d" % (loops, k, loads) 
                    continue

                start.record()
                for r in range(repeat):
                    kernel.prepared_call((n,blocks,1), (threads,1,1), *kernel_args)
                end.record()
                end.synchronize()
                msecs = end.time_since(start)

                if msecs < min_msecs:
                    min_msecs   = msecs
                    min_factor  = unroll_factor
                    min_blocks  = blocks
                    min_threads = threads

                #print "%4d %d %4d %3d %4d (%4d, %4d) %4d %.5f" % \
                #    (repeat, unroll, blocks, threads, loads, n, k, loops, msecs)

    #print "Final: %d %4d %3d %.5f" % (1<<min_factor, min_blocks, min_threads, min_msecs)

    return (min_factor, min_blocks, min_threads)