Example #1
0
def call_compound_kernel(rand_state, *args):
    """
    Pass in a list of GPUTensor objects, constants and operators in postfix notation..

    C +=  2.5 * A * B + 1
    call_compound_ew_kernel(C, 2.5, A, "mul", B, "mul", 1, "add", C, "add", "assign")
    """
    out = None
    arg_cnt = 0
    op_cnt = 0
    array_ids = {}
    const_ids = {}
    kernel_args = [rand_state]
    type_args = []
    shape_stack = []
    threads = 32
    red_depth = 0
    # Apply reduction constraints and determine thread axis
    # Blocks will be allocated counter to this axis
    # Also detect if this is a broadcast or transpose op.
    contiguous = True
    reduction = False
    broadcast = False
    transpose = False
    argminmax = False
    takeop = False
    axis = 1
    for arg in args:
        if type(arg) is dict:
            op_name = arg["op"]
            if op_name in _reduction_ops:

                if op_name[0:3] == "arg":
                    argminmax = True

                # To reduce a whole tensor (axis=None) reduce along each axis in succession.
                if arg.get("axis", None) not in (0, 1):
                    raise ValueError("Only reduction along an axis currently supported")

                # Keep axis values consistent within the same kernel
                if reduction is True:
                    if arg["axis"] != axis:
                        raise ValueError("Reduction only allowed along one axis per kernel.")
                else:
                    reduction = True
                    axis = arg["axis"]
            elif op_name == "onehot":
                takeop = True

        elif isinstance(arg, ng.GPUTensor):
            if len(arg.shape) < 2 or arg.shape[0] == 1 or arg.shape[1] == 1:
                broadcast = True
            elif arg.is_trans:
                transpose = True
            elif arg.take_array:
                takeop = True
            elif not arg.is_contiguous:
                contiguous = False

    # If reducing along axis 0 we need to reverse all strides.
    # Each block gets a column and the threads work down the columns.
    stride_order = 1 if axis == 1 else -1

    for arg in args:

        # Array operand
        if isinstance(arg, ng.GPUTensor):

            # for complex operations, use the native dimensions
            if broadcast or reduction or transpose or takeop or not contiguous:
                if len(arg.shape) == 2:
                    shape = arg.shape
                    strides = list(arg.strides[::stride_order])
                else:
                    raise ValueError(
                        "Operations that are not simple elementwise are only currently supported in 2 dimensions."
                    )

            # use more efficient 2d dimensions if this is a plain ew op.
            else:
                shape, strides = _get_fast_ew_dims(arg.size)
                strides = list(strides[::stride_order])

            # If same array is passed in multiple times to expression,
            # consolidate them into one kernel argument.
            if arg in array_ids:
                indx = array_ids[arg]
            else:

                # The first array passed in should be the output.
                # It's ok if this array is duplicated as the first instance
                # needs to be a mutable pointer.
                # A subsequent instance of out (if present) will be a const pointer.
                if out is None:
                    out = arg
                    indx = arg_cnt
                else:
                    indx = array_ids[arg] = arg_cnt
                arg_cnt += 1

                # support broadcast
                if shape[0] == 1:
                    strides[1 - axis] = 0
                if shape[1] == 1:
                    strides[axis] = 0

                kernel_args.extend((arg.gpudata, strides[0], strides[1]))

                # fancy indexing/take
                if arg.take_array:
                    kernel_args.append(arg.take_array[0].gpudata)

            # swap the take axis when reducing axis=0
            # also add 1 to distinguish between no take operations
            if arg.take_array:
                if axis != 1:
                    take_axis = 2 - arg.take_array[1]
                else:
                    take_axis = arg.take_array[1] + 1
            # no take operation
            else:
                take_axis = 0

            type_args.append((ng.GPUTensor, indx, arg.dtype.str[1:], take_axis))

            shape_stack.append(shape)

        # Constant operand
        elif type(arg) in (int, float):

            arg = float(arg)
            if arg in const_ids:
                indx = const_ids[arg]
            else:
                indx = const_ids[arg] = arg_cnt
                arg_cnt += 1

                kernel_args.append(arg)

            type_args.append((float, indx))
            shape_stack.append((1, 1))

        # Operation
        elif type(arg) is dict:

            op_name = arg["op"]

            if op_name in _float_ops:

                # we need to do the shape arithemtic for the current operation
                max_shape = [1, 1]
                for op_num in range(_float_ops[op_name][0]):
                    shape = shape_stack.pop()
                    for i in range(2):
                        if shape[i] != max_shape[i]:
                            # support broadcast
                            # TODO: don't allow output tensor itself to be broadcastable.
                            # The final output is fine as a broadcast, for example assigning a constant.
                            # You just dont want a tensor being assigned to a smaller shape.
                            if shape[i] == 1 or max_shape[i] == 1:
                                max_shape[i] = max(max_shape[i], shape[i])
                            else:
                                raise TypeError("Input shape:%s not compatible" % (shape,))

                if op_name == "assign":

                    # the axis dim is the thread loop stop condition
                    kernel_args.append(max_shape[axis])

                    rounding = out.rounding

                    # support rounding to arbitrary mantissa size
                    if rounding:
                        # convert bool to some default mantissa
                        if rounding is True:
                            rounding = 10
                        elif out.dtype.type is np.float32:
                            rounding = min(rounding, 15)
                        elif out.dtype.type is np.float16:
                            rounding = min(rounding, 10)

                        kernel_args.append(max(rounding, 1))

                    # speed up deep reduction by using more than 32 threads
                    if reduction and not argminmax:
                        if red_depth >= 4096:
                            threads = 1024
                        elif red_depth >= 2048:
                            threads = 512
                        elif red_depth >= 1024:
                            threads = 256
                        elif red_depth >= 512:
                            threads = 128
                        elif red_depth >= 256:
                            threads = 64

                    # speed up deep broadcast by using more than 32 threads
                    elif not (reduction or transpose) and max_shape[1] >= 512:
                        threads = 256

                    type_args.append((op_name, op_cnt, rounding > 0, threads))

                elif op_name == "onehot":

                    # flip the one hot axis if reducing axis=0
                    hot_axis = arg["axis"] if axis else 1 - arg["axis"]

                    type_args.append((op_name, op_cnt, hot_axis))
                    shape_stack.append(max_shape)
                    kernel_args.append(arg["idx"].gpudata)

                else:
                    type_args.append((op_name, op_cnt))
                    shape_stack.append(max_shape)

            elif op_name in _reduction_ops:

                shape = list(shape_stack.pop())

                red_depth = max(red_depth, shape[axis])

                # Allow a new axis size if doing post reduction broadcast.
                # So we need to know the axis size prior to reduction.
                kernel_args.append(shape[axis])
                type_args.append((op_name, op_cnt))

                # reduce the current shape
                shape[axis] = 1

                # udpate the current shape state
                shape_stack.append(shape)

            else:
                raise TypeError("%s is not a valid operation" % op_name)

            op_cnt += 1

        else:
            raise TypeError("args must be instance of GPUTensor, int, float, or dict (for operators)")

    # for s in argsprint:   print s
    # for s in kernel_args: print s
    # for s in type_args:   print s

    # get or create the kernel in the memoize cache
    kernel = _get_compound_kernel(tuple(type_args))

    # import ipdb; ipdb.set_trace()

    shared = threads * 4 if reduction and threads > 32 else 0

    if out.backend.bench > 1:
        repeat = out.backend.bench
        start, end = ng._get_events()
        start.record(out.backend.stream)
    else:
        repeat = 1

    for r in range(repeat):

        # call the kernel with the number of blocks set as the size of the off-axis
        # Maxwell does well with 32 thread sized blocks, no need to autotune.
        # for a in kernel_args: print a
        kernel.prepared_async_call(
            (max_shape[1 - axis], 1, 1), (threads, 1, 1), out.backend.stream, *kernel_args, shared_size=shared
        )

    if out.backend.bench > 1:
        end.record(out.backend.stream)
        end.synchronize()
        msecs = end.time_since(start) / repeat
        print(
            "%7.3f msecs shape(%d,%d) blk,thd(%d,%d) %s"
            % (msecs, max_shape[0], max_shape[1], max_shape[1 - axis], threads, kernel.name)
        )

    return out
Example #2
0
def call_compound_kernel(rand_state, *args):
    """
    Pass in a list of GPUTensor objects, constants and operators in postfix notation..

    C +=  2.5 * A * B + 1
    call_compound_ew_kernel(C, 2.5, A, "mul", B, "mul", 1, "add", C, "add", "assign")
    """
    out         = None
    arg_cnt     = 0
    op_cnt      = 0
    array_ids   = {}
    kernel_args = [ rand_state, ]
    type_args   = []
    shape_stack = []
    # Apply reduction constraints and determine thread axis
    # Blocks will be allocated counter to this axis
    # Also detect if this is a broadcast or transpose op.
    reduction = False
    broadcast = False
    transpose = False
    axis = 1
    for arg in args:
        if type(arg) is dict:
            if arg["op"] in _reduction_ops:
                
                # To reduce a whole tensor (axis=None) reduce along each axis in succession.
                if arg.get("axis",None) not in (0,1):
                    raise ValueError("Only reduction along an axis currently supported")

                # Keep axis values consistent within the same kernel
                if reduction is True:
                    if arg["axis"] != axis:
                        raise ValueError("Reduction only allowed along one axis per kernel.")
                else:
                    reduction = True
                    axis = arg["axis"]
        elif isinstance(arg, ng.GPUTensor):
            if len(arg.shape) < 2 or arg.shape[0] == 1 or arg.shape[1] == 1:
                broadcast = True
            elif arg.is_trans:
                transpose = True


    # If reducing along axis 0 we need to reverse all strides.
    # Each block gets a column and the threads work down the columns.
    stride_order = 1 if axis == 1 else -1

    for arg in args:

        # Array operand
        if isinstance(arg, ng.GPUTensor):

            # use the more efficient dimensions if this is a plain ew op.
            if len(arg.shape) == 2 and (broadcast or reduction or transpose):
                shape   = arg.shape
                strides = arg.strides
            else:
                shape   = arg.shape_ew
                strides = arg.strides_ew

            # If same array is passed in multiple times to expression,
            # consolidate them into one kernel argument.
            if arg in array_ids:
                indx = array_ids[arg]
            else:

                # The first array passed in should be the output.
                # It's ok if this array is duplicated as the first instance
                # needs to be a mutable pointer.
                # A subsequent instance of out (if present) will be a const pointer.
                if out is None:
                    out  = arg
                    indx = arg_cnt
                else:
                    indx = array_ids[arg] = arg_cnt
                arg_cnt += 1

                # support transposed striding or reduction along an axis
                # let C pointer arithmetic handle itemsize for us
                strides = [s // arg.dtype.itemsize for s in strides[::stride_order]]

                # special case of reducing and outputing along axis=0
                if arg is out and axis == 0 and shape[0] == 1:
                    strides[0] = 1
                    strides[1] = 0
                else:
                    # support broadcast of a row vector
                    if shape[0] == 1: strides[0] = 0
                    
                    # If we're traversing down the columns and this tensor has only one column,
                    # we preserve the col_stride to allow us to jump to the next row.
                    # This is probably a hack so maybe investigate this further.
                    if axis == 1:
                        # For the common case of traversing down the rows, zero the stride to 
                        # support broadcast of column vector.
                        if shape[1] == 1: strides[1] = 0

                kernel_args.extend((arg.gpudata, strides[0], strides[1]))

            type_args.append((ng.GPUTensor, indx, arg.dtype.str[1:]))

            shape_stack.append(shape)

        # Constant operand
        elif type(arg) in (int, float):

            kernel_args.append(float(arg))
            type_args.append((float, arg_cnt))
            shape_stack.append((1,1))
            arg_cnt += 1

        # Operation
        elif type(arg) is dict:

            op_name = arg["op"]

            if op_name in _float_ops:
                
                # we need to do the shape arithemtic for the current operation
                max_shape = [1,1]
                for op_num in range(_float_ops[op_name][0]):
                    shape = shape_stack.pop()
                    for i in range(2):
                        if shape[i] != max_shape[i]:
                            # support broadcast
                            # TODO: don't allow output tensor itself to be broadcastable.
                            # The final output is fine as a broadcast, for example assigning a constant.
                            # You just dont want a tensor being assigned to a smaller shape.
                            if shape[i] == 1 or max_shape[i] == 1:
                                max_shape[i] = max(max_shape[i], shape[i])
                            else:
                                raise TypeError("Input shape:%s not compatible" % (shape,))

                if op_name == "assign":

                    # break deep broadcast operations up into pieces tracked with blockId.y
                    if not (reduction or transpose) and max_shape[1] >= 512:
                        gridY = int(ceil(max_shape[1] / 256.))
                        assert gridY < 2**16
                    else:
                        gridY = 1
                    
                    # the axis dim is the thread loop stop condition
                    kernel_args.append(max_shape[axis])

                    rounding = out.rounding
                    
                    # support rounding to arbitrary mantissa size
                    if rounding:
                        # convert bool to some default mantissa
                        if rounding is True:
                            rounding = 10
                        elif out.dtype.type is np.float32:
                            rounding = min(rounding,22)
                        elif out.dtype.type is np.float16:
                            rounding = min(rounding,10)

                        kernel_args.append(max(rounding,1))

                    type_args.append((op_name, op_cnt, rounding > 0, gridY > 1))

                else:
                    type_args.append((op_name, op_cnt))
                    shape_stack.append(max_shape)

            elif op_name in _reduction_ops:

                shape = list(shape_stack.pop())

                # Allow a new axis size if doing post reduction broadcast.
                # So we need to know the axis size prior to reduction.
                kernel_args.append(shape[axis])
                type_args.append((op_name, op_cnt))

                # reduce the current shape
                shape[axis] = 1

                # udpate the current shape state
                shape_stack.append(shape)

            else:
                raise TypeError("%s is not a valid operation" % op_name)
            
            op_cnt += 1

        else:
            raise TypeError("args must be instance of GPUTensor, int, float, or dict (for operators)")

    # print "\n".join(str(s) for s in args)
    # print "\n"
    # print "\n".join(str(s) for s in kernel_args)
    # print "\n"
    # print "\n".join(str(s) for s in type_args)

    # get or create the kernel in the memoize cache
    kernel = _get_compound_kernel(tuple(type_args))

    #import ipdb; ipdb.set_trace()

    if out.backend.bench:
        repeat = out.backend.bench
        start, end = ng._get_events()
        start.record(out.backend.stream)
    else:
        repeat = 1

    for r in range(repeat):

        # call the kernel with the number of blocks set as the size of the off-axis
        # Maxwell does well with 32 thread sized blocks, no need to autotune.
        #for a in kernel_args: print a
        kernel.prepared_async_call((max_shape[1-axis],gridY,1), (32,1,1), out.backend.stream, *kernel_args)

    if out.backend.bench:
        end.record(out.backend.stream)
        end.synchronize()
        msecs = end.time_since(start) / repeat
        print("%7.3f msecs shape(%d,%d) grid(%d,%d) %s" % (msecs, max_shape[0], max_shape[1], max_shape[1-axis], gridY, kernel.name))

    return out