Esempio n. 1
0
def call_compound_kernel(rand_state, compute_capability, *args):
    """
    Pass in a list of GPUTensor objects, constants and operators in postfix notation..

    C +=  2.5 * A * B + 1
    call_compound_ew_kernel(C, 2.5, A, "mul", B, "mul", 1, "add", C, "add", "assign")
    """
    out = None
    arg_cnt = 0
    op_cnt = 0
    array_ids = {}
    const_ids = {}
    kernel_args = [
        rand_state,
    ]
    type_args = []
    shape_stack = []
    threads = 32
    red_depth = 0
    # Apply reduction constraints and determine thread axis
    # Blocks will be allocated counter to this axis
    # Also detect if this is a broadcast or transpose op.
    contiguous = True
    reduction = False
    broadcast = False
    transpose = False
    argminmax = False
    takeop = False
    axis = 1
    for arg in args:
        if type(arg) is dict:
            op_name = arg["op"]
            if op_name in _reduction_ops:

                if op_name[0:3] == "arg":
                    argminmax = True

                # To reduce a whole tensor (axis=None) reduce along each axis
                # in succession.
                if arg.get("axis", None) not in (0, 1):
                    raise ValueError(
                        "Only reduction along an axis currently supported")

                # Keep axis values consistent within the same kernel
                if reduction is True:
                    if arg["axis"] != axis:
                        raise ValueError(
                            "Reduction only allowed along one axis per kernel."
                        )
                else:
                    reduction = True
                    axis = arg["axis"]
            elif op_name == "onehot":
                takeop = True

        elif isinstance(arg, ng.GPUTensor):
            if len(arg.shape) < 2:
                broadcast = True
            elif (len(arg.shape) == 2
                  and (arg.shape[0] == 1 or arg.shape[1] == 1)):
                broadcast = True
            elif arg.is_trans:
                transpose = True
            elif arg.take_array:
                takeop = True
            elif not arg.is_contiguous:
                contiguous = False

    # If reducing along axis 0 we need to reverse all stridess.
    # Each block gets a column and the threads work down the columns.
    strides_order = 1 if axis == 1 else -1

    for arg in args:

        # Array operand
        if isinstance(arg, ng.GPUTensor):

            # for complex operations, use the native dimensions
            if broadcast or reduction or transpose or takeop or not contiguous:
                if len(arg.shape) == 2:
                    shape = arg.shape
                    strides = list(arg.strides[::strides_order])
                else:
                    raise ValueError(
                        "Operations that are not simple elementwise are only "
                        "currently supported in 2 dimensions.")

            # use more efficient 2d dimensions if this is a plain ew op.
            else:
                shape, strides = _get_fast_ew_dims(arg.size)
                strides = list(strides[::strides_order])

            # If same array is passed in multiple times to expression,
            # consolidate them into one kernel argument.
            if arg in array_ids:
                indx = array_ids[arg]
            else:

                # The first array passed in should be the output.
                # It's ok if this array is duplicated as the first instance
                # needs to be a mutable pointer.
                # A subsequent instance of out (if present) will be a const
                # pointer.
                if out is None:
                    out = arg
                    indx = arg_cnt
                else:
                    indx = array_ids[arg] = arg_cnt
                arg_cnt += 1

                # support broadcast
                # Need to use shape of base array to determin stride if this
                # operation is a take
                if arg.take_array:
                    if arg.base.shape[0] == 1:
                        strides[1 - axis] = 0
                    if arg.base.shape[1] == 1:
                        strides[axis] = 0
                else:
                    if shape[0] == 1:
                        strides[1 - axis] = 0
                    if shape[1] == 1:
                        strides[axis] = 0

                kernel_args.extend((arg.gpudata, strides[0], strides[1]))

                # fancy indexing/take
                if arg.take_array:
                    kernel_args.append(arg.take_array[0].gpudata)

            # swap the take axis when reducing axis=0
            # also add 1 to distinguish between no take operations
            if arg.take_array:
                if axis != 1:
                    take_axis = 2 - arg.take_array[1]
                else:
                    take_axis = arg.take_array[1] + 1
            # no take operation
            else:
                take_axis = 0

            type_args.append((ng.GPUTensor, indx, arg.dtype.str[1:], take_axis,
                              shape[axis] == 1))

            shape_stack.append(shape)

        # Constant operand
        elif type(arg) in (int, float):

            arg = float(arg)
            if arg in const_ids:
                indx = const_ids[arg]
            else:
                indx = const_ids[arg] = arg_cnt
                arg_cnt += 1

                kernel_args.append(arg)

            type_args.append((float, indx))
            shape_stack.append((1, 1))

        # Operation
        elif type(arg) is dict:

            op_name = arg["op"]

            if op_name in _float_ops:

                # we need to do the shape arithemtic for the current operation
                max_shape = [1, 1]
                for op_num in range(_float_ops[op_name][0]):
                    shape = shape_stack.pop()
                    for i in range(2):
                        if shape[i] != max_shape[i]:
                            # support broadcast
                            # TODO: don't allow output tensor itself to be broadcastable.
                            # The final output is fine as a broadcast, for example
                            # assigning a constant.
                            # You just dont want a tensor being assigned to a
                            # smaller shape.
                            if shape[i] == 1 or max_shape[i] == 1:
                                max_shape[i] = max(max_shape[i], shape[i])
                            else:
                                raise TypeError(
                                    "Input shape:%s not compatible" %
                                    (shape, ))

                if op_name == "assign":

                    # the axis dim is the thread loop stop condition
                    kernel_args.append(max_shape[axis])

                    rounding = out.rounding

                    # support rounding to arbitrary mantissa size
                    if rounding:
                        # convert bool to some default mantissa
                        if rounding is True:
                            rounding = 10
                        elif out.dtype.type is np.float32:
                            rounding = min(rounding, 15)
                        elif out.dtype.type is np.float16:
                            rounding = min(rounding, 10)

                        kernel_args.append(max(rounding, 1))

                    # speed up deep reduction by using more than 32 threads
                    if not argminmax:
                        if reduction:
                            if red_depth >= 256:
                                threads = 64

                            # Try to bring this code back after figuring out race conditions
                            # if red_depth >= 4096:
                            #     threads = 1024
                            # elif red_depth >= 2048:
                            #     threads = 512
                            # elif red_depth >= 1024:
                            #     threads = 256
                            # elif red_depth >= 512:
                            #     threads = 128
                            # elif red_depth >= 256:
                            #     threads = 64
                        # speed up deep broadcast by using more than 32 threads
                        elif not (reduction
                                  or transpose) and max_shape[1] >= 512:
                            threads = 256

                    type_args.append((op_name, op_cnt, rounding > 0, threads))

                elif op_name == "onehot":

                    # flip the one hot axis if reducing axis=0
                    hot_axis = arg["axis"] if axis else 1 - arg["axis"]

                    type_args.append((op_name, op_cnt, hot_axis))
                    shape_stack.append(max_shape)
                    kernel_args.append(arg["idx"].gpudata)

                else:
                    type_args.append((op_name, op_cnt))
                    shape_stack.append(max_shape)

            elif op_name in _reduction_ops:

                shape = list(shape_stack.pop())

                red_depth = max(red_depth, shape[axis])

                # Allow a new axis size if doing post reduction broadcast.
                # So we need to know the axis size prior to reduction.
                kernel_args.append(shape[axis])
                type_args.append((op_name, op_cnt))

                # reduce the current shape
                shape[axis] = 1

                # udpate the current shape state
                shape_stack.append(shape)

            else:
                raise TypeError("%s is not a valid operation" % op_name)

            op_cnt += 1

        else:
            raise TypeError(
                "args must be instance of GPUTensor, int, float, or dict (for operators)"
            )

    # for s in argsprint:   print s
    # for s in kernel_args: print s
    # for s in type_args:   print s

    # get or create the kernel in the memoize cache
    kernel = _get_compound_kernel(tuple(type_args), compute_capability)

    shared = threads * 4 if reduction and threads > 32 else 0

    if out.backend.bench > 1:
        repeat = out.backend.bench
        start, end = ng._get_events()
        start.record(out.backend.stream)
    else:
        repeat = 1

    for r in range(repeat):

        # call the kernel with the number of blocks set as the size of the off-axis
        # Maxwell does well with 32 thread sized blocks, no need to autotune.
        # for a in kernel_args: print a
        kernel.prepared_async_call((max_shape[1 - axis], 1, 1),
                                   (threads, 1, 1),
                                   out.backend.stream,
                                   *kernel_args,
                                   shared_size=shared)

    if out.backend.bench > 1:
        end.record(out.backend.stream)
        end.synchronize()
        msecs = end.time_since(start) / repeat
        print("%7.3f msecs shape(%d,%d) blk,thd(%d,%d) %s" %
              (msecs, max_shape[0], max_shape[1], max_shape[1 - axis], threads,
               kernel.name))

    return out
Esempio n. 2
0
def call_compound_kernel(rand_state, *args):
    """
    Pass in a list of GPUTensor objects, constants and operators in postfix notation..

    C +=  2.5 * A * B + 1
    call_compound_ew_kernel(C, 2.5, A, "mul", B, "mul", 1, "add", C, "add", "assign")
    """
    out = None
    arg_cnt = 0
    op_cnt = 0
    array_ids = {}
    const_ids = {}
    kernel_args = [rand_state, ]
    type_args = []
    shape_stack = []
    threads = 32
    red_depth = 0
    # Apply reduction constraints and determine thread axis
    # Blocks will be allocated counter to this axis
    # Also detect if this is a broadcast or transpose op.
    contiguous = True
    reduction = False
    broadcast = False
    transpose = False
    argminmax = False
    takeop = False
    axis = 1
    for arg in args:
        if type(arg) is dict:
            op_name = arg["op"]
            if op_name in _reduction_ops:

                if op_name[0:3] == "arg":
                    argminmax = True

                # To reduce a whole tensor (axis=None) reduce along each axis
                # in succession.
                if arg.get("axis", None) not in (0, 1):
                    raise ValueError(
                        "Only reduction along an axis currently supported")

                # Keep axis values consistent within the same kernel
                if reduction is True:
                    if arg["axis"] != axis:
                        raise ValueError(
                            "Reduction only allowed along one axis per kernel.")
                else:
                    reduction = True
                    axis = arg["axis"]
            elif op_name == "onehot":
                takeop = True

        elif isinstance(arg, ng.GPUTensor):
            if len(arg.shape) < 2:
                broadcast = True
            elif (len(arg.shape) == 2 and (arg.shape[0] == 1 or arg.shape[1] == 1)):
                broadcast = True
            elif arg.is_trans:
                transpose = True
            elif arg.take_array:
                takeop = True
            elif not arg.is_contiguous:
                contiguous = False

    # If reducing along axis 0 we need to reverse all stridess.
    # Each block gets a column and the threads work down the columns.
    strides_order = 1 if axis == 1 else -1

    for arg in args:

        # Array operand
        if isinstance(arg, ng.GPUTensor):

            # for complex operations, use the native dimensions
            if broadcast or reduction or transpose or takeop or not contiguous:
                if len(arg.shape) == 2:
                    shape = arg.shape
                    strides = list(arg.strides[::strides_order])
                else:
                    raise ValueError(
                        "Operations that are not simple elementwise are only "
                        "currently supported in 2 dimensions.")

            # use more efficient 2d dimensions if this is a plain ew op.
            else:
                shape, strides = _get_fast_ew_dims(arg.size)
                strides = list(strides[::strides_order])

            # If same array is passed in multiple times to expression,
            # consolidate them into one kernel argument.
            if arg in array_ids:
                indx = array_ids[arg]
            else:

                # The first array passed in should be the output.
                # It's ok if this array is duplicated as the first instance
                # needs to be a mutable pointer.
                # A subsequent instance of out (if present) will be a const
                # pointer.
                if out is None:
                    out = arg
                    indx = arg_cnt
                else:
                    indx = array_ids[arg] = arg_cnt
                arg_cnt += 1

                # support broadcast
                if shape[0] == 1:
                    strides[1 - axis] = 0
                if shape[1] == 1:
                    strides[axis] = 0

                kernel_args.extend((arg.gpudata, strides[0], strides[1]))

                # fancy indexing/take
                if arg.take_array:
                    kernel_args.append(arg.take_array[0].gpudata)

            # swap the take axis when reducing axis=0
            # also add 1 to distinguish between no take operations
            if arg.take_array:
                if axis != 1:
                    take_axis = 2 - arg.take_array[1]
                else:
                    take_axis = arg.take_array[1] + 1
            # no take operation
            else:
                take_axis = 0

            type_args.append(
                (ng.GPUTensor, indx, arg.dtype.str[1:], take_axis, shape[axis]==1))

            shape_stack.append(shape)

        # Constant operand
        elif type(arg) in (int, float):

            arg = float(arg)
            if arg in const_ids:
                indx = const_ids[arg]
            else:
                indx = const_ids[arg] = arg_cnt
                arg_cnt += 1

                kernel_args.append(arg)

            type_args.append((float, indx))
            shape_stack.append((1, 1))

        # Operation
        elif type(arg) is dict:

            op_name = arg["op"]

            if op_name in _float_ops:

                # we need to do the shape arithemtic for the current operation
                max_shape = [1, 1]
                for op_num in range(_float_ops[op_name][0]):
                    shape = shape_stack.pop()
                    for i in range(2):
                        if shape[i] != max_shape[i]:
                            # support broadcast
                            # TODO: don't allow output tensor itself to be broadcastable.
                            # The final output is fine as a broadcast, for example
                            # assigning a constant.
                            # You just dont want a tensor being assigned to a
                            # smaller shape.
                            if shape[i] == 1 or max_shape[i] == 1:
                                max_shape[i] = max(max_shape[i], shape[i])
                            else:
                                raise TypeError(
                                    "Input shape:%s not compatible" % (shape,))

                if op_name == "assign":

                    # the axis dim is the thread loop stop condition
                    kernel_args.append(max_shape[axis])

                    rounding = out.rounding

                    # support rounding to arbitrary mantissa size
                    if rounding:
                        # convert bool to some default mantissa
                        if rounding is True:
                            rounding = 10
                        elif out.dtype.type is np.float32:
                            rounding = min(rounding, 15)
                        elif out.dtype.type is np.float16:
                            rounding = min(rounding, 10)

                        kernel_args.append(max(rounding, 1))

                    # speed up deep reduction by using more than 32 threads
                    if reduction and not argminmax:
                        if red_depth >= 4096:
                            threads = 1024
                        elif red_depth >= 2048:
                            threads = 512
                        elif red_depth >= 1024:
                            threads = 256
                        elif red_depth >= 512:
                            threads = 128
                        elif red_depth >= 256:
                            threads = 64

                    # speed up deep broadcast by using more than 32 threads
                    elif not (reduction or transpose) and max_shape[1] >= 512:
                        threads = 256

                    type_args.append((op_name, op_cnt, rounding > 0, threads))

                elif op_name == "onehot":

                    # flip the one hot axis if reducing axis=0
                    hot_axis = arg["axis"] if axis else 1 - arg["axis"]

                    type_args.append((op_name, op_cnt, hot_axis))
                    shape_stack.append(max_shape)
                    kernel_args.append(arg["idx"].gpudata)

                else:
                    type_args.append((op_name, op_cnt))
                    shape_stack.append(max_shape)

            elif op_name in _reduction_ops:

                shape = list(shape_stack.pop())

                red_depth = max(red_depth, shape[axis])

                # Allow a new axis size if doing post reduction broadcast.
                # So we need to know the axis size prior to reduction.
                kernel_args.append(shape[axis])
                type_args.append((op_name, op_cnt))

                # reduce the current shape
                shape[axis] = 1

                # udpate the current shape state
                shape_stack.append(shape)

            else:
                raise TypeError("%s is not a valid operation" % op_name)

            op_cnt += 1

        else:
            raise TypeError(
                "args must be instance of GPUTensor, int, float, or dict (for operators)")

    # for s in argsprint:   print s
    # for s in kernel_args: print s
    # for s in type_args:   print s

    # import ipdb; ipdb.set_trace()

    # get or create the kernel in the memoize cache
    kernel = _get_compound_kernel(tuple(type_args))

    shared = threads * 4 if reduction and threads > 32 else 0

    if out.backend.bench > 1:
        repeat = out.backend.bench
        start, end = ng._get_events()
        start.record(out.backend.stream)
    else:
        repeat = 1

    for r in range(repeat):

        # call the kernel with the number of blocks set as the size of the off-axis
        # Maxwell does well with 32 thread sized blocks, no need to autotune.
        # for a in kernel_args: print a
        kernel.prepared_async_call((max_shape[1 - axis], 1, 1),
                                   (threads, 1, 1), out.backend.stream,
                                   *kernel_args, shared_size=shared)

    if out.backend.bench > 1:
        end.record(out.backend.stream)
        end.synchronize()
        msecs = end.time_since(start) / repeat
        print("%7.3f msecs shape(%d,%d) blk,thd(%d,%d) %s" % (
            msecs, max_shape[0], max_shape[1], max_shape[1 - axis], threads, kernel.name))

    return out