def call_compound_kernel(rand_state, *args): """ Pass in a list of GPUTensor objects, constants and operators in postfix notation.. C += 2.5 * A * B + 1 call_compound_ew_kernel(C, 2.5, A, "mul", B, "mul", 1, "add", C, "add", "assign") """ out = None arg_cnt = 0 op_cnt = 0 array_ids = {} const_ids = {} kernel_args = [rand_state] type_args = [] shape_stack = [] threads = 32 red_depth = 0 # Apply reduction constraints and determine thread axis # Blocks will be allocated counter to this axis # Also detect if this is a broadcast or transpose op. contiguous = True reduction = False broadcast = False transpose = False argminmax = False takeop = False axis = 1 for arg in args: if type(arg) is dict: op_name = arg["op"] if op_name in _reduction_ops: if op_name[0:3] == "arg": argminmax = True # To reduce a whole tensor (axis=None) reduce along each axis in succession. if arg.get("axis", None) not in (0, 1): raise ValueError("Only reduction along an axis currently supported") # Keep axis values consistent within the same kernel if reduction is True: if arg["axis"] != axis: raise ValueError("Reduction only allowed along one axis per kernel.") else: reduction = True axis = arg["axis"] elif op_name == "onehot": takeop = True elif isinstance(arg, ng.GPUTensor): if len(arg.shape) < 2 or arg.shape[0] == 1 or arg.shape[1] == 1: broadcast = True elif arg.is_trans: transpose = True elif arg.take_array: takeop = True elif not arg.is_contiguous: contiguous = False # If reducing along axis 0 we need to reverse all strides. # Each block gets a column and the threads work down the columns. stride_order = 1 if axis == 1 else -1 for arg in args: # Array operand if isinstance(arg, ng.GPUTensor): # for complex operations, use the native dimensions if broadcast or reduction or transpose or takeop or not contiguous: if len(arg.shape) == 2: shape = arg.shape strides = list(arg.strides[::stride_order]) else: raise ValueError( "Operations that are not simple elementwise are only currently supported in 2 dimensions." ) # use more efficient 2d dimensions if this is a plain ew op. else: shape, strides = _get_fast_ew_dims(arg.size) strides = list(strides[::stride_order]) # If same array is passed in multiple times to expression, # consolidate them into one kernel argument. if arg in array_ids: indx = array_ids[arg] else: # The first array passed in should be the output. # It's ok if this array is duplicated as the first instance # needs to be a mutable pointer. # A subsequent instance of out (if present) will be a const pointer. if out is None: out = arg indx = arg_cnt else: indx = array_ids[arg] = arg_cnt arg_cnt += 1 # support broadcast if shape[0] == 1: strides[1 - axis] = 0 if shape[1] == 1: strides[axis] = 0 kernel_args.extend((arg.gpudata, strides[0], strides[1])) # fancy indexing/take if arg.take_array: kernel_args.append(arg.take_array[0].gpudata) # swap the take axis when reducing axis=0 # also add 1 to distinguish between no take operations if arg.take_array: if axis != 1: take_axis = 2 - arg.take_array[1] else: take_axis = arg.take_array[1] + 1 # no take operation else: take_axis = 0 type_args.append((ng.GPUTensor, indx, arg.dtype.str[1:], take_axis)) shape_stack.append(shape) # Constant operand elif type(arg) in (int, float): arg = float(arg) if arg in const_ids: indx = const_ids[arg] else: indx = const_ids[arg] = arg_cnt arg_cnt += 1 kernel_args.append(arg) type_args.append((float, indx)) shape_stack.append((1, 1)) # Operation elif type(arg) is dict: op_name = arg["op"] if op_name in _float_ops: # we need to do the shape arithemtic for the current operation max_shape = [1, 1] for op_num in range(_float_ops[op_name][0]): shape = shape_stack.pop() for i in range(2): if shape[i] != max_shape[i]: # support broadcast # TODO: don't allow output tensor itself to be broadcastable. # The final output is fine as a broadcast, for example assigning a constant. # You just dont want a tensor being assigned to a smaller shape. if shape[i] == 1 or max_shape[i] == 1: max_shape[i] = max(max_shape[i], shape[i]) else: raise TypeError("Input shape:%s not compatible" % (shape,)) if op_name == "assign": # the axis dim is the thread loop stop condition kernel_args.append(max_shape[axis]) rounding = out.rounding # support rounding to arbitrary mantissa size if rounding: # convert bool to some default mantissa if rounding is True: rounding = 10 elif out.dtype.type is np.float32: rounding = min(rounding, 15) elif out.dtype.type is np.float16: rounding = min(rounding, 10) kernel_args.append(max(rounding, 1)) # speed up deep reduction by using more than 32 threads if reduction and not argminmax: if red_depth >= 4096: threads = 1024 elif red_depth >= 2048: threads = 512 elif red_depth >= 1024: threads = 256 elif red_depth >= 512: threads = 128 elif red_depth >= 256: threads = 64 # speed up deep broadcast by using more than 32 threads elif not (reduction or transpose) and max_shape[1] >= 512: threads = 256 type_args.append((op_name, op_cnt, rounding > 0, threads)) elif op_name == "onehot": # flip the one hot axis if reducing axis=0 hot_axis = arg["axis"] if axis else 1 - arg["axis"] type_args.append((op_name, op_cnt, hot_axis)) shape_stack.append(max_shape) kernel_args.append(arg["idx"].gpudata) else: type_args.append((op_name, op_cnt)) shape_stack.append(max_shape) elif op_name in _reduction_ops: shape = list(shape_stack.pop()) red_depth = max(red_depth, shape[axis]) # Allow a new axis size if doing post reduction broadcast. # So we need to know the axis size prior to reduction. kernel_args.append(shape[axis]) type_args.append((op_name, op_cnt)) # reduce the current shape shape[axis] = 1 # udpate the current shape state shape_stack.append(shape) else: raise TypeError("%s is not a valid operation" % op_name) op_cnt += 1 else: raise TypeError("args must be instance of GPUTensor, int, float, or dict (for operators)") # for s in argsprint: print s # for s in kernel_args: print s # for s in type_args: print s # get or create the kernel in the memoize cache kernel = _get_compound_kernel(tuple(type_args)) # import ipdb; ipdb.set_trace() shared = threads * 4 if reduction and threads > 32 else 0 if out.backend.bench > 1: repeat = out.backend.bench start, end = ng._get_events() start.record(out.backend.stream) else: repeat = 1 for r in range(repeat): # call the kernel with the number of blocks set as the size of the off-axis # Maxwell does well with 32 thread sized blocks, no need to autotune. # for a in kernel_args: print a kernel.prepared_async_call( (max_shape[1 - axis], 1, 1), (threads, 1, 1), out.backend.stream, *kernel_args, shared_size=shared ) if out.backend.bench > 1: end.record(out.backend.stream) end.synchronize() msecs = end.time_since(start) / repeat print( "%7.3f msecs shape(%d,%d) blk,thd(%d,%d) %s" % (msecs, max_shape[0], max_shape[1], max_shape[1 - axis], threads, kernel.name) ) return out
def call_compound_kernel(rand_state, *args): """ Pass in a list of GPUTensor objects, constants and operators in postfix notation.. C += 2.5 * A * B + 1 call_compound_ew_kernel(C, 2.5, A, "mul", B, "mul", 1, "add", C, "add", "assign") """ out = None arg_cnt = 0 op_cnt = 0 array_ids = {} kernel_args = [ rand_state, ] type_args = [] shape_stack = [] # Apply reduction constraints and determine thread axis # Blocks will be allocated counter to this axis # Also detect if this is a broadcast or transpose op. reduction = False broadcast = False transpose = False axis = 1 for arg in args: if type(arg) is dict: if arg["op"] in _reduction_ops: # To reduce a whole tensor (axis=None) reduce along each axis in succession. if arg.get("axis",None) not in (0,1): raise ValueError("Only reduction along an axis currently supported") # Keep axis values consistent within the same kernel if reduction is True: if arg["axis"] != axis: raise ValueError("Reduction only allowed along one axis per kernel.") else: reduction = True axis = arg["axis"] elif isinstance(arg, ng.GPUTensor): if len(arg.shape) < 2 or arg.shape[0] == 1 or arg.shape[1] == 1: broadcast = True elif arg.is_trans: transpose = True # If reducing along axis 0 we need to reverse all strides. # Each block gets a column and the threads work down the columns. stride_order = 1 if axis == 1 else -1 for arg in args: # Array operand if isinstance(arg, ng.GPUTensor): # use the more efficient dimensions if this is a plain ew op. if len(arg.shape) == 2 and (broadcast or reduction or transpose): shape = arg.shape strides = arg.strides else: shape = arg.shape_ew strides = arg.strides_ew # If same array is passed in multiple times to expression, # consolidate them into one kernel argument. if arg in array_ids: indx = array_ids[arg] else: # The first array passed in should be the output. # It's ok if this array is duplicated as the first instance # needs to be a mutable pointer. # A subsequent instance of out (if present) will be a const pointer. if out is None: out = arg indx = arg_cnt else: indx = array_ids[arg] = arg_cnt arg_cnt += 1 # support transposed striding or reduction along an axis # let C pointer arithmetic handle itemsize for us strides = [s // arg.dtype.itemsize for s in strides[::stride_order]] # special case of reducing and outputing along axis=0 if arg is out and axis == 0 and shape[0] == 1: strides[0] = 1 strides[1] = 0 else: # support broadcast of a row vector if shape[0] == 1: strides[0] = 0 # If we're traversing down the columns and this tensor has only one column, # we preserve the col_stride to allow us to jump to the next row. # This is probably a hack so maybe investigate this further. if axis == 1: # For the common case of traversing down the rows, zero the stride to # support broadcast of column vector. if shape[1] == 1: strides[1] = 0 kernel_args.extend((arg.gpudata, strides[0], strides[1])) type_args.append((ng.GPUTensor, indx, arg.dtype.str[1:])) shape_stack.append(shape) # Constant operand elif type(arg) in (int, float): kernel_args.append(float(arg)) type_args.append((float, arg_cnt)) shape_stack.append((1,1)) arg_cnt += 1 # Operation elif type(arg) is dict: op_name = arg["op"] if op_name in _float_ops: # we need to do the shape arithemtic for the current operation max_shape = [1,1] for op_num in range(_float_ops[op_name][0]): shape = shape_stack.pop() for i in range(2): if shape[i] != max_shape[i]: # support broadcast # TODO: don't allow output tensor itself to be broadcastable. # The final output is fine as a broadcast, for example assigning a constant. # You just dont want a tensor being assigned to a smaller shape. if shape[i] == 1 or max_shape[i] == 1: max_shape[i] = max(max_shape[i], shape[i]) else: raise TypeError("Input shape:%s not compatible" % (shape,)) if op_name == "assign": # break deep broadcast operations up into pieces tracked with blockId.y if not (reduction or transpose) and max_shape[1] >= 512: gridY = int(ceil(max_shape[1] / 256.)) assert gridY < 2**16 else: gridY = 1 # the axis dim is the thread loop stop condition kernel_args.append(max_shape[axis]) rounding = out.rounding # support rounding to arbitrary mantissa size if rounding: # convert bool to some default mantissa if rounding is True: rounding = 10 elif out.dtype.type is np.float32: rounding = min(rounding,22) elif out.dtype.type is np.float16: rounding = min(rounding,10) kernel_args.append(max(rounding,1)) type_args.append((op_name, op_cnt, rounding > 0, gridY > 1)) else: type_args.append((op_name, op_cnt)) shape_stack.append(max_shape) elif op_name in _reduction_ops: shape = list(shape_stack.pop()) # Allow a new axis size if doing post reduction broadcast. # So we need to know the axis size prior to reduction. kernel_args.append(shape[axis]) type_args.append((op_name, op_cnt)) # reduce the current shape shape[axis] = 1 # udpate the current shape state shape_stack.append(shape) else: raise TypeError("%s is not a valid operation" % op_name) op_cnt += 1 else: raise TypeError("args must be instance of GPUTensor, int, float, or dict (for operators)") # print "\n".join(str(s) for s in args) # print "\n" # print "\n".join(str(s) for s in kernel_args) # print "\n" # print "\n".join(str(s) for s in type_args) # get or create the kernel in the memoize cache kernel = _get_compound_kernel(tuple(type_args)) #import ipdb; ipdb.set_trace() if out.backend.bench: repeat = out.backend.bench start, end = ng._get_events() start.record(out.backend.stream) else: repeat = 1 for r in range(repeat): # call the kernel with the number of blocks set as the size of the off-axis # Maxwell does well with 32 thread sized blocks, no need to autotune. #for a in kernel_args: print a kernel.prepared_async_call((max_shape[1-axis],gridY,1), (32,1,1), out.backend.stream, *kernel_args) if out.backend.bench: end.record(out.backend.stream) end.synchronize() msecs = end.time_since(start) / repeat print("%7.3f msecs shape(%d,%d) grid(%d,%d) %s" % (msecs, max_shape[0], max_shape[1], max_shape[1-axis], gridY, kernel.name)) return out