def c_support_code_apply(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_sm = node.outputs[0].dtype ret1 = nvcc_kernel( "kSoftmax_%s" % nodename, params=[ "int M", "int N", "const npy_%(dtype_x)s * x", "const int sx0", "const int sx1", "npy_%(dtype_sm)s * sm", "const int sm_s0", "const int sm_s1", ], body=[ "extern __shared__ npy_%(dtype_sm)s buf[]", "npy_%(dtype_sm)s * buf2 = buf + N", "for (int blockIDX = blockIdx.x; blockIDX < M;" " blockIDX += gridDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "buf[tx] = x[blockIDX * sx0 + tx * sx1]", "buf2[tx] = buf[tx]", "}", "__syncthreads()", inline_softmax("N", "buf", "buf2", "threadIdx.x", "blockDim.x", dtype_sm), "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", # This set all value correctly "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}", "__syncthreads()", "}", ], ) ret2 = nvcc_kernel( "kSoftmax_fixed_shared%s" % nodename, params=[ "int M", "int N", "const npy_%(dtype_x)s * x", "const int sx0", "const int sx1", "npy_%(dtype_sm)s * sm", "const int sm_s0", "const int sm_s1", ], body=[ "extern __shared__ npy_%(dtype_sm)s buf[]", "for (int blockIDX = blockIdx.x; blockIDX < M;" " blockIDX += gridDim.x){", "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]", "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]", inline_softmax_fixed_shared( "N", "buf", "x_ptr", "sx1", "sm_ptr", "sm_s1", "threadIdx.x", "blockDim.x", dtype=dtype_sm ), "__syncthreads()", "}", ], ) return (ret1 + "\n" + ret2) % locals()
def c_support_code_apply(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_sm = node.outputs[0].dtype ret1 = nvcc_kernel( "kSoftmax_%s" % nodename, params=[ 'int M', 'int N', 'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1', 'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1' ], body=[ "extern __shared__ npy_%(dtype_sm)s buf[]", "npy_%(dtype_sm)s * buf2 = buf + N", "for (int blockIDX = blockIdx.x; blockIDX < M;" " blockIDX += gridDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "buf[tx] = x[blockIDX * sx0 + tx * sx1]", "buf2[tx] = buf[tx]", "}", "__syncthreads()", inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x', dtype_sm), "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", # This set all value correctly "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}", "__syncthreads()", "}", ]) ret2 = nvcc_kernel( "kSoftmax_fixed_shared%s" % nodename, params=[ 'int M', 'int N', 'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1', 'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1' ], body=[ "extern __shared__ npy_%(dtype_sm)s buf[]", "for (int blockIDX = blockIdx.x; blockIDX < M;" " blockIDX += gridDim.x){", "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]", "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]", inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1', 'sm_ptr', 'sm_s1', 'threadIdx.x', 'blockDim.x', dtype=dtype_sm), "__syncthreads()", "}", ]) return (ret1 + "\n" + ret2) % locals()
def c_support_code_apply(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_b = node.inputs[1].dtype dtype_sm = node.outputs[0].dtype ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename, params=['int M', 'int N', 'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1', 'const npy_%(dtype_b)s * b', 'const int sb0', 'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'], body=[ "extern __shared__ npy_%(dtype_sm)s buf[]", "npy_%(dtype_sm)s * buf2 = buf + N", "for (int blockIDX = blockIdx.x; blockIDX < M;" " blockIDX += gridDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "buf[tx] = x[blockIDX * sx0 + tx * sx1]", "buf[tx] += b[tx * sb0]", "buf2[tx] = buf[tx]", "}", "__syncthreads()", inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x', dtype_sm), "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}", "__syncthreads()", "}", ]) ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename, params=['int M', 'int N', 'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1', 'const npy_%(dtype_b)s * b', 'const int sb0', 'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'], body=[ "extern __shared__ npy_%(dtype_sm)s buf[]", "for (int blockIDX = blockIdx.x; blockIDX < M;" " blockIDX += gridDim.x){", "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]", "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]", inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1', 'sm_ptr', 'sm_s1', 'threadIdx.x', 'blockDim.x', 'b', 'sb0', dtype_sm), "__syncthreads()", "}", ]) return (ret1 + "\n" + ret2) % locals()