Exemple #1
0
 def c_support_code_apply(self, node, nodename):
     dtype_x = node.inputs[0].dtype
     dtype_sm = node.outputs[0].dtype
     ret1 = nvcc_kernel(
         "kSoftmax_%s" % nodename,
         params=[
             "int M",
             "int N",
             "const npy_%(dtype_x)s * x",
             "const int sx0",
             "const int sx1",
             "npy_%(dtype_sm)s * sm",
             "const int sm_s0",
             "const int sm_s1",
         ],
         body=[
             "extern __shared__ npy_%(dtype_sm)s buf[]",
             "npy_%(dtype_sm)s * buf2 = buf + N",
             "for (int blockIDX = blockIdx.x; blockIDX < M;" "     blockIDX += gridDim.x){",
             "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
             "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
             "buf2[tx] = buf[tx]",
             "}",
             "__syncthreads()",
             inline_softmax("N", "buf", "buf2", "threadIdx.x", "blockDim.x", dtype_sm),
             "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
             # This set all value correctly
             "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
             "}",
             "__syncthreads()",
             "}",
         ],
     )
     ret2 = nvcc_kernel(
         "kSoftmax_fixed_shared%s" % nodename,
         params=[
             "int M",
             "int N",
             "const npy_%(dtype_x)s * x",
             "const int sx0",
             "const int sx1",
             "npy_%(dtype_sm)s * sm",
             "const int sm_s0",
             "const int sm_s1",
         ],
         body=[
             "extern __shared__ npy_%(dtype_sm)s buf[]",
             "for (int blockIDX = blockIdx.x; blockIDX < M;" "     blockIDX += gridDim.x){",
             "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
             "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
             inline_softmax_fixed_shared(
                 "N", "buf", "x_ptr", "sx1", "sm_ptr", "sm_s1", "threadIdx.x", "blockDim.x", dtype=dtype_sm
             ),
             "__syncthreads()",
             "}",
         ],
     )
     return (ret1 + "\n" + ret2) % locals()
Exemple #2
0
 def c_support_code_apply(self, node, nodename):
     dtype_x = node.inputs[0].dtype
     dtype_sm = node.outputs[0].dtype
     ret1 = nvcc_kernel(
         "kSoftmax_%s" % nodename,
         params=[
             'int M', 'int N', 'const npy_%(dtype_x)s * x', 'const int sx0',
             'const int sx1', 'npy_%(dtype_sm)s * sm', 'const int sm_s0',
             'const int sm_s1'
         ],
         body=[
             "extern __shared__ npy_%(dtype_sm)s buf[]",
             "npy_%(dtype_sm)s * buf2 = buf + N",
             "for (int blockIDX = blockIdx.x; blockIDX < M;"
             "     blockIDX += gridDim.x){",
             "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
             "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
             "buf2[tx] = buf[tx]",
             "}",
             "__syncthreads()",
             inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x',
                            dtype_sm),
             "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
             # This set all value correctly
             "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
             "}",
             "__syncthreads()",
             "}",
         ])
     ret2 = nvcc_kernel(
         "kSoftmax_fixed_shared%s" % nodename,
         params=[
             'int M', 'int N', 'const npy_%(dtype_x)s * x', 'const int sx0',
             'const int sx1', 'npy_%(dtype_sm)s * sm', 'const int sm_s0',
             'const int sm_s1'
         ],
         body=[
             "extern __shared__ npy_%(dtype_sm)s buf[]",
             "for (int blockIDX = blockIdx.x; blockIDX < M;"
             "     blockIDX += gridDim.x){",
             "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
             "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
             inline_softmax_fixed_shared('N',
                                         'buf',
                                         'x_ptr',
                                         'sx1',
                                         'sm_ptr',
                                         'sm_s1',
                                         'threadIdx.x',
                                         'blockDim.x',
                                         dtype=dtype_sm),
             "__syncthreads()",
             "}",
         ])
     return (ret1 + "\n" + ret2) % locals()
Exemple #3
0
 def c_support_code_apply(self, node, nodename):
     dtype_x = node.inputs[0].dtype
     dtype_b = node.inputs[1].dtype
     dtype_sm = node.outputs[0].dtype
     ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
             params=['int M', 'int N',
                     'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
                     'const npy_%(dtype_b)s * b', 'const int sb0',
                     'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
             body=[
                 "extern __shared__ npy_%(dtype_sm)s buf[]",
                 "npy_%(dtype_sm)s * buf2 = buf + N",
                 "for (int blockIDX = blockIdx.x; blockIDX < M;"
                 "     blockIDX += gridDim.x){",
                   "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                      "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
                      "buf[tx] += b[tx * sb0]",
                      "buf2[tx] = buf[tx]",
                   "}",
                    "__syncthreads()",
                    inline_softmax('N', 'buf', 'buf2',
                                   'threadIdx.x', 'blockDim.x', dtype_sm),
                   "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                      "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
                   "}",
                   "__syncthreads()",
                 "}",
                 ])
     ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
                        params=['int M', 'int N',
                                'const npy_%(dtype_x)s * x',
                                'const int sx0', 'const int sx1',
                                'const npy_%(dtype_b)s * b', 'const int sb0',
                                'npy_%(dtype_sm)s * sm',
                                'const int sm_s0', 'const int sm_s1'],
                        body=[
                            "extern __shared__ npy_%(dtype_sm)s buf[]",
                            "for (int blockIDX = blockIdx.x; blockIDX < M;"
                            "     blockIDX += gridDim.x){",
                            "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
                            "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
                            inline_softmax_fixed_shared('N', 'buf',
                                                        'x_ptr', 'sx1',
                                                        'sm_ptr', 'sm_s1',
                                                        'threadIdx.x',
                                                        'blockDim.x',
                                                        'b', 'sb0',
                                                        dtype_sm),
                            "__syncthreads()",
                            "}",
                        ])
     return (ret1 + "\n" + ret2) % locals()