Example #1
0
    def getSourceString(self, solution):
        kernels = solution.getKernels()
        kernelNames = []
        for kernel in kernels:
            kernelName = self.kernelWriter.getKernelName(kernel)
            kernelNames.append(kernelName)

        s = ""
        t = ""
        # includes

        if not globalParameters["MergeFiles"]:
            solutionName = self.getSolutionName(solution)
            s += "#include \"%s.h\"\n" % solutionName
            #s += "#include \"MathTemplates.h\"\n"
            s += "\n"

        # solution function signature
        s += self.getSolutionSignature(solution)
        s += " {\n"
        t += "  "
        s += "%sTensileStatus status;\n" % (t)

        # hipFunction Struct
        if solution["KernelLanguage"] == "Assembly":
            s += "\n"
            s += "%s/* module function args */\n" % (t)
            s += "%sstruct {\n" % t
            t += "  "
            if globalParameters["DebugKernel"]:
                s += "%sunsigned int *debugBuffer;\n" % t
            solutionArgs = self.getArgList(solution["ProblemType"], True,
                                           False, False)
            for arg in solutionArgs:
                if arg[0] == "TensileHalf":
                    s += "%s%s %s[2];\n" % (t, arg[0], arg[1])
                else:
                    s += "%s%s %s;\n" % (t, arg[0], arg[1])

            if solution["PersistentKernel"]:
                # pass in the number of groups since not available in WG
                s += "%sunsigned int numGroupTiles0;\n" % t
                s += "%sunsigned int numGroupTiles1;\n" % t

            s += "%sunsigned int pad;\n" % t  # FIXME can this be removed?
            t = t[2:]
            s += "%s} hipFunctionArgs;\n" % t
            #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t
            s += "%ssize_t hipFunctionArgsSize = sizeof(hipFunctionArgs);\n" % t
            s += "%svoid *hipLaunchParams[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &hipFunctionArgs, HIP_LAUNCH_PARAM_BUFFER_SIZE, &hipFunctionArgsSize, HIP_LAUNCH_PARAM_END};\n" % t
            #s += "%sprintf(\"size: %%lu\\n\", sizeof(unsigned int));\n" % t
            #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t
            #for arg in solutionArgs:
            #  s += "%sprintf(\"%s: %%lu\\n\", static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)) - static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)));\n" % (t, arg[1], arg[1], solutionArgs[0][1])

        # NOTE: host compiler aligns size of structs to 64-bits (at least) and aligns the offset of pointers to 64-bits, therefore, having pointers which are not at the beginning of the struct may get padded/shifted by the host compiler and, therefore, not coppied correctly to gpu

        # kernels
        s += "\n%s/* kernels */\n" % (t)
        s += "%sconst unsigned int numKernels = %u; // 1 or 4\n" % (
            t, len(kernels))

        s += "%sint deviceId;\n" % (t)
        s += "%shipCtxGetDevice(&deviceId);\n" % (t)
        s += "%shipDeviceProp_t deviceProperties;\n" % (t)
        s += "%shipGetDeviceProperties( &deviceProperties, deviceId );\n" % (t)
        if solution["KernelLanguage"] == "Source" and globalParameters[
                "RuntimeLanguage"] == "OCL":
            s += "%sconst char *kernelSources[numKernels] = {\n" % (t)
            t += "  "
            for kernelIdx in range(0, len(kernelNames)):
                kernelName = kernelNames[kernelIdx]
                s += "%s%s_src%s\n" % (t, kernelName, \
                    "," if kernelIdx < len(kernels)-1 else "" )
            t = t[2:]
            s += "%s};\n" % (t)
            s += "%scl_kernel kernels[numKernels];\n" % (t)
            s += "%sconst char *buildOptions = \"-cl-std=cl2.0\";\n" % (t)
            s += "%sfor (unsigned int i = 0; i < numKernels; i++) {\n" % (t)
            s += "%s  tensileGetCompiledOpenCLKernel(\n" % (t)
            s += "%s      &kernels[i],\n" % (t)
            s += "%s      kernelSources[i],\n" % (t)
            s += "%s      stream,\n" % (t)
            s += "%s      buildOptions);\n" % (t)
            s += "%s}\n" % (t)

            if solution["GlobalSplitU"] > 1:
                for beta in solution.getKernelsBetaOnly():
                    kernelName = self.kernelWriter.getKernelNameBetaOnly(beta)
                    s += "%scl_kernel kernel_%s;\n" % (t, kernelName)
                    s += "%s  tensileGetCompiledOpenCLKernel(\n" % (t)
                    s += "%s      &kernel_%s,\n" % (t, kernelName)
                    s += "%s      %s_src,\n" % (t, kernelName)
                    s += "%s      stream,\n" % (t)
                    s += "%s      buildOptions);\n" % (t)

        elif solution["KernelLanguage"] == "Assembly":
            localStatic = True
            kernel = kernels[0]
            s += "%sint isa = deviceProperties.gcnArch;\n" % (t)
            s += "%shipFunction_t hipFunction;\n" % (t)

            kernelName = self.kernelWriter.getKernelName(kernel)
            s += t
            if localStatic:
                s += "%sstatic hipFunction_t *hipFunctions = nullptr;\n" % (t)
                s += "%sif ( !hipFunctions ) {\n" % (
                    t
                )  # not locking here means array might be double allocated and memory leak
                t += "  "
                s += "%sstatic std::mutex initFunctionsMutex;\n" % (t)
                s += "%sstd::lock_guard<std::mutex> initFunctionsLock(initFunctionsMutex);\n" % (
                    t)
                s += "%sif ( !hipFunctions ) {\n" % (
                    t
                )  # not locking here means array might be double allocated and memory leak
                t += "  "
                s += "%sstatic int numDevices = -1;\n" % (t)
                s += "%sstatus = hipGetDeviceCount( &numDevices );\n" % (t)
                s += "%shipFunction_t *tmp = new hipFunction_t[numDevices];\n" % (
                    t)
                s += "%sfor ( int i = 0; i < numDevices; i++) {\n" % (t)
                s += "%s  tmp[i] = nullptr;\n" % (t)
                s += "%s}\n" % (t)
                s += "%shipFunctions = tmp;\n" % (t)
                t = t[2:]
                s += "%s}\n" % (t)
                t = t[2:]
                s += "%s}\n" % (t)
                s += "%sif ( !hipFunctions[deviceId] ) {\n" % (t)
                t += "  "
                s += "%sstatic std::mutex loadModuleMutex;\n" % (t)
                s += "%sstd::lock_guard<std::mutex> loadModuleLock(loadModuleMutex);\n" % (
                    t)
                s += "%sif (!hipFunctions[deviceId]) {\n" % (t)
                t += "  "
                s += "%shipModule_t module = nullptr;\n" % (t)
                s += "%shipModuleLoadData(&module, %s_coba);\n" % (t,
                                                                   kernelName)
                s += "%shipModuleGetFunction(&hipFunctions[deviceId], module, \"%s\");\n" % (
                    t, kernelName)
                t = t[2:]
                s += "%s}\n" % (t)
                t = t[2:]
                s += "%s}\n" % (t)
                s += "%shipFunction = hipFunctions[deviceId];\n" % (t)
            else:
                s += "%stensileGetHipFunctionFromCodeObjectByteArray(\n" % (t)
                s += "%s    &hipFunction,\n" % (t)
                s += "%s    \"%s\",\n" % (t, kernelName)
                s += "%s    %s_coba); // code object byte array\n" % (
                    t, kernelName)

        typeName = solution["ProblemType"]["DataType"].toCpp()

        # index assignments
        s += "\n%s/* index assignments */\n" % (t)
        s += "%sconst unsigned int indexD0 = %u;\n" \
            % (t, solution["ProblemType"]["Index0"])
        s += "%sconst unsigned int indexD1 = %u;\n" \
            % (t, solution["ProblemType"]["Index1"])
        s += "%sconst unsigned int indexDU = %u;\n" \
            % (t, solution["ProblemType"]["IndexUnroll"])

        # num enqueues
        s += "\n%s/* num kernels */\n" % (t)
        s += "%sunsigned int numEnqueues[numKernels] = { 1" % (t)
        for i in range(1, len(kernels)):
            s += ", 1"
        s += " };\n"

        # grid size
        s += "\n%s/* grid sizes */\n" % (t)
        s += "%sconst unsigned int workDim = 3;\n" % (t)
        s += "%sconst unsigned int threadTile[2] = { %u, %u };\n" \
            % (t, solution["ThreadTile0"], solution["ThreadTile1"])
        s += "%sconst unsigned int groupSize[2] = { %u, %u };\n" \
            % (t, solution["SubGroup0"], solution["SubGroup1"])
        s += "%ssize_t localWorkSize[3] = { %3u, 1, 1 };\n" \
            % (t, solution["NumThreads"])
        s += "%ssize_t globalWorkSize[numKernels][3];\n" % (t)
        # grid size [2]
        s += "%sglobalWorkSize[0][2] = 1;\n" % (t)
        for i in range(0, solution["ProblemType"]["NumIndicesC"]):
            if i != solution["ProblemType"]["Index0"] and i != solution[
                    "ProblemType"]["Index1"]:
                s += "%sglobalWorkSize[0][2] *= size%s;\n" % (
                    t, self.indexChars[i])

        # grid size [0,1]
        s += "%sunsigned int sizeOfC0 = size%s;\n" % (t, \
            self.indexChars[solution["ProblemType"]["Index0"]])
        s += "%sunsigned int sizeOfC1 = size%s;\n" % (t, \
            self.indexChars[solution["ProblemType"]["Index1"]])
        s += "%sunsigned int macroTile0 = static_cast<unsigned int>(groupSize[0] * threadTile[0]);\n" % (
            t)
        s += "%sunsigned int macroTile1 = static_cast<unsigned int>(groupSize[1] * threadTile[1]);\n" % (
            t)
        s += "%sunsigned int totalWorkGroups0 = sizeOfC0 / macroTile0;\n" % (t)
        s += "%sunsigned int totalWorkGroups1 = sizeOfC1 / macroTile1;\n" % (t)

        if kernel["EdgeType"] != "None":
            s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % (
                t)
            s += "%sif (totalWorkGroups0*macroTile0 < sizeOfC0) { totalWorkGroups0++; }\n" % (
                t)
            s += "%sif (totalWorkGroups1*macroTile1 < sizeOfC1) { totalWorkGroups1++; }\n" % (
                t)
        if kernel["WorkGroupMappingType"] == "Z" and abs(
                kernel["WorkGroupMapping"]) == 2:
            s += "%sunsigned int totalWorkGroupsPow2 = totalWorkGroups0 > totalWorkGroups1 ? totalWorkGroups0 : totalWorkGroups1;\n" % (
                t)
            s += "%stotalWorkGroupsPow2--;\n" % (t)
            s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 1;\n" % (t)
            s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 2;\n" % (t)
            s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 4;\n" % (t)
            s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 8;\n" % (t)
            s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 16;\n" % (t)
            s += "%stotalWorkGroupsPow2++;\n" % (t)
            s += "%stotalWorkGroups0 = totalWorkGroupsPow2;\n" % (t)
            s += "%stotalWorkGroups1 = totalWorkGroupsPow2;\n" % (t)

        if solution["GlobalSplitU"] > 1:
            s += "%stotalWorkGroups1 *= %u; // GlobalSplitU\n" % (
                t, solution["GlobalSplitU"])
        if solution["PersistentKernel"]:
            s += "%sglobalWorkSize[0][0] = deviceProperties.multiProcessorCount * %u;\n" \
                    % (t, solution["PersistentKernel"])
            s += "%sglobalWorkSize[0][1] = 1;\n" % t
        else:
            s += "%sglobalWorkSize[0][0] = totalWorkGroups%u%s;\n" % (
                t, 0 if kernel["WorkGroupMapping"] > 0 else 1,
                "*localWorkSize[0]" if self.language == "OCL" else "")
            s += "%sglobalWorkSize[0][1] = totalWorkGroups%u%s;\n" % (
                t, 1 if kernel["WorkGroupMapping"] > 0 else 0,
                "*localWorkSize[1]" if self.language == "OCL" else "")

        # offsets
        s += "\n%s/* offsets */\n" % (t)
        s += "%sunsigned int offsets[numKernels][1][3];\n" % (t)
        for kernelIdx in range(0, len(kernels)):
            s += "%soffsets[%u][0][0] = offsetC; // tensorC\n" % (t, kernelIdx)
            s += "%soffsets[%u][0][1] = offsetA; // tensorA\n" % (t, kernelIdx)
            s += "%soffsets[%u][0][2] = offsetB; // tensorB\n" % (t, kernelIdx)

        # index sizes
        s += "\n%s/* index sizes */\n" % (t)
        s += "%sunsigned int sizes[numKernels][1][%u];\n" \
            % (t, solution["ProblemType"]["TotalIndices"])
        for kernelIdx in range(0, len(kernels)):
            kernel = kernels[kernelIdx]
            kernelName = self.kernelWriter.getKernelName(kernel)
            # free index sizes
            for i in range(0,solution["ProblemType"]["NumIndicesFree"] \
                + solution["ProblemType"]["NumIndicesBatch"] ):
                s += "%ssizes[%u][0][%u] = size%s;\n" \
                    % (t, kernelIdx, i, self.indexChars[i])
            # summation index sizes
            for i in range(solution["ProblemType"]["NumIndicesC"], \
                    solution["ProblemType"]["TotalIndices"] ):
                lastParam = i == solution["ProblemType"]["TotalIndices"] - 1
                s += "%ssizes[%u][0][%u] = size%s;\n" \
                    % (t, kernelIdx, i, self.indexChars[i])

        #s += "printf(\"Launching with grid=%zu_%zu problemGrid=%u_%u mt=%u_%u\\n\", globalWorkSize[0][0], globalWorkSize[0][1], totalWorkGroups0, totalWorkGroups1, macroTile0, macroTile1);\n"
        s += "\n"

        ########################################
        # Enqueue Beta-Only Kernel
        ########################################
        if solution["GlobalSplitU"] > 1:
            kernelNamesBetaOnly = []
            numStridesC = solution["ProblemType"]["NumIndicesC"] - \
                (0 if solution["ProblemType"]["UseInitialStrides"] else 1)
            for beta in solution.getKernelsBetaOnly():
                kernelName = self.kernelWriter.getKernelNameBetaOnly(beta)
                kernelNamesBetaOnly.append(kernelName)
            s += "%s// enqueue Beta-Only kernel\n" % (t)

            # grid sizes
            s += "%ssize_t localWorkSizeBetaOnly[3] = { 8, 8, 1};\n" % (t)
            s += "%ssize_t globalWorkSizeBetaOnly[3];\n" % (t)
            #s += "%sunsigned int sizeOfC0 = size%s;\n" % (t, \
            #    self.indexChars[solution["ProblemType"]["Index0"]])
            #s += "%sunsigned int sizeOfC1 = size%s;\n" % (t, \
            #    self.indexChars[solution["ProblemType"]["Index1"]])
            s += "%ssize_t totalWorkGroupsBetaOnly0 = sizeOfC0 / localWorkSizeBetaOnly[0];\n" % (
                t)
            s += "%ssize_t totalWorkGroupsBetaOnly1 = sizeOfC1 / localWorkSizeBetaOnly[1];\n" % (
                t)
            s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % (
                t)
            s += "%sif (totalWorkGroupsBetaOnly0*localWorkSizeBetaOnly[0] < sizeOfC0) { totalWorkGroupsBetaOnly0++; }\n" % (
                t)
            s += "%sif (totalWorkGroupsBetaOnly1*localWorkSizeBetaOnly[1] < sizeOfC1) { totalWorkGroupsBetaOnly1++; }\n" % (
                t)
            s += "%sglobalWorkSizeBetaOnly[0] = totalWorkGroupsBetaOnly0%s;\n" % (
                t,
                "*localWorkSizeBetaOnly[0]" if self.language == "OCL" else "")
            s += "%sglobalWorkSizeBetaOnly[1] = totalWorkGroupsBetaOnly1%s;\n" % (
                t,
                "*localWorkSizeBetaOnly[1]" if self.language == "OCL" else "")
            s += "%sglobalWorkSizeBetaOnly[2] = 1;\n" % (t)
            for i in range(0, solution["ProblemType"]["NumIndicesC"]):
                if i != solution["ProblemType"]["Index0"] and i != solution[
                        "ProblemType"]["Index1"]:
                    s += "%sglobalWorkSizeBetaOnly[2] *= size%s;\n" % (
                        t, self.indexChars[i])

            if solution["ProblemType"]["UseBeta"]:
                s += "%sbool betaZero = beta == 0;\n" % (t)
            if self.language == "OCL":
                if solution["ProblemType"]["UseBeta"]:
                    s += "%scl_kernel kernelBetaOnly = betaZero ? kernel_%s : kernel_%s;\n" \
                        % (t, kernelNamesBetaOnly[0], kernelNamesBetaOnly[1])
                else:
                    #s += "%sbool betaZero = true;\n" % (t)
                    s += "%scl_kernel kernelBetaOnly = kernel_%s;\n" \
                        % (t, kernelNamesBetaOnly[0])
                argIdx = 0
                s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % (
                    t, argIdx)
                argIdx += 1
                s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &offsetC ); tensileStatusCheck(status);\n" % (
                    t, argIdx)
                argIdx += 1
                # strides
                for i in range(0, numStridesC):
                    s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % (
                        t, argIdx, self.strideList[i])
                    argIdx += 1
                # sizes
                for i in range(0, solution["ProblemType"]["NumIndicesC"]):
                    s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (
                        t, argIdx, self.indexChars[i])
                    argIdx += 1
                # beta
                if solution["ProblemType"]["UseBeta"]:
                    s += "%sif (!betaZero) {\n" % (t)
                    s += "%s  status = clSetKernelArg( kernelBetaOnly, %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (
                        t, argIdx, typeName)
                    argIdx += 1
                    s += "%s}\n" % (t)
                # enqueue
                s += "%scl_event kernelEventBetaOnly;\n" % (t)
                s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t)
                t += "  "
                s += "%sstream,\n" % (t)
                s += "%skernelBetaOnly,\n" % (t)
                s += "%sworkDim,\n" % (t)
                s += "%sNULL, // globalWorkOffset\n" % (t)
                s += "%sglobalWorkSizeBetaOnly,\n" % (t)
                s += "%slocalWorkSizeBetaOnly,\n" % (t)
                s += "%snumInputEvents,\n" % (t)
                s += "%sinputEvents,\n" % (t)
                #s += "%soutputEvent );\n" % (t)
                s += "%s&kernelEventBetaOnly );\n" % (t)
                t = t[2:]
                s += "%stensileStatusCheck(status);\n" % (t)
                if solution["ProblemType"]["UseBeta"]:
                    s += "%sbeta = %s;\n" % (
                        t, solution["ProblemType"]["DataType"].zeroString(
                            self.language, 1))
                #s += "%sreturn tensileStatusSuccess;\n" % (t)
                s += "%sstatus = clFinish(stream);\n" % (t)
                s += "%stensileStatusCheck(status);\n" % (t)
                #s += " float tmp[128*128];\n"
                #s += "clEnqueueReadBuffer(stream, dataC, CL_TRUE, 0, 128*128*sizeof(float), tmp, 0, NULL, NULL);\n"
                #s += "for (unsigned int i = 0; i < 128*128; i++) { printf(\"%f\\n\", tmp[i]); }\n"

            else:
                s += "%sif( inputEvents != NULL )\n" % (t)
                t += "  "
                s += "%shipEventRecord(inputEvents[0], stream );\n" % (t)
                t += "  "
                s += "%stry {\n" % (t)
                if solution["ProblemType"]["UseBeta"]:
                    s += "%sif (betaZero) {\n" % (t)
                    t += "  "
                s += "%shipLaunchKernelGGL(\n" % (t)
                t += "  "
                s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[0])
                s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % (
                    t)
                s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % (
                    t)
                s += "%s0, // groupMemBytes\n" % (t)
                s += "%sstream,\n" % (t)
                s += "%sdataC,\n" % (t)
                s += "%soffsetC,\n" % (t)
                # strides
                for i in range(0, numStridesC):
                    s += "%s%s,\n" % (t, self.strideList[i])
                # sizes
                for i in range(0, solution["ProblemType"]["NumIndicesC"]):
                    s += "%ssize%s%s" % (
                        t, self.indexChars[i], ",\n" if i <
                        solution["ProblemType"]["NumIndicesC"] - 1 else ");\n")

                if solution["ProblemType"]["UseBeta"]:
                    s += "%s} else {\n" % (t)
                    s += "%shipLaunchKernelGGL(\n" % (t)
                    t += "  "
                    s += "%sHIP_KERNEL_NAME(%s),\n" % (t,
                                                       kernelNamesBetaOnly[1])
                    s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % (
                        t)
                    s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % (
                        t)
                    s += "%s0, // groupMemBytes\n" % (t)
                    s += "%sstream,\n" % (t)
                    s += "%sdataC,\n" % (t)
                    s += "%soffsetC,\n" % (t)
                    # strides
                    for i in range(0, numStridesC):
                        s += "%s%s,\n" % (t, self.strideList[i])
                    # sizes
                    for i in range(0, solution["ProblemType"]["NumIndicesC"]):
                        s += "%ssize%s,\n" % (t, self.indexChars[i])
                    s += "%sbeta);\n" % (t)
                    s += "%s}\n" % (t)

                s += "%s} catch (const std::exception& e) {\n" % (t)
                s += "#ifdef DEBUG\n"
                s += "%s  std::cerr << e.what() << std::endl;\n" % (t)
                s += "#endif\n"
                s += "%s  return tensileStatusFailure;\n" % (t)
                s += "%s}\n" % (t)

        ########################################
        # Enqueue Kernels
        ########################################
        for kernelIdx in range(0, len(kernels)):
            kernel = kernels[kernelIdx]
            if kernel["KernelLanguage"] == "Source":
                kernel["ISA"] = (
                    0, 0, 0)  # HIP source kernels needs dummy ISA version
            kernelName = self.kernelWriter.getKernelName(kernel)
            s += "\n%s/* kernel %u: %s */\n" % (t, kernelIdx, kernelName)
            s += "%sunsigned int kernelIdx = %u;\n" % (t, kernelIdx)
            if self.language == "OCL":
                # set kernel args same for all enqueues
                s += "%s// kernel args same for all enqueues\n" % (t)
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % (
                    t, 0)
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataA ); tensileStatusCheck(status);\n" % (
                    t, 1)
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataB ); tensileStatusCheck(status);\n" % (
                    t, 2)
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &alpha ); tensileStatusCheck(status);\n" % (
                    t, 3, typeName)
                s += "%s%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (t, \
                    "" if solution["ProblemType"]["UseBeta"] else "//", 4, typeName)
                argIdx = 5 if solution["ProblemType"]["UseBeta"] else 4
                argIdx += 3  # skipping offsets here
                for stride in self.strideList:
                    s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % (
                        t, argIdx, stride)
                    argIdx += 1
                for sizeIdx in range(0,
                                     solution["ProblemType"]["TotalIndices"]):
                    if sizeIdx not in [
                            solution["ProblemType"]["Index0"],
                            solution["ProblemType"]["Index1"],
                            solution["ProblemType"]["IndexUnroll"]
                    ]:
                        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (
                            t, argIdx, self.indexChars[sizeIdx])
                    argIdx += 1

            s += "%sfor (unsigned int enqueueIdx = 0; enqueueIdx < numEnqueues[%u]; enqueueIdx++) {\n" % (
                t, kernelIdx)
            t += "  "
            # debug print kernel dimensions
            if globalParameters["LibraryPrintDebug"]:
                s += "%sprintf(\"%s: g{ %%u, %%u, %%u } l{ %%u, %%u, %%u}\\n\", static_cast<unsigned int>(globalWorkSize[kernelIdx][0]), static_cast<unsigned int>(globalWorkSize[kernelIdx][1]), static_cast<unsigned int>(globalWorkSize[kernelIdx][2]), static_cast<unsigned int>(localWorkSize[0]), static_cast<unsigned int>(localWorkSize[1]), static_cast<unsigned int>(localWorkSize[2]) );\n" % (
                    t, kernelName)
                # debug print kernel arguments
                # offsets
                for i in range(0, 3):
                    s += "%sprintf(\"  offset[%u] = %%u\\n\", offsets[kernelIdx][enqueueIdx][%u]);\n" % (
                        t, i, i)
                # strides
                for stride in self.strideList:
                    s += "%sprintf(\"  %s = %%u\\n\", %s);\n" % (t, stride,
                                                                 stride)
                # sizes
                for i in range(0, solution["ProblemType"]["TotalIndices"]):
                    s += "%sprintf(\"  sizes[kernelIdx][enqueueIdx][%u] = %%u\\n\", sizes[kernelIdx][enqueueIdx][%u] );\n" % (
                        t, i, i)

            ########################################
            # OpenCL Runtime
            ########################################
            if self.language == "OCL":
                # set kernel args different for all enqueues
                argIdx = 5 if solution["ProblemType"]["UseBeta"] else 4
                # offsets
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &offsets[kernelIdx][enqueueIdx][0]); tensileStatusCheck(status);\n" % (
                    t, argIdx)
                argIdx += 1
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &offsets[kernelIdx][enqueueIdx][1]); tensileStatusCheck(status);\n" % (
                    t, argIdx)
                argIdx += 1
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &offsets[kernelIdx][enqueueIdx][2]); tensileStatusCheck(status);\n" % (
                    t, argIdx)
                argIdx += 1
                argIdx += len(self.strideList)
                # sizes
                for sizeIdx in range(0,
                                     solution["ProblemType"]["TotalIndices"]):
                    if sizeIdx in [
                            solution["ProblemType"]["Index0"],
                            solution["ProblemType"]["Index1"],
                            solution["ProblemType"]["IndexUnroll"]
                    ]:
                        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (
                            t, argIdx, self.indexChars[sizeIdx])
                    argIdx += 1

                # enqueue
                s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t)
                t += "  "
                s += "%sstream,\n" % (t)
                s += "%skernels[kernelIdx],\n" % (t)
                s += "%sworkDim,\n" % (t)
                s += "%sNULL, // globalWorkOffset\n" % (t)
                s += "%sglobalWorkSize[kernelIdx],\n" % (t)
                s += "%slocalWorkSize,\n" % (t)
                if False:  # solution["GlobalSplitU"] > 1:
                    s += "%s1,\n" % (t)
                    s += "%s&kernelEventBetaOnly,\n" % (t)
                else:
                    s += "%snumInputEvents,\n" % (t)
                    s += "%sinputEvents,\n" % (t)
                s += "%soutputEvent );\n" % (t)
                s += "%stensileStatusCheck(status);\n" % (t)

            ########################################
            # HIP Runtime
            ########################################
            else:
                if not globalParameters["PreciseKernelTime"] or solution[
                        "KernelLanguage"] == "Source":
                    s += "%sif( inputEvents != NULL )\n" % (t)
                    t += "  "
                    s += "%shipEventRecord(inputEvents[enqueueIdx], stream );\n" % (
                        t)
                t = t[2:]
                s += "%stry {\n" % (t)
                t += "  "
                # hip kernel
                if solution["KernelLanguage"] == "Source":
                    s += "%shipLaunchKernelGGL(\n" % (t)
                    t += "  "
                    s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelName)
                    s += "%sdim3(globalWorkSize[kernelIdx][0], globalWorkSize[kernelIdx][1], globalWorkSize[kernelIdx][2]),\n" % (
                        t)
                    s += "%sdim3(localWorkSize[0], localWorkSize[1], localWorkSize[2]),\n" % (
                        t)
                    s += "%s0, // groupMemBytes\n" % (t)
                    s += "%sstream,\n" % (t)
                    s += "%sdataC,\n" % (t)
                    s += "%sdataA,\n" % (t)
                    s += "%sdataB,\n" % (t)
                    s += "%salpha,\n" % (t)
                    s += "%s%sbeta,\n" % (t, \
                        "" if solution["ProblemType"]["UseBeta"] else "//")
                    s += "%soffsets[kernelIdx][enqueueIdx][0],\n" % (t)
                    s += "%soffsets[kernelIdx][enqueueIdx][1],\n" % (t)
                    s += "%soffsets[kernelIdx][enqueueIdx][2],\n" % (t)
                    # strides
                    for stride in self.strideList:
                        s += "%s%s,\n" % (t, stride)
                    # sizes
                    for i in range(0, solution["ProblemType"]["TotalIndices"]):
                        lastParam = i == solution["ProblemType"][
                            "TotalIndices"] - 1
                        s += "%ssizes[kernelIdx][enqueueIdx][%u]%s\n" \
                            % (t, i, "" if lastParam else "," )
                    if solution["PersistentKernel"]:
                        s += "%s,totalWorkGroups%u\n" % (
                            t, 0 if kernel["WorkGroupMapping"] > 0 else 1)
                        s += "%s,totalWorkGroups%u\n" % (
                            t, 1 if kernel["WorkGroupMapping"] > 0 else 0)
                    s += "%s);\n" % (t)

                # assembly kernel
                else:
                    if globalParameters["DebugKernel"]:
                        s += "%sconst unsigned int debugBufferElementsPerThread = 16;\n" % t
                        s += "%sunsigned int debugBufferNumElem = debugBufferElementsPerThread;\n" % (
                            t)
                        s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][0]);\n" % (
                            t)
                        s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][1]);\n" % (
                            t)
                        s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][2]);\n" % (
                            t)
                        s += "%sdebugBufferNumElem *= localWorkSize[0];\n" % (
                            t)
                        s += "%sdebugBufferNumElem *= localWorkSize[1];\n" % (
                            t)
                        s += "%sdebugBufferNumElem *= localWorkSize[2];\n" % (
                            t)
                        s += "%s  printf(\"debugBufferNumElem: %%04i: \\n\", debugBufferNumElem);\n" % (
                            t)
                        s += "%ssize_t debugBufferSize = debugBufferNumElem * sizeof(unsigned int);\n" % (
                            t)
                        s += "%shipDevice_t device;\n" % t
                        s += "%shipDeviceGet(&device, 0);\n" % t
                        s += "%shipMalloc(&(hipFunctionArgs.debugBuffer), debugBufferSize);\n" % t
                        s += "%sunsigned int *debugBufferHostPtr = new unsigned int[debugBufferNumElem];\n" % (
                            t)
                        s += "%smemset(debugBufferHostPtr,0,debugBufferSize);\n" % (
                            t)
                        s += "%shipMemcpyHtoD(hipFunctionArgs.debugBuffer, debugBufferHostPtr, debugBufferSize);\n" % (
                            t)
                        s += "%smemset(debugBufferHostPtr,1,debugBufferSize);\n" % (
                            t)

                    # hip assembly function
                    s += "%shipFunctionArgs.dataC = dataC;\n" % (t)
                    s += "%shipFunctionArgs.dataA = dataA;\n" % (t)
                    s += "%shipFunctionArgs.dataB = dataB;\n" % (t)
                    if solution["ProblemType"]["DataType"].isHalf():
                        s += "%shipFunctionArgs.alpha[0] = alpha;\n" % (t)
                        s += "%shipFunctionArgs.alpha[1] = alpha;\n" % (t)
                    else:
                        s += "%shipFunctionArgs.alpha = alpha;\n" % (t)
                    if solution["ProblemType"]["UseBeta"]:
                        if solution["ProblemType"]["DataType"].isHalf():
                            s += "%shipFunctionArgs.beta[0] = beta;\n" % (t)
                            s += "%shipFunctionArgs.beta[1] = beta;\n" % (t)
                        else:
                            s += "%shipFunctionArgs.beta = beta;\n" % (t)
                    s += "%shipFunctionArgs.offsetC = offsets[kernelIdx][enqueueIdx][0];\n" % (
                        t)
                    s += "%shipFunctionArgs.offsetA = offsets[kernelIdx][enqueueIdx][1];\n" % (
                        t)
                    s += "%shipFunctionArgs.offsetB = offsets[kernelIdx][enqueueIdx][2];\n" % (
                        t)
                    # strides
                    for stride in self.strideList:
                        s += "%shipFunctionArgs.%s = %s;\n" % (t, stride,
                                                               stride)
                    # sizes
                    for i in range(0, solution["ProblemType"]["TotalIndices"]):
                        lastParam = i == solution["ProblemType"][
                            "TotalIndices"] - 1
                        s += "%shipFunctionArgs.size%s = sizes[kernelIdx][enqueueIdx][%u];\n" \
                            % (t, globalParameters["IndexChars"][i], i )

                    if solution["PersistentKernel"]:
                        # pass in the number of groups since not available in WG
                        s += "%shipFunctionArgs.numGroupTiles0 = totalWorkGroups0;\n" % (
                            t)
                        s += "%shipFunctionArgs.numGroupTiles1 = totalWorkGroups1;\n" % (
                            t)

                    s += "%shipHccModuleLaunchKernel(\n" % (t)
                    t += "  "
                    s += "%shipFunction,\n" % (t)
                    s += "%sglobalWorkSize[kernelIdx][0]*localWorkSize[0],\n" % (
                        t)
                    s += "%sglobalWorkSize[kernelIdx][1]*localWorkSize[1],\n" % (
                        t)
                    s += "%sglobalWorkSize[kernelIdx][2]*localWorkSize[2],\n" % (
                        t)
                    s += "%slocalWorkSize[0],\n" % (t)
                    s += "%slocalWorkSize[1],\n" % (t)
                    s += "%slocalWorkSize[2],\n" % (t)
                    s += "%s0, // groupMemBytes\n" % (t)
                    s += "%sstream,\n" % (t)
                    s += "%sNULL,\n" % (t)
                    s += "%s(void**)hipLaunchParams\n" % (t)
                    if globalParameters["PreciseKernelTime"]:
                        s += "%s,inputEvents ? inputEvents[enqueueIdx]:nullptr\n" % (
                            t)
                        s += "%s,outputEvent ? outputEvent[enqueueIdx]:nullptr\n" % (
                            t)

                    s += "%s);\n" % (t)
                    t = t[2:]
                    if globalParameters["DebugKernel"]:
                        # copy debug buffer
                        s += "%shipMemcpyDtoH(debugBufferHostPtr, hipFunctionArgs.debugBuffer, debugBufferSize);\n" % (
                            t)
                        s += "%sfor(unsigned int i = 0; i < debugBufferNumElem/debugBufferElementsPerThread; i++) {\n" % (
                            t)
                        s += "%s  printf(\"%%04i\", i);\n" % (t)
                        s += "%s  char u[debugBufferElementsPerThread] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};\n" % (
                            t)
                        #s += "%s  char u[debugBufferElementsPerThread] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};\n" % (t)
                        #s += "%s  char u[debugBufferElementsPerThread] = {1,1,0,0,1,1,0,0,1,1,1,1,1,1,1,1};\n" % (t)
                        s += "%s  for(unsigned int j = 0; j < debugBufferElementsPerThread; j++) {\n" % (
                            t)
                        s += "%s if (u[j]) printf(\",%%4u\", debugBufferHostPtr[i*debugBufferElementsPerThread+j]);\n" % (
                            t)
                        s += "%s else printf(\",%%4.0f\", ((float *)debugBufferHostPtr)[i*debugBufferElementsPerThread+j]);\n" % (
                            t)

                        s += "%s  }\n" % (t)
                        s += "%s  printf(\"\\n\");\n" % (t)
                        s += "%s}\n" % (t)

                t = t[2:]
                s += "%s} catch (const std::exception& e) {\n" % (t)
                s += "#ifdef DEBUG\n"
                s += "%s  std::cerr << e.what() << std::endl;\n" % (t)
                s += "#endif\n"
                s += "%s  return tensileStatusFailure;\n" % (t)
                s += "%s}\n" % (t)
                if not globalParameters["PreciseKernelTime"] or solution[
                        "KernelLanguage"] == "Source":
                    s += "%sif( outputEvent != NULL )\n" % (t)
                    s += "%s  hipEventRecord(outputEvent[enqueueIdx], stream );\n" % (
                        t)
                s += "  }\n"
        s += "\n"
        s += "  return tensileStatusSuccess;\n"
        s += "}\n"
        s += "\n"
        s += "/* Solution Parameters\n"
        s += Solution.getParametersIndented(solution.state, "  ")
        s += "*/\n"
        s += "\n"

        return s
Example #2
0
  def getProblemSourceString(self, problemType, solution, kernelsWithBuildErrs):
    gsu = solution["GlobalSplitU"]
    persistent = solution["PersistentKernel"]
    kernelLanguage = solution["KernelLanguage"]
    tt0 = solution["ThreadTile0"]
    tt1 = solution["ThreadTile1"]
    sg0 = solution["SubGroup0"]
    sg1 = solution["SubGroup1"]
    nt  = solution["NumThreads"]

    kernels = solution.getKernels()
    kernelNames = []
    kernelBuildErr = 0
    for kernel in kernels:
      kernelName = self.kernelWriter.getKernelName(kernel)
      if kernelName in kernelsWithBuildErrs:
        kernelBuildErr = 1
      kernelNames.append( kernelName )


    s = ""
    t = ""
    # includes

    problemType = solution["ProblemType"] # shortcut

    if not globalParameters["MergeFiles"]:
      solutionName = self.getSolutionName(solution)
      s += "#include \"%s.h\"\n" % solutionName
      s += "\n"

    # problem function signature
    #argList = self.getArgList(problemType, True, True, True, True)
    #for i in range(0, len(argList)):
    #  argString = "%s %s" % argList[i]
    #  s += "%s%s%s" % (t, argString, ",\n" if i < len(argList)-1 else ")" )

    s += self.getSolutionSignature(solution)

    s += " {\n"
    if kernelBuildErr:
      s += "%s  return tensileStatusFailure; // One or more kernels had build failures (%s)\n" % (t, kernelNames)
      s += "%s}\n" % (t)
      return s

    t += "  "
    s += "%sTensileStatus status;\n" % (t)


    # hipFunction Struct
    if kernelLanguage == "Assembly":
      s += "\n"
      s += "%s/* module function args */\n" % (t)
      s += "%sstruct {\n" % t
      t += "  "
      if globalParameters["DebugKernel"]:
        s += "%sunsigned int *debugBuffer;\n" % t
      # Tensor sizes in elements, including only packed dims,
      # and accounting for zero or other strides < size
      # Place these first in the structure since they are 64-bits
      # and need to avoid any unneeded padding:
      s += "%s// Size of Tensor's packed dims, in elements\n" % t
      s += "%suint64_t tensor2dSizeC;\n" % t
      s += "%suint64_t tensor2dSizeA;\n" % t
      s += "%suint64_t tensor2dSizeB;\n" % t
      solutionArgs = self.getArgList(problemType, False, True, False, False)
      for arg in solutionArgs:
        if arg[0] == "TensileHalf":
          s += "%s%s %s[2];\n" % (t, arg[0], arg[1])
        else:
          s += "%s%s %s;\n" % (t, arg[0], arg[1])
      for idxChar in solution["PackedC0Indices"][:-1]:
        s += "%sunsigned magicNumberSize%s;\n" % (t, idxChar)
        s += "%sunsigned magicShiftSize%s;\n" % (t, idxChar)
      for idxChar in solution["PackedC1Indices"][:-1]:
        s += "%sunsigned magicNumberSize%s;\n" % (t, idxChar)
        s += "%sunsigned magicShiftSize%s;\n" % (t, idxChar)

      # number of unroll loop iterations to stagger the start in "U" dim.
      s += "%sint staggerUIter;\n" % t

      # persistent
      s += "%sunsigned int problemNumGroupTiles0;\n" % t
      s += "%sunsigned int problemNumGroupTiles1;\n" % t
      s += "%sunsigned int magicNumberProblemNumGroupTiles0;\n" % t
      s += "%sunsigned int gridNumWorkGroups0;\n" % t
      s += "%sunsigned int numFullBlocks;\n" % t
      s += "%sunsigned int wgmRemainder1;\n" % t
      s += "%sunsigned int magicNumberWgmRemainder1;\n" % t

      s += "%sunsigned int pad;\n" % t # FIXME can this be removed?
      t = t[2:]
      s += "%s} hipFunctionArgs;\n" % t
      #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t
      s += "%ssize_t hipFunctionArgsSize = sizeof(hipFunctionArgs);\n" % t
      s += "%svoid *hipLaunchParams[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &hipFunctionArgs, HIP_LAUNCH_PARAM_BUFFER_SIZE, &hipFunctionArgsSize, HIP_LAUNCH_PARAM_END};\n" % t
      #s += "%sprintf(\"size: %%lu\\n\", sizeof(unsigned int));\n" % t
      #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t
      #for arg in solutionArgs:
      #  s += "%sprintf(\"%s: %%lu\\n\", static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)) - static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)));\n" % (t, arg[1], arg[1], solutionArgs[0][1])

    # NOTE: host compiler aligns size of structs to 64-bits (at least) and aligns the offset of pointers to 64-bits, therefore, having pointers which are not at the beginning of the struct may get padded/shifted by the host compiler and, therefore, not coppied correctly to gpu

    if globalParameters["RuntimeLanguage"] == "HIP":
      s += "%sint deviceId;\n" % (t)
      s += "%shipGetDevice(&deviceId);\n" % (t)

    # kernels
    s += "\n%s/* kernels */\n" % (t)
    s += "%sconst unsigned int numKernels = %u; // 1 or 4\n" % (t, len(kernels))

    if kernelLanguage == "Source" and globalParameters["RuntimeLanguage"] == "OCL":
      s += "%sconst char *kernelSources[numKernels] = {\n" % (t)
      t += "  "
      for kernelIdx in range(0, len(kernelNames)):
        kernelName = kernelNames[kernelIdx]
        s += "%s%s_src%s\n" % (t, kernelName, \
            "," if kernelIdx < len(kernels)-1 else "" )
      t = t[2:]
      s += "%s};\n" % (t)
      s += "%scl_kernel kernels[numKernels];\n" % (t)
      s += "%sconst char *buildOptions = \"-cl-std=cl2.0\";\n" % (t)
      s += "%sfor (unsigned int i = 0; i < numKernels; i++) {\n" % (t)
      s += "%s  tensileGetCompiledOpenCLKernel(\n" % (t)
      s += "%s      &kernels[i],\n" % (t)
      s += "%s      kernelSources[i],\n" % (t)
      s += "%s      stream,\n" % (t)
      s += "%s      buildOptions);\n" % (t)
      s += "%s}\n" % (t)

      if gsu > 1:
        for beta in Solution.getKernelsBetaOnlyFromProblem(problemType, gsu):
          kernelName = self.kernelWriter.getKernelNameBetaOnly(beta)
          s += "%scl_kernel kernel_%s;\n" % (t, kernelName)
          s += "%s  tensileGetCompiledOpenCLKernel(\n" % (t)
          s += "%s      &kernel_%s,\n" % (t, kernelName)
          s += "%s      %s_src,\n" % (t, kernelName)
          s += "%s      stream,\n" % (t)
          s += "%s      buildOptions);\n" % (t)

    elif kernelLanguage == "Assembly":
      kernel = kernels[0]
      s += "%shipFunction_t hipFunction;\n" % (t)
      # if !CodeFromFiles then pass global _coba that points to code object
      s += "%sstatus = solutionLock->getFunction(&hipFunction, deviceId, \"%s\", %s);;\n" \
              % (t, kernelName, "nullptr" if globalParameters["CodeFromFiles"] else kernelName+"_coba" )
      s += "%sif (status) return status;\n" % (t)

    typeName = problemType["DataType"].toCpp()

    # num enqueues
    s += "\n%s/* num kernels */\n" % (t)
    s += "%sunsigned int numEnqueues[numKernels] = { 1" % (t)
    for i in range(1, len(kernels)):
      s += ", 1"
    s += " };\n"

    # grid size
    s += "\n%s/* grid sizes */\n" % (t)
    s += "%sconst unsigned int workDim = 3;\n" % (t)
    s += "%sconst unsigned int threadTile[2] = { %u, %u };\n" \
        % (t, tt0, tt1)
    s += "%sconst unsigned int groupSize[2] = { %u, %u };\n" \
        % (t, sg0, sg1)
    s += "%ssize_t localWorkSize[3] = { %3u, 1, 1 };\n" \
        % (t, nt)
    s += "%ssize_t globalWorkSize[numKernels][3];\n" % (t)
    # grid size [2]
    s += "%sglobalWorkSize[0][2] = 1;\n" % (t)
    for i in range(0, problemType["NumIndicesC"]):
      if i != problemType["Index0"] and i != problemType["Index1"]:
        s += "%sglobalWorkSize[0][2] *= size%s;\n" % (t, self.indexChars[i])

    s += "%sunsigned int sizeOfC0 = " % (t)
    s += " * ".join(["size" + i for i in solution["PackedC0Indices"]])
    s += ";\n"

    s += "%sunsigned int sizeOfC1 = " % (t)
    s += " * ".join(["size" + i for i in solution["PackedC1Indices"]])
    s += ";\n"

    for idxChar in solution["PackedC0Indices"][:-1]:
      s += "%sunsigned magicShiftSize%s = 33; // bozo, review\n" % (t, idxChar)
      s += "%sunsigned magicNumberSize%s = (1L<<magicShiftSize%s) / size%s + 1; // bozo, review\n" \
          % (t, idxChar, idxChar, idxChar)
    for idxChar in solution["PackedC1Indices"][:-1]:
      s += "%sunsigned magicShiftSize%s = 33; // bozo, review\n" % (t, idxChar)
      s += "%sunsigned magicNumberSize%s = (1L<<magicShiftSize%s) / size%s + 1; // bozo, review\n" \
              % (t, idxChar, idxChar, idxChar)

    s += "%sunsigned int macroTile0 = static_cast<unsigned int>(groupSize[0] * threadTile[0]);\n" % (t)
    s += "%sunsigned int macroTile1 = static_cast<unsigned int>(groupSize[1] * threadTile[1]);\n" % (t)
    s += "%sunsigned int totalWorkGroups0 = sizeOfC0 / macroTile0;\n" % (t)
    s += "%sunsigned int totalWorkGroups1 = sizeOfC1 / macroTile1;\n" % (t)

    if kernel["EdgeType"] != "None":
      s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % (t)
      s += "%sif (totalWorkGroups0*macroTile0 < sizeOfC0) { totalWorkGroups0++; }\n" % (t)
      s += "%sif (totalWorkGroups1*macroTile1 < sizeOfC1) { totalWorkGroups1++; }\n" % (t)
    if kernel["WorkGroupMappingType"] == "Z" and abs(kernel["WorkGroupMapping"]) == 2:
      s += "%sunsigned int totalWorkGroupsPow2 = totalWorkGroups0 > totalWorkGroups1 ? totalWorkGroups0 : totalWorkGroups1;\n" % (t)
      s += "%stotalWorkGroupsPow2--;\n" % (t)
      s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 1;\n" % (t)
      s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 2;\n" % (t)
      s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 4;\n" % (t)
      s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 8;\n" % (t)
      s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 16;\n" % (t)
      s += "%stotalWorkGroupsPow2++;\n" % (t)
      s += "%stotalWorkGroups0 = totalWorkGroupsPow2;\n" % (t)
      s += "%stotalWorkGroups1 = totalWorkGroupsPow2;\n" % (t)

    # persistent:
    s += "%sunsigned int problemNumGroupTiles0 = totalWorkGroups0;\n" % (t)
    s += "%sunsigned int problemNumGroupTiles1 = totalWorkGroups1;\n" % (t)
    s += "%sconst unsigned smallNumMagicShift = 31; // bozo, review\n" % (t)
    s += "%sunsigned magicNumberProblemNumGroupTiles0 = (1L<<smallNumMagicShift) / problemNumGroupTiles0 + 1; // bozo, review\n"  % (t)
    s += "%sunsigned numFullBlocks =  problemNumGroupTiles1 / %u; // divide by WorkGroupMapping\n" \
            % (t, abs(kernel["WorkGroupMapping"]) if abs(kernel["WorkGroupMapping"])>0 else 1)
    s += "%sunsigned wgmRemainder1 =  %u ? (problemNumGroupTiles1 %% %u) : 0;\n" % \
            (t, abs(kernel["WorkGroupMapping"]), abs(kernel["WorkGroupMapping"]))
    s += "%sif (wgmRemainder1 == 0) wgmRemainder1 = %u;\n" % (t, abs(kernel["WorkGroupMapping"]))
    s += "%sunsigned magicNumberWgmRemainder1 = ((1L<<smallNumMagicShift) / wgmRemainder1 + 1);\n"  % (t)
    #s += '  printf ("wgmRemainder1=%u \\n", wgmRemainder1);'

    if gsu> 1:
      s += "%stotalWorkGroups1 *= %u; // GlobalSplitU\n" % (t, gsu)
    if persistent:
      s += "%shipDeviceProp_t deviceProperties;\n" % (t)
      # TODO - should cache the device properties - expensive to call on each iteration here:
      s += "%shipGetDeviceProperties( &deviceProperties, deviceId );\n" % (t)
      s += "%sunsigned int numGroups = totalWorkGroups0 * totalWorkGroups1;\n" % (t)
      s += "%sglobalWorkSize[0][0] = (deviceProperties.multiProcessorCount * %u < numGroups) ? (deviceProperties.multiProcessorCount * %u) : numGroups;\n" \
              % (t, persistent, persistent)

      s += "%sglobalWorkSize[0][1] = 1;\n" % t
    else:
      s += "%sglobalWorkSize[0][0] = totalWorkGroups%u%s;\n" % (t, 0 if kernel["WorkGroupMapping"] >= 0 else 1, "*localWorkSize[0]" if self.language == "OCL" else "")
      s += "%sglobalWorkSize[0][1] = totalWorkGroups%u%s;\n" % (t, 1 if kernel["WorkGroupMapping"] >= 0 else 0, "*localWorkSize[1]" if self.language == "OCL" else "")

    # index sizes
    s += "\n%s/* index sizes */\n" % (t)
    s += "%sunsigned int sizes[numKernels][1][%u];\n" \
        % (t, problemType["TotalIndices"])
    for kernelIdx in range(0, len(kernels)):
      kernel = kernels[kernelIdx]
      kernelName = self.kernelWriter.getKernelName(kernel)
      # free index sizes
      for i in range(0,problemType["NumIndicesFree"] \
          + problemType["NumIndicesBatch"] ):
        s += "%ssizes[%u][0][%u] = size%s;\n" \
            % (t, kernelIdx, i, self.indexChars[i])
      # summation index sizes
      for i in range(problemType["NumIndicesC"], \
              problemType["TotalIndices"] ):
        lastParam = i == problemType["TotalIndices"]-1
        s += "%ssizes[%u][0][%u] = size%s;\n" \
            % (t, kernelIdx, i, self.indexChars[i])

      # Tensor2DSizes - size excluding the batch dimension, accounts for cases where one of strides is 0
      #print "IndexAssignmentsA=", problemType["IndexAssignmentsA"], "Batch=", problemType["IndicesBatch"]
      firstStride = 0 if problemType["UseInitialStrides"] else 1
      del i

      numIdx = problemType["NumIndicesC"]
      printMe = 0
      s += "%suint64_t tensor2dSizeC = %s" % \
          (t, "1" if firstStride==1 else "strideC%u%s"% (0,self.indexChars[0]))
      for idx in range(0,numIdx):
        # Multiply only by packed tensor dims
        if idx in problemType["IndicesFree"]:
          printMe = True
        else:
          printMe = False

        if printMe:
          if idx+1 < numIdx:
            strideIdx = idx+1
            s += " * std::max(size%s, strideC%u%s)" % \
                (self.indexChars[idx], idx+1, self.indexChars[strideIdx])
          else:
            s += " * size%s" % (self.indexChars[idx])
      s += ";\n"

      s += "%suint64_t tensor2dSizeA = 1;\n" % t
      numIdx = len(problemType["IndexAssignmentsA"])

      printMe = printedSum = False
      for i in range(0,numIdx):
        idx = problemType["IndexAssignmentsA"][i]

        # Multiply only by first free and first summation
        if idx in [ord(x)-ord(globalParameters["IndexChars"][0]) for x in solution["PackedC0Indices"]]:
          printMe = True
        elif idx in problemType["IndicesSummation"] and not printedSum:
          printMe = printedSum = True
        else:
          printMe = False

        if printMe:
          s += "%stensor2dSizeA = " % t
          if i+1 < numIdx:
            strideIdx = problemType["IndexAssignmentsA"][i+1]
            s += "std::max(tensor2dSizeA*size%s, (uint64_t)strideA%u%s);\n" \
                % (self.indexChars[idx], i+1, self.indexChars[strideIdx])
          else:
            s += " tensor2dSizeA * size%s" % (self.indexChars[idx])
      s += ";\n"

      s += "%suint64_t tensor2dSizeB = 1;\n" % t
      numIdx = len(problemType["IndexAssignmentsB"])
      printMe = printedSum = False
      for i in range(0,numIdx):
        idx = problemType["IndexAssignmentsB"][i]

        # Multiply only by first free and first summation
        if idx in [ord(x)-ord(globalParameters["IndexChars"][0]) for x in solution["PackedC1Indices"]]:
          printMe = True
        elif idx in problemType["IndicesSummation"] and not printedSum:
          printMe = printedSum = True
        else:
          printMe = False

        if printMe:
          s += "%stensor2dSizeB = " % t
          if i+1 < numIdx:
            strideIdx = problemType["IndexAssignmentsB"][i+1]
            s += "std::max(tensor2dSizeB*size%s, (uint64_t)strideB%u%s);\n" \
                % (self.indexChars[idx], i+1, self.indexChars[strideIdx])
          else:
            s += " tensor2dSizeB * size%s" % (self.indexChars[idx])
      s += ";\n"

    unrollChar = globalParameters["IndexChars"][problemType["IndexUnroll"]]

    s += "  unsigned int staggerUIter = %s; // how many stride-sized clicks to stagger start offset\n" \
        % (solution["StaggerU"])
    s += "  int unrollLoopIters = size%s/%u/%u; // /DepthU/GSU\n" % (unrollChar, solution["DepthU"], gsu)
    s += "  while (staggerUIter>1) {\n"
    s += "    if (unrollLoopIters >= (staggerUIter*%u)) {\n" % (1<<solution["_staggerStrideShift"])
    s += "      break;}\n"
    s += "    staggerUIter /= 2; // step down to smaller stagger\n"
    s += "  }\n"
    s += "  if (staggerUIter>=1) staggerUIter -= 1;\n" # convert to a mask
    #s += '  printf ("size%s=%%u StaggerU=%s unrollLoopIters=%%u, staggerUIter=%%d\\n", size%s, unrollLoopIters, staggerUIter);\n' % (unrollChar, solution["StaggerU"], unrollChar)



    #s += "printf(\"Launching with grid=%zu_%zu problemGrid=%u_%u mt=%u_%u\\n\", globalWorkSize[0][0], globalWorkSize[0][1], totalWorkGroups0, totalWorkGroups1, macroTile0, macroTile1);\n"
    s += "\n"
    s += "%sint kernelsLaunched=0;\n" % (t)

    ########################################
    # Enqueue Beta-Only Kernel
    ########################################
    if gsu > 1:
      kernelNamesBetaOnly = []
      numStridesC = problemType["NumIndicesC"] - \
          (0 if problemType["UseInitialStrides"] else 1)
      for beta in Solution.getKernelsBetaOnlyFromProblem(problemType, gsu):
        kernelName = self.kernelWriter.getKernelNameBetaOnly(beta)
        kernelNamesBetaOnly.append(kernelName)
      s += "%s// enqueue Beta-Only kernel\n" % (t)

      # grid sizes
      s += "%ssize_t localWorkSizeBetaOnly[3] = { 8, 8, 1};\n" % (t)
      s += "%ssize_t globalWorkSizeBetaOnly[3];\n" % (t)
      #s += "%sunsigned int sizeOfC0 = size%s;\n" % (t, \
      #    self.indexChars[problemType["Index0"]])
      #s += "%sunsigned int sizeOfC1 = size%s;\n" % (t, \
      #    self.indexChars[problemType["Index1"]])
      s += "%ssize_t totalWorkGroupsBetaOnly0 = sizeOfC0 / localWorkSizeBetaOnly[0];\n" % (t)
      s += "%ssize_t totalWorkGroupsBetaOnly1 = sizeOfC1 / localWorkSizeBetaOnly[1];\n" % (t)
      s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % (t)
      s += "%sif (totalWorkGroupsBetaOnly0*localWorkSizeBetaOnly[0] < sizeOfC0) { totalWorkGroupsBetaOnly0++; }\n" % (t)
      s += "%sif (totalWorkGroupsBetaOnly1*localWorkSizeBetaOnly[1] < sizeOfC1) { totalWorkGroupsBetaOnly1++; }\n" % (t)
      s += "%sglobalWorkSizeBetaOnly[0] = totalWorkGroupsBetaOnly0%s;\n" % (t, "*localWorkSizeBetaOnly[0]" if self.language == "OCL" else "")
      s += "%sglobalWorkSizeBetaOnly[1] = totalWorkGroupsBetaOnly1%s;\n" % (t, "*localWorkSizeBetaOnly[1]" if self.language == "OCL" else "")
      s += "%sglobalWorkSizeBetaOnly[2] = 1;\n" % (t)
      for i in range(0, problemType["NumIndicesC"]):
        if i != problemType["Index0"] and i != problemType["Index1"]:
          s += "%sglobalWorkSizeBetaOnly[2] *= size%s;\n" % (t, self.indexChars[i])

      if problemType["UseBeta"]:
        s += "%sbool betaZero = beta == 0;\n" % (t)
      if self.language == "OCL":
        if problemType["UseBeta"]:
          s += "%scl_kernel kernelBetaOnly = betaZero ? kernel_%s : kernel_%s;\n" \
              % (t, kernelNamesBetaOnly[0], kernelNamesBetaOnly[1])
        else:
          #s += "%sbool betaZero = true;\n" % (t)
          s += "%scl_kernel kernelBetaOnly = kernel_%s;\n" \
              % (t, kernelNamesBetaOnly[0])
        argIdx = 0
        s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % (t, argIdx); argIdx+=1
        # strides
        for i in range(0,numStridesC):
          s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.strideList[i]); argIdx+=1
        # sizes
        for i in range(0, problemType["NumIndicesC"]):
          s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.indexChars[i]); argIdx+=1
        # beta
        if problemType["UseBeta"]:
          s += "%sif (!betaZero) {\n" % (t)
          s += "%s  status = clSetKernelArg( kernelBetaOnly, %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (t, argIdx, typeName); argIdx+=1
          s += "%s}\n" % (t)
        # enqueue
        s += "%scl_event kernelEventBetaOnly;\n" % (t)
        s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t)
        t += "  "
        s += "%sstream,\n" % (t)
        s += "%skernelBetaOnly,\n" % (t)
        s += "%sworkDim,\n" % (t)
        s += "%sNULL, // globalWorkOffset\n" % (t)
        s += "%sglobalWorkSizeBetaOnly,\n" % (t)
        s += "%slocalWorkSizeBetaOnly,\n" % (t)
        s += "%snumInputEvents,\n" % (t)
        s += "%sinputEvents,\n" % (t)
        #s += "%soutputEvent );\n" % (t)
        s += "%s&kernelEventBetaOnly );\n" % (t)
        t = t[2:]
        s += "%stensileStatusCheck(status);\n" % (t)
        if problemType["UseBeta"]:
          s += "%sbeta = %s;\n" % (t, problemType["DataType"].zeroString(self.language, 1) )
        #s += "%sreturn tensileStatusSuccess;\n" % (t)
        s += "%sstatus = clFinish(stream);\n" % (t)
        s += "%stensileStatusCheck(status);\n" % (t)
        #s += " float tmp[128*128];\n"
        #s += "clEnqueueReadBuffer(stream, dataC, CL_TRUE, 0, 128*128*sizeof(float), tmp, 0, NULL, NULL);\n"
        #s += "for (unsigned int i = 0; i < 128*128; i++) { printf(\"%f\\n\", tmp[i]); }\n"
      else:
        s += "%stry {\n" % (t)
        # TODO - timing with beta kernels is somewhat pessimistic since it has this separate event only on the GSU path.
        # Introduces 2-3us of overhead ; may want to disable PreciseKernelTime so non-GSU have same overhead.
        # Long-term fix would be to launch the beta kernel with the hipHccModule* API and set start-event in that call
        if problemType["UseBeta"]:
          s += "%sif (betaZero) {\n" % (t)
          t += "  "
        s += "%sif( inputEvents != NULL )\n" % (t)
        s += "%s  hipEventRecord(inputEvents[0], stream );\n" % (t)
        s += "%skernelsLaunched++;\n" % (t)
        s += "%shipLaunchKernelGGL(\n" % (t)
        t += "  "
        s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[0])
        s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % (t)
        s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % (t)
        s += "%s0, // groupMemBytes\n" % (t)
        s += "%sstream,\n" % (t)
        s += "%sdataD,\n" % (t)
        s += "%sdataC,\n" % (t)
        # strides
        for i in range(0,numStridesC*2):
          s += "%s%s,\n" % (t, self.strideList[i])
        # sizes
        for i in range(0, problemType["NumIndicesC"]):
          s += "%ssize%s%s" % (t, self.indexChars[i], ",\n" if i < problemType["NumIndicesC"]-1 else ");\n")

        if problemType["UseBeta"]:
          s += "%s} else {\n" % (t)
          t = t[:-2]
          s += "%sif( inputEvents != NULL )\n" % (t)
          s += "%s  hipEventRecord(inputEvents[0], stream );\n" % (t)
          s += "%skernelsLaunched++;\n" % (t)
          s += "%shipLaunchKernelGGL(\n" % (t)
          t += "  "
          s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[1])
          s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % (t)
          s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % (t)
          s += "%s0, // groupMemBytes\n" % (t)
          s += "%sstream,\n" % (t)
          s += "%sdataD,\n" % (t)
          s += "%sdataC,\n" % (t)
          # strides
          for i in range(0,numStridesC*2):
            s += "%s%s,\n" % (t, self.strideList[i])
          # sizes
          for i in range(0, problemType["NumIndicesC"]):
            s += "%ssize%s,\n" % (t, self.indexChars[i])
          s += "%sbeta);\n" % (t)
          s += "}\n"

        t = "  "
        s += "%s} catch (const std::exception& e) {\n" % (t)
        s += "#ifdef DEBUG\n"
        s += "%s  std::cerr << e.what() << std::endl;\n" % (t)
        s += "#endif\n"
        s += "%s  return tensileStatusFailure;\n" % (t)
        s += "%s}\n" % (t)

    ########################################
    # Enqueue Kernels
    ########################################
    for kernelIdx in range(0, len(kernels)):
      kernel = kernels[kernelIdx]
      if kernel["KernelLanguage"] == "Source":
        kernel["ISA"] = (0, 0, 0) # HIP source kernels needs dummy ISA version
      kernelName = self.kernelWriter.getKernelName(kernel)
      s += "\n%s/* kernel %u: %s */\n" % (t, kernelIdx, kernelName)
      s += "%sunsigned int kernelIdx = %u;\n" % (t, kernelIdx)
      if self.language == "OCL":
        # set kernel args same for all enqueues
        s += "%s// kernel args same for all enqueues\n" % (t)
        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataD ); tensileStatusCheck(status);\n" % (t, 0)
        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % (t, 1)
        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataA ); tensileStatusCheck(status);\n" % (t, 2)
        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataB ); tensileStatusCheck(status);\n" % (t, 3)
        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &alpha ); tensileStatusCheck(status);\n" % (t, 4, typeName)
        s += "%s%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (t, \
            "" if problemType["UseBeta"] else "//", 5, typeName)
        argIdx = 6 if problemType["UseBeta"] else 5
        for stride in self.strideList:
          s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % (t, argIdx, stride)
          argIdx += 1
        for sizeIdx in range(0, problemType["TotalIndices"]):
          if sizeIdx not in [ problemType["Index0"],  problemType["Index1"], problemType["IndexUnroll"] ]:
            s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.indexChars[sizeIdx])
          argIdx += 1

        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(staggerUIter), &staggerUIter ); tensileStatusCheck(status);\n" % (t, argIdx)
        argIdx += 1

      s += "%sfor (unsigned int enqueueIdx = 0; enqueueIdx < numEnqueues[%u]; enqueueIdx++) {\n" % (t, kernelIdx)
      t += "  "
      # debug print kernel dimensions
      if globalParameters["LibraryPrintDebug"]:
        s += "%sprintf(\"%s: g{ %%u, %%u, %%u } l{ %%u, %%u, %%u}\\n\", static_cast<unsigned int>(globalWorkSize[kernelIdx][0]), static_cast<unsigned int>(globalWorkSize[kernelIdx][1]), static_cast<unsigned int>(globalWorkSize[kernelIdx][2]), static_cast<unsigned int>(localWorkSize[0]), static_cast<unsigned int>(localWorkSize[1]), static_cast<unsigned int>(localWorkSize[2]) );\n" % (t, kernelName)
        # debug print kernel arguments
        # strides
        for stride in self.strideList:
          s += "%sprintf(\"  %s = %%u\\n\", %s);\n" % (t, stride, stride)
        # sizes
        for i in range(0, problemType["TotalIndices"]):
          s += "%sprintf(\"  sizes[kernelIdx][enqueueIdx][%u] = %%u\\n\", sizes[kernelIdx][enqueueIdx][%u] );\n" % (t, i, i )
        s += "%sprintf(\"  problemNumGroupTiles0== %%u\\n\", problemNumGroupTiles0 );\n" % (t)
        s += "%sprintf(\"  problemNumGroupTiles1== %%u\\n\", problemNumGroupTiles1 );\n" % (t)
        s += "%sprintf(\"  tensor2dSizeC== %%lu\\n\", tensor2dSizeC );\n" % (t)
        s += "%sprintf(\"  tensor2dSizeA== %%lu\\n\", tensor2dSizeA );\n" % (t)
        s += "%sprintf(\"  tensor2dSizeB== %%lu\\n\", tensor2dSizeB );\n" % (t)
        for idxChar in solution["PackedC0Indices"][:-1]:
          s += "%sprintf(\"  magicNumberSize%s== 0x%%x, magicShiftSize%s== %%u)\\n\",  magicNumberSize%s, magicShiftSize%s);\n" \
              % (t, idxChar, idxChar, idxChar, idxChar)
        for idxChar in solution["PackedC1Indices"][:-1]:
          s += "%sprintf(\"  magicNumberSize%s== 0x%%x, magicShiftSize%s== %%u)\\n\",  magicNumberSize%s, magicShiftSize%s);\n" \
              % (t, idxChar, idxChar, idxChar, idxChar)

      ########################################
      # OpenCL Runtime
      ########################################
      if self.language == "OCL":
        # set kernel args different for all enqueues
        argIdx = 6 if problemType["UseBeta"] else 5
        argIdx += len(self.strideList)
        # sizes
        for sizeIdx in range(0, problemType["TotalIndices"]):
          if sizeIdx in [ problemType["Index0"],  problemType["Index1"], problemType["IndexUnroll"] ]:
            s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.indexChars[sizeIdx])
          argIdx += 1

        # enqueue
        s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t)
        t += "  "
        s += "%sstream,\n" % (t)
        s += "%skernels[kernelIdx],\n" % (t)
        s += "%sworkDim,\n" % (t)
        s += "%sNULL, // globalWorkOffset\n" % (t)
        s += "%sglobalWorkSize[kernelIdx],\n" % (t)
        s += "%slocalWorkSize,\n" % (t)
        if False: # gsu > 1:
          s += "%s1,\n" % (t)
          s += "%s&kernelEventBetaOnly,\n" % (t)
        else:
          s += "%snumInputEvents,\n" % (t)
          s += "%sinputEvents,\n" % (t)
        s += "%soutputEvent );\n" % (t)
        s += "%stensileStatusCheck(status);\n" % (t)
        s += "%s}\n" % (t)

      ########################################
      # HIP Runtime
      ########################################
      else:
        if not globalParameters["PreciseKernelTime"] or kernelLanguage == "Source":
          s += "%sif( inputEvents != NULL )\n" % (t)
          t += "  "
          s += "%shipEventRecord(inputEvents[enqueueIdx], stream );\n" % (t)
        t = t[2:]
        s += "%stry {\n" % (t)
        t += "  "
        # hip kernel
        if kernelLanguage == "Source":
          s += "%skernelsLaunched++;\n" % (t)
          s += "%shipLaunchKernelGGL(\n" % (t)
          t += "  "
          s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelName)
          s += "%sdim3(globalWorkSize[kernelIdx][0], globalWorkSize[kernelIdx][1], globalWorkSize[kernelIdx][2]),\n" % (t)
          s += "%sdim3(localWorkSize[0], localWorkSize[1], localWorkSize[2]),\n" % (t)
          s += "%s0, // groupMemBytes\n" % (t)
          s += "%sstream,\n" % (t)
          s += "%sdataD,\n" % (t)
          s += "%sdataC,\n" % (t)
          s += "%sdataA,\n" % (t)
          s += "%sdataB,\n" % (t)
          s += "%salpha,\n" % (t)
          s += "%s%sbeta,\n" % (t, \
              "" if problemType["UseBeta"] else "//")
          # strides
          for stride in self.strideList:
            s += "%s%s,\n" % (t, stride)
          # sizes
          for i in range(0, problemType["TotalIndices"]):
            lastParam = i == problemType["TotalIndices"]-1
            s += "%ssizes[kernelIdx][enqueueIdx][%u]%s\n" \
                % (t, i, "" if lastParam else "," )
          for idxChar in solution["PackedC0Indices"][:-1]:
            s += "%s,magicNumberSize%s\n" % (t, idxChar)
            s += "%s,magicShiftSize%s\n" % (t, idxChar)
          for idxChar in solution["PackedC1Indices"][:-1]:
            s += "%s,magicNumberSize%s\n" % (t, idxChar)
            s += "%s,magicShiftSize%s\n" % (t, idxChar)
          s += "%s,staggerUIter\n" % (t)
          #persistent:
          s += "%s,problemNumGroupTiles0\n" % (t)
          s += "%s,problemNumGroupTiles1\n" % (t)
          s += "%s,magicNumberProblemNumGroupTiles0\n" % (t) # magic number to use when dividing by problemNumGroupTiles0
          s += "%s);\n" % (t)

        # assembly kernel
        else:
          if globalParameters["DebugKernel"]:
            s += "%sconst unsigned int debugBufferElementsPerThread = 16;\n" % t
            s += "%sunsigned int debugBufferNumElem = debugBufferElementsPerThread;\n" % (t)
            s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][0]);\n" % (t)
            s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][1]);\n" % (t)
            s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][2]);\n" % (t)
            s += "%sdebugBufferNumElem *= localWorkSize[0];\n" % (t)
            s += "%sdebugBufferNumElem *= localWorkSize[1];\n" % (t)
            s += "%sdebugBufferNumElem *= localWorkSize[2];\n" % (t)
            s += "%s  printf(\"debugBufferNumElem: %%04i: \\n\", debugBufferNumElem);\n" % (t)
            s += "%ssize_t debugBufferSize = debugBufferNumElem * sizeof(unsigned int);\n" % (t)
            s += "%shipDevice_t device;\n" % t
            s += "%shipDeviceGet(&device, 0);\n" % t
            s += "%shipMalloc(&(hipFunctionArgs.debugBuffer), debugBufferSize);\n" % t
            s += "%sunsigned int *debugBufferHostPtr = new unsigned int[debugBufferNumElem];\n" % (t)
            s += "%smemset(debugBufferHostPtr,0,debugBufferSize);\n" % (t)
            s += "%shipMemcpyHtoD(hipFunctionArgs.debugBuffer, debugBufferHostPtr, debugBufferSize);\n" % (t)
            s += "%smemset(debugBufferHostPtr,1,debugBufferSize);\n" % (t)

          # hip assembly function
          s += "%shipFunctionArgs.tensor2dSizeC = tensor2dSizeC;\n" % (t)
          s += "%shipFunctionArgs.tensor2dSizeA = tensor2dSizeA;\n" % (t)
          s += "%shipFunctionArgs.tensor2dSizeB = tensor2dSizeB;\n" % (t)

          s += "%shipFunctionArgs.dataD = dataD;\n" % (t)
          s += "%shipFunctionArgs.dataC = dataC;\n" % (t)
          s += "%shipFunctionArgs.dataA = dataA;\n" % (t)
          s += "%shipFunctionArgs.dataB = dataB;\n" % (t)

          if problemType["DataType"].isHalf():
            s += "%shipFunctionArgs.alpha[0] = alpha;\n" % (t)
            s += "%shipFunctionArgs.alpha[1] = alpha;\n" % (t)
          else:
            s += "%shipFunctionArgs.alpha = alpha;\n" % (t)
          if problemType["UseBeta"]:
            if problemType["DataType"].isHalf():
              s += "%shipFunctionArgs.beta[0] = beta;\n" % (t)
              s += "%shipFunctionArgs.beta[1] = beta;\n" % (t)
            else:
              s += "%shipFunctionArgs.beta = beta;\n" % (t)
          # strides
          for stride in self.strideList:
            s += "%shipFunctionArgs.%s = %s;\n" % (t, stride, stride)
          # sizes
          for i in range(0, problemType["TotalIndices"]):
            lastParam = i == problemType["TotalIndices"]-1
            s += "%shipFunctionArgs.size%s = sizes[kernelIdx][enqueueIdx][%u];\n" \
                % (t, globalParameters["IndexChars"][i], i )

          s += "%shipFunctionArgs.tensor2dSizeC = tensor2dSizeC;\n" % (t)
          s += "%shipFunctionArgs.tensor2dSizeA = tensor2dSizeA;\n" % (t)
          s += "%shipFunctionArgs.tensor2dSizeB = tensor2dSizeB;\n" % (t)

          s += "%shipFunctionArgs.staggerUIter = staggerUIter;\n" % (t)
          # persistent - pass in the number of tiles in problem since not available in WG
          s += "\n"
          s += "%shipFunctionArgs.problemNumGroupTiles0 = problemNumGroupTiles0;\n" % (t)
          s += "%shipFunctionArgs.problemNumGroupTiles1 = problemNumGroupTiles1;\n" % (t)
          s += "%shipFunctionArgs.magicNumberProblemNumGroupTiles0 = magicNumberProblemNumGroupTiles0;\n" % (t)
          s += "%shipFunctionArgs.gridNumWorkGroups0 = globalWorkSize[kernelIdx][0];\n" % (t) #
          s += "%shipFunctionArgs.numFullBlocks = numFullBlocks;\n" % (t)
          s += "%shipFunctionArgs.wgmRemainder1 = wgmRemainder1;\n" % (t)
          s += "%shipFunctionArgs.magicNumberWgmRemainder1 = magicNumberWgmRemainder1;\n" % (t)

          # Magic numbers for packed indices:
          for idxChar in solution["PackedC0Indices"][:-1]:
            s += "%shipFunctionArgs.magicNumberSize%s = magicNumberSize%s;\n" % (t, idxChar, idxChar)
            s += "%shipFunctionArgs.magicShiftSize%s = magicShiftSize%s;\n" % (t, idxChar, idxChar)
          for idxChar in solution["PackedC1Indices"][:-1]:
            s += "%shipFunctionArgs.magicNumberSize%s = magicNumberSize%s;\n" % (t, idxChar, idxChar)
            s += "%shipFunctionArgs.magicShiftSize%s = magicShiftSize%s;\n" % (t, idxChar, idxChar)

          s += "%skernelsLaunched++;\n" % (t)
          s += "%shipHccModuleLaunchKernel(\n" % (t)
          t += "  "
          s += "%shipFunction,\n" % (t)
          s += "%sglobalWorkSize[kernelIdx][0]*localWorkSize[0],\n" % (t)
          s += "%sglobalWorkSize[kernelIdx][1]*localWorkSize[1],\n" % (t)
          s += "%sglobalWorkSize[kernelIdx][2]*localWorkSize[2],\n" % (t)
          s += "%slocalWorkSize[0],\n" % (t)
          s += "%slocalWorkSize[1],\n" % (t)
          s += "%slocalWorkSize[2],\n" % (t)
          s += "%s0, // groupMemBytes\n" % (t)
          s += "%sstream,\n" % (t)
          s += "%sNULL,\n" % (t)
          s += "%s(void**)hipLaunchParams\n" % (t)
          if globalParameters["PreciseKernelTime"]:
            s += "%s,(inputEvents && kernelsLaunched==1) ? inputEvents[enqueueIdx]:nullptr\n" %(t)
            s += "%s,outputEvent ? outputEvent[enqueueIdx]:nullptr\n" % (t)

          s += "%s);\n" % (t)
          t = t[2:]
          if globalParameters["DebugKernel"]:
            # copy debug buffer
            s += "%shipMemcpyDtoH(debugBufferHostPtr, hipFunctionArgs.debugBuffer, debugBufferSize);\n" % (t)
            s += "%sfor(unsigned int i = 0; i < debugBufferNumElem/debugBufferElementsPerThread; i++) {\n" % (t)
            s += "%s  printf(\"%%04i\", i);\n" % (t)
            s += "%s  char u[debugBufferElementsPerThread] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};\n" % (t)
            #s += "%s  char u[debugBufferElementsPerThread] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};\n" % (t)
            #s += "%s  char u[debugBufferElementsPerThread] = {1,1,0,0,1,1,0,0,1,1,1,1,1,1,1,1};\n" % (t)
            s += "%s  for(unsigned int j = 0; j < debugBufferElementsPerThread; j++) {\n" % (t)
            s += "%s if (u[j]) printf(\",%%4u\", debugBufferHostPtr[i*debugBufferElementsPerThread+j]);\n" % (t)
            s += "%s else printf(\",%%4.0f\", ((float *)debugBufferHostPtr)[i*debugBufferElementsPerThread+j]);\n" % (t)

            s += "%s  }\n" % (t)
            s += "%s  printf(\"\\n\");\n" % (t)
            s += "%s}\n" % (t)


        t = t[2:]
        s += "%s} catch (const std::exception& e) {\n" % (t)
        s += "#ifdef DEBUG\n"
        s += "%s  std::cerr << e.what() << std::endl;\n" % (t)
        s += "#endif\n"
        s += "%s  return tensileStatusFailure;\n" % (t)
        s += "%s}\n" % (t)
        if not globalParameters["PreciseKernelTime"] or kernelLanguage == "Source":
          s += "%sif( outputEvent != NULL )\n" % (t)
          s += "%s  hipEventRecord(outputEvent[enqueueIdx], stream );\n" % (t)
        s += "  }\n"
    s += "\n"
    s += "  return tensileStatusSuccess;\n"
    s += "}\n"
    s += "\n"
    s += "/* Solution Parameters\n"
    s += Solution.getParametersIndented(solution.getAttributes(), "  ")
    s += "*/\n"
    s += "\n"

    return s