Beispiel #1
0
 def getSolutionName(self, solution):
     if globalParameters["ShortNames"]:
         solutionName = Solution.getNameSerial(solution,
                                               self.solutionSerialNaming)
     else:
         solutionName = Solution.getNameMin(solution,
                                            self.solutionMinNaming)
     return solutionName
Beispiel #2
0
def writeBenchmarkFiles(stepBaseDir, solutions, problemSizes, stepName,
                        filesToCopy):
    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(globalParameters["WorkingPath"], "Solutions"))
        ensurePath(os.path.join(globalParameters["WorkingPath"], "Kernels"))

    ##############################################################################
    # Min Naming
    ##############################################################################
    kernels = []
    kernelsBetaOnly = []
    for solution in solutions:
        solutionKernels = solution.getKernels()
        for kernel in solutionKernels:
            if kernel not in kernels:
                kernels.append(kernel)
        solutionKernelsBetaOnly = solution.getKernelsBetaOnly()
        for kernel in solutionKernelsBetaOnly:
            if kernel not in kernelsBetaOnly:
                kernelsBetaOnly.append(kernel)

    solutionSerialNaming = Solution.getSerialNaming(solutions)
    kernelSerialNaming = Solution.getSerialNaming(kernels)
    solutionMinNaming = Solution.getMinNaming(solutions)
    kernelMinNaming = Solution.getMinNaming(kernels)
    solutionWriter = SolutionWriter( \
        solutionMinNaming, solutionSerialNaming, \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterSource = KernelWriterSource( \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterAssembly = KernelWriterAssembly( \
        kernelMinNaming, kernelSerialNaming)

    # write solution, kernels and CMake
    problemType = solutions[0]["ProblemType"]
    writeSolutionsAndKernels( \
        globalParameters["WorkingPath"], [problemType], solutions, kernels, kernelsBetaOnly, \
        solutionWriter, kernelWriterSource, kernelWriterAssembly )

    ##############################################################################
    # Write CMake
    ##############################################################################

    clientName = "TensileBenchmark_%s" % stepName
    writeCMake(globalParameters["WorkingPath"], solutions, kernels, filesToCopy, \
        clientName)

    forBenchmark = True
    writeClientParameters(forBenchmark, solutions, problemSizes, stepName, \
        filesToCopy, stepBaseDir)
Beispiel #3
0
def readSolutions(filename):
    try:
        stream = open(filename, "r")
    except IOError:
        printExit("Cannot open file: %s" % filename)
    solutionStates = yaml.load(stream, yaml.SafeLoader)
    stream.close()

    # verify
    if len(solutionStates) < 2:
        printExit("len(%s) %u < 2" % (filename, len(solutionStates)))
    versionString = solutionStates[0]["MinimumRequiredVersion"]
    if not versionIsCompatible(versionString):
        printWarning("File \"%s\" version=%s does not match current Tensile version=%s" \
            % (filename, versionString, __version__) )

    if "ProblemSizes" not in solutionStates[1]:
        printExit("%s doesn't begin with ProblemSizes" % filename)
    else:
        problemSizesConfig = solutionStates[1]["ProblemSizes"]

    solutions = []
    for i in range(2, len(solutionStates)):
        solutionState = solutionStates[i]
        solutionObject = Solution(solutionState)
        solutions.append(solutionObject)
    problemType = solutions[0]["ProblemType"]
    problemSizes = ProblemSizes(problemType, problemSizesConfig)
    return (problemSizes, solutions)
 def abbreviation(self):
     string = "%02u" % self.stepIdx
     if self.isFinal():
         string += "_Final"
     else:
         for param in self.benchmarkParameters:
             string += "_%s" % Solution.getParameterNameAbbreviation(param)
     return string
Beispiel #5
0
 def __str__(self):
     state = ""
     idx = 0
     for hardcodedParameters in self.winners:
         winningParameters = self.winners[hardcodedParameters][0]
         score = self.winners[hardcodedParameters][1]
         state += "  %2u: %s -> %s %f GFlop/s\n" % (idx, hardcodedParameters, \
             Solution.getNameFull(winningParameters), score)
         idx += 1
     return state
Beispiel #6
0
    def get(lookupHardcodedParameters, winners):
        matches = []

        # only 1 winner, when benchmarking 1 solution
        if len(winners) == 1:
            hardcodedFrozen = winners.keys()[0]
            winningParameters = winners[hardcodedFrozen][0]
            score = winners[hardcodedFrozen][1]
            matches.append([hardcodedFrozen, winningParameters, score])
            return matches

        for hardcodedFrozen in winners:
            winningParameters = winners[hardcodedFrozen][0]
            score = winners[hardcodedFrozen][1]
            frozenMatch = True
            # a match if no key in hardcoded has a different value than lookup
            for paramName in hardcodedFrozen:
                if paramName in lookupHardcodedParameters:
                    if lookupHardcodedParameters[paramName] != \
                        hardcodedFrozen[paramName]:
                        frozenMatch = False
                        break
            if frozenMatch:
                matchMacroTile = True
                matchUnion = {}
                matchUnion.update(hardcodedFrozen.parameters)
                matchUnion.update(winningParameters)
                if "MacroTile0" in lookupHardcodedParameters:
                    lookupMacroTile0 = lookupHardcodedParameters["MacroTile0"]
                    lookupMacroTile1 = lookupHardcodedParameters["MacroTile1"]
                    Solution.assignProblemIndependentDerivedParameters(
                        matchUnion)
                    Solution.assignProblemIndependentDerivedParameters(
                        hardcodedFrozen.parameters)
                    if matchUnion["MacroTile0"] != lookupMacroTile0 \
                        or matchUnion["MacroTile1"] != lookupMacroTile1:
                        matchMacroTile = False
                if matchMacroTile:
                    matches.append([hardcodedFrozen, winningParameters, score])
            else:
                pass

        return matches
Beispiel #7
0
def readLibraryLogicForSchedule(filename):
    print1("# Reading Library Logic: %s" % (filename))
    try:
        stream = open(filename, "r")
    except IOError:
        printExit("Cannot open file: %s" % filename)
    data = yaml.load(stream, yaml.SafeLoader)
    stream.close()

    # verify
    if len(data) < 6:
        printExit("len(%s) %u < 7" % (filename, len(data)))

    # parse out objects
    versionString = data[0]["MinimumRequiredVersion"]
    scheduleName = data[1]
    architectureName = data[2]
    deviceNames = data[3]
    problemTypeState = data[4]
    solutionStates = data[5]
    indexOrder = data[6]
    exactLogic = data[7]
    rangeLogic = data[8]

    # does version match
    if not versionIsCompatible(versionString):
        printWarning("File \"%s\" version=%s does not match Tensile version=%s" \
            % (filename, versionString, __version__) )

    # unpack problemType
    problemType = ProblemType(problemTypeState)
    # unpack solutions
    solutions = []
    for i in range(0, len(solutionStates)):
        solutionState = solutionStates[i]
        if solutionState["KernelLanguage"] == "Assembly":
            isa0 = int(architectureName[3])
            isa1 = int(architectureName[4])
            isa2 = int(architectureName[5])
            solutionState["ISA"] = (isa0, isa1, isa2)
        else:
            solutionState["ISA"] = (0, 0, 0)
        solutionObject = Solution(solutionState)
        if solutionObject["ProblemType"] != problemType:
            printExit("ProblemType of file doesn't match solution: %s != %s" \
                % (problemType, solutionObject["ProblemType"]))
        solutions.append(solutionObject)

    return (scheduleName, deviceNames, problemType, solutions, indexOrder, \
        exactLogic, rangeLogic )
Beispiel #8
0
 def __str__(self):
     return Solution.getNameFull(self.parameters)
Beispiel #9
0
 def __init__(self, parameters):
     self.parameters = deepcopy(parameters)
     self.hashValue = hash(Solution.getNameFull(self.parameters))
Beispiel #10
0
def benchmarkProblemType( problemTypeConfig, problemSizeGroupConfig, \
    problemSizeGroupIdx ):

    benchmarkTestFails = 0

    # convert config to full benchmark process (resolves defaults)
    print1("")
    print1(HR)
    print1("# Converting Config to BenchmarkProcess Object")
    print1(HR)
    print1("")
    benchmarkProcess = BenchmarkProcess( problemTypeConfig, \
        problemSizeGroupConfig )

    problemTypeName = str(benchmarkProcess.problemType)
    problemSizeGroupName = "%s_%02u" % (problemTypeName, problemSizeGroupIdx)
    pushWorkingPath(problemSizeGroupName)
    ensurePath(os.path.join(globalParameters["WorkingPath"], "Data"))

    totalBenchmarkSteps = len(benchmarkProcess)
    resultsFileBaseFinal = None
    winners = WinningParameterDict()
    print1("# NumBenchmarkSteps: %u" % totalBenchmarkSteps)
    print1("")
    print1(HR)
    print1("# Done Creating BenchmarkProcess Object")
    print1(HR)

    ##############################################################################
    # For Each Benchmark Step
    ##############################################################################
    for benchmarkStepIdx in range(0, totalBenchmarkSteps):

        benchmarkStep = benchmarkProcess[benchmarkStepIdx]
        if winners.winners == {}:
            # perf optimization to skip the initial winners creation
            # this helps a little here but really helps below with avoiding the super-expensive
            # removeHardcoded step below - that can use a fast-path to create
            # winners when needed.
            print1(
                "# Empty winners - use fast initialization of hardcodedParameters"
            )
            resultingHardcodedParameterList = benchmarkStep.hardcodedParameters
        else:
            resultingHardcodedParameterList = \
                winners.wpdUpdate( benchmarkStep.hardcodedParameters )

        benchmarkStep.hardcodedParameters = resultingHardcodedParameterList
        numHardcoded = len(benchmarkStep.hardcodedParameters)
        stepName = str(benchmarkStep)
        shortName = benchmarkStep.abbreviation()
        print1("\n")
        print1(HR)
        currentTime = time.time()
        elapsedTime = currentTime - startTime
        print1("# BenchmarkStep: %s - %s %.3fs" %
               (problemSizeGroupName, stepName, elapsedTime))
        print1("# NumProblems: %u" %
               benchmarkStep.problemSizes.totalProblemSizes)
        print1("# BenchmarkParameters:")
        for paramName in benchmarkStep.benchmarkParameters:
            paramValues = benchmarkStep.benchmarkParameters[paramName]
            printStr = "#     %s = { %s" % (paramName, paramValues[0])
            for paramValueIdx in range(1, len(paramValues)):
                printStr += ", %s" % str(paramValues[paramValueIdx])
            printStr += " }"
            print1(printStr)

        if False:
            # print1(hardcoded parameters and their winners
            print1("# HardcodedParameters | WinningParameters:")
            paramDictIdx = 0
            hardcodedMinNaming = \
                Solution.getMinNaming(benchmarkStep.hardcodedParameters)
            for paramDict in benchmarkStep.hardcodedParameters:
                winningParameters = winners[paramDict]
                print1("#    (%u) %s | %s" % (paramDictIdx, \
                    Solution.getNameMin(paramDict, hardcodedMinNaming), \
                    Solution.getNameFull(winningParameters) ))
                paramDictIdx += 1
        pushWorkingPath(shortName)

        ############################################################################
        # Copy Files to Benchmark Source Directory
        ############################################################################
        stepBaseDir = globalParameters["WorkingPath"]
        sourceDir = \
          os.path.join(stepBaseDir, "source" )
        ensurePath(sourceDir)
        pushWorkingPath("sourceTmp")
        filesToCopy = [
            "SolutionMapper.h",
            "Client.cpp",
            "Client.h",
            "CMakeLists.txt",
            "DeviceStats.h",
            "TensorUtils.h",
            "MathTemplates.cpp",
            "MathTemplates.h",
            "TensileTypes.h",
            "tensile_bfloat16.h",
            "KernelHeader.h",
            "ReferenceCPU.h",
            "SolutionHelper.cpp",
            "SolutionHelper.h",
            "Tools.cpp",
            "Tools.h",
        ]

        for f in filesToCopy:
            shutil_copy(os.path.join(globalParameters["SourcePath"], f),
                        globalParameters["WorkingPath"])
        if globalParameters["RuntimeLanguage"] == "OCL":
            shutil_copy(
                os.path.join(globalParameters["SourcePath"],
                             "FindOpenCL.cmake"),
                globalParameters["WorkingPath"])
        else:
            shutil_copy(
                os.path.join(globalParameters["SourcePath"], "FindHIP.cmake"),
                globalParameters["WorkingPath"])
            shutil_copy(
                os.path.join(globalParameters["SourcePath"], "FindHCC.cmake"),
                globalParameters["WorkingPath"])

        ############################################################################
        # Enumerate Benchmark Permutations
        ############################################################################
        solutions = []
        totalBenchmarkPermutations = 1
        for benchmarkParamName in benchmarkStep.benchmarkParameters:
            totalBenchmarkPermutations *= len(
                benchmarkStep.benchmarkParameters[benchmarkParamName])
        maxPossibleSolutions = totalBenchmarkPermutations * numHardcoded
        print1("# MaxPossibleSolutions: %u = %u (hardcoded) * %u (benchmark)" % \
            (maxPossibleSolutions, numHardcoded, totalBenchmarkPermutations))

        benchmarkPermutations = []
        for i in range(0, totalBenchmarkPermutations):
            permutation = {}
            pIdx = i
            for benchmarkParamName in benchmarkStep.benchmarkParameters:
                benchmarkParamValues = deepcopy( \
                    benchmarkStep.benchmarkParameters[benchmarkParamName])
                valueIdx = pIdx % len(benchmarkParamValues)
                permutation[benchmarkParamName] = benchmarkParamValues[
                    valueIdx]
                pIdx /= len(benchmarkParamValues)
            benchmarkPermutations.append(permutation)

        ############################################################################
        # Enumerate Solutions = Hardcoded * Benchmark
        ############################################################################
        print1("# Enumerating Solutions")
        if globalParameters["PrintLevel"] >= 1:
            progressBar = ProgressBar(maxPossibleSolutions)
        solutionSet = set()  # avoid duplicates for nlca=-1, 1
        for hardcodedIdx in range(0, numHardcoded):
            solutions.append([])
            hardcodedParamDict = benchmarkStep.hardcodedParameters[
                hardcodedIdx]
            for benchmarkIdx in range(0, len(benchmarkPermutations)):
                benchmarkPermutation = benchmarkPermutations[benchmarkIdx]
                solution = {
                    "ProblemType": deepcopy(benchmarkProcess.problemType.state)
                }
                solution.update(benchmarkPermutation)
                solution.update(hardcodedParamDict)
                if benchmarkStepIdx > 0:
                    winningParameters = winners[hardcodedParamDict]
                    if winningParameters == None:
                        # this is a joined parameter that didn't have a winner, that's okay
                        continue
                    solution.update(winningParameters)

                # append default parameters where necessary
                for initialSolutionParameterName in benchmarkStep.initialSolutionParameters:
                    if initialSolutionParameterName not in solution:
                        solution[initialSolutionParameterName] = \
                            benchmarkStep.initialSolutionParameters[initialSolutionParameterName]
                # TODO check if solution matches problem size for exact tile kernels
                solutionObject = Solution(solution)
                if solutionObject["Valid"]:
                    if solutionObject not in solutionSet:
                        solutionSet.add(solutionObject)
                        solutions[hardcodedIdx].append(solutionObject)
                else:
                    if globalParameters["PrintSolutionRejectionReason"]:
                        print1("rejecting solution %s" % str(solutionObject))
                if globalParameters["PrintLevel"] >= 1:
                    progressBar.increment()

        # remove hardcoded that don't have any valid benchmarks
        removeHardcoded = []
        for hardcodedIdx in range(0, numHardcoded):
            if len(solutions[hardcodedIdx]) == 0:
                hardcodedParamDict = benchmarkStep.hardcodedParameters[
                    hardcodedIdx]
                removeHardcoded.append(hardcodedParamDict)
        removesExist = len(removeHardcoded) > 0
        for hardcodedParam in removeHardcoded:
            benchmarkStep.hardcodedParameters.remove(hardcodedParam)

        if removesExist:
            print1(
                "# Updating winners since enumeration removed unused hardcoded solutions.  removeHardcoded=%u winners=%u"
                % (len(removeHardcoded), len(winners.winners)))
            winners.wpdUpdate(benchmarkStep.hardcodedParameters)
            if globalParameters["PrintLevel"] >= 1:
                print1("")
            numHardcoded = len(benchmarkStep.hardcodedParameters)
            # remove from solution 2D list also
            for solutionList in shallowcopy(solutions):
                if len(solutionList) == 0:
                    solutions.remove(solutionList)
        elif winners.winners == {}:
            print1("# Populating initial winners (%u solutions)\n" %
                   len(benchmarkStep.hardcodedParameters))
            for hcParm in benchmarkStep.hardcodedParameters:
                winners.winners[FrozenDictionary(hcParm)] = [{}, -1]

        print1("# Actual Solutions: %u / %u\n" % ( len(solutions), \
            maxPossibleSolutions ))

        # create linear list
        solutionList = []
        for i in range(0, len(solutions)):
            solutionsForHardcoded = solutions[i]
            for j in range(0, len(solutionsForHardcoded)):
                solution = solutionsForHardcoded[j]
                solutionList.append(solution)
        if len(solutionList) == 0:
            msg = "Your parameters resulted in 0 valid solutions."
            if globalParameters["PrintSolutionRejectionReason"]:
                msg += "\nExamine reject and backtrace messages above to see why and where solutions were rejected."
            else:
                msg += "\nYou should re-run with \"PrintSolutionRejectionReason: True\" to see why each parameter combination was rejected."
            printExit(msg)
        if globalParameters["PrintLevel"] >= 1:
            for i in range(0, len(solutions)):
                solutionsForHardcoded = solutions[i]
                for j in range(0, len(solutionsForHardcoded)):
                    solution = solutionsForHardcoded[j]
                    print2("#    (%u:%u) %s" % (i, j, \
                        Solution.getNameFull(solution) ))
            print2(HR)

        # write benchmarkFiles
        writeBenchmarkFiles(stepBaseDir, solutionList, benchmarkStep.problemSizes, \
            shortName, filesToCopy)

        print1("# Copying files that differ from sourceTmp -> source")
        sourceTmp = globalParameters["WorkingPath"]
        files = os.listdir(sourceTmp)
        for f in files:
            f0 = os.path.join(sourceTmp, f)
            f1 = os.path.join(sourceDir, f)
            if os.path.isdir(f0):
                #print "cpDir:", f0, f1
                if os.path.isdir(f1):
                    shutil.rmtree(f1, True)
                shutil.copytree(f0, f1)
            elif not os.path.exists(f1) or not filecmp.cmp(f0, f1):
                #print "cp:", f0, f1
                shutil.copy(f0, f1)
        shutil.rmtree(sourceTmp, True)

        popWorkingPath()  # source

        ############################################################################
        # Run Benchmark Script
        ############################################################################
        resultsFileBase = os.path.normpath(os.path.join( \
            globalParameters["WorkingPath"], "../Data", shortName))
        if benchmarkStep.isFinal():
            resultsFileBaseFinal = resultsFileBase
        resultsFileName = resultsFileBase + ".csv"
        solutionsFileName = resultsFileBase + ".yaml"
        if not os.path.exists(resultsFileName) or \
            globalParameters["ForceRedoBenchmarkProblems"]:
            pushWorkingPath("build")

            # write runScript
            libraryLogicPath = None
            path = globalParameters["WorkingPath"]
            forBenchmark = True
            runScriptName = writeRunScript(path, libraryLogicPath,
                                           forBenchmark)

            # run runScript
            process = Popen(runScriptName, cwd=globalParameters["WorkingPath"])
            process.communicate()
            if process.returncode:
                benchmarkTestFails += 1
                printWarning(
                    "BenchmarkProblems: Benchmark Process exited with code %u"
                    % process.returncode)
            popWorkingPath()  # build
        else:
            print1("# Already benchmarked; skipping.")

        ############################################################################
        # Winners -> Determined Parameters
        ############################################################################
        results = getResults(resultsFileName, solutions)
        print2("CSV Results: %s" % results)
        winners.addResults(benchmarkStep.hardcodedParameters, \
            benchmarkPermutations, solutions, results)

        ############################################################################
        # Write Solutions YAML
        ############################################################################
        YAMLIO.writeSolutions(solutionsFileName, benchmarkStep.problemSizes, \
            solutions )

        # End Iteration
        popWorkingPath()  # stepName
        currentTime = time.time()
        elapsedTime = currentTime - startTime
        print1("%s\n# %s\n# %s: End - %.3fs\n%s\n" \
            % (HR, problemSizeGroupName, shortName, elapsedTime, HR))

    popWorkingPath()  # ProblemType
    return (resultsFileBaseFinal, benchmarkTestFails)
Beispiel #11
0
    def getSourceString(self, solution):
        kernels = solution.getKernels()
        kernelNames = []
        for kernel in kernels:
            kernelName = self.kernelWriter.getKernelName(kernel)
            kernelNames.append(kernelName)

        s = ""
        t = ""
        # includes

        if not globalParameters["MergeFiles"]:
            solutionName = self.getSolutionName(solution)
            s += "#include \"%s.h\"\n" % solutionName
            #s += "#include \"MathTemplates.h\"\n"
            s += "\n"

        # solution function signature
        s += self.getSolutionSignature(solution)
        s += " {\n"
        t += "  "
        s += "%sTensileStatus status;\n" % (t)

        # hipFunction Struct
        if solution["KernelLanguage"] == "Assembly":
            s += "\n"
            s += "%s/* module function args */\n" % (t)
            s += "%sstruct {\n" % t
            t += "  "
            if globalParameters["DebugKernel"]:
                s += "%sunsigned int *debugBuffer;\n" % t
            solutionArgs = self.getArgList(solution["ProblemType"], True,
                                           False, False)
            for arg in solutionArgs:
                if arg[0] == "TensileHalf":
                    s += "%s%s %s[2];\n" % (t, arg[0], arg[1])
                else:
                    s += "%s%s %s;\n" % (t, arg[0], arg[1])

            if solution["PersistentKernel"]:
                # pass in the number of groups since not available in WG
                s += "%sunsigned int numGroupTiles0;\n" % t
                s += "%sunsigned int numGroupTiles1;\n" % t

            s += "%sunsigned int pad;\n" % t  # FIXME can this be removed?
            t = t[2:]
            s += "%s} hipFunctionArgs;\n" % t
            #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t
            s += "%ssize_t hipFunctionArgsSize = sizeof(hipFunctionArgs);\n" % t
            s += "%svoid *hipLaunchParams[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &hipFunctionArgs, HIP_LAUNCH_PARAM_BUFFER_SIZE, &hipFunctionArgsSize, HIP_LAUNCH_PARAM_END};\n" % t
            #s += "%sprintf(\"size: %%lu\\n\", sizeof(unsigned int));\n" % t
            #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t
            #for arg in solutionArgs:
            #  s += "%sprintf(\"%s: %%lu\\n\", static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)) - static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)));\n" % (t, arg[1], arg[1], solutionArgs[0][1])

        # NOTE: host compiler aligns size of structs to 64-bits (at least) and aligns the offset of pointers to 64-bits, therefore, having pointers which are not at the beginning of the struct may get padded/shifted by the host compiler and, therefore, not coppied correctly to gpu

        # kernels
        s += "\n%s/* kernels */\n" % (t)
        s += "%sconst unsigned int numKernels = %u; // 1 or 4\n" % (
            t, len(kernels))

        s += "%sint deviceId;\n" % (t)
        s += "%shipCtxGetDevice(&deviceId);\n" % (t)
        s += "%shipDeviceProp_t deviceProperties;\n" % (t)
        s += "%shipGetDeviceProperties( &deviceProperties, deviceId );\n" % (t)
        if solution["KernelLanguage"] == "Source" and globalParameters[
                "RuntimeLanguage"] == "OCL":
            s += "%sconst char *kernelSources[numKernels] = {\n" % (t)
            t += "  "
            for kernelIdx in range(0, len(kernelNames)):
                kernelName = kernelNames[kernelIdx]
                s += "%s%s_src%s\n" % (t, kernelName, \
                    "," if kernelIdx < len(kernels)-1 else "" )
            t = t[2:]
            s += "%s};\n" % (t)
            s += "%scl_kernel kernels[numKernels];\n" % (t)
            s += "%sconst char *buildOptions = \"-cl-std=cl2.0\";\n" % (t)
            s += "%sfor (unsigned int i = 0; i < numKernels; i++) {\n" % (t)
            s += "%s  tensileGetCompiledOpenCLKernel(\n" % (t)
            s += "%s      &kernels[i],\n" % (t)
            s += "%s      kernelSources[i],\n" % (t)
            s += "%s      stream,\n" % (t)
            s += "%s      buildOptions);\n" % (t)
            s += "%s}\n" % (t)

            if solution["GlobalSplitU"] > 1:
                for beta in solution.getKernelsBetaOnly():
                    kernelName = self.kernelWriter.getKernelNameBetaOnly(beta)
                    s += "%scl_kernel kernel_%s;\n" % (t, kernelName)
                    s += "%s  tensileGetCompiledOpenCLKernel(\n" % (t)
                    s += "%s      &kernel_%s,\n" % (t, kernelName)
                    s += "%s      %s_src,\n" % (t, kernelName)
                    s += "%s      stream,\n" % (t)
                    s += "%s      buildOptions);\n" % (t)

        elif solution["KernelLanguage"] == "Assembly":
            localStatic = True
            kernel = kernels[0]
            s += "%sint isa = deviceProperties.gcnArch;\n" % (t)
            s += "%shipFunction_t hipFunction;\n" % (t)

            kernelName = self.kernelWriter.getKernelName(kernel)
            s += t
            if localStatic:
                s += "%sstatic hipFunction_t *hipFunctions = nullptr;\n" % (t)
                s += "%sif ( !hipFunctions ) {\n" % (
                    t
                )  # not locking here means array might be double allocated and memory leak
                t += "  "
                s += "%sstatic std::mutex initFunctionsMutex;\n" % (t)
                s += "%sstd::lock_guard<std::mutex> initFunctionsLock(initFunctionsMutex);\n" % (
                    t)
                s += "%sif ( !hipFunctions ) {\n" % (
                    t
                )  # not locking here means array might be double allocated and memory leak
                t += "  "
                s += "%sstatic int numDevices = -1;\n" % (t)
                s += "%sstatus = hipGetDeviceCount( &numDevices );\n" % (t)
                s += "%shipFunction_t *tmp = new hipFunction_t[numDevices];\n" % (
                    t)
                s += "%sfor ( int i = 0; i < numDevices; i++) {\n" % (t)
                s += "%s  tmp[i] = nullptr;\n" % (t)
                s += "%s}\n" % (t)
                s += "%shipFunctions = tmp;\n" % (t)
                t = t[2:]
                s += "%s}\n" % (t)
                t = t[2:]
                s += "%s}\n" % (t)
                s += "%sif ( !hipFunctions[deviceId] ) {\n" % (t)
                t += "  "
                s += "%sstatic std::mutex loadModuleMutex;\n" % (t)
                s += "%sstd::lock_guard<std::mutex> loadModuleLock(loadModuleMutex);\n" % (
                    t)
                s += "%sif (!hipFunctions[deviceId]) {\n" % (t)
                t += "  "
                s += "%shipModule_t module = nullptr;\n" % (t)
                s += "%shipModuleLoadData(&module, %s_coba);\n" % (t,
                                                                   kernelName)
                s += "%shipModuleGetFunction(&hipFunctions[deviceId], module, \"%s\");\n" % (
                    t, kernelName)
                t = t[2:]
                s += "%s}\n" % (t)
                t = t[2:]
                s += "%s}\n" % (t)
                s += "%shipFunction = hipFunctions[deviceId];\n" % (t)
            else:
                s += "%stensileGetHipFunctionFromCodeObjectByteArray(\n" % (t)
                s += "%s    &hipFunction,\n" % (t)
                s += "%s    \"%s\",\n" % (t, kernelName)
                s += "%s    %s_coba); // code object byte array\n" % (
                    t, kernelName)

        typeName = solution["ProblemType"]["DataType"].toCpp()

        # index assignments
        s += "\n%s/* index assignments */\n" % (t)
        s += "%sconst unsigned int indexD0 = %u;\n" \
            % (t, solution["ProblemType"]["Index0"])
        s += "%sconst unsigned int indexD1 = %u;\n" \
            % (t, solution["ProblemType"]["Index1"])
        s += "%sconst unsigned int indexDU = %u;\n" \
            % (t, solution["ProblemType"]["IndexUnroll"])

        # num enqueues
        s += "\n%s/* num kernels */\n" % (t)
        s += "%sunsigned int numEnqueues[numKernels] = { 1" % (t)
        for i in range(1, len(kernels)):
            s += ", 1"
        s += " };\n"

        # grid size
        s += "\n%s/* grid sizes */\n" % (t)
        s += "%sconst unsigned int workDim = 3;\n" % (t)
        s += "%sconst unsigned int threadTile[2] = { %u, %u };\n" \
            % (t, solution["ThreadTile0"], solution["ThreadTile1"])
        s += "%sconst unsigned int groupSize[2] = { %u, %u };\n" \
            % (t, solution["SubGroup0"], solution["SubGroup1"])
        s += "%ssize_t localWorkSize[3] = { %3u, 1, 1 };\n" \
            % (t, solution["NumThreads"])
        s += "%ssize_t globalWorkSize[numKernels][3];\n" % (t)
        # grid size [2]
        s += "%sglobalWorkSize[0][2] = 1;\n" % (t)
        for i in range(0, solution["ProblemType"]["NumIndicesC"]):
            if i != solution["ProblemType"]["Index0"] and i != solution[
                    "ProblemType"]["Index1"]:
                s += "%sglobalWorkSize[0][2] *= size%s;\n" % (
                    t, self.indexChars[i])

        # grid size [0,1]
        s += "%sunsigned int sizeOfC0 = size%s;\n" % (t, \
            self.indexChars[solution["ProblemType"]["Index0"]])
        s += "%sunsigned int sizeOfC1 = size%s;\n" % (t, \
            self.indexChars[solution["ProblemType"]["Index1"]])
        s += "%sunsigned int macroTile0 = static_cast<unsigned int>(groupSize[0] * threadTile[0]);\n" % (
            t)
        s += "%sunsigned int macroTile1 = static_cast<unsigned int>(groupSize[1] * threadTile[1]);\n" % (
            t)
        s += "%sunsigned int totalWorkGroups0 = sizeOfC0 / macroTile0;\n" % (t)
        s += "%sunsigned int totalWorkGroups1 = sizeOfC1 / macroTile1;\n" % (t)

        if kernel["EdgeType"] != "None":
            s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % (
                t)
            s += "%sif (totalWorkGroups0*macroTile0 < sizeOfC0) { totalWorkGroups0++; }\n" % (
                t)
            s += "%sif (totalWorkGroups1*macroTile1 < sizeOfC1) { totalWorkGroups1++; }\n" % (
                t)
        if kernel["WorkGroupMappingType"] == "Z" and abs(
                kernel["WorkGroupMapping"]) == 2:
            s += "%sunsigned int totalWorkGroupsPow2 = totalWorkGroups0 > totalWorkGroups1 ? totalWorkGroups0 : totalWorkGroups1;\n" % (
                t)
            s += "%stotalWorkGroupsPow2--;\n" % (t)
            s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 1;\n" % (t)
            s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 2;\n" % (t)
            s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 4;\n" % (t)
            s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 8;\n" % (t)
            s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 16;\n" % (t)
            s += "%stotalWorkGroupsPow2++;\n" % (t)
            s += "%stotalWorkGroups0 = totalWorkGroupsPow2;\n" % (t)
            s += "%stotalWorkGroups1 = totalWorkGroupsPow2;\n" % (t)

        if solution["GlobalSplitU"] > 1:
            s += "%stotalWorkGroups1 *= %u; // GlobalSplitU\n" % (
                t, solution["GlobalSplitU"])
        if solution["PersistentKernel"]:
            s += "%sglobalWorkSize[0][0] = deviceProperties.multiProcessorCount * %u;\n" \
                    % (t, solution["PersistentKernel"])
            s += "%sglobalWorkSize[0][1] = 1;\n" % t
        else:
            s += "%sglobalWorkSize[0][0] = totalWorkGroups%u%s;\n" % (
                t, 0 if kernel["WorkGroupMapping"] > 0 else 1,
                "*localWorkSize[0]" if self.language == "OCL" else "")
            s += "%sglobalWorkSize[0][1] = totalWorkGroups%u%s;\n" % (
                t, 1 if kernel["WorkGroupMapping"] > 0 else 0,
                "*localWorkSize[1]" if self.language == "OCL" else "")

        # offsets
        s += "\n%s/* offsets */\n" % (t)
        s += "%sunsigned int offsets[numKernels][1][3];\n" % (t)
        for kernelIdx in range(0, len(kernels)):
            s += "%soffsets[%u][0][0] = offsetC; // tensorC\n" % (t, kernelIdx)
            s += "%soffsets[%u][0][1] = offsetA; // tensorA\n" % (t, kernelIdx)
            s += "%soffsets[%u][0][2] = offsetB; // tensorB\n" % (t, kernelIdx)

        # index sizes
        s += "\n%s/* index sizes */\n" % (t)
        s += "%sunsigned int sizes[numKernels][1][%u];\n" \
            % (t, solution["ProblemType"]["TotalIndices"])
        for kernelIdx in range(0, len(kernels)):
            kernel = kernels[kernelIdx]
            kernelName = self.kernelWriter.getKernelName(kernel)
            # free index sizes
            for i in range(0,solution["ProblemType"]["NumIndicesFree"] \
                + solution["ProblemType"]["NumIndicesBatch"] ):
                s += "%ssizes[%u][0][%u] = size%s;\n" \
                    % (t, kernelIdx, i, self.indexChars[i])
            # summation index sizes
            for i in range(solution["ProblemType"]["NumIndicesC"], \
                    solution["ProblemType"]["TotalIndices"] ):
                lastParam = i == solution["ProblemType"]["TotalIndices"] - 1
                s += "%ssizes[%u][0][%u] = size%s;\n" \
                    % (t, kernelIdx, i, self.indexChars[i])

        #s += "printf(\"Launching with grid=%zu_%zu problemGrid=%u_%u mt=%u_%u\\n\", globalWorkSize[0][0], globalWorkSize[0][1], totalWorkGroups0, totalWorkGroups1, macroTile0, macroTile1);\n"
        s += "\n"

        ########################################
        # Enqueue Beta-Only Kernel
        ########################################
        if solution["GlobalSplitU"] > 1:
            kernelNamesBetaOnly = []
            numStridesC = solution["ProblemType"]["NumIndicesC"] - \
                (0 if solution["ProblemType"]["UseInitialStrides"] else 1)
            for beta in solution.getKernelsBetaOnly():
                kernelName = self.kernelWriter.getKernelNameBetaOnly(beta)
                kernelNamesBetaOnly.append(kernelName)
            s += "%s// enqueue Beta-Only kernel\n" % (t)

            # grid sizes
            s += "%ssize_t localWorkSizeBetaOnly[3] = { 8, 8, 1};\n" % (t)
            s += "%ssize_t globalWorkSizeBetaOnly[3];\n" % (t)
            #s += "%sunsigned int sizeOfC0 = size%s;\n" % (t, \
            #    self.indexChars[solution["ProblemType"]["Index0"]])
            #s += "%sunsigned int sizeOfC1 = size%s;\n" % (t, \
            #    self.indexChars[solution["ProblemType"]["Index1"]])
            s += "%ssize_t totalWorkGroupsBetaOnly0 = sizeOfC0 / localWorkSizeBetaOnly[0];\n" % (
                t)
            s += "%ssize_t totalWorkGroupsBetaOnly1 = sizeOfC1 / localWorkSizeBetaOnly[1];\n" % (
                t)
            s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % (
                t)
            s += "%sif (totalWorkGroupsBetaOnly0*localWorkSizeBetaOnly[0] < sizeOfC0) { totalWorkGroupsBetaOnly0++; }\n" % (
                t)
            s += "%sif (totalWorkGroupsBetaOnly1*localWorkSizeBetaOnly[1] < sizeOfC1) { totalWorkGroupsBetaOnly1++; }\n" % (
                t)
            s += "%sglobalWorkSizeBetaOnly[0] = totalWorkGroupsBetaOnly0%s;\n" % (
                t,
                "*localWorkSizeBetaOnly[0]" if self.language == "OCL" else "")
            s += "%sglobalWorkSizeBetaOnly[1] = totalWorkGroupsBetaOnly1%s;\n" % (
                t,
                "*localWorkSizeBetaOnly[1]" if self.language == "OCL" else "")
            s += "%sglobalWorkSizeBetaOnly[2] = 1;\n" % (t)
            for i in range(0, solution["ProblemType"]["NumIndicesC"]):
                if i != solution["ProblemType"]["Index0"] and i != solution[
                        "ProblemType"]["Index1"]:
                    s += "%sglobalWorkSizeBetaOnly[2] *= size%s;\n" % (
                        t, self.indexChars[i])

            if solution["ProblemType"]["UseBeta"]:
                s += "%sbool betaZero = beta == 0;\n" % (t)
            if self.language == "OCL":
                if solution["ProblemType"]["UseBeta"]:
                    s += "%scl_kernel kernelBetaOnly = betaZero ? kernel_%s : kernel_%s;\n" \
                        % (t, kernelNamesBetaOnly[0], kernelNamesBetaOnly[1])
                else:
                    #s += "%sbool betaZero = true;\n" % (t)
                    s += "%scl_kernel kernelBetaOnly = kernel_%s;\n" \
                        % (t, kernelNamesBetaOnly[0])
                argIdx = 0
                s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % (
                    t, argIdx)
                argIdx += 1
                s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &offsetC ); tensileStatusCheck(status);\n" % (
                    t, argIdx)
                argIdx += 1
                # strides
                for i in range(0, numStridesC):
                    s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % (
                        t, argIdx, self.strideList[i])
                    argIdx += 1
                # sizes
                for i in range(0, solution["ProblemType"]["NumIndicesC"]):
                    s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (
                        t, argIdx, self.indexChars[i])
                    argIdx += 1
                # beta
                if solution["ProblemType"]["UseBeta"]:
                    s += "%sif (!betaZero) {\n" % (t)
                    s += "%s  status = clSetKernelArg( kernelBetaOnly, %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (
                        t, argIdx, typeName)
                    argIdx += 1
                    s += "%s}\n" % (t)
                # enqueue
                s += "%scl_event kernelEventBetaOnly;\n" % (t)
                s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t)
                t += "  "
                s += "%sstream,\n" % (t)
                s += "%skernelBetaOnly,\n" % (t)
                s += "%sworkDim,\n" % (t)
                s += "%sNULL, // globalWorkOffset\n" % (t)
                s += "%sglobalWorkSizeBetaOnly,\n" % (t)
                s += "%slocalWorkSizeBetaOnly,\n" % (t)
                s += "%snumInputEvents,\n" % (t)
                s += "%sinputEvents,\n" % (t)
                #s += "%soutputEvent );\n" % (t)
                s += "%s&kernelEventBetaOnly );\n" % (t)
                t = t[2:]
                s += "%stensileStatusCheck(status);\n" % (t)
                if solution["ProblemType"]["UseBeta"]:
                    s += "%sbeta = %s;\n" % (
                        t, solution["ProblemType"]["DataType"].zeroString(
                            self.language, 1))
                #s += "%sreturn tensileStatusSuccess;\n" % (t)
                s += "%sstatus = clFinish(stream);\n" % (t)
                s += "%stensileStatusCheck(status);\n" % (t)
                #s += " float tmp[128*128];\n"
                #s += "clEnqueueReadBuffer(stream, dataC, CL_TRUE, 0, 128*128*sizeof(float), tmp, 0, NULL, NULL);\n"
                #s += "for (unsigned int i = 0; i < 128*128; i++) { printf(\"%f\\n\", tmp[i]); }\n"

            else:
                s += "%sif( inputEvents != NULL )\n" % (t)
                t += "  "
                s += "%shipEventRecord(inputEvents[0], stream );\n" % (t)
                t += "  "
                s += "%stry {\n" % (t)
                if solution["ProblemType"]["UseBeta"]:
                    s += "%sif (betaZero) {\n" % (t)
                    t += "  "
                s += "%shipLaunchKernelGGL(\n" % (t)
                t += "  "
                s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[0])
                s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % (
                    t)
                s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % (
                    t)
                s += "%s0, // groupMemBytes\n" % (t)
                s += "%sstream,\n" % (t)
                s += "%sdataC,\n" % (t)
                s += "%soffsetC,\n" % (t)
                # strides
                for i in range(0, numStridesC):
                    s += "%s%s,\n" % (t, self.strideList[i])
                # sizes
                for i in range(0, solution["ProblemType"]["NumIndicesC"]):
                    s += "%ssize%s%s" % (
                        t, self.indexChars[i], ",\n" if i <
                        solution["ProblemType"]["NumIndicesC"] - 1 else ");\n")

                if solution["ProblemType"]["UseBeta"]:
                    s += "%s} else {\n" % (t)
                    s += "%shipLaunchKernelGGL(\n" % (t)
                    t += "  "
                    s += "%sHIP_KERNEL_NAME(%s),\n" % (t,
                                                       kernelNamesBetaOnly[1])
                    s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % (
                        t)
                    s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % (
                        t)
                    s += "%s0, // groupMemBytes\n" % (t)
                    s += "%sstream,\n" % (t)
                    s += "%sdataC,\n" % (t)
                    s += "%soffsetC,\n" % (t)
                    # strides
                    for i in range(0, numStridesC):
                        s += "%s%s,\n" % (t, self.strideList[i])
                    # sizes
                    for i in range(0, solution["ProblemType"]["NumIndicesC"]):
                        s += "%ssize%s,\n" % (t, self.indexChars[i])
                    s += "%sbeta);\n" % (t)
                    s += "%s}\n" % (t)

                s += "%s} catch (const std::exception& e) {\n" % (t)
                s += "#ifdef DEBUG\n"
                s += "%s  std::cerr << e.what() << std::endl;\n" % (t)
                s += "#endif\n"
                s += "%s  return tensileStatusFailure;\n" % (t)
                s += "%s}\n" % (t)

        ########################################
        # Enqueue Kernels
        ########################################
        for kernelIdx in range(0, len(kernels)):
            kernel = kernels[kernelIdx]
            if kernel["KernelLanguage"] == "Source":
                kernel["ISA"] = (
                    0, 0, 0)  # HIP source kernels needs dummy ISA version
            kernelName = self.kernelWriter.getKernelName(kernel)
            s += "\n%s/* kernel %u: %s */\n" % (t, kernelIdx, kernelName)
            s += "%sunsigned int kernelIdx = %u;\n" % (t, kernelIdx)
            if self.language == "OCL":
                # set kernel args same for all enqueues
                s += "%s// kernel args same for all enqueues\n" % (t)
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % (
                    t, 0)
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataA ); tensileStatusCheck(status);\n" % (
                    t, 1)
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataB ); tensileStatusCheck(status);\n" % (
                    t, 2)
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &alpha ); tensileStatusCheck(status);\n" % (
                    t, 3, typeName)
                s += "%s%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (t, \
                    "" if solution["ProblemType"]["UseBeta"] else "//", 4, typeName)
                argIdx = 5 if solution["ProblemType"]["UseBeta"] else 4
                argIdx += 3  # skipping offsets here
                for stride in self.strideList:
                    s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % (
                        t, argIdx, stride)
                    argIdx += 1
                for sizeIdx in range(0,
                                     solution["ProblemType"]["TotalIndices"]):
                    if sizeIdx not in [
                            solution["ProblemType"]["Index0"],
                            solution["ProblemType"]["Index1"],
                            solution["ProblemType"]["IndexUnroll"]
                    ]:
                        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (
                            t, argIdx, self.indexChars[sizeIdx])
                    argIdx += 1

            s += "%sfor (unsigned int enqueueIdx = 0; enqueueIdx < numEnqueues[%u]; enqueueIdx++) {\n" % (
                t, kernelIdx)
            t += "  "
            # debug print kernel dimensions
            if globalParameters["LibraryPrintDebug"]:
                s += "%sprintf(\"%s: g{ %%u, %%u, %%u } l{ %%u, %%u, %%u}\\n\", static_cast<unsigned int>(globalWorkSize[kernelIdx][0]), static_cast<unsigned int>(globalWorkSize[kernelIdx][1]), static_cast<unsigned int>(globalWorkSize[kernelIdx][2]), static_cast<unsigned int>(localWorkSize[0]), static_cast<unsigned int>(localWorkSize[1]), static_cast<unsigned int>(localWorkSize[2]) );\n" % (
                    t, kernelName)
                # debug print kernel arguments
                # offsets
                for i in range(0, 3):
                    s += "%sprintf(\"  offset[%u] = %%u\\n\", offsets[kernelIdx][enqueueIdx][%u]);\n" % (
                        t, i, i)
                # strides
                for stride in self.strideList:
                    s += "%sprintf(\"  %s = %%u\\n\", %s);\n" % (t, stride,
                                                                 stride)
                # sizes
                for i in range(0, solution["ProblemType"]["TotalIndices"]):
                    s += "%sprintf(\"  sizes[kernelIdx][enqueueIdx][%u] = %%u\\n\", sizes[kernelIdx][enqueueIdx][%u] );\n" % (
                        t, i, i)

            ########################################
            # OpenCL Runtime
            ########################################
            if self.language == "OCL":
                # set kernel args different for all enqueues
                argIdx = 5 if solution["ProblemType"]["UseBeta"] else 4
                # offsets
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &offsets[kernelIdx][enqueueIdx][0]); tensileStatusCheck(status);\n" % (
                    t, argIdx)
                argIdx += 1
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &offsets[kernelIdx][enqueueIdx][1]); tensileStatusCheck(status);\n" % (
                    t, argIdx)
                argIdx += 1
                s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &offsets[kernelIdx][enqueueIdx][2]); tensileStatusCheck(status);\n" % (
                    t, argIdx)
                argIdx += 1
                argIdx += len(self.strideList)
                # sizes
                for sizeIdx in range(0,
                                     solution["ProblemType"]["TotalIndices"]):
                    if sizeIdx in [
                            solution["ProblemType"]["Index0"],
                            solution["ProblemType"]["Index1"],
                            solution["ProblemType"]["IndexUnroll"]
                    ]:
                        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (
                            t, argIdx, self.indexChars[sizeIdx])
                    argIdx += 1

                # enqueue
                s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t)
                t += "  "
                s += "%sstream,\n" % (t)
                s += "%skernels[kernelIdx],\n" % (t)
                s += "%sworkDim,\n" % (t)
                s += "%sNULL, // globalWorkOffset\n" % (t)
                s += "%sglobalWorkSize[kernelIdx],\n" % (t)
                s += "%slocalWorkSize,\n" % (t)
                if False:  # solution["GlobalSplitU"] > 1:
                    s += "%s1,\n" % (t)
                    s += "%s&kernelEventBetaOnly,\n" % (t)
                else:
                    s += "%snumInputEvents,\n" % (t)
                    s += "%sinputEvents,\n" % (t)
                s += "%soutputEvent );\n" % (t)
                s += "%stensileStatusCheck(status);\n" % (t)

            ########################################
            # HIP Runtime
            ########################################
            else:
                if not globalParameters["PreciseKernelTime"] or solution[
                        "KernelLanguage"] == "Source":
                    s += "%sif( inputEvents != NULL )\n" % (t)
                    t += "  "
                    s += "%shipEventRecord(inputEvents[enqueueIdx], stream );\n" % (
                        t)
                t = t[2:]
                s += "%stry {\n" % (t)
                t += "  "
                # hip kernel
                if solution["KernelLanguage"] == "Source":
                    s += "%shipLaunchKernelGGL(\n" % (t)
                    t += "  "
                    s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelName)
                    s += "%sdim3(globalWorkSize[kernelIdx][0], globalWorkSize[kernelIdx][1], globalWorkSize[kernelIdx][2]),\n" % (
                        t)
                    s += "%sdim3(localWorkSize[0], localWorkSize[1], localWorkSize[2]),\n" % (
                        t)
                    s += "%s0, // groupMemBytes\n" % (t)
                    s += "%sstream,\n" % (t)
                    s += "%sdataC,\n" % (t)
                    s += "%sdataA,\n" % (t)
                    s += "%sdataB,\n" % (t)
                    s += "%salpha,\n" % (t)
                    s += "%s%sbeta,\n" % (t, \
                        "" if solution["ProblemType"]["UseBeta"] else "//")
                    s += "%soffsets[kernelIdx][enqueueIdx][0],\n" % (t)
                    s += "%soffsets[kernelIdx][enqueueIdx][1],\n" % (t)
                    s += "%soffsets[kernelIdx][enqueueIdx][2],\n" % (t)
                    # strides
                    for stride in self.strideList:
                        s += "%s%s,\n" % (t, stride)
                    # sizes
                    for i in range(0, solution["ProblemType"]["TotalIndices"]):
                        lastParam = i == solution["ProblemType"][
                            "TotalIndices"] - 1
                        s += "%ssizes[kernelIdx][enqueueIdx][%u]%s\n" \
                            % (t, i, "" if lastParam else "," )
                    if solution["PersistentKernel"]:
                        s += "%s,totalWorkGroups%u\n" % (
                            t, 0 if kernel["WorkGroupMapping"] > 0 else 1)
                        s += "%s,totalWorkGroups%u\n" % (
                            t, 1 if kernel["WorkGroupMapping"] > 0 else 0)
                    s += "%s);\n" % (t)

                # assembly kernel
                else:
                    if globalParameters["DebugKernel"]:
                        s += "%sconst unsigned int debugBufferElementsPerThread = 16;\n" % t
                        s += "%sunsigned int debugBufferNumElem = debugBufferElementsPerThread;\n" % (
                            t)
                        s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][0]);\n" % (
                            t)
                        s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][1]);\n" % (
                            t)
                        s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][2]);\n" % (
                            t)
                        s += "%sdebugBufferNumElem *= localWorkSize[0];\n" % (
                            t)
                        s += "%sdebugBufferNumElem *= localWorkSize[1];\n" % (
                            t)
                        s += "%sdebugBufferNumElem *= localWorkSize[2];\n" % (
                            t)
                        s += "%s  printf(\"debugBufferNumElem: %%04i: \\n\", debugBufferNumElem);\n" % (
                            t)
                        s += "%ssize_t debugBufferSize = debugBufferNumElem * sizeof(unsigned int);\n" % (
                            t)
                        s += "%shipDevice_t device;\n" % t
                        s += "%shipDeviceGet(&device, 0);\n" % t
                        s += "%shipMalloc(&(hipFunctionArgs.debugBuffer), debugBufferSize);\n" % t
                        s += "%sunsigned int *debugBufferHostPtr = new unsigned int[debugBufferNumElem];\n" % (
                            t)
                        s += "%smemset(debugBufferHostPtr,0,debugBufferSize);\n" % (
                            t)
                        s += "%shipMemcpyHtoD(hipFunctionArgs.debugBuffer, debugBufferHostPtr, debugBufferSize);\n" % (
                            t)
                        s += "%smemset(debugBufferHostPtr,1,debugBufferSize);\n" % (
                            t)

                    # hip assembly function
                    s += "%shipFunctionArgs.dataC = dataC;\n" % (t)
                    s += "%shipFunctionArgs.dataA = dataA;\n" % (t)
                    s += "%shipFunctionArgs.dataB = dataB;\n" % (t)
                    if solution["ProblemType"]["DataType"].isHalf():
                        s += "%shipFunctionArgs.alpha[0] = alpha;\n" % (t)
                        s += "%shipFunctionArgs.alpha[1] = alpha;\n" % (t)
                    else:
                        s += "%shipFunctionArgs.alpha = alpha;\n" % (t)
                    if solution["ProblemType"]["UseBeta"]:
                        if solution["ProblemType"]["DataType"].isHalf():
                            s += "%shipFunctionArgs.beta[0] = beta;\n" % (t)
                            s += "%shipFunctionArgs.beta[1] = beta;\n" % (t)
                        else:
                            s += "%shipFunctionArgs.beta = beta;\n" % (t)
                    s += "%shipFunctionArgs.offsetC = offsets[kernelIdx][enqueueIdx][0];\n" % (
                        t)
                    s += "%shipFunctionArgs.offsetA = offsets[kernelIdx][enqueueIdx][1];\n" % (
                        t)
                    s += "%shipFunctionArgs.offsetB = offsets[kernelIdx][enqueueIdx][2];\n" % (
                        t)
                    # strides
                    for stride in self.strideList:
                        s += "%shipFunctionArgs.%s = %s;\n" % (t, stride,
                                                               stride)
                    # sizes
                    for i in range(0, solution["ProblemType"]["TotalIndices"]):
                        lastParam = i == solution["ProblemType"][
                            "TotalIndices"] - 1
                        s += "%shipFunctionArgs.size%s = sizes[kernelIdx][enqueueIdx][%u];\n" \
                            % (t, globalParameters["IndexChars"][i], i )

                    if solution["PersistentKernel"]:
                        # pass in the number of groups since not available in WG
                        s += "%shipFunctionArgs.numGroupTiles0 = totalWorkGroups0;\n" % (
                            t)
                        s += "%shipFunctionArgs.numGroupTiles1 = totalWorkGroups1;\n" % (
                            t)

                    s += "%shipHccModuleLaunchKernel(\n" % (t)
                    t += "  "
                    s += "%shipFunction,\n" % (t)
                    s += "%sglobalWorkSize[kernelIdx][0]*localWorkSize[0],\n" % (
                        t)
                    s += "%sglobalWorkSize[kernelIdx][1]*localWorkSize[1],\n" % (
                        t)
                    s += "%sglobalWorkSize[kernelIdx][2]*localWorkSize[2],\n" % (
                        t)
                    s += "%slocalWorkSize[0],\n" % (t)
                    s += "%slocalWorkSize[1],\n" % (t)
                    s += "%slocalWorkSize[2],\n" % (t)
                    s += "%s0, // groupMemBytes\n" % (t)
                    s += "%sstream,\n" % (t)
                    s += "%sNULL,\n" % (t)
                    s += "%s(void**)hipLaunchParams\n" % (t)
                    if globalParameters["PreciseKernelTime"]:
                        s += "%s,inputEvents ? inputEvents[enqueueIdx]:nullptr\n" % (
                            t)
                        s += "%s,outputEvent ? outputEvent[enqueueIdx]:nullptr\n" % (
                            t)

                    s += "%s);\n" % (t)
                    t = t[2:]
                    if globalParameters["DebugKernel"]:
                        # copy debug buffer
                        s += "%shipMemcpyDtoH(debugBufferHostPtr, hipFunctionArgs.debugBuffer, debugBufferSize);\n" % (
                            t)
                        s += "%sfor(unsigned int i = 0; i < debugBufferNumElem/debugBufferElementsPerThread; i++) {\n" % (
                            t)
                        s += "%s  printf(\"%%04i\", i);\n" % (t)
                        s += "%s  char u[debugBufferElementsPerThread] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};\n" % (
                            t)
                        #s += "%s  char u[debugBufferElementsPerThread] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};\n" % (t)
                        #s += "%s  char u[debugBufferElementsPerThread] = {1,1,0,0,1,1,0,0,1,1,1,1,1,1,1,1};\n" % (t)
                        s += "%s  for(unsigned int j = 0; j < debugBufferElementsPerThread; j++) {\n" % (
                            t)
                        s += "%s if (u[j]) printf(\",%%4u\", debugBufferHostPtr[i*debugBufferElementsPerThread+j]);\n" % (
                            t)
                        s += "%s else printf(\",%%4.0f\", ((float *)debugBufferHostPtr)[i*debugBufferElementsPerThread+j]);\n" % (
                            t)

                        s += "%s  }\n" % (t)
                        s += "%s  printf(\"\\n\");\n" % (t)
                        s += "%s}\n" % (t)

                t = t[2:]
                s += "%s} catch (const std::exception& e) {\n" % (t)
                s += "#ifdef DEBUG\n"
                s += "%s  std::cerr << e.what() << std::endl;\n" % (t)
                s += "#endif\n"
                s += "%s  return tensileStatusFailure;\n" % (t)
                s += "%s}\n" % (t)
                if not globalParameters["PreciseKernelTime"] or solution[
                        "KernelLanguage"] == "Source":
                    s += "%sif( outputEvent != NULL )\n" % (t)
                    s += "%s  hipEventRecord(outputEvent[enqueueIdx], stream );\n" % (
                        t)
                s += "  }\n"
        s += "\n"
        s += "  return tensileStatusSuccess;\n"
        s += "}\n"
        s += "\n"
        s += "/* Solution Parameters\n"
        s += Solution.getParametersIndented(solution.state, "  ")
        s += "*/\n"
        s += "\n"

        return s
def writeCMake(outputPath, solutions, kernels, libraryStaticFiles, clientName):
    print1("# Writing Custom CMake")
    ##############################################################################
    # Min Naming
    ##############################################################################
    if globalParameters["ShortNames"] and not globalParameters["MergeFiles"]:
        solutionSerialNaming = Solution.getSerialNaming(solutions)
        kernelSerialNaming = Solution.getSerialNaming(kernels)
    else:
        solutionSerialNaming = None
        kernelSerialNaming = None
    solutionMinNaming = Solution.getMinNaming(solutions)
    kernelMinNaming = Solution.getMinNaming(kernels)
    solutionWriter = SolutionWriter( \
        solutionMinNaming, solutionSerialNaming, \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterSource = KernelWriterSource( \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterAssembly = KernelWriterAssembly( \
        kernelMinNaming, kernelSerialNaming)

    generatedFile = open(os.path.join(outputPath, "Generated.cmake"), "w")
    generatedFile.write(CMakeHeader)
    generatedFile.write("set( TensileClient_SOLUTIONS\n")

    # write solution names
    if globalParameters["MergeFiles"]:
        generatedFile.write("  ${CMAKE_SOURCE_DIR}/Solutions.h\n")
        generatedFile.write("  ${CMAKE_SOURCE_DIR}/Solutions.cpp\n")
    else:
        for solution in solutions:
            solutionName = solutionWriter.getSolutionName(solution)
            generatedFile.write("  ${CMAKE_SOURCE_DIR}/Solutions/%s.h\n" \
                % (solutionName) )
            generatedFile.write("  ${CMAKE_SOURCE_DIR}/Solutions/%s.cpp\n" \
                % (solutionName) )
    generatedFile.write("  )\n")

    # write kernel names
    generatedFile.write("set( TensileClient_KERNELS\n")
    if globalParameters["MergeFiles"]:
        generatedFile.write("  ${CMAKE_SOURCE_DIR}/Kernels.h\n")
        generatedFile.write("  ${CMAKE_SOURCE_DIR}/Kernels.cpp\n")
    else:
        for kernel in kernels:
            kernelName = kernelWriterSource.getKernelName(kernel) if kernel[
                "KernelLanguage"] == "Source" else kernelWriterAssembly.getKernelName(
                    kernel)
            generatedFile.write("  ${CMAKE_SOURCE_DIR}/Kernels/%s.h\n" %
                                (kernelName))
            generatedFile.write("  ${CMAKE_SOURCE_DIR}/Kernels/%s.cpp\n" %
                                kernelName)
    generatedFile.write("  )\n")

    generatedFile.write("set( TensileClient_SOURCE\n")
    for fileName in libraryStaticFiles:
        # copy file
        shutil_copy( os.path.join(globalParameters["SourcePath"], fileName), \
            outputPath )
        # add file to cmake
        generatedFile.write("  ${CMAKE_SOURCE_DIR}/%s\n" % fileName)
    generatedFile.write("  )\n\n")

    # close generated cmake
    generatedFile.close()
def TensileCreateLibrary():
    print1("")
    print1(HR)
    print1("# Tensile Create Library")
    print2(HR)
    print2("")

    ##############################################################################
    # Parse Command Line Arguments
    ##############################################################################
    print2("Arguments: %s" % sys.argv)
    argParser = argparse.ArgumentParser()
    argParser.add_argument("LogicPath",
                           help="Path to LibraryLogic.yaml files.")
    argParser.add_argument("OutputPath", help="Where to write library files?")
    argParser.add_argument("RuntimeLanguage", help="Which runtime language?", \
        choices=["OCL", "HIP", "HSA"])
    argParser.add_argument("--merge-files", dest="MergeFiles", \
        action="store_true")
    argParser.add_argument("--no-merge-files", dest="MergeFiles", \
        action="store_false")
    argParser.add_argument("--short-file-names", dest="ShortNames", \
        action="store_true")
    argParser.add_argument("--no-short-file-names", dest="ShortNames", \
        action="store_false")
    argParser.add_argument("--library-print-debug", dest="LibraryPrintDebug", \
        action="store_true")
    argParser.add_argument("--no-library-print-debug", dest="LibraryPrintDebug", \
        action="store_false")
    args = argParser.parse_args()

    logicPath = args.LogicPath
    outputPath = args.OutputPath
    print2("OutputPath: %s" % outputPath)
    ensurePath(outputPath)
    arguments = {}
    arguments["RuntimeLanguage"] = args.RuntimeLanguage
    arguments["MergeFiles"] = args.MergeFiles
    arguments["ShortNames"] = args.ShortNames
    arguments["LibraryPrintDebug"] = args.LibraryPrintDebug
    arguments["CodeFromFiles"] = False
    assignGlobalParameters(arguments)

    if not os.path.exists(logicPath):
        printExit("LogicPath %s doesn't exist" % logicPath)

    logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \
        if (os.path.isfile(os.path.join(logicPath, f)) \
        and os.path.splitext(f)[1]==".yaml")]

    print1("# LibraryLogicFiles:" % logicFiles)
    for logicFile in logicFiles:
        print1("#   %s" % logicFile)

    ##############################################################################
    # Parse config files
    ##############################################################################
    solutions = []
    logicData = {}  # keys are problemTypes, values are schedules
    for logicFileName in logicFiles:
        (scheduleName, deviceNames, problemType, solutionsForSchedule, \
            indexOrder, exactLogic, rangeLogic) \
            = YAMLIO.readLibraryLogicForSchedule(logicFileName)
        if problemType not in logicData:
            logicData[problemType] = []
        logicData[problemType].append((scheduleName, deviceNames, \
            solutionsForSchedule, indexOrder, exactLogic, rangeLogic ))
        for solution in solutionsForSchedule:
            if solution not in solutions:
                solutions.append(solution)

    # create solution writer and kernel writer
    kernels = []
    kernelsBetaOnly = []
    for solution in solutions:
        solutionKernels = solution.getKernels()
        for kernel in solutionKernels:
            if kernel not in kernels:
                kernels.append(kernel)
        solutionKernelsBetaOnly = solution.getKernelsBetaOnly()
        for kernel in solutionKernelsBetaOnly:
            if kernel not in kernelsBetaOnly:
                kernelsBetaOnly.append(kernel)

    # if any kernels are assembly, append every ISA supported

    if globalParameters["ShortNames"] and not globalParameters["MergeFiles"]:
        solutionSerialNaming = Solution.getSerialNaming(solutions)
        kernelSerialNaming = Solution.getSerialNaming(kernels)
    else:
        solutionSerialNaming = None
        kernelSerialNaming = None
    solutionMinNaming = Solution.getMinNaming(solutions)
    kernelMinNaming = Solution.getMinNaming(kernels)
    solutionWriter = SolutionWriter( \
        solutionMinNaming, solutionSerialNaming, \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterSource = KernelWriterSource( \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterAssembly = KernelWriterAssembly( \
        kernelMinNaming, kernelSerialNaming)

    # write solutions and kernels
    writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
        solutionWriter, kernelWriterSource, kernelWriterAssembly)

    libraryStaticFiles = [
        "SolutionMapper.h", "TensileTypes.h", "KernelHeader.h",
        "SolutionHelper.cpp", "SolutionHelper.h", "Tools.cpp", "Tools.h"
    ]

    # write cmake
    clientName = "LibraryClient"
    writeCMake(outputPath, solutions, kernels, libraryStaticFiles, clientName)

    # write logic
    writeLogic(outputPath, logicData, solutionWriter)
    print1("# Tensile Library Writer DONE")
    print1(HR)
    print1("")
Beispiel #14
0
def writeClientParameters(forBenchmark, solutions, problemSizes, stepName, \
    functionList):
    h = ""

    ##############################################################################
    # Min Naming
    ##############################################################################
    if forBenchmark:
        kernels = []
        for solution in solutions:
            solutionKernels = solution.getKernels()
            for kernel in solutionKernels:
                if kernel not in kernels:
                    kernels.append(kernel)

        solutionSerialNaming = Solution.getSerialNaming(solutions)
        kernelSerialNaming = Solution.getSerialNaming(kernels)
        solutionMinNaming = Solution.getMinNaming(solutions)
        kernelMinNaming = Solution.getMinNaming(kernels)
        solutionWriter = SolutionWriter( \
            solutionMinNaming, solutionSerialNaming, \
            kernelMinNaming, kernelSerialNaming)

    if forBenchmark:
        if globalParameters["MergeFiles"]:
            h += "#include \"Solutions.h\"\n"
        else:
            for solution in solutions:
                solutionName = solutionWriter.getSolutionName(solution)
                h += "#include \"" + solutionName + ".h\"\n"
        h += "\n"
    else:
        h += "#include \"Tensile.h\"\n"

    h += "typedef enum {\n"
    h += "    enum_float,\n"
    h += "    enum_double,\n"
    h += "    enum_TensileComplexFloat,\n"
    h += "    enum_TensileComplexDouble\n"
    h += "#ifdef Tensile_ENABLE_HALF\n"
    h += "    ,enum_TensileHalf\n"
    h += "#endif\n"
    h += "} DataTypeEnum;\n"
    h += "\n"

    h += "// Debug Params\n"
    h += "const bool printTensorA=%s;\n" % toCppBool(
        globalParameters["PrintTensorA"])
    h += "const bool printTensorB=%s;\n" % toCppBool(
        globalParameters["PrintTensorB"])
    h += "const bool printTensorC=%s;\n" % toCppBool(
        globalParameters["PrintTensorC"])

    h += "const bool printWinnersOnly=%s;\n" % toCppBool(
        globalParameters["PrintWinnersOnly"])
    h += "\n"

    h += "const char indexChars[%u] = \"%s" \
        % (len(globalParameters["IndexChars"])+1, \
        globalParameters["IndexChars"][0])
    for i in range(1, len(globalParameters["IndexChars"])):
        h += globalParameters["IndexChars"][i]
    h += "\";\n"

    h += "unsigned int functionIdx;\n"
    h += "unsigned int dataTypeIdx;\n"
    h += "unsigned int problemTypeIdx;\n"
    h += "\n"

    ##############################################################################
    # Problem Types
    ##############################################################################
    #dataTypes = []
    #problemTypes = []
    #functionSerialToDataTypeAndIdx = []
    dataTypes = []
    problemTypes = []
    problemTypesForDataType = {}  # for data type
    schedulesForProblemType = {}  # for problem type
    functionInfo = [
    ]  # dataTypeIdx, problemTypeIdx, idxWithinDataType, idxWithinProblemType

    if forBenchmark:
        problemType = solutions[0]["ProblemType"]
        dataType = problemType["DataType"]
        dataTypes.append(dataType)
        problemTypes.append(problemType)
        problemTypesForDataType[dataType] = [problemType]
        schedulesForProblemType[problemType] = solutions
        numProblemTypes = 1
        for solution in solutions:
            functionInfo.append([0, 0, 0, 0, 0, 0])
    else:
        for functionIdx in range(0, len(functionList)):
            function = functionList[functionIdx]
            scheduleName = function[0]
            problemType = function[1]
            dataType = problemType["DataType"]
            if dataType not in dataTypes:
                dataTypes.append(dataType)
                problemTypesForDataType[dataType] = []
            if problemType not in problemTypesForDataType[dataType]:
                problemTypesForDataType[dataType].append(problemType)
                schedulesForProblemType[problemType] = []
            schedulesForProblemType[problemType].append(scheduleName)

        # sort
        dataTypes = sorted(dataTypes)
        for dataType in dataTypes:
            problemTypesForDataType[dataType] = \
                sorted(problemTypesForDataType[dataType])
            for problemType in problemTypesForDataType[dataType]:
                schedulesForProblemType[problemType] = \
                    sorted(schedulesForProblemType[problemType])

        # assign info
        functionIdxSerial = 0
        problemTypeIdxSerial = 0
        for dataTypeIdxSerial in range(0, len(dataTypes)):
            dataType = dataTypes[dataTypeIdxSerial]
            functionIdxForDataType = 0
            for problemTypeIdxForDataType in range(0, \
                len(problemTypesForDataType[dataType])):
                problemType = \
                    problemTypesForDataType[dataType][problemTypeIdxForDataType]
                problemTypes.append(problemType)
                functionIdxForProblemType = 0
                for functionIdxForProblemType in range(0, \
                    len(schedulesForProblemType[problemType])):
                    functionInfo.append([ \
                        dataTypeIdxSerial, \
                        problemTypeIdxForDataType, \
                        problemTypeIdxSerial, \
                        functionIdxSerial,\
                        functionIdxForDataType,\
                        functionIdxForProblemType, \
                        ])
                    functionIdxForProblemType += 1
                    functionIdxForDataType += 1
                    functionIdxSerial += 1
                problemTypeIdxSerial += 1
        numProblemTypes = problemTypeIdxSerial
        numFunctions = functionIdxSerial
        h += "const unsigned int numFunctions = %u;\n" % numFunctions

    ##############################################################################
    # Data Types
    ##############################################################################
    h += "/* data types */\n"
    numDataTypes = len(dataTypes)
    h += "const unsigned int numDataTypes = %u;\n" % numDataTypes
    h += "const DataTypeEnum dataTypeEnums[numDataTypes] = { enum_%s" \
        % dataTypes[0].toCpp()
    for dataTypeIdx in range(1, numDataTypes):
        h += ", enum_%s" % dataTypes[dataTypeIdx].toCpp()
    h += " };\n"
    # bytes per elements
    h += "const unsigned int bytesPerElement[numDataTypes] = { %u" \
        % (dataTypes[0].numBytes())
    for dataTypeIdx in range(1, numDataTypes):
        dataType = dataTypes[dataTypeIdx]
        h += ", %u" % dataType.numBytes()
    h += " };\n"
    # flops per mac
    h += "const unsigned int numFlopsPerMac[numDataTypes] = { %u" \
        % (2 if dataTypes[0].isReal() else 8)
    for dataTypeIdx in range(1, numDataTypes):
        dataType = dataTypes[dataTypeIdx]
        h += ", %u" % (2 if dataType.isReal() else 8)
    h += " };\n"
    for dataTypeIdx in range(0, numDataTypes):
        h += "#define Tensile_DATA_TYPE_%s\n" \
            % dataTypes[dataTypeIdx].toCpp().upper()

    ##############################################################################
    # Problem Types
    ##############################################################################
    h += "/* problem types */\n"
    h += "const unsigned int numProblemTypes = %u;\n" % numProblemTypes
    # Num C Indices
    h += "const unsigned int numIndicesC[numProblemTypes] = { %u" \
        % problemTypes[0]["NumIndicesC"]
    for problemTypeIdx in range(1, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        h += ", %u" % problemType["NumIndicesC"]
    h += " };\n"

    # Num AB Indices
    maxNumIndicesAB = len(problemTypes[0]["IndexAssignmentsA"])
    h += "const unsigned int numIndicesAB[numProblemTypes] = { %u" \
        % len(problemTypes[0]["IndexAssignmentsA"])
    for problemTypeIdx in range(1, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        numIndicesAB = len(problemType["IndexAssignmentsA"])
        h += ", %u" % numIndicesAB
        maxNumIndicesAB = max(numIndicesAB, maxNumIndicesAB)
    h += " };\n"
    h += "const unsigned int maxNumIndicesAB = %u;\n" % maxNumIndicesAB
    # Index Assignments A
    h += "const unsigned int indexAssignmentsA[numProblemTypes][maxNumIndicesAB] = {\n"
    for problemTypeIdx in range(0, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        indices = problemType["IndexAssignmentsA"]
        h += "  { %u" % indices[0]
        for i in range(1, maxNumIndicesAB):
            if i < len(indices):
                h += ", %u" % indices[i]
            else:
                h += ", static_cast<unsigned int>(-1)"
        if problemTypeIdx < numProblemTypes - 1:
            h += " },\n"
        else:
            h += " }\n"
    h += "};\n"
    # Index Assignments B
    h += "const unsigned int indexAssignmentsB[numProblemTypes][maxNumIndicesAB] = {\n"
    for problemTypeIdx in range(0, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        indices = problemType["IndexAssignmentsB"]
        h += "  { %u" % indices[0]
        for i in range(1, maxNumIndicesAB):
            if i < len(indices):
                h += ", %u" % indices[i]
            else:
                h += ", static_cast<unsigned int>(-1)"
        if problemTypeIdx < numProblemTypes - 1:
            h += " },\n"
        else:
            h += " }\n"
    h += "};\n"
    # beta
    h += "bool useBeta[numProblemTypes] = { %s" \
        % ("true" if problemTypes[0]["UseBeta"] else "false")
    for problemTypeIdx in range(1, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        h += ", %s" % ("true" if problemType["UseBeta"] else "false")
    h += " };\n"
    # Complex Conjugates
    h += "const bool complexConjugateA[numProblemTypes] = { %s" \
        % ("true" if problemTypes[0]["ComplexConjugateA"] else "false" )
    for problemTypeIdx in range(1, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        h += ", %s" % ("true"
                       if problemTypes[0]["ComplexConjugateA"] else "false")
    h += " };\n"
    h += "const bool complexConjugateB[numProblemTypes] = { %s" \
        % ("true" if problemTypes[0]["ComplexConjugateB"] else "false" )
    for problemTypeIdx in range(1, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        h += ", %s" % ("true"
                       if problemTypes[0]["ComplexConjugateB"] else "false")
    h += " };\n"
    h += "\n"

    if not forBenchmark:
        h += "// dataTypeIdxSerial, problemTypeIdxForDataType, problemTypeIdxSerial, functionIdxSerial, functionIdxForDataType, functionIdxForProblemType\n"
        first = True
        h += "const unsigned int functionInfo[numFunctions][6] = {\n"
        for info in functionInfo:
            h += "%s{ %u, %u, %u, %u, %u, %u }" % ("  " if first else ",\n  ", \
                info[0], info[1], info[2], info[3], info[4], info[5] )
            first = False
        h += " };\n"

    ##############################################################################
    # Problem Sizes
    ##############################################################################
    maxNumIndices = problemTypes[0]["TotalIndices"]
    if not forBenchmark:
        for problemType in problemTypes:
            maxNumIndices = max(problemType["TotalIndices"], maxNumIndices)
    h += "const unsigned int maxNumIndices = %u;\n" % maxNumIndices
    h += "const unsigned int totalIndices[numProblemTypes] = { %u" \
        % problemTypes[0]["TotalIndices"]
    for problemTypeIdx in range(1, numProblemTypes):
        h += ", %u" % problemTypes[problemTypeIdx]["TotalIndices"]
    h += " };\n"
    if forBenchmark:
        h += "const unsigned int numProblems = %u;\n" \
            % problemSizes.totalProblemSizes
        h += "const unsigned int problemSizes[numProblems][%u] = {\n" \
            % problemTypes[0]["TotalIndices"]
        for i in range(0, problemSizes.totalProblemSizes):
            line = "  {%5u" % problemSizes.sizes[i][0]
            for j in range(1, problemTypes[0]["TotalIndices"]):
                line += ",%5u" % problemSizes.sizes[i][j]
            line += " }"
            h += line
            if i < problemSizes.totalProblemSizes - 1:
                h += ","
            else:
                h += "};"
            h += "\n"
        h += "const unsigned int minStrides[%u] = {" \
            % problemTypes[0]["TotalIndices"]
        for i in range(0, len(problemSizes.minStrides)):
            if (i != 0):
                h += ", "
            h += str(problemSizes.minStrides[i])
        h += "};\n"
    else:
        h += "unsigned int userSizes[maxNumIndices];\n"
        h += "unsigned int minStrides[%u] = {" \
            % maxNumIndices
        for i in range(0, maxNumIndices):
            if (i != 0):
                h += ", "
            h += str(0)
            # always use 0 for minStrides in benchmark mode
        h += "};\n"

    if forBenchmark:
        h += "/* problem sizes */\n"
        """
    h += "const bool indexIsSized[maxNumIndices] = {"
    for i in range(0, problemSizes.totalIndices):
      h += " %s" % ("true" if problemSizes.indexIsSized[i] else "false")
      if i < problemSizes.totalIndices-1:
        h += ","
    h += " };\n"

    h += "const unsigned int numIndicesSized = %u;\n" \
        % len(problemSizes.indicesSized)
    h += "const unsigned int indicesSized[numIndicesSized][4] = {\n"
    h += "// { min, stride, stride_incr, max }\n"
    for i in range(0, len(problemSizes.indicesSized)):
      r = problemSizes.indicesSized[i]
      h += "  { %u, %u, %u, %u }" % (r[0], r[1], r[2], r[3])
      if i < len(problemSizes.indicesSized)-1:
        h += ","
      h += "\n"
    h += "  };\n"

    numIndicesMapped = len(problemSizes.indicesMapped)
    h += "const unsigned int numIndicesMapped = %u;\n" % numIndicesMapped
    if numIndicesMapped > 0:
      h += "#define Tensile_INDICES_MAPPED 1\n"
      h += "const unsigned int indicesMapped[numIndicesMapped] = {"
      for i in range(0, numIndicesMapped):
        h += " %u" % problemSizes.indicesMapped[i]
        if i < numIndicesMapped-1:
          h += ","
      h += " };\n"
    else:
      h += "#define Tensile_INDICES_MAPPED 0\n"
    """

    ##############################################################################
    # Max Problem Sizes
    ##############################################################################
    if forBenchmark:
        h += "size_t maxSizeC = %u;\n" % (problemSizes.maxC)
        h += "size_t maxSizeA = %u;\n" % (problemSizes.maxA)
        h += "size_t maxSizeB = %u;\n" % (problemSizes.maxB)
        h += "\n"
    else:
        h += "size_t maxSizeC;\n"
        h += "size_t maxSizeA;\n"
        h += "size_t maxSizeB;\n"
        h += "\n"

    ##############################################################################
    # Current Problem Size
    ##############################################################################
    h += "/* current problem size */\n"
    #h += "unsigned int fullSizes[maxNumIndices];\n"
    #h += "unsigned int currentSizedIndexSizes[numIndicesSized];\n"
    #h += "unsigned int currentSizedIndexIncrements[numIndicesSized];\n"
    h += "\n"

    ##############################################################################
    # Solutions
    ##############################################################################
    if forBenchmark:
        h += "/* solutions */\n"
        # Problem Type Indices
        h += "const unsigned int maxNumSolutions = %u;\n" % len(solutions)
        h += "float solutionPerf[numProblems][maxNumSolutions]; // milliseconds\n"
        h += "\n"
        # Solution Ptrs
        h += "typedef TensileStatus (*SolutionFunctionPointer)(\n"
        argList = solutionWriter.getArgList(solutions[0]["ProblemType"], True,
                                            True, True)
        for i in range(0, len(argList)):
            h += "  %s %s%s" % (argList[i][0], argList[i][1], \
                ",\n" if i < len(argList)-1 else ");\n\n")
        h += "const SolutionFunctionPointer solutions[maxNumSolutions] = {\n"
        for i in range(0, len(solutions)):
            solution = solutions[i]
            solutionName = solutionWriter.getSolutionName(solution)
            h += "  %s" % solutionName
            if i < len(solutions) - 1:
                h += ","
            h += "\n"
        h += " };\n"
        h += "\n"
        # Solution Names
        h += "const char *solutionNames[maxNumSolutions] = {\n"
        for i in range(0, len(solutions)):
            solution = solutions[i]
            solutionName = solutionWriter.getSolutionName(solution)
            h += "  \"%s\"" % solutionName
            if i < len(solutions) - 1:
                h += ","
            h += "\n"
        h += " };\n"
        h += "\n"
    else:
        # Function Names
        functionNames = []
        for dataType in dataTypes:
            for problemType in problemTypesForDataType[dataType]:
                for scheduleName in schedulesForProblemType[problemType]:
                    #functionNames.append("tensile_%s_%s" % (scheduleName, problemType))
                    functionNames.append("tensile_%s" % (problemType))
        h += "const char *functionNames[numFunctions] = {\n"
        for functionIdx in range(0, len(functionNames)):
            functionName = functionNames[functionIdx]
            h += "    \"%s\"%s\n" % (functionName, \
                "," if functionIdx < len(functionNames)-1 else "" )
        h += " };\n"

    ##############################################################################
    # Runtime Structures
    ##############################################################################
    h += "/* runtime structures */\n"
    h += "TensileStatus status;\n"
    if globalParameters["RuntimeLanguage"] == "OCL":
        h += "cl_platform_id platform;\n"
        h += "cl_device_id device;\n"
        h += "cl_context context;\n"
        h += "cl_command_queue stream;\n"
    else:
        h += "hipStream_t stream;\n"
        #h += "int deviceIdx = %u;\n" \
        #    % (globalParameters["Device"])
    h += "\n"
    h += "void *deviceC;\n"
    h += "void *deviceA;\n"
    h += "void *deviceB;\n"

    ##############################################################################
    # Benchmarking and Validation Parameters
    ##############################################################################
    h += "\n/* benchmarking parameters */\n"
    #h += "const bool measureKernelTime = %s;\n" \
    #    % ("true" if globalParameters["KernelTime"] else "false")
    #h += "const unsigned int numEnqueuesPerSync = %u;\n" \
    #    % (globalParameters["EnqueuesPerSync"])
    #h += "const unsigned int numSyncsPerBenchmark = %u;\n" \
    #    % (globalParameters["SyncsPerBenchmark"])
    #h += "unsigned int numElementsToValidate = %s;\n" \
    #    % (str(globalParameters["NumElementsToValidate"]) \
    #    if globalParameters["NumElementsToValidate"] >= 0 \
    #    else "0xFFFFFFFF" )
    #h += "unsigned int validationMaxToPrint = %u;\n" \
    #    % globalParameters["ValidationMaxToPrint"]
    #h += "bool validationPrintValids = %s;\n" \
    #    % ("true" if globalParameters["ValidationPrintValids"] else "false")
    h += "size_t validationStride;\n"
    if problemType["HighPrecisionAccumulate"]:
        h += "static bool useHighPrecisionAccumulate = true;\n"
    else:
        h += "static bool useHighPrecisionAccumulate = false;\n"
    #h += "unsigned int dataInitTypeC = %s;\n" % globalParameters["DataInitTypeC"]
    #h += "unsigned int dataInitTypeAB = %s;\n" % globalParameters["DataInitTypeAB"]
    h += "\n"

    ##############################################################################
    # Generated Call to Reference
    ##############################################################################
    h += "/* generated call to reference */\n"
    h += "template<typename DataType>\n"
    h += "TensileStatus generatedCallToReferenceCPU(\n"
    h += "    const unsigned int *sizes,\n"
    h += "    const unsigned int *minStrides,\n"
    h += "    DataType *referenceC,\n"
    h += "    DataType *initialA,\n"
    h += "    DataType *initialB,\n"
    h += "    const unsigned int stride_a,\n"
    h += "    const unsigned int stride_b,\n"
    h += "    const unsigned int stride_c,\n"
    h += "    DataType alpha,\n"
    h += "    DataType beta,\n"
    h += "    bool useHighPrecisionAccumulate) {\n"
    h += "  return tensileReferenceCPU(\n"
    h += "      referenceC,\n"
    h += "      initialA,\n"
    h += "      initialB,\n"
    h += "      stride_a,\n"
    h += "      stride_b,\n"
    h += "      stride_c,\n"
    h += "      alpha,\n"
    h += "      beta,\n"
    h += "      totalIndices[problemTypeIdx],\n"
    h += "      sizes,\n"
    h += "      minStrides,\n"
    h += "      numIndicesC[problemTypeIdx],\n"
    h += "      numIndicesAB[problemTypeIdx],\n"
    h += "      indexAssignmentsA[problemTypeIdx],\n"
    h += "      indexAssignmentsB[problemTypeIdx],\n"
    h += "      complexConjugateA[problemTypeIdx],\n"
    h += "      complexConjugateB[problemTypeIdx],\n"
    h += "      validationStride,\n"
    h += "      useHighPrecisionAccumulate);\n"
    h += "};\n"
    h += "\n"

    ##############################################################################
    # Generated Call to Solution
    ##############################################################################
    if forBenchmark:
        problemType = solutions[0]["ProblemType"]
        h += "/* generated call to solution */\n"
        h += "template<typename DataType>\n"
        h += "TensileStatus generatedCallToSolution(\n"
        h += "    unsigned int solutionIdx,\n"
        h += "    const unsigned int *sizes,\n"
        h += "    const unsigned int *minStrides,\n"
        h += "    DataType alpha,\n"
        h += "    DataType beta, \n"
        h += "    unsigned int numEvents = 0, \n"
        if globalParameters["RuntimeLanguage"] == "OCL":
            h += "    cl_event *event_wait_list = NULL,\n"
            h += "    cl_event *outputEvent = NULL ) {\n"
        else:
            h += "    hipEvent_t *startEvent = NULL,\n"
            h += "    hipEvent_t *stopEvent = NULL ) {\n"

        h += "  // calculate parameters assuming packed data\n"
        # strides
        indexChars = globalParameters["IndexChars"]
        firstStride = 1
        if problemType["UseInitialStrides"]:
            firstStride = 0
        lastStrideC = problemType["NumIndicesC"]
        lastStrideA = len(problemType["IndexAssignmentsA"])
        lastStrideB = len(problemType["IndexAssignmentsB"])

        # calculate strides
        for i in range(0, lastStrideC):
            h += "  unsigned int strideC%u%s = 1" % (i, indexChars[i])
            for j in range(0, i):
                h += "* std::max(minStrides[%i], sizes[%i])" % (j, j)
            h += ";\n"
        for i in range(0, lastStrideA):
            h += "  unsigned int strideA%u%s = 1" % (i, \
                indexChars[problemType["IndexAssignmentsA"][i]])
            for j in range(0, i):
                h += "* std::max(minStrides[%i], sizes[%i])" % \
                  (problemType["IndexAssignmentsA"][j],
                   problemType["IndexAssignmentsA"][j])
            h += ";\n"
        for i in range(0, lastStrideB):
            h += "  unsigned int strideB%u%s = 1" % (i, \
                indexChars[problemType["IndexAssignmentsB"][i]])
            for j in range(0, i):
                h += "* std::max(minStrides[%i], sizes[%i])" % \
                  (problemType["IndexAssignmentsB"][j],
                   problemType["IndexAssignmentsB"][j])
            h += ";\n"
        for i in range(0, problemType["TotalIndices"]):
            h += "  unsigned int size%s = sizes[%u];\n" % (indexChars[i], i)
        h += "\n"

        # function call
        h += "  // call solution function\n"
        if globalParameters["RuntimeLanguage"] == "OCL":
            h += "  return solutions[solutionIdx]( static_cast<cl_mem>(deviceC), static_cast<cl_mem>(deviceA), static_cast<cl_mem>(deviceB),\n"
        else:
            typeName = dataTypes[0].toCpp()
            h += "  return solutions[solutionIdx]( static_cast<%s *>(deviceC), static_cast<%s *>(deviceA), static_cast<%s *>(deviceB),\n" \
                % (typeName, typeName, typeName)
        h += "      alpha,\n"
        if problemType["UseBeta"]:
            h += "      beta,\n"
        h += "      0, 0, 0, // offsets\n"
        for i in range(firstStride, lastStrideC):
            h += "      strideC%u%s,\n" % (i, indexChars[i])
        for i in range(firstStride, lastStrideA):
            h += "      strideA%u%s,\n" % (i, \
                indexChars[problemType["IndexAssignmentsA"][i]])
        for i in range(firstStride, lastStrideB):
            h += "      strideB%u%s,\n" % (i, \
                indexChars[problemType["IndexAssignmentsB"][i]])
        for i in range(0, problemType["TotalIndices"]):
            h += "      size%s,\n" % indexChars[i]
        h += "      stream,\n"
        if globalParameters["RuntimeLanguage"] == "OCL":
            h += "      numEvents, event_wait_list, outputEvent ); // events\n"
        else:
            h += "      numEvents, startEvent, stopEvent); // events\n"

        h += "};\n"
        h += "\n"
    else:
        ############################################################################
        # Generated Call to Function
        ############################################################################
        for enqueue in [True, False]:
            functionName = "tensile" if enqueue else "tensileGetSolutionName"
            returnName = "TensileStatus" if enqueue else "const char *"
            h += "/* generated call to function */\n"
            h += "template<typename DataType>\n"
            h += "%s generatedCallTo_%s(\n" % (returnName, functionName)
            h += "    unsigned int *sizes,\n"
            h += "    unsigned int *minStrides,\n"
            h += "    DataType alpha,\n"
            h += "    DataType beta, \n"
            h += "    unsigned int strideA, \n"
            h += "    unsigned int strideB, \n"
            h += "    unsigned int strideC, \n"
            h += "    unsigned int numEvents = 0, \n"

            if globalParameters["RuntimeLanguage"] == "OCL":
                h += "    cl_event *event_wait_list = NULL,\n"
                h += "    cl_event *outputEvent = NULL );\n\n"
            else:
                h += "    hipEvent_t *startEvent = NULL,\n"
                h += "    hipEvent_t *stopEvent = NULL );\n\n"

            for dataType in dataTypes:
                typeName = dataType.toCpp()
                functionsForDataType = []
                for problemType in problemTypesForDataType[dataType]:
                    for scheduleName in schedulesForProblemType[problemType]:
                        functionsForDataType.append(
                            [scheduleName, problemType])
                h += "template<>\n"
                h += "inline %s generatedCallTo_%s<%s>(\n" \
                    % (returnName, functionName, typeName)
                h += "    unsigned int *sizes,\n"
                h += "    unsigned int *minStrides,\n"
                h += "    %s alpha,\n" % typeName
                h += "    %s beta,\n" % typeName
                h += "    unsigned int strideA, \n"
                h += "    unsigned int strideB, \n"
                h += "    unsigned int strideC, \n"
                h += "    unsigned int numEvents, \n"

                if globalParameters["RuntimeLanguage"] == "OCL":
                    h += "    cl_event *event_wait_list,\n"
                    h += "    cl_event *outputEvent ) {\n\n"
                else:
                    h += "    hipEvent_t *startEvent,\n"
                    h += "    hipEvent_t *stopEvent ) {\n\n"

                h += "  unsigned int functionIdxForDataType = functionInfo[functionIdx][4];\n"

                for functionIdx in range(0, len(functionsForDataType)):
                    function = functionsForDataType[functionIdx]
                    scheduleName = function[0]
                    problemType = function[1]
                    if len(functionsForDataType) > 1:
                        if functionIdx == 0:
                            h += "  if (functionIdxForDataType == %u) {\n" % functionIdx
                        elif functionIdx == len(functionsForDataType) - 1:
                            h += "  } else {\n"
                        else:
                            h += "  } else if (functionIdxForDataType == %u) {\n" \
                                % functionIdx

                    # strides
                    indexChars = globalParameters["IndexChars"]
                    firstStride = 1
                    if problemType["UseInitialStrides"]:
                        firstStride = 0
                    lastStrideC = problemType["NumIndicesC"]
                    lastStrideA = len(problemType["IndexAssignmentsA"])
                    lastStrideB = len(problemType["IndexAssignmentsB"])

                    # calculate strides
                    for i in range(0, lastStrideC):
                        h += "    unsigned int strideC%u%s = 1" % (
                            i, indexChars[i])
                        for j in range(0, i):
                            h += "*sizes[%i]" % j
                        h += ";\n"
                    h += "    if (strideC != std::numeric_limits<unsigned int>::max())  strideC%u%s = strideC;\n" % (
                        lastStrideC - 1, indexChars[lastStrideC - 1])

                    for i in range(0, lastStrideA):
                        h += "    unsigned int strideA%u%s = 1" % (i, \
                            indexChars[problemType["IndexAssignmentsA"][i]])
                        for j in range(0, i):
                            h += "*sizes[%i]" % \
                              problemType["IndexAssignmentsA"][j]
                        h += ";\n"
                    h += "    if (strideA != std::numeric_limits<unsigned int>::max())  strideA%u%s = strideA;\n" % (
                        lastStrideA - 1, indexChars[lastStrideA - 1])
                    for i in range(0, lastStrideB):
                        h += "    unsigned int strideB%u%s = 1" % (i, \
                            indexChars[problemType["IndexAssignmentsB"][i]])
                        for j in range(0, i):
                            h += "*sizes[%i]" % \
                              problemType["IndexAssignmentsB"][j]
                        h += ";\n"
                    h += "    if (strideB != std::numeric_limits<unsigned int>::max())  strideB%u%s = strideB;\n" % (
                        lastStrideB - 1, indexChars[lastStrideB - 1])
                    for i in range(0, problemType["TotalIndices"]):
                        h += "    unsigned int size%s = sizes[%u];\n" % (
                            indexChars[i], i)

                    # function call
                    h += "    // call solution function\n"
                    h += "    return %s_%s(\n" % (functionName, problemType)
                    if enqueue:
                        if globalParameters["RuntimeLanguage"] == "OCL":
                            h += "        static_cast<cl_mem>(deviceC),\n"
                            h += "        static_cast<cl_mem>(deviceA),\n"
                            h += "        static_cast<cl_mem>(deviceB),\n"
                        else:
                            h += "        static_cast<%s *>(deviceC),\n" % typeName
                            h += "        static_cast<%s *>(deviceA),\n" % typeName
                            h += "        static_cast<%s *>(deviceB),\n" % typeName
                        h += "        alpha,\n"
                        if problemType["UseBeta"]:
                            h += "        beta,\n"
                        h += "        0, 0, 0, // offsets\n"
                    for i in range(firstStride, lastStrideC):
                        h += "        strideC%u%s,\n" % (i, indexChars[i])
                    for i in range(firstStride, lastStrideA):
                        h += "        strideA%u%s,\n" % (i, \
                            indexChars[problemType["IndexAssignmentsA"][i]])
                    for i in range(firstStride, lastStrideB):
                        h += "        strideB%u%s,\n" % (i, \
                            indexChars[problemType["IndexAssignmentsB"][i]])
                    for i in range(0, problemType["TotalIndices"]):
                        h += "        size%s,\n" % indexChars[i]
                    h += "        stream"
                    if enqueue:
                        if globalParameters["RuntimeLanguage"] == "OCL":
                            h += ",\n        numEvents, event_wait_list, outputEvent"
                        else:
                            h += ",\n        numEvents, startEvent, stopEvent"
                    h += ");\n"

                if len(functionsForDataType) > 1:
                    h += "  }\n"  # close last if
                h += "};\n"  # close callToFunction

    ##############################################################################
    # Results File Name
    ##############################################################################
    if forBenchmark:
        h += "/* results file name */\n"
        resultsFileName = os.path.join(globalParameters["WorkingPath"], \
            "../../Data","%s.csv" % stepName)
        resultsFileName = resultsFileName.replace("\\", "\\\\")
        h += "const char *resultsFileName = \"%s\";\n" % resultsFileName

    ##############################################################################
    # Write File
    ##############################################################################
    clientParametersFile = open(os.path.join(globalParameters["WorkingPath"], \
        "ClientParameters.h"), "w")
    clientParametersFile.write(CHeader)
    clientParametersFile.write(h)
    clientParametersFile.close()
Beispiel #15
0
def TensileCreateLibrary():
    print1("")
    print1(HR)
    print1("# Tensile Create Library")
    print2(HR)
    print2("")

    ##############################################################################
    # Parse Command Line Arguments
    ##############################################################################
    print2("Arguments: %s" % sys.argv)
    argParser = argparse.ArgumentParser()
    argParser.add_argument("LogicPath",
                           help="Path to LibraryLogic.yaml files.")
    argParser.add_argument("OutputPath", help="Where to write library files?")
    argParser.add_argument("RuntimeLanguage", help="Which runtime language?", \
        choices=["OCL", "HIP", "HSA"])
    argParser.add_argument("--merge-files", dest="MergeFiles", \
        action="store_true")
    argParser.add_argument("--no-merge-files", dest="MergeFiles", \
        action="store_false")
    argParser.add_argument("--short-file-names", dest="ShortNames", \
        action="store_true")
    argParser.add_argument("--no-short-file-names", dest="ShortNames", \
        action="store_false")
    argParser.add_argument("--library-print-debug", dest="LibraryPrintDebug", \
        action="store_true")
    argParser.add_argument("--no-library-print-debug", dest="LibraryPrintDebug", \
        action="store_false")
    argParser.add_argument(
        "--isa",
        dest="isa",
        action="append",
        help="which architectures for assembly kernels to target")
    args = argParser.parse_args()

    logicPath = args.LogicPath
    outputPath = args.OutputPath
    print2("OutputPath: %s" % outputPath)
    ensurePath(outputPath)
    arguments = {}
    arguments["RuntimeLanguage"] = args.RuntimeLanguage
    arguments["MergeFiles"] = args.MergeFiles
    arguments["ShortNames"] = args.ShortNames
    arguments["LibraryPrintDebug"] = args.LibraryPrintDebug
    if args.isa:
        newISA = []
        for isa in args.isa:
            gfxIdx = isa.find("gfx")
            if gfxIdx >= 0:
                major = int(isa[gfxIdx + 3:gfxIdx + 4])
                minor = int(isa[gfxIdx + 4:gfxIdx + 5])
                step = int(isa[gfxIdx + 5:gfxIdx + 6])
                isaTuple = (major, minor, step)
                if isaTuple in globalParameters[
                        "SupportedISA"] and isaTuple not in newISA:
                    print1("# User-Specified ISA: gfx%u%u%u" %
                           (major, minor, step))
                    newISA.append(isaTuple)
            else:
                printWarning("isa parameter must be formed as: --isa gfx803")
        arguments["SupportedISA"] = newISA
    assignGlobalParameters(arguments)

    if not os.path.exists(logicPath):
        printExit("LogicPath %s doesn't exist" % logicPath)

    logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \
        if (os.path.isfile(os.path.join(logicPath, f)) \
        and os.path.splitext(f)[1]==".yaml")]

    print1("# LibraryLogicFiles:" % logicFiles)
    for logicFile in logicFiles:
        print1("#   %s" % logicFile)

    ##############################################################################
    # Parse config files
    ##############################################################################
    solutions = []
    logicData = {}  # keys are problemTypes, values are schedules
    for logicFileName in logicFiles:
        (scheduleName, deviceNames, problemType, solutionsForSchedule, \
            indexOrder, exactLogic, rangeLogic) \
            = YAMLIO.readLibraryLogicForSchedule(logicFileName)
        if problemType not in logicData:
            logicData[problemType] = []
        logicData[problemType].append((scheduleName, deviceNames, \
            solutionsForSchedule, indexOrder, exactLogic, rangeLogic ))
        for solution in solutionsForSchedule:
            if solution not in solutions:
                solutions.append(solution)

    # create solution writer and kernel writer
    kernels = []
    kernelsBetaOnly = []
    for solution in solutions:
        solutionKernels = solution.getKernels()
        for kernel in solutionKernels:
            if kernel not in kernels:
                kernels.append(kernel)
        solutionKernelsBetaOnly = solution.getKernelsBetaOnly()
        for kernel in solutionKernelsBetaOnly:
            if kernel not in kernelsBetaOnly:
                kernelsBetaOnly.append(kernel)

    # if any kernels are assembly, append every ISA supported
    if globalParameters["RuntimeLanguage"] == "HIP":
        newKernels = []
        for kernel in kernels:
            if kernel["KernelLanguage"] == "Assembly":
                kernel["ISA"] = globalParameters["SupportedISA"][0]
                for i in range(1, len(globalParameters["SupportedISA"])):
                    newKernel = deepcopy(kernel)
                    newKernel["ISA"] = globalParameters["SupportedISA"][i]
                    newKernels.append(newKernel)
            else:
                kernel["ISA"] = (0, 0, 0)
        kernels.extend(newKernels)

    if globalParameters["ShortNames"] and not globalParameters["MergeFiles"]:
        solutionSerialNaming = Solution.getSerialNaming(solutions)
        kernelSerialNaming = Solution.getSerialNaming(kernels)
    else:
        solutionSerialNaming = None
        kernelSerialNaming = None
    solutionMinNaming = Solution.getMinNaming(solutions)
    kernelMinNaming = Solution.getMinNaming(kernels)
    solutionWriter = SolutionWriter( \
        solutionMinNaming, solutionSerialNaming, \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterSource = KernelWriterSource( \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterAssembly = KernelWriterAssembly( \
        kernelMinNaming, kernelSerialNaming)

    # write solutions and kernels
    writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
        solutionWriter, kernelWriterSource, kernelWriterAssembly)

    libraryStaticFiles = [
        "TensileTypes.h", "KernelHeader.h", "SolutionHelper.cpp",
        "SolutionHelper.h", "Tools.cpp", "Tools.h"
    ]

    # write cmake
    clientName = "LibraryClient"
    writeCMake(outputPath, solutions, kernels, libraryStaticFiles, clientName)

    # write logic
    writeLogic(outputPath, logicData, solutionWriter)
    print1("# Tensile Library Writer DONE")
    print1(HR)
    print1("")
Beispiel #16
0
  def getProblemSourceString(self, problemType, solution, kernelsWithBuildErrs):
    gsu = solution["GlobalSplitU"]
    persistent = solution["PersistentKernel"]
    kernelLanguage = solution["KernelLanguage"]
    tt0 = solution["ThreadTile0"]
    tt1 = solution["ThreadTile1"]
    sg0 = solution["SubGroup0"]
    sg1 = solution["SubGroup1"]
    nt  = solution["NumThreads"]

    kernels = solution.getKernels()
    kernelNames = []
    kernelBuildErr = 0
    for kernel in kernels:
      kernelName = self.kernelWriter.getKernelName(kernel)
      if kernelName in kernelsWithBuildErrs:
        kernelBuildErr = 1
      kernelNames.append( kernelName )


    s = ""
    t = ""
    # includes

    problemType = solution["ProblemType"] # shortcut

    if not globalParameters["MergeFiles"]:
      solutionName = self.getSolutionName(solution)
      s += "#include \"%s.h\"\n" % solutionName
      s += "\n"

    # problem function signature
    #argList = self.getArgList(problemType, True, True, True, True)
    #for i in range(0, len(argList)):
    #  argString = "%s %s" % argList[i]
    #  s += "%s%s%s" % (t, argString, ",\n" if i < len(argList)-1 else ")" )

    s += self.getSolutionSignature(solution)

    s += " {\n"
    if kernelBuildErr:
      s += "%s  return tensileStatusFailure; // One or more kernels had build failures (%s)\n" % (t, kernelNames)
      s += "%s}\n" % (t)
      return s

    t += "  "
    s += "%sTensileStatus status;\n" % (t)


    # hipFunction Struct
    if kernelLanguage == "Assembly":
      s += "\n"
      s += "%s/* module function args */\n" % (t)
      s += "%sstruct {\n" % t
      t += "  "
      if globalParameters["DebugKernel"]:
        s += "%sunsigned int *debugBuffer;\n" % t
      # Tensor sizes in elements, including only packed dims,
      # and accounting for zero or other strides < size
      # Place these first in the structure since they are 64-bits
      # and need to avoid any unneeded padding:
      s += "%s// Size of Tensor's packed dims, in elements\n" % t
      s += "%suint64_t tensor2dSizeC;\n" % t
      s += "%suint64_t tensor2dSizeA;\n" % t
      s += "%suint64_t tensor2dSizeB;\n" % t
      solutionArgs = self.getArgList(problemType, False, True, False, False)
      for arg in solutionArgs:
        if arg[0] == "TensileHalf":
          s += "%s%s %s[2];\n" % (t, arg[0], arg[1])
        else:
          s += "%s%s %s;\n" % (t, arg[0], arg[1])
      for idxChar in solution["PackedC0Indices"][:-1]:
        s += "%sunsigned magicNumberSize%s;\n" % (t, idxChar)
        s += "%sunsigned magicShiftSize%s;\n" % (t, idxChar)
      for idxChar in solution["PackedC1Indices"][:-1]:
        s += "%sunsigned magicNumberSize%s;\n" % (t, idxChar)
        s += "%sunsigned magicShiftSize%s;\n" % (t, idxChar)

      # number of unroll loop iterations to stagger the start in "U" dim.
      s += "%sint staggerUIter;\n" % t

      # persistent
      s += "%sunsigned int problemNumGroupTiles0;\n" % t
      s += "%sunsigned int problemNumGroupTiles1;\n" % t
      s += "%sunsigned int magicNumberProblemNumGroupTiles0;\n" % t
      s += "%sunsigned int gridNumWorkGroups0;\n" % t
      s += "%sunsigned int numFullBlocks;\n" % t
      s += "%sunsigned int wgmRemainder1;\n" % t
      s += "%sunsigned int magicNumberWgmRemainder1;\n" % t

      s += "%sunsigned int pad;\n" % t # FIXME can this be removed?
      t = t[2:]
      s += "%s} hipFunctionArgs;\n" % t
      #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t
      s += "%ssize_t hipFunctionArgsSize = sizeof(hipFunctionArgs);\n" % t
      s += "%svoid *hipLaunchParams[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &hipFunctionArgs, HIP_LAUNCH_PARAM_BUFFER_SIZE, &hipFunctionArgsSize, HIP_LAUNCH_PARAM_END};\n" % t
      #s += "%sprintf(\"size: %%lu\\n\", sizeof(unsigned int));\n" % t
      #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t
      #for arg in solutionArgs:
      #  s += "%sprintf(\"%s: %%lu\\n\", static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)) - static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)));\n" % (t, arg[1], arg[1], solutionArgs[0][1])

    # NOTE: host compiler aligns size of structs to 64-bits (at least) and aligns the offset of pointers to 64-bits, therefore, having pointers which are not at the beginning of the struct may get padded/shifted by the host compiler and, therefore, not coppied correctly to gpu

    if globalParameters["RuntimeLanguage"] == "HIP":
      s += "%sint deviceId;\n" % (t)
      s += "%shipGetDevice(&deviceId);\n" % (t)

    # kernels
    s += "\n%s/* kernels */\n" % (t)
    s += "%sconst unsigned int numKernels = %u; // 1 or 4\n" % (t, len(kernels))

    if kernelLanguage == "Source" and globalParameters["RuntimeLanguage"] == "OCL":
      s += "%sconst char *kernelSources[numKernels] = {\n" % (t)
      t += "  "
      for kernelIdx in range(0, len(kernelNames)):
        kernelName = kernelNames[kernelIdx]
        s += "%s%s_src%s\n" % (t, kernelName, \
            "," if kernelIdx < len(kernels)-1 else "" )
      t = t[2:]
      s += "%s};\n" % (t)
      s += "%scl_kernel kernels[numKernels];\n" % (t)
      s += "%sconst char *buildOptions = \"-cl-std=cl2.0\";\n" % (t)
      s += "%sfor (unsigned int i = 0; i < numKernels; i++) {\n" % (t)
      s += "%s  tensileGetCompiledOpenCLKernel(\n" % (t)
      s += "%s      &kernels[i],\n" % (t)
      s += "%s      kernelSources[i],\n" % (t)
      s += "%s      stream,\n" % (t)
      s += "%s      buildOptions);\n" % (t)
      s += "%s}\n" % (t)

      if gsu > 1:
        for beta in Solution.getKernelsBetaOnlyFromProblem(problemType, gsu):
          kernelName = self.kernelWriter.getKernelNameBetaOnly(beta)
          s += "%scl_kernel kernel_%s;\n" % (t, kernelName)
          s += "%s  tensileGetCompiledOpenCLKernel(\n" % (t)
          s += "%s      &kernel_%s,\n" % (t, kernelName)
          s += "%s      %s_src,\n" % (t, kernelName)
          s += "%s      stream,\n" % (t)
          s += "%s      buildOptions);\n" % (t)

    elif kernelLanguage == "Assembly":
      kernel = kernels[0]
      s += "%shipFunction_t hipFunction;\n" % (t)
      # if !CodeFromFiles then pass global _coba that points to code object
      s += "%sstatus = solutionLock->getFunction(&hipFunction, deviceId, \"%s\", %s);;\n" \
              % (t, kernelName, "nullptr" if globalParameters["CodeFromFiles"] else kernelName+"_coba" )
      s += "%sif (status) return status;\n" % (t)

    typeName = problemType["DataType"].toCpp()

    # num enqueues
    s += "\n%s/* num kernels */\n" % (t)
    s += "%sunsigned int numEnqueues[numKernels] = { 1" % (t)
    for i in range(1, len(kernels)):
      s += ", 1"
    s += " };\n"

    # grid size
    s += "\n%s/* grid sizes */\n" % (t)
    s += "%sconst unsigned int workDim = 3;\n" % (t)
    s += "%sconst unsigned int threadTile[2] = { %u, %u };\n" \
        % (t, tt0, tt1)
    s += "%sconst unsigned int groupSize[2] = { %u, %u };\n" \
        % (t, sg0, sg1)
    s += "%ssize_t localWorkSize[3] = { %3u, 1, 1 };\n" \
        % (t, nt)
    s += "%ssize_t globalWorkSize[numKernels][3];\n" % (t)
    # grid size [2]
    s += "%sglobalWorkSize[0][2] = 1;\n" % (t)
    for i in range(0, problemType["NumIndicesC"]):
      if i != problemType["Index0"] and i != problemType["Index1"]:
        s += "%sglobalWorkSize[0][2] *= size%s;\n" % (t, self.indexChars[i])

    s += "%sunsigned int sizeOfC0 = " % (t)
    s += " * ".join(["size" + i for i in solution["PackedC0Indices"]])
    s += ";\n"

    s += "%sunsigned int sizeOfC1 = " % (t)
    s += " * ".join(["size" + i for i in solution["PackedC1Indices"]])
    s += ";\n"

    for idxChar in solution["PackedC0Indices"][:-1]:
      s += "%sunsigned magicShiftSize%s = 33; // bozo, review\n" % (t, idxChar)
      s += "%sunsigned magicNumberSize%s = (1L<<magicShiftSize%s) / size%s + 1; // bozo, review\n" \
          % (t, idxChar, idxChar, idxChar)
    for idxChar in solution["PackedC1Indices"][:-1]:
      s += "%sunsigned magicShiftSize%s = 33; // bozo, review\n" % (t, idxChar)
      s += "%sunsigned magicNumberSize%s = (1L<<magicShiftSize%s) / size%s + 1; // bozo, review\n" \
              % (t, idxChar, idxChar, idxChar)

    s += "%sunsigned int macroTile0 = static_cast<unsigned int>(groupSize[0] * threadTile[0]);\n" % (t)
    s += "%sunsigned int macroTile1 = static_cast<unsigned int>(groupSize[1] * threadTile[1]);\n" % (t)
    s += "%sunsigned int totalWorkGroups0 = sizeOfC0 / macroTile0;\n" % (t)
    s += "%sunsigned int totalWorkGroups1 = sizeOfC1 / macroTile1;\n" % (t)

    if kernel["EdgeType"] != "None":
      s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % (t)
      s += "%sif (totalWorkGroups0*macroTile0 < sizeOfC0) { totalWorkGroups0++; }\n" % (t)
      s += "%sif (totalWorkGroups1*macroTile1 < sizeOfC1) { totalWorkGroups1++; }\n" % (t)
    if kernel["WorkGroupMappingType"] == "Z" and abs(kernel["WorkGroupMapping"]) == 2:
      s += "%sunsigned int totalWorkGroupsPow2 = totalWorkGroups0 > totalWorkGroups1 ? totalWorkGroups0 : totalWorkGroups1;\n" % (t)
      s += "%stotalWorkGroupsPow2--;\n" % (t)
      s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 1;\n" % (t)
      s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 2;\n" % (t)
      s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 4;\n" % (t)
      s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 8;\n" % (t)
      s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 16;\n" % (t)
      s += "%stotalWorkGroupsPow2++;\n" % (t)
      s += "%stotalWorkGroups0 = totalWorkGroupsPow2;\n" % (t)
      s += "%stotalWorkGroups1 = totalWorkGroupsPow2;\n" % (t)

    # persistent:
    s += "%sunsigned int problemNumGroupTiles0 = totalWorkGroups0;\n" % (t)
    s += "%sunsigned int problemNumGroupTiles1 = totalWorkGroups1;\n" % (t)
    s += "%sconst unsigned smallNumMagicShift = 31; // bozo, review\n" % (t)
    s += "%sunsigned magicNumberProblemNumGroupTiles0 = (1L<<smallNumMagicShift) / problemNumGroupTiles0 + 1; // bozo, review\n"  % (t)
    s += "%sunsigned numFullBlocks =  problemNumGroupTiles1 / %u; // divide by WorkGroupMapping\n" \
            % (t, abs(kernel["WorkGroupMapping"]) if abs(kernel["WorkGroupMapping"])>0 else 1)
    s += "%sunsigned wgmRemainder1 =  %u ? (problemNumGroupTiles1 %% %u) : 0;\n" % \
            (t, abs(kernel["WorkGroupMapping"]), abs(kernel["WorkGroupMapping"]))
    s += "%sif (wgmRemainder1 == 0) wgmRemainder1 = %u;\n" % (t, abs(kernel["WorkGroupMapping"]))
    s += "%sunsigned magicNumberWgmRemainder1 = ((1L<<smallNumMagicShift) / wgmRemainder1 + 1);\n"  % (t)
    #s += '  printf ("wgmRemainder1=%u \\n", wgmRemainder1);'

    if gsu> 1:
      s += "%stotalWorkGroups1 *= %u; // GlobalSplitU\n" % (t, gsu)
    if persistent:
      s += "%shipDeviceProp_t deviceProperties;\n" % (t)
      # TODO - should cache the device properties - expensive to call on each iteration here:
      s += "%shipGetDeviceProperties( &deviceProperties, deviceId );\n" % (t)
      s += "%sunsigned int numGroups = totalWorkGroups0 * totalWorkGroups1;\n" % (t)
      s += "%sglobalWorkSize[0][0] = (deviceProperties.multiProcessorCount * %u < numGroups) ? (deviceProperties.multiProcessorCount * %u) : numGroups;\n" \
              % (t, persistent, persistent)

      s += "%sglobalWorkSize[0][1] = 1;\n" % t
    else:
      s += "%sglobalWorkSize[0][0] = totalWorkGroups%u%s;\n" % (t, 0 if kernel["WorkGroupMapping"] >= 0 else 1, "*localWorkSize[0]" if self.language == "OCL" else "")
      s += "%sglobalWorkSize[0][1] = totalWorkGroups%u%s;\n" % (t, 1 if kernel["WorkGroupMapping"] >= 0 else 0, "*localWorkSize[1]" if self.language == "OCL" else "")

    # index sizes
    s += "\n%s/* index sizes */\n" % (t)
    s += "%sunsigned int sizes[numKernels][1][%u];\n" \
        % (t, problemType["TotalIndices"])
    for kernelIdx in range(0, len(kernels)):
      kernel = kernels[kernelIdx]
      kernelName = self.kernelWriter.getKernelName(kernel)
      # free index sizes
      for i in range(0,problemType["NumIndicesFree"] \
          + problemType["NumIndicesBatch"] ):
        s += "%ssizes[%u][0][%u] = size%s;\n" \
            % (t, kernelIdx, i, self.indexChars[i])
      # summation index sizes
      for i in range(problemType["NumIndicesC"], \
              problemType["TotalIndices"] ):
        lastParam = i == problemType["TotalIndices"]-1
        s += "%ssizes[%u][0][%u] = size%s;\n" \
            % (t, kernelIdx, i, self.indexChars[i])

      # Tensor2DSizes - size excluding the batch dimension, accounts for cases where one of strides is 0
      #print "IndexAssignmentsA=", problemType["IndexAssignmentsA"], "Batch=", problemType["IndicesBatch"]
      firstStride = 0 if problemType["UseInitialStrides"] else 1
      del i

      numIdx = problemType["NumIndicesC"]
      printMe = 0
      s += "%suint64_t tensor2dSizeC = %s" % \
          (t, "1" if firstStride==1 else "strideC%u%s"% (0,self.indexChars[0]))
      for idx in range(0,numIdx):
        # Multiply only by packed tensor dims
        if idx in problemType["IndicesFree"]:
          printMe = True
        else:
          printMe = False

        if printMe:
          if idx+1 < numIdx:
            strideIdx = idx+1
            s += " * std::max(size%s, strideC%u%s)" % \
                (self.indexChars[idx], idx+1, self.indexChars[strideIdx])
          else:
            s += " * size%s" % (self.indexChars[idx])
      s += ";\n"

      s += "%suint64_t tensor2dSizeA = 1;\n" % t
      numIdx = len(problemType["IndexAssignmentsA"])

      printMe = printedSum = False
      for i in range(0,numIdx):
        idx = problemType["IndexAssignmentsA"][i]

        # Multiply only by first free and first summation
        if idx in [ord(x)-ord(globalParameters["IndexChars"][0]) for x in solution["PackedC0Indices"]]:
          printMe = True
        elif idx in problemType["IndicesSummation"] and not printedSum:
          printMe = printedSum = True
        else:
          printMe = False

        if printMe:
          s += "%stensor2dSizeA = " % t
          if i+1 < numIdx:
            strideIdx = problemType["IndexAssignmentsA"][i+1]
            s += "std::max(tensor2dSizeA*size%s, (uint64_t)strideA%u%s);\n" \
                % (self.indexChars[idx], i+1, self.indexChars[strideIdx])
          else:
            s += " tensor2dSizeA * size%s" % (self.indexChars[idx])
      s += ";\n"

      s += "%suint64_t tensor2dSizeB = 1;\n" % t
      numIdx = len(problemType["IndexAssignmentsB"])
      printMe = printedSum = False
      for i in range(0,numIdx):
        idx = problemType["IndexAssignmentsB"][i]

        # Multiply only by first free and first summation
        if idx in [ord(x)-ord(globalParameters["IndexChars"][0]) for x in solution["PackedC1Indices"]]:
          printMe = True
        elif idx in problemType["IndicesSummation"] and not printedSum:
          printMe = printedSum = True
        else:
          printMe = False

        if printMe:
          s += "%stensor2dSizeB = " % t
          if i+1 < numIdx:
            strideIdx = problemType["IndexAssignmentsB"][i+1]
            s += "std::max(tensor2dSizeB*size%s, (uint64_t)strideB%u%s);\n" \
                % (self.indexChars[idx], i+1, self.indexChars[strideIdx])
          else:
            s += " tensor2dSizeB * size%s" % (self.indexChars[idx])
      s += ";\n"

    unrollChar = globalParameters["IndexChars"][problemType["IndexUnroll"]]

    s += "  unsigned int staggerUIter = %s; // how many stride-sized clicks to stagger start offset\n" \
        % (solution["StaggerU"])
    s += "  int unrollLoopIters = size%s/%u/%u; // /DepthU/GSU\n" % (unrollChar, solution["DepthU"], gsu)
    s += "  while (staggerUIter>1) {\n"
    s += "    if (unrollLoopIters >= (staggerUIter*%u)) {\n" % (1<<solution["_staggerStrideShift"])
    s += "      break;}\n"
    s += "    staggerUIter /= 2; // step down to smaller stagger\n"
    s += "  }\n"
    s += "  if (staggerUIter>=1) staggerUIter -= 1;\n" # convert to a mask
    #s += '  printf ("size%s=%%u StaggerU=%s unrollLoopIters=%%u, staggerUIter=%%d\\n", size%s, unrollLoopIters, staggerUIter);\n' % (unrollChar, solution["StaggerU"], unrollChar)



    #s += "printf(\"Launching with grid=%zu_%zu problemGrid=%u_%u mt=%u_%u\\n\", globalWorkSize[0][0], globalWorkSize[0][1], totalWorkGroups0, totalWorkGroups1, macroTile0, macroTile1);\n"
    s += "\n"
    s += "%sint kernelsLaunched=0;\n" % (t)

    ########################################
    # Enqueue Beta-Only Kernel
    ########################################
    if gsu > 1:
      kernelNamesBetaOnly = []
      numStridesC = problemType["NumIndicesC"] - \
          (0 if problemType["UseInitialStrides"] else 1)
      for beta in Solution.getKernelsBetaOnlyFromProblem(problemType, gsu):
        kernelName = self.kernelWriter.getKernelNameBetaOnly(beta)
        kernelNamesBetaOnly.append(kernelName)
      s += "%s// enqueue Beta-Only kernel\n" % (t)

      # grid sizes
      s += "%ssize_t localWorkSizeBetaOnly[3] = { 8, 8, 1};\n" % (t)
      s += "%ssize_t globalWorkSizeBetaOnly[3];\n" % (t)
      #s += "%sunsigned int sizeOfC0 = size%s;\n" % (t, \
      #    self.indexChars[problemType["Index0"]])
      #s += "%sunsigned int sizeOfC1 = size%s;\n" % (t, \
      #    self.indexChars[problemType["Index1"]])
      s += "%ssize_t totalWorkGroupsBetaOnly0 = sizeOfC0 / localWorkSizeBetaOnly[0];\n" % (t)
      s += "%ssize_t totalWorkGroupsBetaOnly1 = sizeOfC1 / localWorkSizeBetaOnly[1];\n" % (t)
      s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % (t)
      s += "%sif (totalWorkGroupsBetaOnly0*localWorkSizeBetaOnly[0] < sizeOfC0) { totalWorkGroupsBetaOnly0++; }\n" % (t)
      s += "%sif (totalWorkGroupsBetaOnly1*localWorkSizeBetaOnly[1] < sizeOfC1) { totalWorkGroupsBetaOnly1++; }\n" % (t)
      s += "%sglobalWorkSizeBetaOnly[0] = totalWorkGroupsBetaOnly0%s;\n" % (t, "*localWorkSizeBetaOnly[0]" if self.language == "OCL" else "")
      s += "%sglobalWorkSizeBetaOnly[1] = totalWorkGroupsBetaOnly1%s;\n" % (t, "*localWorkSizeBetaOnly[1]" if self.language == "OCL" else "")
      s += "%sglobalWorkSizeBetaOnly[2] = 1;\n" % (t)
      for i in range(0, problemType["NumIndicesC"]):
        if i != problemType["Index0"] and i != problemType["Index1"]:
          s += "%sglobalWorkSizeBetaOnly[2] *= size%s;\n" % (t, self.indexChars[i])

      if problemType["UseBeta"]:
        s += "%sbool betaZero = beta == 0;\n" % (t)
      if self.language == "OCL":
        if problemType["UseBeta"]:
          s += "%scl_kernel kernelBetaOnly = betaZero ? kernel_%s : kernel_%s;\n" \
              % (t, kernelNamesBetaOnly[0], kernelNamesBetaOnly[1])
        else:
          #s += "%sbool betaZero = true;\n" % (t)
          s += "%scl_kernel kernelBetaOnly = kernel_%s;\n" \
              % (t, kernelNamesBetaOnly[0])
        argIdx = 0
        s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % (t, argIdx); argIdx+=1
        # strides
        for i in range(0,numStridesC):
          s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.strideList[i]); argIdx+=1
        # sizes
        for i in range(0, problemType["NumIndicesC"]):
          s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.indexChars[i]); argIdx+=1
        # beta
        if problemType["UseBeta"]:
          s += "%sif (!betaZero) {\n" % (t)
          s += "%s  status = clSetKernelArg( kernelBetaOnly, %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (t, argIdx, typeName); argIdx+=1
          s += "%s}\n" % (t)
        # enqueue
        s += "%scl_event kernelEventBetaOnly;\n" % (t)
        s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t)
        t += "  "
        s += "%sstream,\n" % (t)
        s += "%skernelBetaOnly,\n" % (t)
        s += "%sworkDim,\n" % (t)
        s += "%sNULL, // globalWorkOffset\n" % (t)
        s += "%sglobalWorkSizeBetaOnly,\n" % (t)
        s += "%slocalWorkSizeBetaOnly,\n" % (t)
        s += "%snumInputEvents,\n" % (t)
        s += "%sinputEvents,\n" % (t)
        #s += "%soutputEvent );\n" % (t)
        s += "%s&kernelEventBetaOnly );\n" % (t)
        t = t[2:]
        s += "%stensileStatusCheck(status);\n" % (t)
        if problemType["UseBeta"]:
          s += "%sbeta = %s;\n" % (t, problemType["DataType"].zeroString(self.language, 1) )
        #s += "%sreturn tensileStatusSuccess;\n" % (t)
        s += "%sstatus = clFinish(stream);\n" % (t)
        s += "%stensileStatusCheck(status);\n" % (t)
        #s += " float tmp[128*128];\n"
        #s += "clEnqueueReadBuffer(stream, dataC, CL_TRUE, 0, 128*128*sizeof(float), tmp, 0, NULL, NULL);\n"
        #s += "for (unsigned int i = 0; i < 128*128; i++) { printf(\"%f\\n\", tmp[i]); }\n"
      else:
        s += "%stry {\n" % (t)
        # TODO - timing with beta kernels is somewhat pessimistic since it has this separate event only on the GSU path.
        # Introduces 2-3us of overhead ; may want to disable PreciseKernelTime so non-GSU have same overhead.
        # Long-term fix would be to launch the beta kernel with the hipHccModule* API and set start-event in that call
        if problemType["UseBeta"]:
          s += "%sif (betaZero) {\n" % (t)
          t += "  "
        s += "%sif( inputEvents != NULL )\n" % (t)
        s += "%s  hipEventRecord(inputEvents[0], stream );\n" % (t)
        s += "%skernelsLaunched++;\n" % (t)
        s += "%shipLaunchKernelGGL(\n" % (t)
        t += "  "
        s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[0])
        s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % (t)
        s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % (t)
        s += "%s0, // groupMemBytes\n" % (t)
        s += "%sstream,\n" % (t)
        s += "%sdataD,\n" % (t)
        s += "%sdataC,\n" % (t)
        # strides
        for i in range(0,numStridesC*2):
          s += "%s%s,\n" % (t, self.strideList[i])
        # sizes
        for i in range(0, problemType["NumIndicesC"]):
          s += "%ssize%s%s" % (t, self.indexChars[i], ",\n" if i < problemType["NumIndicesC"]-1 else ");\n")

        if problemType["UseBeta"]:
          s += "%s} else {\n" % (t)
          t = t[:-2]
          s += "%sif( inputEvents != NULL )\n" % (t)
          s += "%s  hipEventRecord(inputEvents[0], stream );\n" % (t)
          s += "%skernelsLaunched++;\n" % (t)
          s += "%shipLaunchKernelGGL(\n" % (t)
          t += "  "
          s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[1])
          s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % (t)
          s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % (t)
          s += "%s0, // groupMemBytes\n" % (t)
          s += "%sstream,\n" % (t)
          s += "%sdataD,\n" % (t)
          s += "%sdataC,\n" % (t)
          # strides
          for i in range(0,numStridesC*2):
            s += "%s%s,\n" % (t, self.strideList[i])
          # sizes
          for i in range(0, problemType["NumIndicesC"]):
            s += "%ssize%s,\n" % (t, self.indexChars[i])
          s += "%sbeta);\n" % (t)
          s += "}\n"

        t = "  "
        s += "%s} catch (const std::exception& e) {\n" % (t)
        s += "#ifdef DEBUG\n"
        s += "%s  std::cerr << e.what() << std::endl;\n" % (t)
        s += "#endif\n"
        s += "%s  return tensileStatusFailure;\n" % (t)
        s += "%s}\n" % (t)

    ########################################
    # Enqueue Kernels
    ########################################
    for kernelIdx in range(0, len(kernels)):
      kernel = kernels[kernelIdx]
      if kernel["KernelLanguage"] == "Source":
        kernel["ISA"] = (0, 0, 0) # HIP source kernels needs dummy ISA version
      kernelName = self.kernelWriter.getKernelName(kernel)
      s += "\n%s/* kernel %u: %s */\n" % (t, kernelIdx, kernelName)
      s += "%sunsigned int kernelIdx = %u;\n" % (t, kernelIdx)
      if self.language == "OCL":
        # set kernel args same for all enqueues
        s += "%s// kernel args same for all enqueues\n" % (t)
        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataD ); tensileStatusCheck(status);\n" % (t, 0)
        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % (t, 1)
        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataA ); tensileStatusCheck(status);\n" % (t, 2)
        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataB ); tensileStatusCheck(status);\n" % (t, 3)
        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &alpha ); tensileStatusCheck(status);\n" % (t, 4, typeName)
        s += "%s%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (t, \
            "" if problemType["UseBeta"] else "//", 5, typeName)
        argIdx = 6 if problemType["UseBeta"] else 5
        for stride in self.strideList:
          s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % (t, argIdx, stride)
          argIdx += 1
        for sizeIdx in range(0, problemType["TotalIndices"]):
          if sizeIdx not in [ problemType["Index0"],  problemType["Index1"], problemType["IndexUnroll"] ]:
            s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.indexChars[sizeIdx])
          argIdx += 1

        s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(staggerUIter), &staggerUIter ); tensileStatusCheck(status);\n" % (t, argIdx)
        argIdx += 1

      s += "%sfor (unsigned int enqueueIdx = 0; enqueueIdx < numEnqueues[%u]; enqueueIdx++) {\n" % (t, kernelIdx)
      t += "  "
      # debug print kernel dimensions
      if globalParameters["LibraryPrintDebug"]:
        s += "%sprintf(\"%s: g{ %%u, %%u, %%u } l{ %%u, %%u, %%u}\\n\", static_cast<unsigned int>(globalWorkSize[kernelIdx][0]), static_cast<unsigned int>(globalWorkSize[kernelIdx][1]), static_cast<unsigned int>(globalWorkSize[kernelIdx][2]), static_cast<unsigned int>(localWorkSize[0]), static_cast<unsigned int>(localWorkSize[1]), static_cast<unsigned int>(localWorkSize[2]) );\n" % (t, kernelName)
        # debug print kernel arguments
        # strides
        for stride in self.strideList:
          s += "%sprintf(\"  %s = %%u\\n\", %s);\n" % (t, stride, stride)
        # sizes
        for i in range(0, problemType["TotalIndices"]):
          s += "%sprintf(\"  sizes[kernelIdx][enqueueIdx][%u] = %%u\\n\", sizes[kernelIdx][enqueueIdx][%u] );\n" % (t, i, i )
        s += "%sprintf(\"  problemNumGroupTiles0== %%u\\n\", problemNumGroupTiles0 );\n" % (t)
        s += "%sprintf(\"  problemNumGroupTiles1== %%u\\n\", problemNumGroupTiles1 );\n" % (t)
        s += "%sprintf(\"  tensor2dSizeC== %%lu\\n\", tensor2dSizeC );\n" % (t)
        s += "%sprintf(\"  tensor2dSizeA== %%lu\\n\", tensor2dSizeA );\n" % (t)
        s += "%sprintf(\"  tensor2dSizeB== %%lu\\n\", tensor2dSizeB );\n" % (t)
        for idxChar in solution["PackedC0Indices"][:-1]:
          s += "%sprintf(\"  magicNumberSize%s== 0x%%x, magicShiftSize%s== %%u)\\n\",  magicNumberSize%s, magicShiftSize%s);\n" \
              % (t, idxChar, idxChar, idxChar, idxChar)
        for idxChar in solution["PackedC1Indices"][:-1]:
          s += "%sprintf(\"  magicNumberSize%s== 0x%%x, magicShiftSize%s== %%u)\\n\",  magicNumberSize%s, magicShiftSize%s);\n" \
              % (t, idxChar, idxChar, idxChar, idxChar)

      ########################################
      # OpenCL Runtime
      ########################################
      if self.language == "OCL":
        # set kernel args different for all enqueues
        argIdx = 6 if problemType["UseBeta"] else 5
        argIdx += len(self.strideList)
        # sizes
        for sizeIdx in range(0, problemType["TotalIndices"]):
          if sizeIdx in [ problemType["Index0"],  problemType["Index1"], problemType["IndexUnroll"] ]:
            s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.indexChars[sizeIdx])
          argIdx += 1

        # enqueue
        s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t)
        t += "  "
        s += "%sstream,\n" % (t)
        s += "%skernels[kernelIdx],\n" % (t)
        s += "%sworkDim,\n" % (t)
        s += "%sNULL, // globalWorkOffset\n" % (t)
        s += "%sglobalWorkSize[kernelIdx],\n" % (t)
        s += "%slocalWorkSize,\n" % (t)
        if False: # gsu > 1:
          s += "%s1,\n" % (t)
          s += "%s&kernelEventBetaOnly,\n" % (t)
        else:
          s += "%snumInputEvents,\n" % (t)
          s += "%sinputEvents,\n" % (t)
        s += "%soutputEvent );\n" % (t)
        s += "%stensileStatusCheck(status);\n" % (t)
        s += "%s}\n" % (t)

      ########################################
      # HIP Runtime
      ########################################
      else:
        if not globalParameters["PreciseKernelTime"] or kernelLanguage == "Source":
          s += "%sif( inputEvents != NULL )\n" % (t)
          t += "  "
          s += "%shipEventRecord(inputEvents[enqueueIdx], stream );\n" % (t)
        t = t[2:]
        s += "%stry {\n" % (t)
        t += "  "
        # hip kernel
        if kernelLanguage == "Source":
          s += "%skernelsLaunched++;\n" % (t)
          s += "%shipLaunchKernelGGL(\n" % (t)
          t += "  "
          s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelName)
          s += "%sdim3(globalWorkSize[kernelIdx][0], globalWorkSize[kernelIdx][1], globalWorkSize[kernelIdx][2]),\n" % (t)
          s += "%sdim3(localWorkSize[0], localWorkSize[1], localWorkSize[2]),\n" % (t)
          s += "%s0, // groupMemBytes\n" % (t)
          s += "%sstream,\n" % (t)
          s += "%sdataD,\n" % (t)
          s += "%sdataC,\n" % (t)
          s += "%sdataA,\n" % (t)
          s += "%sdataB,\n" % (t)
          s += "%salpha,\n" % (t)
          s += "%s%sbeta,\n" % (t, \
              "" if problemType["UseBeta"] else "//")
          # strides
          for stride in self.strideList:
            s += "%s%s,\n" % (t, stride)
          # sizes
          for i in range(0, problemType["TotalIndices"]):
            lastParam = i == problemType["TotalIndices"]-1
            s += "%ssizes[kernelIdx][enqueueIdx][%u]%s\n" \
                % (t, i, "" if lastParam else "," )
          for idxChar in solution["PackedC0Indices"][:-1]:
            s += "%s,magicNumberSize%s\n" % (t, idxChar)
            s += "%s,magicShiftSize%s\n" % (t, idxChar)
          for idxChar in solution["PackedC1Indices"][:-1]:
            s += "%s,magicNumberSize%s\n" % (t, idxChar)
            s += "%s,magicShiftSize%s\n" % (t, idxChar)
          s += "%s,staggerUIter\n" % (t)
          #persistent:
          s += "%s,problemNumGroupTiles0\n" % (t)
          s += "%s,problemNumGroupTiles1\n" % (t)
          s += "%s,magicNumberProblemNumGroupTiles0\n" % (t) # magic number to use when dividing by problemNumGroupTiles0
          s += "%s);\n" % (t)

        # assembly kernel
        else:
          if globalParameters["DebugKernel"]:
            s += "%sconst unsigned int debugBufferElementsPerThread = 16;\n" % t
            s += "%sunsigned int debugBufferNumElem = debugBufferElementsPerThread;\n" % (t)
            s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][0]);\n" % (t)
            s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][1]);\n" % (t)
            s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][2]);\n" % (t)
            s += "%sdebugBufferNumElem *= localWorkSize[0];\n" % (t)
            s += "%sdebugBufferNumElem *= localWorkSize[1];\n" % (t)
            s += "%sdebugBufferNumElem *= localWorkSize[2];\n" % (t)
            s += "%s  printf(\"debugBufferNumElem: %%04i: \\n\", debugBufferNumElem);\n" % (t)
            s += "%ssize_t debugBufferSize = debugBufferNumElem * sizeof(unsigned int);\n" % (t)
            s += "%shipDevice_t device;\n" % t
            s += "%shipDeviceGet(&device, 0);\n" % t
            s += "%shipMalloc(&(hipFunctionArgs.debugBuffer), debugBufferSize);\n" % t
            s += "%sunsigned int *debugBufferHostPtr = new unsigned int[debugBufferNumElem];\n" % (t)
            s += "%smemset(debugBufferHostPtr,0,debugBufferSize);\n" % (t)
            s += "%shipMemcpyHtoD(hipFunctionArgs.debugBuffer, debugBufferHostPtr, debugBufferSize);\n" % (t)
            s += "%smemset(debugBufferHostPtr,1,debugBufferSize);\n" % (t)

          # hip assembly function
          s += "%shipFunctionArgs.tensor2dSizeC = tensor2dSizeC;\n" % (t)
          s += "%shipFunctionArgs.tensor2dSizeA = tensor2dSizeA;\n" % (t)
          s += "%shipFunctionArgs.tensor2dSizeB = tensor2dSizeB;\n" % (t)

          s += "%shipFunctionArgs.dataD = dataD;\n" % (t)
          s += "%shipFunctionArgs.dataC = dataC;\n" % (t)
          s += "%shipFunctionArgs.dataA = dataA;\n" % (t)
          s += "%shipFunctionArgs.dataB = dataB;\n" % (t)

          if problemType["DataType"].isHalf():
            s += "%shipFunctionArgs.alpha[0] = alpha;\n" % (t)
            s += "%shipFunctionArgs.alpha[1] = alpha;\n" % (t)
          else:
            s += "%shipFunctionArgs.alpha = alpha;\n" % (t)
          if problemType["UseBeta"]:
            if problemType["DataType"].isHalf():
              s += "%shipFunctionArgs.beta[0] = beta;\n" % (t)
              s += "%shipFunctionArgs.beta[1] = beta;\n" % (t)
            else:
              s += "%shipFunctionArgs.beta = beta;\n" % (t)
          # strides
          for stride in self.strideList:
            s += "%shipFunctionArgs.%s = %s;\n" % (t, stride, stride)
          # sizes
          for i in range(0, problemType["TotalIndices"]):
            lastParam = i == problemType["TotalIndices"]-1
            s += "%shipFunctionArgs.size%s = sizes[kernelIdx][enqueueIdx][%u];\n" \
                % (t, globalParameters["IndexChars"][i], i )

          s += "%shipFunctionArgs.tensor2dSizeC = tensor2dSizeC;\n" % (t)
          s += "%shipFunctionArgs.tensor2dSizeA = tensor2dSizeA;\n" % (t)
          s += "%shipFunctionArgs.tensor2dSizeB = tensor2dSizeB;\n" % (t)

          s += "%shipFunctionArgs.staggerUIter = staggerUIter;\n" % (t)
          # persistent - pass in the number of tiles in problem since not available in WG
          s += "\n"
          s += "%shipFunctionArgs.problemNumGroupTiles0 = problemNumGroupTiles0;\n" % (t)
          s += "%shipFunctionArgs.problemNumGroupTiles1 = problemNumGroupTiles1;\n" % (t)
          s += "%shipFunctionArgs.magicNumberProblemNumGroupTiles0 = magicNumberProblemNumGroupTiles0;\n" % (t)
          s += "%shipFunctionArgs.gridNumWorkGroups0 = globalWorkSize[kernelIdx][0];\n" % (t) #
          s += "%shipFunctionArgs.numFullBlocks = numFullBlocks;\n" % (t)
          s += "%shipFunctionArgs.wgmRemainder1 = wgmRemainder1;\n" % (t)
          s += "%shipFunctionArgs.magicNumberWgmRemainder1 = magicNumberWgmRemainder1;\n" % (t)

          # Magic numbers for packed indices:
          for idxChar in solution["PackedC0Indices"][:-1]:
            s += "%shipFunctionArgs.magicNumberSize%s = magicNumberSize%s;\n" % (t, idxChar, idxChar)
            s += "%shipFunctionArgs.magicShiftSize%s = magicShiftSize%s;\n" % (t, idxChar, idxChar)
          for idxChar in solution["PackedC1Indices"][:-1]:
            s += "%shipFunctionArgs.magicNumberSize%s = magicNumberSize%s;\n" % (t, idxChar, idxChar)
            s += "%shipFunctionArgs.magicShiftSize%s = magicShiftSize%s;\n" % (t, idxChar, idxChar)

          s += "%skernelsLaunched++;\n" % (t)
          s += "%shipHccModuleLaunchKernel(\n" % (t)
          t += "  "
          s += "%shipFunction,\n" % (t)
          s += "%sglobalWorkSize[kernelIdx][0]*localWorkSize[0],\n" % (t)
          s += "%sglobalWorkSize[kernelIdx][1]*localWorkSize[1],\n" % (t)
          s += "%sglobalWorkSize[kernelIdx][2]*localWorkSize[2],\n" % (t)
          s += "%slocalWorkSize[0],\n" % (t)
          s += "%slocalWorkSize[1],\n" % (t)
          s += "%slocalWorkSize[2],\n" % (t)
          s += "%s0, // groupMemBytes\n" % (t)
          s += "%sstream,\n" % (t)
          s += "%sNULL,\n" % (t)
          s += "%s(void**)hipLaunchParams\n" % (t)
          if globalParameters["PreciseKernelTime"]:
            s += "%s,(inputEvents && kernelsLaunched==1) ? inputEvents[enqueueIdx]:nullptr\n" %(t)
            s += "%s,outputEvent ? outputEvent[enqueueIdx]:nullptr\n" % (t)

          s += "%s);\n" % (t)
          t = t[2:]
          if globalParameters["DebugKernel"]:
            # copy debug buffer
            s += "%shipMemcpyDtoH(debugBufferHostPtr, hipFunctionArgs.debugBuffer, debugBufferSize);\n" % (t)
            s += "%sfor(unsigned int i = 0; i < debugBufferNumElem/debugBufferElementsPerThread; i++) {\n" % (t)
            s += "%s  printf(\"%%04i\", i);\n" % (t)
            s += "%s  char u[debugBufferElementsPerThread] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};\n" % (t)
            #s += "%s  char u[debugBufferElementsPerThread] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};\n" % (t)
            #s += "%s  char u[debugBufferElementsPerThread] = {1,1,0,0,1,1,0,0,1,1,1,1,1,1,1,1};\n" % (t)
            s += "%s  for(unsigned int j = 0; j < debugBufferElementsPerThread; j++) {\n" % (t)
            s += "%s if (u[j]) printf(\",%%4u\", debugBufferHostPtr[i*debugBufferElementsPerThread+j]);\n" % (t)
            s += "%s else printf(\",%%4.0f\", ((float *)debugBufferHostPtr)[i*debugBufferElementsPerThread+j]);\n" % (t)

            s += "%s  }\n" % (t)
            s += "%s  printf(\"\\n\");\n" % (t)
            s += "%s}\n" % (t)


        t = t[2:]
        s += "%s} catch (const std::exception& e) {\n" % (t)
        s += "#ifdef DEBUG\n"
        s += "%s  std::cerr << e.what() << std::endl;\n" % (t)
        s += "#endif\n"
        s += "%s  return tensileStatusFailure;\n" % (t)
        s += "%s}\n" % (t)
        if not globalParameters["PreciseKernelTime"] or kernelLanguage == "Source":
          s += "%sif( outputEvent != NULL )\n" % (t)
          s += "%s  hipEventRecord(outputEvent[enqueueIdx], stream );\n" % (t)
        s += "  }\n"
    s += "\n"
    s += "  return tensileStatusSuccess;\n"
    s += "}\n"
    s += "\n"
    s += "/* Solution Parameters\n"
    s += Solution.getParametersIndented(solution.getAttributes(), "  ")
    s += "*/\n"
    s += "\n"

    return s