def getSolutionName(self, solution): if globalParameters["ShortNames"]: solutionName = Solution.getNameSerial(solution, self.solutionSerialNaming) else: solutionName = Solution.getNameMin(solution, self.solutionMinNaming) return solutionName
def writeBenchmarkFiles(stepBaseDir, solutions, problemSizes, stepName, filesToCopy): if not globalParameters["MergeFiles"]: ensurePath(os.path.join(globalParameters["WorkingPath"], "Solutions")) ensurePath(os.path.join(globalParameters["WorkingPath"], "Kernels")) ############################################################################## # Min Naming ############################################################################## kernels = [] kernelsBetaOnly = [] for solution in solutions: solutionKernels = solution.getKernels() for kernel in solutionKernels: if kernel not in kernels: kernels.append(kernel) solutionKernelsBetaOnly = solution.getKernelsBetaOnly() for kernel in solutionKernelsBetaOnly: if kernel not in kernelsBetaOnly: kernelsBetaOnly.append(kernel) solutionSerialNaming = Solution.getSerialNaming(solutions) kernelSerialNaming = Solution.getSerialNaming(kernels) solutionMinNaming = Solution.getMinNaming(solutions) kernelMinNaming = Solution.getMinNaming(kernels) solutionWriter = SolutionWriter( \ solutionMinNaming, solutionSerialNaming, \ kernelMinNaming, kernelSerialNaming) kernelWriterSource = KernelWriterSource( \ kernelMinNaming, kernelSerialNaming) kernelWriterAssembly = KernelWriterAssembly( \ kernelMinNaming, kernelSerialNaming) # write solution, kernels and CMake problemType = solutions[0]["ProblemType"] writeSolutionsAndKernels( \ globalParameters["WorkingPath"], [problemType], solutions, kernels, kernelsBetaOnly, \ solutionWriter, kernelWriterSource, kernelWriterAssembly ) ############################################################################## # Write CMake ############################################################################## clientName = "TensileBenchmark_%s" % stepName writeCMake(globalParameters["WorkingPath"], solutions, kernels, filesToCopy, \ clientName) forBenchmark = True writeClientParameters(forBenchmark, solutions, problemSizes, stepName, \ filesToCopy, stepBaseDir)
def readSolutions(filename): try: stream = open(filename, "r") except IOError: printExit("Cannot open file: %s" % filename) solutionStates = yaml.load(stream, yaml.SafeLoader) stream.close() # verify if len(solutionStates) < 2: printExit("len(%s) %u < 2" % (filename, len(solutionStates))) versionString = solutionStates[0]["MinimumRequiredVersion"] if not versionIsCompatible(versionString): printWarning("File \"%s\" version=%s does not match current Tensile version=%s" \ % (filename, versionString, __version__) ) if "ProblemSizes" not in solutionStates[1]: printExit("%s doesn't begin with ProblemSizes" % filename) else: problemSizesConfig = solutionStates[1]["ProblemSizes"] solutions = [] for i in range(2, len(solutionStates)): solutionState = solutionStates[i] solutionObject = Solution(solutionState) solutions.append(solutionObject) problemType = solutions[0]["ProblemType"] problemSizes = ProblemSizes(problemType, problemSizesConfig) return (problemSizes, solutions)
def abbreviation(self): string = "%02u" % self.stepIdx if self.isFinal(): string += "_Final" else: for param in self.benchmarkParameters: string += "_%s" % Solution.getParameterNameAbbreviation(param) return string
def __str__(self): state = "" idx = 0 for hardcodedParameters in self.winners: winningParameters = self.winners[hardcodedParameters][0] score = self.winners[hardcodedParameters][1] state += " %2u: %s -> %s %f GFlop/s\n" % (idx, hardcodedParameters, \ Solution.getNameFull(winningParameters), score) idx += 1 return state
def get(lookupHardcodedParameters, winners): matches = [] # only 1 winner, when benchmarking 1 solution if len(winners) == 1: hardcodedFrozen = winners.keys()[0] winningParameters = winners[hardcodedFrozen][0] score = winners[hardcodedFrozen][1] matches.append([hardcodedFrozen, winningParameters, score]) return matches for hardcodedFrozen in winners: winningParameters = winners[hardcodedFrozen][0] score = winners[hardcodedFrozen][1] frozenMatch = True # a match if no key in hardcoded has a different value than lookup for paramName in hardcodedFrozen: if paramName in lookupHardcodedParameters: if lookupHardcodedParameters[paramName] != \ hardcodedFrozen[paramName]: frozenMatch = False break if frozenMatch: matchMacroTile = True matchUnion = {} matchUnion.update(hardcodedFrozen.parameters) matchUnion.update(winningParameters) if "MacroTile0" in lookupHardcodedParameters: lookupMacroTile0 = lookupHardcodedParameters["MacroTile0"] lookupMacroTile1 = lookupHardcodedParameters["MacroTile1"] Solution.assignProblemIndependentDerivedParameters( matchUnion) Solution.assignProblemIndependentDerivedParameters( hardcodedFrozen.parameters) if matchUnion["MacroTile0"] != lookupMacroTile0 \ or matchUnion["MacroTile1"] != lookupMacroTile1: matchMacroTile = False if matchMacroTile: matches.append([hardcodedFrozen, winningParameters, score]) else: pass return matches
def readLibraryLogicForSchedule(filename): print1("# Reading Library Logic: %s" % (filename)) try: stream = open(filename, "r") except IOError: printExit("Cannot open file: %s" % filename) data = yaml.load(stream, yaml.SafeLoader) stream.close() # verify if len(data) < 6: printExit("len(%s) %u < 7" % (filename, len(data))) # parse out objects versionString = data[0]["MinimumRequiredVersion"] scheduleName = data[1] architectureName = data[2] deviceNames = data[3] problemTypeState = data[4] solutionStates = data[5] indexOrder = data[6] exactLogic = data[7] rangeLogic = data[8] # does version match if not versionIsCompatible(versionString): printWarning("File \"%s\" version=%s does not match Tensile version=%s" \ % (filename, versionString, __version__) ) # unpack problemType problemType = ProblemType(problemTypeState) # unpack solutions solutions = [] for i in range(0, len(solutionStates)): solutionState = solutionStates[i] if solutionState["KernelLanguage"] == "Assembly": isa0 = int(architectureName[3]) isa1 = int(architectureName[4]) isa2 = int(architectureName[5]) solutionState["ISA"] = (isa0, isa1, isa2) else: solutionState["ISA"] = (0, 0, 0) solutionObject = Solution(solutionState) if solutionObject["ProblemType"] != problemType: printExit("ProblemType of file doesn't match solution: %s != %s" \ % (problemType, solutionObject["ProblemType"])) solutions.append(solutionObject) return (scheduleName, deviceNames, problemType, solutions, indexOrder, \ exactLogic, rangeLogic )
def __str__(self): return Solution.getNameFull(self.parameters)
def __init__(self, parameters): self.parameters = deepcopy(parameters) self.hashValue = hash(Solution.getNameFull(self.parameters))
def benchmarkProblemType( problemTypeConfig, problemSizeGroupConfig, \ problemSizeGroupIdx ): benchmarkTestFails = 0 # convert config to full benchmark process (resolves defaults) print1("") print1(HR) print1("# Converting Config to BenchmarkProcess Object") print1(HR) print1("") benchmarkProcess = BenchmarkProcess( problemTypeConfig, \ problemSizeGroupConfig ) problemTypeName = str(benchmarkProcess.problemType) problemSizeGroupName = "%s_%02u" % (problemTypeName, problemSizeGroupIdx) pushWorkingPath(problemSizeGroupName) ensurePath(os.path.join(globalParameters["WorkingPath"], "Data")) totalBenchmarkSteps = len(benchmarkProcess) resultsFileBaseFinal = None winners = WinningParameterDict() print1("# NumBenchmarkSteps: %u" % totalBenchmarkSteps) print1("") print1(HR) print1("# Done Creating BenchmarkProcess Object") print1(HR) ############################################################################## # For Each Benchmark Step ############################################################################## for benchmarkStepIdx in range(0, totalBenchmarkSteps): benchmarkStep = benchmarkProcess[benchmarkStepIdx] if winners.winners == {}: # perf optimization to skip the initial winners creation # this helps a little here but really helps below with avoiding the super-expensive # removeHardcoded step below - that can use a fast-path to create # winners when needed. print1( "# Empty winners - use fast initialization of hardcodedParameters" ) resultingHardcodedParameterList = benchmarkStep.hardcodedParameters else: resultingHardcodedParameterList = \ winners.wpdUpdate( benchmarkStep.hardcodedParameters ) benchmarkStep.hardcodedParameters = resultingHardcodedParameterList numHardcoded = len(benchmarkStep.hardcodedParameters) stepName = str(benchmarkStep) shortName = benchmarkStep.abbreviation() print1("\n") print1(HR) currentTime = time.time() elapsedTime = currentTime - startTime print1("# BenchmarkStep: %s - %s %.3fs" % (problemSizeGroupName, stepName, elapsedTime)) print1("# NumProblems: %u" % benchmarkStep.problemSizes.totalProblemSizes) print1("# BenchmarkParameters:") for paramName in benchmarkStep.benchmarkParameters: paramValues = benchmarkStep.benchmarkParameters[paramName] printStr = "# %s = { %s" % (paramName, paramValues[0]) for paramValueIdx in range(1, len(paramValues)): printStr += ", %s" % str(paramValues[paramValueIdx]) printStr += " }" print1(printStr) if False: # print1(hardcoded parameters and their winners print1("# HardcodedParameters | WinningParameters:") paramDictIdx = 0 hardcodedMinNaming = \ Solution.getMinNaming(benchmarkStep.hardcodedParameters) for paramDict in benchmarkStep.hardcodedParameters: winningParameters = winners[paramDict] print1("# (%u) %s | %s" % (paramDictIdx, \ Solution.getNameMin(paramDict, hardcodedMinNaming), \ Solution.getNameFull(winningParameters) )) paramDictIdx += 1 pushWorkingPath(shortName) ############################################################################ # Copy Files to Benchmark Source Directory ############################################################################ stepBaseDir = globalParameters["WorkingPath"] sourceDir = \ os.path.join(stepBaseDir, "source" ) ensurePath(sourceDir) pushWorkingPath("sourceTmp") filesToCopy = [ "SolutionMapper.h", "Client.cpp", "Client.h", "CMakeLists.txt", "DeviceStats.h", "TensorUtils.h", "MathTemplates.cpp", "MathTemplates.h", "TensileTypes.h", "tensile_bfloat16.h", "KernelHeader.h", "ReferenceCPU.h", "SolutionHelper.cpp", "SolutionHelper.h", "Tools.cpp", "Tools.h", ] for f in filesToCopy: shutil_copy(os.path.join(globalParameters["SourcePath"], f), globalParameters["WorkingPath"]) if globalParameters["RuntimeLanguage"] == "OCL": shutil_copy( os.path.join(globalParameters["SourcePath"], "FindOpenCL.cmake"), globalParameters["WorkingPath"]) else: shutil_copy( os.path.join(globalParameters["SourcePath"], "FindHIP.cmake"), globalParameters["WorkingPath"]) shutil_copy( os.path.join(globalParameters["SourcePath"], "FindHCC.cmake"), globalParameters["WorkingPath"]) ############################################################################ # Enumerate Benchmark Permutations ############################################################################ solutions = [] totalBenchmarkPermutations = 1 for benchmarkParamName in benchmarkStep.benchmarkParameters: totalBenchmarkPermutations *= len( benchmarkStep.benchmarkParameters[benchmarkParamName]) maxPossibleSolutions = totalBenchmarkPermutations * numHardcoded print1("# MaxPossibleSolutions: %u = %u (hardcoded) * %u (benchmark)" % \ (maxPossibleSolutions, numHardcoded, totalBenchmarkPermutations)) benchmarkPermutations = [] for i in range(0, totalBenchmarkPermutations): permutation = {} pIdx = i for benchmarkParamName in benchmarkStep.benchmarkParameters: benchmarkParamValues = deepcopy( \ benchmarkStep.benchmarkParameters[benchmarkParamName]) valueIdx = pIdx % len(benchmarkParamValues) permutation[benchmarkParamName] = benchmarkParamValues[ valueIdx] pIdx /= len(benchmarkParamValues) benchmarkPermutations.append(permutation) ############################################################################ # Enumerate Solutions = Hardcoded * Benchmark ############################################################################ print1("# Enumerating Solutions") if globalParameters["PrintLevel"] >= 1: progressBar = ProgressBar(maxPossibleSolutions) solutionSet = set() # avoid duplicates for nlca=-1, 1 for hardcodedIdx in range(0, numHardcoded): solutions.append([]) hardcodedParamDict = benchmarkStep.hardcodedParameters[ hardcodedIdx] for benchmarkIdx in range(0, len(benchmarkPermutations)): benchmarkPermutation = benchmarkPermutations[benchmarkIdx] solution = { "ProblemType": deepcopy(benchmarkProcess.problemType.state) } solution.update(benchmarkPermutation) solution.update(hardcodedParamDict) if benchmarkStepIdx > 0: winningParameters = winners[hardcodedParamDict] if winningParameters == None: # this is a joined parameter that didn't have a winner, that's okay continue solution.update(winningParameters) # append default parameters where necessary for initialSolutionParameterName in benchmarkStep.initialSolutionParameters: if initialSolutionParameterName not in solution: solution[initialSolutionParameterName] = \ benchmarkStep.initialSolutionParameters[initialSolutionParameterName] # TODO check if solution matches problem size for exact tile kernels solutionObject = Solution(solution) if solutionObject["Valid"]: if solutionObject not in solutionSet: solutionSet.add(solutionObject) solutions[hardcodedIdx].append(solutionObject) else: if globalParameters["PrintSolutionRejectionReason"]: print1("rejecting solution %s" % str(solutionObject)) if globalParameters["PrintLevel"] >= 1: progressBar.increment() # remove hardcoded that don't have any valid benchmarks removeHardcoded = [] for hardcodedIdx in range(0, numHardcoded): if len(solutions[hardcodedIdx]) == 0: hardcodedParamDict = benchmarkStep.hardcodedParameters[ hardcodedIdx] removeHardcoded.append(hardcodedParamDict) removesExist = len(removeHardcoded) > 0 for hardcodedParam in removeHardcoded: benchmarkStep.hardcodedParameters.remove(hardcodedParam) if removesExist: print1( "# Updating winners since enumeration removed unused hardcoded solutions. removeHardcoded=%u winners=%u" % (len(removeHardcoded), len(winners.winners))) winners.wpdUpdate(benchmarkStep.hardcodedParameters) if globalParameters["PrintLevel"] >= 1: print1("") numHardcoded = len(benchmarkStep.hardcodedParameters) # remove from solution 2D list also for solutionList in shallowcopy(solutions): if len(solutionList) == 0: solutions.remove(solutionList) elif winners.winners == {}: print1("# Populating initial winners (%u solutions)\n" % len(benchmarkStep.hardcodedParameters)) for hcParm in benchmarkStep.hardcodedParameters: winners.winners[FrozenDictionary(hcParm)] = [{}, -1] print1("# Actual Solutions: %u / %u\n" % ( len(solutions), \ maxPossibleSolutions )) # create linear list solutionList = [] for i in range(0, len(solutions)): solutionsForHardcoded = solutions[i] for j in range(0, len(solutionsForHardcoded)): solution = solutionsForHardcoded[j] solutionList.append(solution) if len(solutionList) == 0: msg = "Your parameters resulted in 0 valid solutions." if globalParameters["PrintSolutionRejectionReason"]: msg += "\nExamine reject and backtrace messages above to see why and where solutions were rejected." else: msg += "\nYou should re-run with \"PrintSolutionRejectionReason: True\" to see why each parameter combination was rejected." printExit(msg) if globalParameters["PrintLevel"] >= 1: for i in range(0, len(solutions)): solutionsForHardcoded = solutions[i] for j in range(0, len(solutionsForHardcoded)): solution = solutionsForHardcoded[j] print2("# (%u:%u) %s" % (i, j, \ Solution.getNameFull(solution) )) print2(HR) # write benchmarkFiles writeBenchmarkFiles(stepBaseDir, solutionList, benchmarkStep.problemSizes, \ shortName, filesToCopy) print1("# Copying files that differ from sourceTmp -> source") sourceTmp = globalParameters["WorkingPath"] files = os.listdir(sourceTmp) for f in files: f0 = os.path.join(sourceTmp, f) f1 = os.path.join(sourceDir, f) if os.path.isdir(f0): #print "cpDir:", f0, f1 if os.path.isdir(f1): shutil.rmtree(f1, True) shutil.copytree(f0, f1) elif not os.path.exists(f1) or not filecmp.cmp(f0, f1): #print "cp:", f0, f1 shutil.copy(f0, f1) shutil.rmtree(sourceTmp, True) popWorkingPath() # source ############################################################################ # Run Benchmark Script ############################################################################ resultsFileBase = os.path.normpath(os.path.join( \ globalParameters["WorkingPath"], "../Data", shortName)) if benchmarkStep.isFinal(): resultsFileBaseFinal = resultsFileBase resultsFileName = resultsFileBase + ".csv" solutionsFileName = resultsFileBase + ".yaml" if not os.path.exists(resultsFileName) or \ globalParameters["ForceRedoBenchmarkProblems"]: pushWorkingPath("build") # write runScript libraryLogicPath = None path = globalParameters["WorkingPath"] forBenchmark = True runScriptName = writeRunScript(path, libraryLogicPath, forBenchmark) # run runScript process = Popen(runScriptName, cwd=globalParameters["WorkingPath"]) process.communicate() if process.returncode: benchmarkTestFails += 1 printWarning( "BenchmarkProblems: Benchmark Process exited with code %u" % process.returncode) popWorkingPath() # build else: print1("# Already benchmarked; skipping.") ############################################################################ # Winners -> Determined Parameters ############################################################################ results = getResults(resultsFileName, solutions) print2("CSV Results: %s" % results) winners.addResults(benchmarkStep.hardcodedParameters, \ benchmarkPermutations, solutions, results) ############################################################################ # Write Solutions YAML ############################################################################ YAMLIO.writeSolutions(solutionsFileName, benchmarkStep.problemSizes, \ solutions ) # End Iteration popWorkingPath() # stepName currentTime = time.time() elapsedTime = currentTime - startTime print1("%s\n# %s\n# %s: End - %.3fs\n%s\n" \ % (HR, problemSizeGroupName, shortName, elapsedTime, HR)) popWorkingPath() # ProblemType return (resultsFileBaseFinal, benchmarkTestFails)
def getSourceString(self, solution): kernels = solution.getKernels() kernelNames = [] for kernel in kernels: kernelName = self.kernelWriter.getKernelName(kernel) kernelNames.append(kernelName) s = "" t = "" # includes if not globalParameters["MergeFiles"]: solutionName = self.getSolutionName(solution) s += "#include \"%s.h\"\n" % solutionName #s += "#include \"MathTemplates.h\"\n" s += "\n" # solution function signature s += self.getSolutionSignature(solution) s += " {\n" t += " " s += "%sTensileStatus status;\n" % (t) # hipFunction Struct if solution["KernelLanguage"] == "Assembly": s += "\n" s += "%s/* module function args */\n" % (t) s += "%sstruct {\n" % t t += " " if globalParameters["DebugKernel"]: s += "%sunsigned int *debugBuffer;\n" % t solutionArgs = self.getArgList(solution["ProblemType"], True, False, False) for arg in solutionArgs: if arg[0] == "TensileHalf": s += "%s%s %s[2];\n" % (t, arg[0], arg[1]) else: s += "%s%s %s;\n" % (t, arg[0], arg[1]) if solution["PersistentKernel"]: # pass in the number of groups since not available in WG s += "%sunsigned int numGroupTiles0;\n" % t s += "%sunsigned int numGroupTiles1;\n" % t s += "%sunsigned int pad;\n" % t # FIXME can this be removed? t = t[2:] s += "%s} hipFunctionArgs;\n" % t #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t s += "%ssize_t hipFunctionArgsSize = sizeof(hipFunctionArgs);\n" % t s += "%svoid *hipLaunchParams[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &hipFunctionArgs, HIP_LAUNCH_PARAM_BUFFER_SIZE, &hipFunctionArgsSize, HIP_LAUNCH_PARAM_END};\n" % t #s += "%sprintf(\"size: %%lu\\n\", sizeof(unsigned int));\n" % t #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t #for arg in solutionArgs: # s += "%sprintf(\"%s: %%lu\\n\", static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)) - static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)));\n" % (t, arg[1], arg[1], solutionArgs[0][1]) # NOTE: host compiler aligns size of structs to 64-bits (at least) and aligns the offset of pointers to 64-bits, therefore, having pointers which are not at the beginning of the struct may get padded/shifted by the host compiler and, therefore, not coppied correctly to gpu # kernels s += "\n%s/* kernels */\n" % (t) s += "%sconst unsigned int numKernels = %u; // 1 or 4\n" % ( t, len(kernels)) s += "%sint deviceId;\n" % (t) s += "%shipCtxGetDevice(&deviceId);\n" % (t) s += "%shipDeviceProp_t deviceProperties;\n" % (t) s += "%shipGetDeviceProperties( &deviceProperties, deviceId );\n" % (t) if solution["KernelLanguage"] == "Source" and globalParameters[ "RuntimeLanguage"] == "OCL": s += "%sconst char *kernelSources[numKernels] = {\n" % (t) t += " " for kernelIdx in range(0, len(kernelNames)): kernelName = kernelNames[kernelIdx] s += "%s%s_src%s\n" % (t, kernelName, \ "," if kernelIdx < len(kernels)-1 else "" ) t = t[2:] s += "%s};\n" % (t) s += "%scl_kernel kernels[numKernels];\n" % (t) s += "%sconst char *buildOptions = \"-cl-std=cl2.0\";\n" % (t) s += "%sfor (unsigned int i = 0; i < numKernels; i++) {\n" % (t) s += "%s tensileGetCompiledOpenCLKernel(\n" % (t) s += "%s &kernels[i],\n" % (t) s += "%s kernelSources[i],\n" % (t) s += "%s stream,\n" % (t) s += "%s buildOptions);\n" % (t) s += "%s}\n" % (t) if solution["GlobalSplitU"] > 1: for beta in solution.getKernelsBetaOnly(): kernelName = self.kernelWriter.getKernelNameBetaOnly(beta) s += "%scl_kernel kernel_%s;\n" % (t, kernelName) s += "%s tensileGetCompiledOpenCLKernel(\n" % (t) s += "%s &kernel_%s,\n" % (t, kernelName) s += "%s %s_src,\n" % (t, kernelName) s += "%s stream,\n" % (t) s += "%s buildOptions);\n" % (t) elif solution["KernelLanguage"] == "Assembly": localStatic = True kernel = kernels[0] s += "%sint isa = deviceProperties.gcnArch;\n" % (t) s += "%shipFunction_t hipFunction;\n" % (t) kernelName = self.kernelWriter.getKernelName(kernel) s += t if localStatic: s += "%sstatic hipFunction_t *hipFunctions = nullptr;\n" % (t) s += "%sif ( !hipFunctions ) {\n" % ( t ) # not locking here means array might be double allocated and memory leak t += " " s += "%sstatic std::mutex initFunctionsMutex;\n" % (t) s += "%sstd::lock_guard<std::mutex> initFunctionsLock(initFunctionsMutex);\n" % ( t) s += "%sif ( !hipFunctions ) {\n" % ( t ) # not locking here means array might be double allocated and memory leak t += " " s += "%sstatic int numDevices = -1;\n" % (t) s += "%sstatus = hipGetDeviceCount( &numDevices );\n" % (t) s += "%shipFunction_t *tmp = new hipFunction_t[numDevices];\n" % ( t) s += "%sfor ( int i = 0; i < numDevices; i++) {\n" % (t) s += "%s tmp[i] = nullptr;\n" % (t) s += "%s}\n" % (t) s += "%shipFunctions = tmp;\n" % (t) t = t[2:] s += "%s}\n" % (t) t = t[2:] s += "%s}\n" % (t) s += "%sif ( !hipFunctions[deviceId] ) {\n" % (t) t += " " s += "%sstatic std::mutex loadModuleMutex;\n" % (t) s += "%sstd::lock_guard<std::mutex> loadModuleLock(loadModuleMutex);\n" % ( t) s += "%sif (!hipFunctions[deviceId]) {\n" % (t) t += " " s += "%shipModule_t module = nullptr;\n" % (t) s += "%shipModuleLoadData(&module, %s_coba);\n" % (t, kernelName) s += "%shipModuleGetFunction(&hipFunctions[deviceId], module, \"%s\");\n" % ( t, kernelName) t = t[2:] s += "%s}\n" % (t) t = t[2:] s += "%s}\n" % (t) s += "%shipFunction = hipFunctions[deviceId];\n" % (t) else: s += "%stensileGetHipFunctionFromCodeObjectByteArray(\n" % (t) s += "%s &hipFunction,\n" % (t) s += "%s \"%s\",\n" % (t, kernelName) s += "%s %s_coba); // code object byte array\n" % ( t, kernelName) typeName = solution["ProblemType"]["DataType"].toCpp() # index assignments s += "\n%s/* index assignments */\n" % (t) s += "%sconst unsigned int indexD0 = %u;\n" \ % (t, solution["ProblemType"]["Index0"]) s += "%sconst unsigned int indexD1 = %u;\n" \ % (t, solution["ProblemType"]["Index1"]) s += "%sconst unsigned int indexDU = %u;\n" \ % (t, solution["ProblemType"]["IndexUnroll"]) # num enqueues s += "\n%s/* num kernels */\n" % (t) s += "%sunsigned int numEnqueues[numKernels] = { 1" % (t) for i in range(1, len(kernels)): s += ", 1" s += " };\n" # grid size s += "\n%s/* grid sizes */\n" % (t) s += "%sconst unsigned int workDim = 3;\n" % (t) s += "%sconst unsigned int threadTile[2] = { %u, %u };\n" \ % (t, solution["ThreadTile0"], solution["ThreadTile1"]) s += "%sconst unsigned int groupSize[2] = { %u, %u };\n" \ % (t, solution["SubGroup0"], solution["SubGroup1"]) s += "%ssize_t localWorkSize[3] = { %3u, 1, 1 };\n" \ % (t, solution["NumThreads"]) s += "%ssize_t globalWorkSize[numKernels][3];\n" % (t) # grid size [2] s += "%sglobalWorkSize[0][2] = 1;\n" % (t) for i in range(0, solution["ProblemType"]["NumIndicesC"]): if i != solution["ProblemType"]["Index0"] and i != solution[ "ProblemType"]["Index1"]: s += "%sglobalWorkSize[0][2] *= size%s;\n" % ( t, self.indexChars[i]) # grid size [0,1] s += "%sunsigned int sizeOfC0 = size%s;\n" % (t, \ self.indexChars[solution["ProblemType"]["Index0"]]) s += "%sunsigned int sizeOfC1 = size%s;\n" % (t, \ self.indexChars[solution["ProblemType"]["Index1"]]) s += "%sunsigned int macroTile0 = static_cast<unsigned int>(groupSize[0] * threadTile[0]);\n" % ( t) s += "%sunsigned int macroTile1 = static_cast<unsigned int>(groupSize[1] * threadTile[1]);\n" % ( t) s += "%sunsigned int totalWorkGroups0 = sizeOfC0 / macroTile0;\n" % (t) s += "%sunsigned int totalWorkGroups1 = sizeOfC1 / macroTile1;\n" % (t) if kernel["EdgeType"] != "None": s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % ( t) s += "%sif (totalWorkGroups0*macroTile0 < sizeOfC0) { totalWorkGroups0++; }\n" % ( t) s += "%sif (totalWorkGroups1*macroTile1 < sizeOfC1) { totalWorkGroups1++; }\n" % ( t) if kernel["WorkGroupMappingType"] == "Z" and abs( kernel["WorkGroupMapping"]) == 2: s += "%sunsigned int totalWorkGroupsPow2 = totalWorkGroups0 > totalWorkGroups1 ? totalWorkGroups0 : totalWorkGroups1;\n" % ( t) s += "%stotalWorkGroupsPow2--;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 1;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 2;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 4;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 8;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 16;\n" % (t) s += "%stotalWorkGroupsPow2++;\n" % (t) s += "%stotalWorkGroups0 = totalWorkGroupsPow2;\n" % (t) s += "%stotalWorkGroups1 = totalWorkGroupsPow2;\n" % (t) if solution["GlobalSplitU"] > 1: s += "%stotalWorkGroups1 *= %u; // GlobalSplitU\n" % ( t, solution["GlobalSplitU"]) if solution["PersistentKernel"]: s += "%sglobalWorkSize[0][0] = deviceProperties.multiProcessorCount * %u;\n" \ % (t, solution["PersistentKernel"]) s += "%sglobalWorkSize[0][1] = 1;\n" % t else: s += "%sglobalWorkSize[0][0] = totalWorkGroups%u%s;\n" % ( t, 0 if kernel["WorkGroupMapping"] > 0 else 1, "*localWorkSize[0]" if self.language == "OCL" else "") s += "%sglobalWorkSize[0][1] = totalWorkGroups%u%s;\n" % ( t, 1 if kernel["WorkGroupMapping"] > 0 else 0, "*localWorkSize[1]" if self.language == "OCL" else "") # offsets s += "\n%s/* offsets */\n" % (t) s += "%sunsigned int offsets[numKernels][1][3];\n" % (t) for kernelIdx in range(0, len(kernels)): s += "%soffsets[%u][0][0] = offsetC; // tensorC\n" % (t, kernelIdx) s += "%soffsets[%u][0][1] = offsetA; // tensorA\n" % (t, kernelIdx) s += "%soffsets[%u][0][2] = offsetB; // tensorB\n" % (t, kernelIdx) # index sizes s += "\n%s/* index sizes */\n" % (t) s += "%sunsigned int sizes[numKernels][1][%u];\n" \ % (t, solution["ProblemType"]["TotalIndices"]) for kernelIdx in range(0, len(kernels)): kernel = kernels[kernelIdx] kernelName = self.kernelWriter.getKernelName(kernel) # free index sizes for i in range(0,solution["ProblemType"]["NumIndicesFree"] \ + solution["ProblemType"]["NumIndicesBatch"] ): s += "%ssizes[%u][0][%u] = size%s;\n" \ % (t, kernelIdx, i, self.indexChars[i]) # summation index sizes for i in range(solution["ProblemType"]["NumIndicesC"], \ solution["ProblemType"]["TotalIndices"] ): lastParam = i == solution["ProblemType"]["TotalIndices"] - 1 s += "%ssizes[%u][0][%u] = size%s;\n" \ % (t, kernelIdx, i, self.indexChars[i]) #s += "printf(\"Launching with grid=%zu_%zu problemGrid=%u_%u mt=%u_%u\\n\", globalWorkSize[0][0], globalWorkSize[0][1], totalWorkGroups0, totalWorkGroups1, macroTile0, macroTile1);\n" s += "\n" ######################################## # Enqueue Beta-Only Kernel ######################################## if solution["GlobalSplitU"] > 1: kernelNamesBetaOnly = [] numStridesC = solution["ProblemType"]["NumIndicesC"] - \ (0 if solution["ProblemType"]["UseInitialStrides"] else 1) for beta in solution.getKernelsBetaOnly(): kernelName = self.kernelWriter.getKernelNameBetaOnly(beta) kernelNamesBetaOnly.append(kernelName) s += "%s// enqueue Beta-Only kernel\n" % (t) # grid sizes s += "%ssize_t localWorkSizeBetaOnly[3] = { 8, 8, 1};\n" % (t) s += "%ssize_t globalWorkSizeBetaOnly[3];\n" % (t) #s += "%sunsigned int sizeOfC0 = size%s;\n" % (t, \ # self.indexChars[solution["ProblemType"]["Index0"]]) #s += "%sunsigned int sizeOfC1 = size%s;\n" % (t, \ # self.indexChars[solution["ProblemType"]["Index1"]]) s += "%ssize_t totalWorkGroupsBetaOnly0 = sizeOfC0 / localWorkSizeBetaOnly[0];\n" % ( t) s += "%ssize_t totalWorkGroupsBetaOnly1 = sizeOfC1 / localWorkSizeBetaOnly[1];\n" % ( t) s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % ( t) s += "%sif (totalWorkGroupsBetaOnly0*localWorkSizeBetaOnly[0] < sizeOfC0) { totalWorkGroupsBetaOnly0++; }\n" % ( t) s += "%sif (totalWorkGroupsBetaOnly1*localWorkSizeBetaOnly[1] < sizeOfC1) { totalWorkGroupsBetaOnly1++; }\n" % ( t) s += "%sglobalWorkSizeBetaOnly[0] = totalWorkGroupsBetaOnly0%s;\n" % ( t, "*localWorkSizeBetaOnly[0]" if self.language == "OCL" else "") s += "%sglobalWorkSizeBetaOnly[1] = totalWorkGroupsBetaOnly1%s;\n" % ( t, "*localWorkSizeBetaOnly[1]" if self.language == "OCL" else "") s += "%sglobalWorkSizeBetaOnly[2] = 1;\n" % (t) for i in range(0, solution["ProblemType"]["NumIndicesC"]): if i != solution["ProblemType"]["Index0"] and i != solution[ "ProblemType"]["Index1"]: s += "%sglobalWorkSizeBetaOnly[2] *= size%s;\n" % ( t, self.indexChars[i]) if solution["ProblemType"]["UseBeta"]: s += "%sbool betaZero = beta == 0;\n" % (t) if self.language == "OCL": if solution["ProblemType"]["UseBeta"]: s += "%scl_kernel kernelBetaOnly = betaZero ? kernel_%s : kernel_%s;\n" \ % (t, kernelNamesBetaOnly[0], kernelNamesBetaOnly[1]) else: #s += "%sbool betaZero = true;\n" % (t) s += "%scl_kernel kernelBetaOnly = kernel_%s;\n" \ % (t, kernelNamesBetaOnly[0]) argIdx = 0 s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % ( t, argIdx) argIdx += 1 s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &offsetC ); tensileStatusCheck(status);\n" % ( t, argIdx) argIdx += 1 # strides for i in range(0, numStridesC): s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % ( t, argIdx, self.strideList[i]) argIdx += 1 # sizes for i in range(0, solution["ProblemType"]["NumIndicesC"]): s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % ( t, argIdx, self.indexChars[i]) argIdx += 1 # beta if solution["ProblemType"]["UseBeta"]: s += "%sif (!betaZero) {\n" % (t) s += "%s status = clSetKernelArg( kernelBetaOnly, %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % ( t, argIdx, typeName) argIdx += 1 s += "%s}\n" % (t) # enqueue s += "%scl_event kernelEventBetaOnly;\n" % (t) s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t) t += " " s += "%sstream,\n" % (t) s += "%skernelBetaOnly,\n" % (t) s += "%sworkDim,\n" % (t) s += "%sNULL, // globalWorkOffset\n" % (t) s += "%sglobalWorkSizeBetaOnly,\n" % (t) s += "%slocalWorkSizeBetaOnly,\n" % (t) s += "%snumInputEvents,\n" % (t) s += "%sinputEvents,\n" % (t) #s += "%soutputEvent );\n" % (t) s += "%s&kernelEventBetaOnly );\n" % (t) t = t[2:] s += "%stensileStatusCheck(status);\n" % (t) if solution["ProblemType"]["UseBeta"]: s += "%sbeta = %s;\n" % ( t, solution["ProblemType"]["DataType"].zeroString( self.language, 1)) #s += "%sreturn tensileStatusSuccess;\n" % (t) s += "%sstatus = clFinish(stream);\n" % (t) s += "%stensileStatusCheck(status);\n" % (t) #s += " float tmp[128*128];\n" #s += "clEnqueueReadBuffer(stream, dataC, CL_TRUE, 0, 128*128*sizeof(float), tmp, 0, NULL, NULL);\n" #s += "for (unsigned int i = 0; i < 128*128; i++) { printf(\"%f\\n\", tmp[i]); }\n" else: s += "%sif( inputEvents != NULL )\n" % (t) t += " " s += "%shipEventRecord(inputEvents[0], stream );\n" % (t) t += " " s += "%stry {\n" % (t) if solution["ProblemType"]["UseBeta"]: s += "%sif (betaZero) {\n" % (t) t += " " s += "%shipLaunchKernelGGL(\n" % (t) t += " " s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[0]) s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % ( t) s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % ( t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sdataC,\n" % (t) s += "%soffsetC,\n" % (t) # strides for i in range(0, numStridesC): s += "%s%s,\n" % (t, self.strideList[i]) # sizes for i in range(0, solution["ProblemType"]["NumIndicesC"]): s += "%ssize%s%s" % ( t, self.indexChars[i], ",\n" if i < solution["ProblemType"]["NumIndicesC"] - 1 else ");\n") if solution["ProblemType"]["UseBeta"]: s += "%s} else {\n" % (t) s += "%shipLaunchKernelGGL(\n" % (t) t += " " s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[1]) s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % ( t) s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % ( t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sdataC,\n" % (t) s += "%soffsetC,\n" % (t) # strides for i in range(0, numStridesC): s += "%s%s,\n" % (t, self.strideList[i]) # sizes for i in range(0, solution["ProblemType"]["NumIndicesC"]): s += "%ssize%s,\n" % (t, self.indexChars[i]) s += "%sbeta);\n" % (t) s += "%s}\n" % (t) s += "%s} catch (const std::exception& e) {\n" % (t) s += "#ifdef DEBUG\n" s += "%s std::cerr << e.what() << std::endl;\n" % (t) s += "#endif\n" s += "%s return tensileStatusFailure;\n" % (t) s += "%s}\n" % (t) ######################################## # Enqueue Kernels ######################################## for kernelIdx in range(0, len(kernels)): kernel = kernels[kernelIdx] if kernel["KernelLanguage"] == "Source": kernel["ISA"] = ( 0, 0, 0) # HIP source kernels needs dummy ISA version kernelName = self.kernelWriter.getKernelName(kernel) s += "\n%s/* kernel %u: %s */\n" % (t, kernelIdx, kernelName) s += "%sunsigned int kernelIdx = %u;\n" % (t, kernelIdx) if self.language == "OCL": # set kernel args same for all enqueues s += "%s// kernel args same for all enqueues\n" % (t) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % ( t, 0) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataA ); tensileStatusCheck(status);\n" % ( t, 1) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataB ); tensileStatusCheck(status);\n" % ( t, 2) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &alpha ); tensileStatusCheck(status);\n" % ( t, 3, typeName) s += "%s%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (t, \ "" if solution["ProblemType"]["UseBeta"] else "//", 4, typeName) argIdx = 5 if solution["ProblemType"]["UseBeta"] else 4 argIdx += 3 # skipping offsets here for stride in self.strideList: s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % ( t, argIdx, stride) argIdx += 1 for sizeIdx in range(0, solution["ProblemType"]["TotalIndices"]): if sizeIdx not in [ solution["ProblemType"]["Index0"], solution["ProblemType"]["Index1"], solution["ProblemType"]["IndexUnroll"] ]: s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % ( t, argIdx, self.indexChars[sizeIdx]) argIdx += 1 s += "%sfor (unsigned int enqueueIdx = 0; enqueueIdx < numEnqueues[%u]; enqueueIdx++) {\n" % ( t, kernelIdx) t += " " # debug print kernel dimensions if globalParameters["LibraryPrintDebug"]: s += "%sprintf(\"%s: g{ %%u, %%u, %%u } l{ %%u, %%u, %%u}\\n\", static_cast<unsigned int>(globalWorkSize[kernelIdx][0]), static_cast<unsigned int>(globalWorkSize[kernelIdx][1]), static_cast<unsigned int>(globalWorkSize[kernelIdx][2]), static_cast<unsigned int>(localWorkSize[0]), static_cast<unsigned int>(localWorkSize[1]), static_cast<unsigned int>(localWorkSize[2]) );\n" % ( t, kernelName) # debug print kernel arguments # offsets for i in range(0, 3): s += "%sprintf(\" offset[%u] = %%u\\n\", offsets[kernelIdx][enqueueIdx][%u]);\n" % ( t, i, i) # strides for stride in self.strideList: s += "%sprintf(\" %s = %%u\\n\", %s);\n" % (t, stride, stride) # sizes for i in range(0, solution["ProblemType"]["TotalIndices"]): s += "%sprintf(\" sizes[kernelIdx][enqueueIdx][%u] = %%u\\n\", sizes[kernelIdx][enqueueIdx][%u] );\n" % ( t, i, i) ######################################## # OpenCL Runtime ######################################## if self.language == "OCL": # set kernel args different for all enqueues argIdx = 5 if solution["ProblemType"]["UseBeta"] else 4 # offsets s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &offsets[kernelIdx][enqueueIdx][0]); tensileStatusCheck(status);\n" % ( t, argIdx) argIdx += 1 s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &offsets[kernelIdx][enqueueIdx][1]); tensileStatusCheck(status);\n" % ( t, argIdx) argIdx += 1 s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &offsets[kernelIdx][enqueueIdx][2]); tensileStatusCheck(status);\n" % ( t, argIdx) argIdx += 1 argIdx += len(self.strideList) # sizes for sizeIdx in range(0, solution["ProblemType"]["TotalIndices"]): if sizeIdx in [ solution["ProblemType"]["Index0"], solution["ProblemType"]["Index1"], solution["ProblemType"]["IndexUnroll"] ]: s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % ( t, argIdx, self.indexChars[sizeIdx]) argIdx += 1 # enqueue s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t) t += " " s += "%sstream,\n" % (t) s += "%skernels[kernelIdx],\n" % (t) s += "%sworkDim,\n" % (t) s += "%sNULL, // globalWorkOffset\n" % (t) s += "%sglobalWorkSize[kernelIdx],\n" % (t) s += "%slocalWorkSize,\n" % (t) if False: # solution["GlobalSplitU"] > 1: s += "%s1,\n" % (t) s += "%s&kernelEventBetaOnly,\n" % (t) else: s += "%snumInputEvents,\n" % (t) s += "%sinputEvents,\n" % (t) s += "%soutputEvent );\n" % (t) s += "%stensileStatusCheck(status);\n" % (t) ######################################## # HIP Runtime ######################################## else: if not globalParameters["PreciseKernelTime"] or solution[ "KernelLanguage"] == "Source": s += "%sif( inputEvents != NULL )\n" % (t) t += " " s += "%shipEventRecord(inputEvents[enqueueIdx], stream );\n" % ( t) t = t[2:] s += "%stry {\n" % (t) t += " " # hip kernel if solution["KernelLanguage"] == "Source": s += "%shipLaunchKernelGGL(\n" % (t) t += " " s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelName) s += "%sdim3(globalWorkSize[kernelIdx][0], globalWorkSize[kernelIdx][1], globalWorkSize[kernelIdx][2]),\n" % ( t) s += "%sdim3(localWorkSize[0], localWorkSize[1], localWorkSize[2]),\n" % ( t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sdataC,\n" % (t) s += "%sdataA,\n" % (t) s += "%sdataB,\n" % (t) s += "%salpha,\n" % (t) s += "%s%sbeta,\n" % (t, \ "" if solution["ProblemType"]["UseBeta"] else "//") s += "%soffsets[kernelIdx][enqueueIdx][0],\n" % (t) s += "%soffsets[kernelIdx][enqueueIdx][1],\n" % (t) s += "%soffsets[kernelIdx][enqueueIdx][2],\n" % (t) # strides for stride in self.strideList: s += "%s%s,\n" % (t, stride) # sizes for i in range(0, solution["ProblemType"]["TotalIndices"]): lastParam = i == solution["ProblemType"][ "TotalIndices"] - 1 s += "%ssizes[kernelIdx][enqueueIdx][%u]%s\n" \ % (t, i, "" if lastParam else "," ) if solution["PersistentKernel"]: s += "%s,totalWorkGroups%u\n" % ( t, 0 if kernel["WorkGroupMapping"] > 0 else 1) s += "%s,totalWorkGroups%u\n" % ( t, 1 if kernel["WorkGroupMapping"] > 0 else 0) s += "%s);\n" % (t) # assembly kernel else: if globalParameters["DebugKernel"]: s += "%sconst unsigned int debugBufferElementsPerThread = 16;\n" % t s += "%sunsigned int debugBufferNumElem = debugBufferElementsPerThread;\n" % ( t) s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][0]);\n" % ( t) s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][1]);\n" % ( t) s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][2]);\n" % ( t) s += "%sdebugBufferNumElem *= localWorkSize[0];\n" % ( t) s += "%sdebugBufferNumElem *= localWorkSize[1];\n" % ( t) s += "%sdebugBufferNumElem *= localWorkSize[2];\n" % ( t) s += "%s printf(\"debugBufferNumElem: %%04i: \\n\", debugBufferNumElem);\n" % ( t) s += "%ssize_t debugBufferSize = debugBufferNumElem * sizeof(unsigned int);\n" % ( t) s += "%shipDevice_t device;\n" % t s += "%shipDeviceGet(&device, 0);\n" % t s += "%shipMalloc(&(hipFunctionArgs.debugBuffer), debugBufferSize);\n" % t s += "%sunsigned int *debugBufferHostPtr = new unsigned int[debugBufferNumElem];\n" % ( t) s += "%smemset(debugBufferHostPtr,0,debugBufferSize);\n" % ( t) s += "%shipMemcpyHtoD(hipFunctionArgs.debugBuffer, debugBufferHostPtr, debugBufferSize);\n" % ( t) s += "%smemset(debugBufferHostPtr,1,debugBufferSize);\n" % ( t) # hip assembly function s += "%shipFunctionArgs.dataC = dataC;\n" % (t) s += "%shipFunctionArgs.dataA = dataA;\n" % (t) s += "%shipFunctionArgs.dataB = dataB;\n" % (t) if solution["ProblemType"]["DataType"].isHalf(): s += "%shipFunctionArgs.alpha[0] = alpha;\n" % (t) s += "%shipFunctionArgs.alpha[1] = alpha;\n" % (t) else: s += "%shipFunctionArgs.alpha = alpha;\n" % (t) if solution["ProblemType"]["UseBeta"]: if solution["ProblemType"]["DataType"].isHalf(): s += "%shipFunctionArgs.beta[0] = beta;\n" % (t) s += "%shipFunctionArgs.beta[1] = beta;\n" % (t) else: s += "%shipFunctionArgs.beta = beta;\n" % (t) s += "%shipFunctionArgs.offsetC = offsets[kernelIdx][enqueueIdx][0];\n" % ( t) s += "%shipFunctionArgs.offsetA = offsets[kernelIdx][enqueueIdx][1];\n" % ( t) s += "%shipFunctionArgs.offsetB = offsets[kernelIdx][enqueueIdx][2];\n" % ( t) # strides for stride in self.strideList: s += "%shipFunctionArgs.%s = %s;\n" % (t, stride, stride) # sizes for i in range(0, solution["ProblemType"]["TotalIndices"]): lastParam = i == solution["ProblemType"][ "TotalIndices"] - 1 s += "%shipFunctionArgs.size%s = sizes[kernelIdx][enqueueIdx][%u];\n" \ % (t, globalParameters["IndexChars"][i], i ) if solution["PersistentKernel"]: # pass in the number of groups since not available in WG s += "%shipFunctionArgs.numGroupTiles0 = totalWorkGroups0;\n" % ( t) s += "%shipFunctionArgs.numGroupTiles1 = totalWorkGroups1;\n" % ( t) s += "%shipHccModuleLaunchKernel(\n" % (t) t += " " s += "%shipFunction,\n" % (t) s += "%sglobalWorkSize[kernelIdx][0]*localWorkSize[0],\n" % ( t) s += "%sglobalWorkSize[kernelIdx][1]*localWorkSize[1],\n" % ( t) s += "%sglobalWorkSize[kernelIdx][2]*localWorkSize[2],\n" % ( t) s += "%slocalWorkSize[0],\n" % (t) s += "%slocalWorkSize[1],\n" % (t) s += "%slocalWorkSize[2],\n" % (t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sNULL,\n" % (t) s += "%s(void**)hipLaunchParams\n" % (t) if globalParameters["PreciseKernelTime"]: s += "%s,inputEvents ? inputEvents[enqueueIdx]:nullptr\n" % ( t) s += "%s,outputEvent ? outputEvent[enqueueIdx]:nullptr\n" % ( t) s += "%s);\n" % (t) t = t[2:] if globalParameters["DebugKernel"]: # copy debug buffer s += "%shipMemcpyDtoH(debugBufferHostPtr, hipFunctionArgs.debugBuffer, debugBufferSize);\n" % ( t) s += "%sfor(unsigned int i = 0; i < debugBufferNumElem/debugBufferElementsPerThread; i++) {\n" % ( t) s += "%s printf(\"%%04i\", i);\n" % (t) s += "%s char u[debugBufferElementsPerThread] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};\n" % ( t) #s += "%s char u[debugBufferElementsPerThread] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};\n" % (t) #s += "%s char u[debugBufferElementsPerThread] = {1,1,0,0,1,1,0,0,1,1,1,1,1,1,1,1};\n" % (t) s += "%s for(unsigned int j = 0; j < debugBufferElementsPerThread; j++) {\n" % ( t) s += "%s if (u[j]) printf(\",%%4u\", debugBufferHostPtr[i*debugBufferElementsPerThread+j]);\n" % ( t) s += "%s else printf(\",%%4.0f\", ((float *)debugBufferHostPtr)[i*debugBufferElementsPerThread+j]);\n" % ( t) s += "%s }\n" % (t) s += "%s printf(\"\\n\");\n" % (t) s += "%s}\n" % (t) t = t[2:] s += "%s} catch (const std::exception& e) {\n" % (t) s += "#ifdef DEBUG\n" s += "%s std::cerr << e.what() << std::endl;\n" % (t) s += "#endif\n" s += "%s return tensileStatusFailure;\n" % (t) s += "%s}\n" % (t) if not globalParameters["PreciseKernelTime"] or solution[ "KernelLanguage"] == "Source": s += "%sif( outputEvent != NULL )\n" % (t) s += "%s hipEventRecord(outputEvent[enqueueIdx], stream );\n" % ( t) s += " }\n" s += "\n" s += " return tensileStatusSuccess;\n" s += "}\n" s += "\n" s += "/* Solution Parameters\n" s += Solution.getParametersIndented(solution.state, " ") s += "*/\n" s += "\n" return s
def writeCMake(outputPath, solutions, kernels, libraryStaticFiles, clientName): print1("# Writing Custom CMake") ############################################################################## # Min Naming ############################################################################## if globalParameters["ShortNames"] and not globalParameters["MergeFiles"]: solutionSerialNaming = Solution.getSerialNaming(solutions) kernelSerialNaming = Solution.getSerialNaming(kernels) else: solutionSerialNaming = None kernelSerialNaming = None solutionMinNaming = Solution.getMinNaming(solutions) kernelMinNaming = Solution.getMinNaming(kernels) solutionWriter = SolutionWriter( \ solutionMinNaming, solutionSerialNaming, \ kernelMinNaming, kernelSerialNaming) kernelWriterSource = KernelWriterSource( \ kernelMinNaming, kernelSerialNaming) kernelWriterAssembly = KernelWriterAssembly( \ kernelMinNaming, kernelSerialNaming) generatedFile = open(os.path.join(outputPath, "Generated.cmake"), "w") generatedFile.write(CMakeHeader) generatedFile.write("set( TensileClient_SOLUTIONS\n") # write solution names if globalParameters["MergeFiles"]: generatedFile.write(" ${CMAKE_SOURCE_DIR}/Solutions.h\n") generatedFile.write(" ${CMAKE_SOURCE_DIR}/Solutions.cpp\n") else: for solution in solutions: solutionName = solutionWriter.getSolutionName(solution) generatedFile.write(" ${CMAKE_SOURCE_DIR}/Solutions/%s.h\n" \ % (solutionName) ) generatedFile.write(" ${CMAKE_SOURCE_DIR}/Solutions/%s.cpp\n" \ % (solutionName) ) generatedFile.write(" )\n") # write kernel names generatedFile.write("set( TensileClient_KERNELS\n") if globalParameters["MergeFiles"]: generatedFile.write(" ${CMAKE_SOURCE_DIR}/Kernels.h\n") generatedFile.write(" ${CMAKE_SOURCE_DIR}/Kernels.cpp\n") else: for kernel in kernels: kernelName = kernelWriterSource.getKernelName(kernel) if kernel[ "KernelLanguage"] == "Source" else kernelWriterAssembly.getKernelName( kernel) generatedFile.write(" ${CMAKE_SOURCE_DIR}/Kernels/%s.h\n" % (kernelName)) generatedFile.write(" ${CMAKE_SOURCE_DIR}/Kernels/%s.cpp\n" % kernelName) generatedFile.write(" )\n") generatedFile.write("set( TensileClient_SOURCE\n") for fileName in libraryStaticFiles: # copy file shutil_copy( os.path.join(globalParameters["SourcePath"], fileName), \ outputPath ) # add file to cmake generatedFile.write(" ${CMAKE_SOURCE_DIR}/%s\n" % fileName) generatedFile.write(" )\n\n") # close generated cmake generatedFile.close()
def TensileCreateLibrary(): print1("") print1(HR) print1("# Tensile Create Library") print2(HR) print2("") ############################################################################## # Parse Command Line Arguments ############################################################################## print2("Arguments: %s" % sys.argv) argParser = argparse.ArgumentParser() argParser.add_argument("LogicPath", help="Path to LibraryLogic.yaml files.") argParser.add_argument("OutputPath", help="Where to write library files?") argParser.add_argument("RuntimeLanguage", help="Which runtime language?", \ choices=["OCL", "HIP", "HSA"]) argParser.add_argument("--merge-files", dest="MergeFiles", \ action="store_true") argParser.add_argument("--no-merge-files", dest="MergeFiles", \ action="store_false") argParser.add_argument("--short-file-names", dest="ShortNames", \ action="store_true") argParser.add_argument("--no-short-file-names", dest="ShortNames", \ action="store_false") argParser.add_argument("--library-print-debug", dest="LibraryPrintDebug", \ action="store_true") argParser.add_argument("--no-library-print-debug", dest="LibraryPrintDebug", \ action="store_false") args = argParser.parse_args() logicPath = args.LogicPath outputPath = args.OutputPath print2("OutputPath: %s" % outputPath) ensurePath(outputPath) arguments = {} arguments["RuntimeLanguage"] = args.RuntimeLanguage arguments["MergeFiles"] = args.MergeFiles arguments["ShortNames"] = args.ShortNames arguments["LibraryPrintDebug"] = args.LibraryPrintDebug arguments["CodeFromFiles"] = False assignGlobalParameters(arguments) if not os.path.exists(logicPath): printExit("LogicPath %s doesn't exist" % logicPath) logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \ if (os.path.isfile(os.path.join(logicPath, f)) \ and os.path.splitext(f)[1]==".yaml")] print1("# LibraryLogicFiles:" % logicFiles) for logicFile in logicFiles: print1("# %s" % logicFile) ############################################################################## # Parse config files ############################################################################## solutions = [] logicData = {} # keys are problemTypes, values are schedules for logicFileName in logicFiles: (scheduleName, deviceNames, problemType, solutionsForSchedule, \ indexOrder, exactLogic, rangeLogic) \ = YAMLIO.readLibraryLogicForSchedule(logicFileName) if problemType not in logicData: logicData[problemType] = [] logicData[problemType].append((scheduleName, deviceNames, \ solutionsForSchedule, indexOrder, exactLogic, rangeLogic )) for solution in solutionsForSchedule: if solution not in solutions: solutions.append(solution) # create solution writer and kernel writer kernels = [] kernelsBetaOnly = [] for solution in solutions: solutionKernels = solution.getKernels() for kernel in solutionKernels: if kernel not in kernels: kernels.append(kernel) solutionKernelsBetaOnly = solution.getKernelsBetaOnly() for kernel in solutionKernelsBetaOnly: if kernel not in kernelsBetaOnly: kernelsBetaOnly.append(kernel) # if any kernels are assembly, append every ISA supported if globalParameters["ShortNames"] and not globalParameters["MergeFiles"]: solutionSerialNaming = Solution.getSerialNaming(solutions) kernelSerialNaming = Solution.getSerialNaming(kernels) else: solutionSerialNaming = None kernelSerialNaming = None solutionMinNaming = Solution.getMinNaming(solutions) kernelMinNaming = Solution.getMinNaming(kernels) solutionWriter = SolutionWriter( \ solutionMinNaming, solutionSerialNaming, \ kernelMinNaming, kernelSerialNaming) kernelWriterSource = KernelWriterSource( \ kernelMinNaming, kernelSerialNaming) kernelWriterAssembly = KernelWriterAssembly( \ kernelMinNaming, kernelSerialNaming) # write solutions and kernels writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \ solutionWriter, kernelWriterSource, kernelWriterAssembly) libraryStaticFiles = [ "SolutionMapper.h", "TensileTypes.h", "KernelHeader.h", "SolutionHelper.cpp", "SolutionHelper.h", "Tools.cpp", "Tools.h" ] # write cmake clientName = "LibraryClient" writeCMake(outputPath, solutions, kernels, libraryStaticFiles, clientName) # write logic writeLogic(outputPath, logicData, solutionWriter) print1("# Tensile Library Writer DONE") print1(HR) print1("")
def writeClientParameters(forBenchmark, solutions, problemSizes, stepName, \ functionList): h = "" ############################################################################## # Min Naming ############################################################################## if forBenchmark: kernels = [] for solution in solutions: solutionKernels = solution.getKernels() for kernel in solutionKernels: if kernel not in kernels: kernels.append(kernel) solutionSerialNaming = Solution.getSerialNaming(solutions) kernelSerialNaming = Solution.getSerialNaming(kernels) solutionMinNaming = Solution.getMinNaming(solutions) kernelMinNaming = Solution.getMinNaming(kernels) solutionWriter = SolutionWriter( \ solutionMinNaming, solutionSerialNaming, \ kernelMinNaming, kernelSerialNaming) if forBenchmark: if globalParameters["MergeFiles"]: h += "#include \"Solutions.h\"\n" else: for solution in solutions: solutionName = solutionWriter.getSolutionName(solution) h += "#include \"" + solutionName + ".h\"\n" h += "\n" else: h += "#include \"Tensile.h\"\n" h += "typedef enum {\n" h += " enum_float,\n" h += " enum_double,\n" h += " enum_TensileComplexFloat,\n" h += " enum_TensileComplexDouble\n" h += "#ifdef Tensile_ENABLE_HALF\n" h += " ,enum_TensileHalf\n" h += "#endif\n" h += "} DataTypeEnum;\n" h += "\n" h += "// Debug Params\n" h += "const bool printTensorA=%s;\n" % toCppBool( globalParameters["PrintTensorA"]) h += "const bool printTensorB=%s;\n" % toCppBool( globalParameters["PrintTensorB"]) h += "const bool printTensorC=%s;\n" % toCppBool( globalParameters["PrintTensorC"]) h += "const bool printWinnersOnly=%s;\n" % toCppBool( globalParameters["PrintWinnersOnly"]) h += "\n" h += "const char indexChars[%u] = \"%s" \ % (len(globalParameters["IndexChars"])+1, \ globalParameters["IndexChars"][0]) for i in range(1, len(globalParameters["IndexChars"])): h += globalParameters["IndexChars"][i] h += "\";\n" h += "unsigned int functionIdx;\n" h += "unsigned int dataTypeIdx;\n" h += "unsigned int problemTypeIdx;\n" h += "\n" ############################################################################## # Problem Types ############################################################################## #dataTypes = [] #problemTypes = [] #functionSerialToDataTypeAndIdx = [] dataTypes = [] problemTypes = [] problemTypesForDataType = {} # for data type schedulesForProblemType = {} # for problem type functionInfo = [ ] # dataTypeIdx, problemTypeIdx, idxWithinDataType, idxWithinProblemType if forBenchmark: problemType = solutions[0]["ProblemType"] dataType = problemType["DataType"] dataTypes.append(dataType) problemTypes.append(problemType) problemTypesForDataType[dataType] = [problemType] schedulesForProblemType[problemType] = solutions numProblemTypes = 1 for solution in solutions: functionInfo.append([0, 0, 0, 0, 0, 0]) else: for functionIdx in range(0, len(functionList)): function = functionList[functionIdx] scheduleName = function[0] problemType = function[1] dataType = problemType["DataType"] if dataType not in dataTypes: dataTypes.append(dataType) problemTypesForDataType[dataType] = [] if problemType not in problemTypesForDataType[dataType]: problemTypesForDataType[dataType].append(problemType) schedulesForProblemType[problemType] = [] schedulesForProblemType[problemType].append(scheduleName) # sort dataTypes = sorted(dataTypes) for dataType in dataTypes: problemTypesForDataType[dataType] = \ sorted(problemTypesForDataType[dataType]) for problemType in problemTypesForDataType[dataType]: schedulesForProblemType[problemType] = \ sorted(schedulesForProblemType[problemType]) # assign info functionIdxSerial = 0 problemTypeIdxSerial = 0 for dataTypeIdxSerial in range(0, len(dataTypes)): dataType = dataTypes[dataTypeIdxSerial] functionIdxForDataType = 0 for problemTypeIdxForDataType in range(0, \ len(problemTypesForDataType[dataType])): problemType = \ problemTypesForDataType[dataType][problemTypeIdxForDataType] problemTypes.append(problemType) functionIdxForProblemType = 0 for functionIdxForProblemType in range(0, \ len(schedulesForProblemType[problemType])): functionInfo.append([ \ dataTypeIdxSerial, \ problemTypeIdxForDataType, \ problemTypeIdxSerial, \ functionIdxSerial,\ functionIdxForDataType,\ functionIdxForProblemType, \ ]) functionIdxForProblemType += 1 functionIdxForDataType += 1 functionIdxSerial += 1 problemTypeIdxSerial += 1 numProblemTypes = problemTypeIdxSerial numFunctions = functionIdxSerial h += "const unsigned int numFunctions = %u;\n" % numFunctions ############################################################################## # Data Types ############################################################################## h += "/* data types */\n" numDataTypes = len(dataTypes) h += "const unsigned int numDataTypes = %u;\n" % numDataTypes h += "const DataTypeEnum dataTypeEnums[numDataTypes] = { enum_%s" \ % dataTypes[0].toCpp() for dataTypeIdx in range(1, numDataTypes): h += ", enum_%s" % dataTypes[dataTypeIdx].toCpp() h += " };\n" # bytes per elements h += "const unsigned int bytesPerElement[numDataTypes] = { %u" \ % (dataTypes[0].numBytes()) for dataTypeIdx in range(1, numDataTypes): dataType = dataTypes[dataTypeIdx] h += ", %u" % dataType.numBytes() h += " };\n" # flops per mac h += "const unsigned int numFlopsPerMac[numDataTypes] = { %u" \ % (2 if dataTypes[0].isReal() else 8) for dataTypeIdx in range(1, numDataTypes): dataType = dataTypes[dataTypeIdx] h += ", %u" % (2 if dataType.isReal() else 8) h += " };\n" for dataTypeIdx in range(0, numDataTypes): h += "#define Tensile_DATA_TYPE_%s\n" \ % dataTypes[dataTypeIdx].toCpp().upper() ############################################################################## # Problem Types ############################################################################## h += "/* problem types */\n" h += "const unsigned int numProblemTypes = %u;\n" % numProblemTypes # Num C Indices h += "const unsigned int numIndicesC[numProblemTypes] = { %u" \ % problemTypes[0]["NumIndicesC"] for problemTypeIdx in range(1, numProblemTypes): problemType = problemTypes[problemTypeIdx] h += ", %u" % problemType["NumIndicesC"] h += " };\n" # Num AB Indices maxNumIndicesAB = len(problemTypes[0]["IndexAssignmentsA"]) h += "const unsigned int numIndicesAB[numProblemTypes] = { %u" \ % len(problemTypes[0]["IndexAssignmentsA"]) for problemTypeIdx in range(1, numProblemTypes): problemType = problemTypes[problemTypeIdx] numIndicesAB = len(problemType["IndexAssignmentsA"]) h += ", %u" % numIndicesAB maxNumIndicesAB = max(numIndicesAB, maxNumIndicesAB) h += " };\n" h += "const unsigned int maxNumIndicesAB = %u;\n" % maxNumIndicesAB # Index Assignments A h += "const unsigned int indexAssignmentsA[numProblemTypes][maxNumIndicesAB] = {\n" for problemTypeIdx in range(0, numProblemTypes): problemType = problemTypes[problemTypeIdx] indices = problemType["IndexAssignmentsA"] h += " { %u" % indices[0] for i in range(1, maxNumIndicesAB): if i < len(indices): h += ", %u" % indices[i] else: h += ", static_cast<unsigned int>(-1)" if problemTypeIdx < numProblemTypes - 1: h += " },\n" else: h += " }\n" h += "};\n" # Index Assignments B h += "const unsigned int indexAssignmentsB[numProblemTypes][maxNumIndicesAB] = {\n" for problemTypeIdx in range(0, numProblemTypes): problemType = problemTypes[problemTypeIdx] indices = problemType["IndexAssignmentsB"] h += " { %u" % indices[0] for i in range(1, maxNumIndicesAB): if i < len(indices): h += ", %u" % indices[i] else: h += ", static_cast<unsigned int>(-1)" if problemTypeIdx < numProblemTypes - 1: h += " },\n" else: h += " }\n" h += "};\n" # beta h += "bool useBeta[numProblemTypes] = { %s" \ % ("true" if problemTypes[0]["UseBeta"] else "false") for problemTypeIdx in range(1, numProblemTypes): problemType = problemTypes[problemTypeIdx] h += ", %s" % ("true" if problemType["UseBeta"] else "false") h += " };\n" # Complex Conjugates h += "const bool complexConjugateA[numProblemTypes] = { %s" \ % ("true" if problemTypes[0]["ComplexConjugateA"] else "false" ) for problemTypeIdx in range(1, numProblemTypes): problemType = problemTypes[problemTypeIdx] h += ", %s" % ("true" if problemTypes[0]["ComplexConjugateA"] else "false") h += " };\n" h += "const bool complexConjugateB[numProblemTypes] = { %s" \ % ("true" if problemTypes[0]["ComplexConjugateB"] else "false" ) for problemTypeIdx in range(1, numProblemTypes): problemType = problemTypes[problemTypeIdx] h += ", %s" % ("true" if problemTypes[0]["ComplexConjugateB"] else "false") h += " };\n" h += "\n" if not forBenchmark: h += "// dataTypeIdxSerial, problemTypeIdxForDataType, problemTypeIdxSerial, functionIdxSerial, functionIdxForDataType, functionIdxForProblemType\n" first = True h += "const unsigned int functionInfo[numFunctions][6] = {\n" for info in functionInfo: h += "%s{ %u, %u, %u, %u, %u, %u }" % (" " if first else ",\n ", \ info[0], info[1], info[2], info[3], info[4], info[5] ) first = False h += " };\n" ############################################################################## # Problem Sizes ############################################################################## maxNumIndices = problemTypes[0]["TotalIndices"] if not forBenchmark: for problemType in problemTypes: maxNumIndices = max(problemType["TotalIndices"], maxNumIndices) h += "const unsigned int maxNumIndices = %u;\n" % maxNumIndices h += "const unsigned int totalIndices[numProblemTypes] = { %u" \ % problemTypes[0]["TotalIndices"] for problemTypeIdx in range(1, numProblemTypes): h += ", %u" % problemTypes[problemTypeIdx]["TotalIndices"] h += " };\n" if forBenchmark: h += "const unsigned int numProblems = %u;\n" \ % problemSizes.totalProblemSizes h += "const unsigned int problemSizes[numProblems][%u] = {\n" \ % problemTypes[0]["TotalIndices"] for i in range(0, problemSizes.totalProblemSizes): line = " {%5u" % problemSizes.sizes[i][0] for j in range(1, problemTypes[0]["TotalIndices"]): line += ",%5u" % problemSizes.sizes[i][j] line += " }" h += line if i < problemSizes.totalProblemSizes - 1: h += "," else: h += "};" h += "\n" h += "const unsigned int minStrides[%u] = {" \ % problemTypes[0]["TotalIndices"] for i in range(0, len(problemSizes.minStrides)): if (i != 0): h += ", " h += str(problemSizes.minStrides[i]) h += "};\n" else: h += "unsigned int userSizes[maxNumIndices];\n" h += "unsigned int minStrides[%u] = {" \ % maxNumIndices for i in range(0, maxNumIndices): if (i != 0): h += ", " h += str(0) # always use 0 for minStrides in benchmark mode h += "};\n" if forBenchmark: h += "/* problem sizes */\n" """ h += "const bool indexIsSized[maxNumIndices] = {" for i in range(0, problemSizes.totalIndices): h += " %s" % ("true" if problemSizes.indexIsSized[i] else "false") if i < problemSizes.totalIndices-1: h += "," h += " };\n" h += "const unsigned int numIndicesSized = %u;\n" \ % len(problemSizes.indicesSized) h += "const unsigned int indicesSized[numIndicesSized][4] = {\n" h += "// { min, stride, stride_incr, max }\n" for i in range(0, len(problemSizes.indicesSized)): r = problemSizes.indicesSized[i] h += " { %u, %u, %u, %u }" % (r[0], r[1], r[2], r[3]) if i < len(problemSizes.indicesSized)-1: h += "," h += "\n" h += " };\n" numIndicesMapped = len(problemSizes.indicesMapped) h += "const unsigned int numIndicesMapped = %u;\n" % numIndicesMapped if numIndicesMapped > 0: h += "#define Tensile_INDICES_MAPPED 1\n" h += "const unsigned int indicesMapped[numIndicesMapped] = {" for i in range(0, numIndicesMapped): h += " %u" % problemSizes.indicesMapped[i] if i < numIndicesMapped-1: h += "," h += " };\n" else: h += "#define Tensile_INDICES_MAPPED 0\n" """ ############################################################################## # Max Problem Sizes ############################################################################## if forBenchmark: h += "size_t maxSizeC = %u;\n" % (problemSizes.maxC) h += "size_t maxSizeA = %u;\n" % (problemSizes.maxA) h += "size_t maxSizeB = %u;\n" % (problemSizes.maxB) h += "\n" else: h += "size_t maxSizeC;\n" h += "size_t maxSizeA;\n" h += "size_t maxSizeB;\n" h += "\n" ############################################################################## # Current Problem Size ############################################################################## h += "/* current problem size */\n" #h += "unsigned int fullSizes[maxNumIndices];\n" #h += "unsigned int currentSizedIndexSizes[numIndicesSized];\n" #h += "unsigned int currentSizedIndexIncrements[numIndicesSized];\n" h += "\n" ############################################################################## # Solutions ############################################################################## if forBenchmark: h += "/* solutions */\n" # Problem Type Indices h += "const unsigned int maxNumSolutions = %u;\n" % len(solutions) h += "float solutionPerf[numProblems][maxNumSolutions]; // milliseconds\n" h += "\n" # Solution Ptrs h += "typedef TensileStatus (*SolutionFunctionPointer)(\n" argList = solutionWriter.getArgList(solutions[0]["ProblemType"], True, True, True) for i in range(0, len(argList)): h += " %s %s%s" % (argList[i][0], argList[i][1], \ ",\n" if i < len(argList)-1 else ");\n\n") h += "const SolutionFunctionPointer solutions[maxNumSolutions] = {\n" for i in range(0, len(solutions)): solution = solutions[i] solutionName = solutionWriter.getSolutionName(solution) h += " %s" % solutionName if i < len(solutions) - 1: h += "," h += "\n" h += " };\n" h += "\n" # Solution Names h += "const char *solutionNames[maxNumSolutions] = {\n" for i in range(0, len(solutions)): solution = solutions[i] solutionName = solutionWriter.getSolutionName(solution) h += " \"%s\"" % solutionName if i < len(solutions) - 1: h += "," h += "\n" h += " };\n" h += "\n" else: # Function Names functionNames = [] for dataType in dataTypes: for problemType in problemTypesForDataType[dataType]: for scheduleName in schedulesForProblemType[problemType]: #functionNames.append("tensile_%s_%s" % (scheduleName, problemType)) functionNames.append("tensile_%s" % (problemType)) h += "const char *functionNames[numFunctions] = {\n" for functionIdx in range(0, len(functionNames)): functionName = functionNames[functionIdx] h += " \"%s\"%s\n" % (functionName, \ "," if functionIdx < len(functionNames)-1 else "" ) h += " };\n" ############################################################################## # Runtime Structures ############################################################################## h += "/* runtime structures */\n" h += "TensileStatus status;\n" if globalParameters["RuntimeLanguage"] == "OCL": h += "cl_platform_id platform;\n" h += "cl_device_id device;\n" h += "cl_context context;\n" h += "cl_command_queue stream;\n" else: h += "hipStream_t stream;\n" #h += "int deviceIdx = %u;\n" \ # % (globalParameters["Device"]) h += "\n" h += "void *deviceC;\n" h += "void *deviceA;\n" h += "void *deviceB;\n" ############################################################################## # Benchmarking and Validation Parameters ############################################################################## h += "\n/* benchmarking parameters */\n" #h += "const bool measureKernelTime = %s;\n" \ # % ("true" if globalParameters["KernelTime"] else "false") #h += "const unsigned int numEnqueuesPerSync = %u;\n" \ # % (globalParameters["EnqueuesPerSync"]) #h += "const unsigned int numSyncsPerBenchmark = %u;\n" \ # % (globalParameters["SyncsPerBenchmark"]) #h += "unsigned int numElementsToValidate = %s;\n" \ # % (str(globalParameters["NumElementsToValidate"]) \ # if globalParameters["NumElementsToValidate"] >= 0 \ # else "0xFFFFFFFF" ) #h += "unsigned int validationMaxToPrint = %u;\n" \ # % globalParameters["ValidationMaxToPrint"] #h += "bool validationPrintValids = %s;\n" \ # % ("true" if globalParameters["ValidationPrintValids"] else "false") h += "size_t validationStride;\n" if problemType["HighPrecisionAccumulate"]: h += "static bool useHighPrecisionAccumulate = true;\n" else: h += "static bool useHighPrecisionAccumulate = false;\n" #h += "unsigned int dataInitTypeC = %s;\n" % globalParameters["DataInitTypeC"] #h += "unsigned int dataInitTypeAB = %s;\n" % globalParameters["DataInitTypeAB"] h += "\n" ############################################################################## # Generated Call to Reference ############################################################################## h += "/* generated call to reference */\n" h += "template<typename DataType>\n" h += "TensileStatus generatedCallToReferenceCPU(\n" h += " const unsigned int *sizes,\n" h += " const unsigned int *minStrides,\n" h += " DataType *referenceC,\n" h += " DataType *initialA,\n" h += " DataType *initialB,\n" h += " const unsigned int stride_a,\n" h += " const unsigned int stride_b,\n" h += " const unsigned int stride_c,\n" h += " DataType alpha,\n" h += " DataType beta,\n" h += " bool useHighPrecisionAccumulate) {\n" h += " return tensileReferenceCPU(\n" h += " referenceC,\n" h += " initialA,\n" h += " initialB,\n" h += " stride_a,\n" h += " stride_b,\n" h += " stride_c,\n" h += " alpha,\n" h += " beta,\n" h += " totalIndices[problemTypeIdx],\n" h += " sizes,\n" h += " minStrides,\n" h += " numIndicesC[problemTypeIdx],\n" h += " numIndicesAB[problemTypeIdx],\n" h += " indexAssignmentsA[problemTypeIdx],\n" h += " indexAssignmentsB[problemTypeIdx],\n" h += " complexConjugateA[problemTypeIdx],\n" h += " complexConjugateB[problemTypeIdx],\n" h += " validationStride,\n" h += " useHighPrecisionAccumulate);\n" h += "};\n" h += "\n" ############################################################################## # Generated Call to Solution ############################################################################## if forBenchmark: problemType = solutions[0]["ProblemType"] h += "/* generated call to solution */\n" h += "template<typename DataType>\n" h += "TensileStatus generatedCallToSolution(\n" h += " unsigned int solutionIdx,\n" h += " const unsigned int *sizes,\n" h += " const unsigned int *minStrides,\n" h += " DataType alpha,\n" h += " DataType beta, \n" h += " unsigned int numEvents = 0, \n" if globalParameters["RuntimeLanguage"] == "OCL": h += " cl_event *event_wait_list = NULL,\n" h += " cl_event *outputEvent = NULL ) {\n" else: h += " hipEvent_t *startEvent = NULL,\n" h += " hipEvent_t *stopEvent = NULL ) {\n" h += " // calculate parameters assuming packed data\n" # strides indexChars = globalParameters["IndexChars"] firstStride = 1 if problemType["UseInitialStrides"]: firstStride = 0 lastStrideC = problemType["NumIndicesC"] lastStrideA = len(problemType["IndexAssignmentsA"]) lastStrideB = len(problemType["IndexAssignmentsB"]) # calculate strides for i in range(0, lastStrideC): h += " unsigned int strideC%u%s = 1" % (i, indexChars[i]) for j in range(0, i): h += "* std::max(minStrides[%i], sizes[%i])" % (j, j) h += ";\n" for i in range(0, lastStrideA): h += " unsigned int strideA%u%s = 1" % (i, \ indexChars[problemType["IndexAssignmentsA"][i]]) for j in range(0, i): h += "* std::max(minStrides[%i], sizes[%i])" % \ (problemType["IndexAssignmentsA"][j], problemType["IndexAssignmentsA"][j]) h += ";\n" for i in range(0, lastStrideB): h += " unsigned int strideB%u%s = 1" % (i, \ indexChars[problemType["IndexAssignmentsB"][i]]) for j in range(0, i): h += "* std::max(minStrides[%i], sizes[%i])" % \ (problemType["IndexAssignmentsB"][j], problemType["IndexAssignmentsB"][j]) h += ";\n" for i in range(0, problemType["TotalIndices"]): h += " unsigned int size%s = sizes[%u];\n" % (indexChars[i], i) h += "\n" # function call h += " // call solution function\n" if globalParameters["RuntimeLanguage"] == "OCL": h += " return solutions[solutionIdx]( static_cast<cl_mem>(deviceC), static_cast<cl_mem>(deviceA), static_cast<cl_mem>(deviceB),\n" else: typeName = dataTypes[0].toCpp() h += " return solutions[solutionIdx]( static_cast<%s *>(deviceC), static_cast<%s *>(deviceA), static_cast<%s *>(deviceB),\n" \ % (typeName, typeName, typeName) h += " alpha,\n" if problemType["UseBeta"]: h += " beta,\n" h += " 0, 0, 0, // offsets\n" for i in range(firstStride, lastStrideC): h += " strideC%u%s,\n" % (i, indexChars[i]) for i in range(firstStride, lastStrideA): h += " strideA%u%s,\n" % (i, \ indexChars[problemType["IndexAssignmentsA"][i]]) for i in range(firstStride, lastStrideB): h += " strideB%u%s,\n" % (i, \ indexChars[problemType["IndexAssignmentsB"][i]]) for i in range(0, problemType["TotalIndices"]): h += " size%s,\n" % indexChars[i] h += " stream,\n" if globalParameters["RuntimeLanguage"] == "OCL": h += " numEvents, event_wait_list, outputEvent ); // events\n" else: h += " numEvents, startEvent, stopEvent); // events\n" h += "};\n" h += "\n" else: ############################################################################ # Generated Call to Function ############################################################################ for enqueue in [True, False]: functionName = "tensile" if enqueue else "tensileGetSolutionName" returnName = "TensileStatus" if enqueue else "const char *" h += "/* generated call to function */\n" h += "template<typename DataType>\n" h += "%s generatedCallTo_%s(\n" % (returnName, functionName) h += " unsigned int *sizes,\n" h += " unsigned int *minStrides,\n" h += " DataType alpha,\n" h += " DataType beta, \n" h += " unsigned int strideA, \n" h += " unsigned int strideB, \n" h += " unsigned int strideC, \n" h += " unsigned int numEvents = 0, \n" if globalParameters["RuntimeLanguage"] == "OCL": h += " cl_event *event_wait_list = NULL,\n" h += " cl_event *outputEvent = NULL );\n\n" else: h += " hipEvent_t *startEvent = NULL,\n" h += " hipEvent_t *stopEvent = NULL );\n\n" for dataType in dataTypes: typeName = dataType.toCpp() functionsForDataType = [] for problemType in problemTypesForDataType[dataType]: for scheduleName in schedulesForProblemType[problemType]: functionsForDataType.append( [scheduleName, problemType]) h += "template<>\n" h += "inline %s generatedCallTo_%s<%s>(\n" \ % (returnName, functionName, typeName) h += " unsigned int *sizes,\n" h += " unsigned int *minStrides,\n" h += " %s alpha,\n" % typeName h += " %s beta,\n" % typeName h += " unsigned int strideA, \n" h += " unsigned int strideB, \n" h += " unsigned int strideC, \n" h += " unsigned int numEvents, \n" if globalParameters["RuntimeLanguage"] == "OCL": h += " cl_event *event_wait_list,\n" h += " cl_event *outputEvent ) {\n\n" else: h += " hipEvent_t *startEvent,\n" h += " hipEvent_t *stopEvent ) {\n\n" h += " unsigned int functionIdxForDataType = functionInfo[functionIdx][4];\n" for functionIdx in range(0, len(functionsForDataType)): function = functionsForDataType[functionIdx] scheduleName = function[0] problemType = function[1] if len(functionsForDataType) > 1: if functionIdx == 0: h += " if (functionIdxForDataType == %u) {\n" % functionIdx elif functionIdx == len(functionsForDataType) - 1: h += " } else {\n" else: h += " } else if (functionIdxForDataType == %u) {\n" \ % functionIdx # strides indexChars = globalParameters["IndexChars"] firstStride = 1 if problemType["UseInitialStrides"]: firstStride = 0 lastStrideC = problemType["NumIndicesC"] lastStrideA = len(problemType["IndexAssignmentsA"]) lastStrideB = len(problemType["IndexAssignmentsB"]) # calculate strides for i in range(0, lastStrideC): h += " unsigned int strideC%u%s = 1" % ( i, indexChars[i]) for j in range(0, i): h += "*sizes[%i]" % j h += ";\n" h += " if (strideC != std::numeric_limits<unsigned int>::max()) strideC%u%s = strideC;\n" % ( lastStrideC - 1, indexChars[lastStrideC - 1]) for i in range(0, lastStrideA): h += " unsigned int strideA%u%s = 1" % (i, \ indexChars[problemType["IndexAssignmentsA"][i]]) for j in range(0, i): h += "*sizes[%i]" % \ problemType["IndexAssignmentsA"][j] h += ";\n" h += " if (strideA != std::numeric_limits<unsigned int>::max()) strideA%u%s = strideA;\n" % ( lastStrideA - 1, indexChars[lastStrideA - 1]) for i in range(0, lastStrideB): h += " unsigned int strideB%u%s = 1" % (i, \ indexChars[problemType["IndexAssignmentsB"][i]]) for j in range(0, i): h += "*sizes[%i]" % \ problemType["IndexAssignmentsB"][j] h += ";\n" h += " if (strideB != std::numeric_limits<unsigned int>::max()) strideB%u%s = strideB;\n" % ( lastStrideB - 1, indexChars[lastStrideB - 1]) for i in range(0, problemType["TotalIndices"]): h += " unsigned int size%s = sizes[%u];\n" % ( indexChars[i], i) # function call h += " // call solution function\n" h += " return %s_%s(\n" % (functionName, problemType) if enqueue: if globalParameters["RuntimeLanguage"] == "OCL": h += " static_cast<cl_mem>(deviceC),\n" h += " static_cast<cl_mem>(deviceA),\n" h += " static_cast<cl_mem>(deviceB),\n" else: h += " static_cast<%s *>(deviceC),\n" % typeName h += " static_cast<%s *>(deviceA),\n" % typeName h += " static_cast<%s *>(deviceB),\n" % typeName h += " alpha,\n" if problemType["UseBeta"]: h += " beta,\n" h += " 0, 0, 0, // offsets\n" for i in range(firstStride, lastStrideC): h += " strideC%u%s,\n" % (i, indexChars[i]) for i in range(firstStride, lastStrideA): h += " strideA%u%s,\n" % (i, \ indexChars[problemType["IndexAssignmentsA"][i]]) for i in range(firstStride, lastStrideB): h += " strideB%u%s,\n" % (i, \ indexChars[problemType["IndexAssignmentsB"][i]]) for i in range(0, problemType["TotalIndices"]): h += " size%s,\n" % indexChars[i] h += " stream" if enqueue: if globalParameters["RuntimeLanguage"] == "OCL": h += ",\n numEvents, event_wait_list, outputEvent" else: h += ",\n numEvents, startEvent, stopEvent" h += ");\n" if len(functionsForDataType) > 1: h += " }\n" # close last if h += "};\n" # close callToFunction ############################################################################## # Results File Name ############################################################################## if forBenchmark: h += "/* results file name */\n" resultsFileName = os.path.join(globalParameters["WorkingPath"], \ "../../Data","%s.csv" % stepName) resultsFileName = resultsFileName.replace("\\", "\\\\") h += "const char *resultsFileName = \"%s\";\n" % resultsFileName ############################################################################## # Write File ############################################################################## clientParametersFile = open(os.path.join(globalParameters["WorkingPath"], \ "ClientParameters.h"), "w") clientParametersFile.write(CHeader) clientParametersFile.write(h) clientParametersFile.close()
def TensileCreateLibrary(): print1("") print1(HR) print1("# Tensile Create Library") print2(HR) print2("") ############################################################################## # Parse Command Line Arguments ############################################################################## print2("Arguments: %s" % sys.argv) argParser = argparse.ArgumentParser() argParser.add_argument("LogicPath", help="Path to LibraryLogic.yaml files.") argParser.add_argument("OutputPath", help="Where to write library files?") argParser.add_argument("RuntimeLanguage", help="Which runtime language?", \ choices=["OCL", "HIP", "HSA"]) argParser.add_argument("--merge-files", dest="MergeFiles", \ action="store_true") argParser.add_argument("--no-merge-files", dest="MergeFiles", \ action="store_false") argParser.add_argument("--short-file-names", dest="ShortNames", \ action="store_true") argParser.add_argument("--no-short-file-names", dest="ShortNames", \ action="store_false") argParser.add_argument("--library-print-debug", dest="LibraryPrintDebug", \ action="store_true") argParser.add_argument("--no-library-print-debug", dest="LibraryPrintDebug", \ action="store_false") argParser.add_argument( "--isa", dest="isa", action="append", help="which architectures for assembly kernels to target") args = argParser.parse_args() logicPath = args.LogicPath outputPath = args.OutputPath print2("OutputPath: %s" % outputPath) ensurePath(outputPath) arguments = {} arguments["RuntimeLanguage"] = args.RuntimeLanguage arguments["MergeFiles"] = args.MergeFiles arguments["ShortNames"] = args.ShortNames arguments["LibraryPrintDebug"] = args.LibraryPrintDebug if args.isa: newISA = [] for isa in args.isa: gfxIdx = isa.find("gfx") if gfxIdx >= 0: major = int(isa[gfxIdx + 3:gfxIdx + 4]) minor = int(isa[gfxIdx + 4:gfxIdx + 5]) step = int(isa[gfxIdx + 5:gfxIdx + 6]) isaTuple = (major, minor, step) if isaTuple in globalParameters[ "SupportedISA"] and isaTuple not in newISA: print1("# User-Specified ISA: gfx%u%u%u" % (major, minor, step)) newISA.append(isaTuple) else: printWarning("isa parameter must be formed as: --isa gfx803") arguments["SupportedISA"] = newISA assignGlobalParameters(arguments) if not os.path.exists(logicPath): printExit("LogicPath %s doesn't exist" % logicPath) logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \ if (os.path.isfile(os.path.join(logicPath, f)) \ and os.path.splitext(f)[1]==".yaml")] print1("# LibraryLogicFiles:" % logicFiles) for logicFile in logicFiles: print1("# %s" % logicFile) ############################################################################## # Parse config files ############################################################################## solutions = [] logicData = {} # keys are problemTypes, values are schedules for logicFileName in logicFiles: (scheduleName, deviceNames, problemType, solutionsForSchedule, \ indexOrder, exactLogic, rangeLogic) \ = YAMLIO.readLibraryLogicForSchedule(logicFileName) if problemType not in logicData: logicData[problemType] = [] logicData[problemType].append((scheduleName, deviceNames, \ solutionsForSchedule, indexOrder, exactLogic, rangeLogic )) for solution in solutionsForSchedule: if solution not in solutions: solutions.append(solution) # create solution writer and kernel writer kernels = [] kernelsBetaOnly = [] for solution in solutions: solutionKernels = solution.getKernels() for kernel in solutionKernels: if kernel not in kernels: kernels.append(kernel) solutionKernelsBetaOnly = solution.getKernelsBetaOnly() for kernel in solutionKernelsBetaOnly: if kernel not in kernelsBetaOnly: kernelsBetaOnly.append(kernel) # if any kernels are assembly, append every ISA supported if globalParameters["RuntimeLanguage"] == "HIP": newKernels = [] for kernel in kernels: if kernel["KernelLanguage"] == "Assembly": kernel["ISA"] = globalParameters["SupportedISA"][0] for i in range(1, len(globalParameters["SupportedISA"])): newKernel = deepcopy(kernel) newKernel["ISA"] = globalParameters["SupportedISA"][i] newKernels.append(newKernel) else: kernel["ISA"] = (0, 0, 0) kernels.extend(newKernels) if globalParameters["ShortNames"] and not globalParameters["MergeFiles"]: solutionSerialNaming = Solution.getSerialNaming(solutions) kernelSerialNaming = Solution.getSerialNaming(kernels) else: solutionSerialNaming = None kernelSerialNaming = None solutionMinNaming = Solution.getMinNaming(solutions) kernelMinNaming = Solution.getMinNaming(kernels) solutionWriter = SolutionWriter( \ solutionMinNaming, solutionSerialNaming, \ kernelMinNaming, kernelSerialNaming) kernelWriterSource = KernelWriterSource( \ kernelMinNaming, kernelSerialNaming) kernelWriterAssembly = KernelWriterAssembly( \ kernelMinNaming, kernelSerialNaming) # write solutions and kernels writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \ solutionWriter, kernelWriterSource, kernelWriterAssembly) libraryStaticFiles = [ "TensileTypes.h", "KernelHeader.h", "SolutionHelper.cpp", "SolutionHelper.h", "Tools.cpp", "Tools.h" ] # write cmake clientName = "LibraryClient" writeCMake(outputPath, solutions, kernels, libraryStaticFiles, clientName) # write logic writeLogic(outputPath, logicData, solutionWriter) print1("# Tensile Library Writer DONE") print1(HR) print1("")
def getProblemSourceString(self, problemType, solution, kernelsWithBuildErrs): gsu = solution["GlobalSplitU"] persistent = solution["PersistentKernel"] kernelLanguage = solution["KernelLanguage"] tt0 = solution["ThreadTile0"] tt1 = solution["ThreadTile1"] sg0 = solution["SubGroup0"] sg1 = solution["SubGroup1"] nt = solution["NumThreads"] kernels = solution.getKernels() kernelNames = [] kernelBuildErr = 0 for kernel in kernels: kernelName = self.kernelWriter.getKernelName(kernel) if kernelName in kernelsWithBuildErrs: kernelBuildErr = 1 kernelNames.append( kernelName ) s = "" t = "" # includes problemType = solution["ProblemType"] # shortcut if not globalParameters["MergeFiles"]: solutionName = self.getSolutionName(solution) s += "#include \"%s.h\"\n" % solutionName s += "\n" # problem function signature #argList = self.getArgList(problemType, True, True, True, True) #for i in range(0, len(argList)): # argString = "%s %s" % argList[i] # s += "%s%s%s" % (t, argString, ",\n" if i < len(argList)-1 else ")" ) s += self.getSolutionSignature(solution) s += " {\n" if kernelBuildErr: s += "%s return tensileStatusFailure; // One or more kernels had build failures (%s)\n" % (t, kernelNames) s += "%s}\n" % (t) return s t += " " s += "%sTensileStatus status;\n" % (t) # hipFunction Struct if kernelLanguage == "Assembly": s += "\n" s += "%s/* module function args */\n" % (t) s += "%sstruct {\n" % t t += " " if globalParameters["DebugKernel"]: s += "%sunsigned int *debugBuffer;\n" % t # Tensor sizes in elements, including only packed dims, # and accounting for zero or other strides < size # Place these first in the structure since they are 64-bits # and need to avoid any unneeded padding: s += "%s// Size of Tensor's packed dims, in elements\n" % t s += "%suint64_t tensor2dSizeC;\n" % t s += "%suint64_t tensor2dSizeA;\n" % t s += "%suint64_t tensor2dSizeB;\n" % t solutionArgs = self.getArgList(problemType, False, True, False, False) for arg in solutionArgs: if arg[0] == "TensileHalf": s += "%s%s %s[2];\n" % (t, arg[0], arg[1]) else: s += "%s%s %s;\n" % (t, arg[0], arg[1]) for idxChar in solution["PackedC0Indices"][:-1]: s += "%sunsigned magicNumberSize%s;\n" % (t, idxChar) s += "%sunsigned magicShiftSize%s;\n" % (t, idxChar) for idxChar in solution["PackedC1Indices"][:-1]: s += "%sunsigned magicNumberSize%s;\n" % (t, idxChar) s += "%sunsigned magicShiftSize%s;\n" % (t, idxChar) # number of unroll loop iterations to stagger the start in "U" dim. s += "%sint staggerUIter;\n" % t # persistent s += "%sunsigned int problemNumGroupTiles0;\n" % t s += "%sunsigned int problemNumGroupTiles1;\n" % t s += "%sunsigned int magicNumberProblemNumGroupTiles0;\n" % t s += "%sunsigned int gridNumWorkGroups0;\n" % t s += "%sunsigned int numFullBlocks;\n" % t s += "%sunsigned int wgmRemainder1;\n" % t s += "%sunsigned int magicNumberWgmRemainder1;\n" % t s += "%sunsigned int pad;\n" % t # FIXME can this be removed? t = t[2:] s += "%s} hipFunctionArgs;\n" % t #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t s += "%ssize_t hipFunctionArgsSize = sizeof(hipFunctionArgs);\n" % t s += "%svoid *hipLaunchParams[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &hipFunctionArgs, HIP_LAUNCH_PARAM_BUFFER_SIZE, &hipFunctionArgsSize, HIP_LAUNCH_PARAM_END};\n" % t #s += "%sprintf(\"size: %%lu\\n\", sizeof(unsigned int));\n" % t #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t #for arg in solutionArgs: # s += "%sprintf(\"%s: %%lu\\n\", static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)) - static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)));\n" % (t, arg[1], arg[1], solutionArgs[0][1]) # NOTE: host compiler aligns size of structs to 64-bits (at least) and aligns the offset of pointers to 64-bits, therefore, having pointers which are not at the beginning of the struct may get padded/shifted by the host compiler and, therefore, not coppied correctly to gpu if globalParameters["RuntimeLanguage"] == "HIP": s += "%sint deviceId;\n" % (t) s += "%shipGetDevice(&deviceId);\n" % (t) # kernels s += "\n%s/* kernels */\n" % (t) s += "%sconst unsigned int numKernels = %u; // 1 or 4\n" % (t, len(kernels)) if kernelLanguage == "Source" and globalParameters["RuntimeLanguage"] == "OCL": s += "%sconst char *kernelSources[numKernels] = {\n" % (t) t += " " for kernelIdx in range(0, len(kernelNames)): kernelName = kernelNames[kernelIdx] s += "%s%s_src%s\n" % (t, kernelName, \ "," if kernelIdx < len(kernels)-1 else "" ) t = t[2:] s += "%s};\n" % (t) s += "%scl_kernel kernels[numKernels];\n" % (t) s += "%sconst char *buildOptions = \"-cl-std=cl2.0\";\n" % (t) s += "%sfor (unsigned int i = 0; i < numKernels; i++) {\n" % (t) s += "%s tensileGetCompiledOpenCLKernel(\n" % (t) s += "%s &kernels[i],\n" % (t) s += "%s kernelSources[i],\n" % (t) s += "%s stream,\n" % (t) s += "%s buildOptions);\n" % (t) s += "%s}\n" % (t) if gsu > 1: for beta in Solution.getKernelsBetaOnlyFromProblem(problemType, gsu): kernelName = self.kernelWriter.getKernelNameBetaOnly(beta) s += "%scl_kernel kernel_%s;\n" % (t, kernelName) s += "%s tensileGetCompiledOpenCLKernel(\n" % (t) s += "%s &kernel_%s,\n" % (t, kernelName) s += "%s %s_src,\n" % (t, kernelName) s += "%s stream,\n" % (t) s += "%s buildOptions);\n" % (t) elif kernelLanguage == "Assembly": kernel = kernels[0] s += "%shipFunction_t hipFunction;\n" % (t) # if !CodeFromFiles then pass global _coba that points to code object s += "%sstatus = solutionLock->getFunction(&hipFunction, deviceId, \"%s\", %s);;\n" \ % (t, kernelName, "nullptr" if globalParameters["CodeFromFiles"] else kernelName+"_coba" ) s += "%sif (status) return status;\n" % (t) typeName = problemType["DataType"].toCpp() # num enqueues s += "\n%s/* num kernels */\n" % (t) s += "%sunsigned int numEnqueues[numKernels] = { 1" % (t) for i in range(1, len(kernels)): s += ", 1" s += " };\n" # grid size s += "\n%s/* grid sizes */\n" % (t) s += "%sconst unsigned int workDim = 3;\n" % (t) s += "%sconst unsigned int threadTile[2] = { %u, %u };\n" \ % (t, tt0, tt1) s += "%sconst unsigned int groupSize[2] = { %u, %u };\n" \ % (t, sg0, sg1) s += "%ssize_t localWorkSize[3] = { %3u, 1, 1 };\n" \ % (t, nt) s += "%ssize_t globalWorkSize[numKernels][3];\n" % (t) # grid size [2] s += "%sglobalWorkSize[0][2] = 1;\n" % (t) for i in range(0, problemType["NumIndicesC"]): if i != problemType["Index0"] and i != problemType["Index1"]: s += "%sglobalWorkSize[0][2] *= size%s;\n" % (t, self.indexChars[i]) s += "%sunsigned int sizeOfC0 = " % (t) s += " * ".join(["size" + i for i in solution["PackedC0Indices"]]) s += ";\n" s += "%sunsigned int sizeOfC1 = " % (t) s += " * ".join(["size" + i for i in solution["PackedC1Indices"]]) s += ";\n" for idxChar in solution["PackedC0Indices"][:-1]: s += "%sunsigned magicShiftSize%s = 33; // bozo, review\n" % (t, idxChar) s += "%sunsigned magicNumberSize%s = (1L<<magicShiftSize%s) / size%s + 1; // bozo, review\n" \ % (t, idxChar, idxChar, idxChar) for idxChar in solution["PackedC1Indices"][:-1]: s += "%sunsigned magicShiftSize%s = 33; // bozo, review\n" % (t, idxChar) s += "%sunsigned magicNumberSize%s = (1L<<magicShiftSize%s) / size%s + 1; // bozo, review\n" \ % (t, idxChar, idxChar, idxChar) s += "%sunsigned int macroTile0 = static_cast<unsigned int>(groupSize[0] * threadTile[0]);\n" % (t) s += "%sunsigned int macroTile1 = static_cast<unsigned int>(groupSize[1] * threadTile[1]);\n" % (t) s += "%sunsigned int totalWorkGroups0 = sizeOfC0 / macroTile0;\n" % (t) s += "%sunsigned int totalWorkGroups1 = sizeOfC1 / macroTile1;\n" % (t) if kernel["EdgeType"] != "None": s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % (t) s += "%sif (totalWorkGroups0*macroTile0 < sizeOfC0) { totalWorkGroups0++; }\n" % (t) s += "%sif (totalWorkGroups1*macroTile1 < sizeOfC1) { totalWorkGroups1++; }\n" % (t) if kernel["WorkGroupMappingType"] == "Z" and abs(kernel["WorkGroupMapping"]) == 2: s += "%sunsigned int totalWorkGroupsPow2 = totalWorkGroups0 > totalWorkGroups1 ? totalWorkGroups0 : totalWorkGroups1;\n" % (t) s += "%stotalWorkGroupsPow2--;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 1;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 2;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 4;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 8;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 16;\n" % (t) s += "%stotalWorkGroupsPow2++;\n" % (t) s += "%stotalWorkGroups0 = totalWorkGroupsPow2;\n" % (t) s += "%stotalWorkGroups1 = totalWorkGroupsPow2;\n" % (t) # persistent: s += "%sunsigned int problemNumGroupTiles0 = totalWorkGroups0;\n" % (t) s += "%sunsigned int problemNumGroupTiles1 = totalWorkGroups1;\n" % (t) s += "%sconst unsigned smallNumMagicShift = 31; // bozo, review\n" % (t) s += "%sunsigned magicNumberProblemNumGroupTiles0 = (1L<<smallNumMagicShift) / problemNumGroupTiles0 + 1; // bozo, review\n" % (t) s += "%sunsigned numFullBlocks = problemNumGroupTiles1 / %u; // divide by WorkGroupMapping\n" \ % (t, abs(kernel["WorkGroupMapping"]) if abs(kernel["WorkGroupMapping"])>0 else 1) s += "%sunsigned wgmRemainder1 = %u ? (problemNumGroupTiles1 %% %u) : 0;\n" % \ (t, abs(kernel["WorkGroupMapping"]), abs(kernel["WorkGroupMapping"])) s += "%sif (wgmRemainder1 == 0) wgmRemainder1 = %u;\n" % (t, abs(kernel["WorkGroupMapping"])) s += "%sunsigned magicNumberWgmRemainder1 = ((1L<<smallNumMagicShift) / wgmRemainder1 + 1);\n" % (t) #s += ' printf ("wgmRemainder1=%u \\n", wgmRemainder1);' if gsu> 1: s += "%stotalWorkGroups1 *= %u; // GlobalSplitU\n" % (t, gsu) if persistent: s += "%shipDeviceProp_t deviceProperties;\n" % (t) # TODO - should cache the device properties - expensive to call on each iteration here: s += "%shipGetDeviceProperties( &deviceProperties, deviceId );\n" % (t) s += "%sunsigned int numGroups = totalWorkGroups0 * totalWorkGroups1;\n" % (t) s += "%sglobalWorkSize[0][0] = (deviceProperties.multiProcessorCount * %u < numGroups) ? (deviceProperties.multiProcessorCount * %u) : numGroups;\n" \ % (t, persistent, persistent) s += "%sglobalWorkSize[0][1] = 1;\n" % t else: s += "%sglobalWorkSize[0][0] = totalWorkGroups%u%s;\n" % (t, 0 if kernel["WorkGroupMapping"] >= 0 else 1, "*localWorkSize[0]" if self.language == "OCL" else "") s += "%sglobalWorkSize[0][1] = totalWorkGroups%u%s;\n" % (t, 1 if kernel["WorkGroupMapping"] >= 0 else 0, "*localWorkSize[1]" if self.language == "OCL" else "") # index sizes s += "\n%s/* index sizes */\n" % (t) s += "%sunsigned int sizes[numKernels][1][%u];\n" \ % (t, problemType["TotalIndices"]) for kernelIdx in range(0, len(kernels)): kernel = kernels[kernelIdx] kernelName = self.kernelWriter.getKernelName(kernel) # free index sizes for i in range(0,problemType["NumIndicesFree"] \ + problemType["NumIndicesBatch"] ): s += "%ssizes[%u][0][%u] = size%s;\n" \ % (t, kernelIdx, i, self.indexChars[i]) # summation index sizes for i in range(problemType["NumIndicesC"], \ problemType["TotalIndices"] ): lastParam = i == problemType["TotalIndices"]-1 s += "%ssizes[%u][0][%u] = size%s;\n" \ % (t, kernelIdx, i, self.indexChars[i]) # Tensor2DSizes - size excluding the batch dimension, accounts for cases where one of strides is 0 #print "IndexAssignmentsA=", problemType["IndexAssignmentsA"], "Batch=", problemType["IndicesBatch"] firstStride = 0 if problemType["UseInitialStrides"] else 1 del i numIdx = problemType["NumIndicesC"] printMe = 0 s += "%suint64_t tensor2dSizeC = %s" % \ (t, "1" if firstStride==1 else "strideC%u%s"% (0,self.indexChars[0])) for idx in range(0,numIdx): # Multiply only by packed tensor dims if idx in problemType["IndicesFree"]: printMe = True else: printMe = False if printMe: if idx+1 < numIdx: strideIdx = idx+1 s += " * std::max(size%s, strideC%u%s)" % \ (self.indexChars[idx], idx+1, self.indexChars[strideIdx]) else: s += " * size%s" % (self.indexChars[idx]) s += ";\n" s += "%suint64_t tensor2dSizeA = 1;\n" % t numIdx = len(problemType["IndexAssignmentsA"]) printMe = printedSum = False for i in range(0,numIdx): idx = problemType["IndexAssignmentsA"][i] # Multiply only by first free and first summation if idx in [ord(x)-ord(globalParameters["IndexChars"][0]) for x in solution["PackedC0Indices"]]: printMe = True elif idx in problemType["IndicesSummation"] and not printedSum: printMe = printedSum = True else: printMe = False if printMe: s += "%stensor2dSizeA = " % t if i+1 < numIdx: strideIdx = problemType["IndexAssignmentsA"][i+1] s += "std::max(tensor2dSizeA*size%s, (uint64_t)strideA%u%s);\n" \ % (self.indexChars[idx], i+1, self.indexChars[strideIdx]) else: s += " tensor2dSizeA * size%s" % (self.indexChars[idx]) s += ";\n" s += "%suint64_t tensor2dSizeB = 1;\n" % t numIdx = len(problemType["IndexAssignmentsB"]) printMe = printedSum = False for i in range(0,numIdx): idx = problemType["IndexAssignmentsB"][i] # Multiply only by first free and first summation if idx in [ord(x)-ord(globalParameters["IndexChars"][0]) for x in solution["PackedC1Indices"]]: printMe = True elif idx in problemType["IndicesSummation"] and not printedSum: printMe = printedSum = True else: printMe = False if printMe: s += "%stensor2dSizeB = " % t if i+1 < numIdx: strideIdx = problemType["IndexAssignmentsB"][i+1] s += "std::max(tensor2dSizeB*size%s, (uint64_t)strideB%u%s);\n" \ % (self.indexChars[idx], i+1, self.indexChars[strideIdx]) else: s += " tensor2dSizeB * size%s" % (self.indexChars[idx]) s += ";\n" unrollChar = globalParameters["IndexChars"][problemType["IndexUnroll"]] s += " unsigned int staggerUIter = %s; // how many stride-sized clicks to stagger start offset\n" \ % (solution["StaggerU"]) s += " int unrollLoopIters = size%s/%u/%u; // /DepthU/GSU\n" % (unrollChar, solution["DepthU"], gsu) s += " while (staggerUIter>1) {\n" s += " if (unrollLoopIters >= (staggerUIter*%u)) {\n" % (1<<solution["_staggerStrideShift"]) s += " break;}\n" s += " staggerUIter /= 2; // step down to smaller stagger\n" s += " }\n" s += " if (staggerUIter>=1) staggerUIter -= 1;\n" # convert to a mask #s += ' printf ("size%s=%%u StaggerU=%s unrollLoopIters=%%u, staggerUIter=%%d\\n", size%s, unrollLoopIters, staggerUIter);\n' % (unrollChar, solution["StaggerU"], unrollChar) #s += "printf(\"Launching with grid=%zu_%zu problemGrid=%u_%u mt=%u_%u\\n\", globalWorkSize[0][0], globalWorkSize[0][1], totalWorkGroups0, totalWorkGroups1, macroTile0, macroTile1);\n" s += "\n" s += "%sint kernelsLaunched=0;\n" % (t) ######################################## # Enqueue Beta-Only Kernel ######################################## if gsu > 1: kernelNamesBetaOnly = [] numStridesC = problemType["NumIndicesC"] - \ (0 if problemType["UseInitialStrides"] else 1) for beta in Solution.getKernelsBetaOnlyFromProblem(problemType, gsu): kernelName = self.kernelWriter.getKernelNameBetaOnly(beta) kernelNamesBetaOnly.append(kernelName) s += "%s// enqueue Beta-Only kernel\n" % (t) # grid sizes s += "%ssize_t localWorkSizeBetaOnly[3] = { 8, 8, 1};\n" % (t) s += "%ssize_t globalWorkSizeBetaOnly[3];\n" % (t) #s += "%sunsigned int sizeOfC0 = size%s;\n" % (t, \ # self.indexChars[problemType["Index0"]]) #s += "%sunsigned int sizeOfC1 = size%s;\n" % (t, \ # self.indexChars[problemType["Index1"]]) s += "%ssize_t totalWorkGroupsBetaOnly0 = sizeOfC0 / localWorkSizeBetaOnly[0];\n" % (t) s += "%ssize_t totalWorkGroupsBetaOnly1 = sizeOfC1 / localWorkSizeBetaOnly[1];\n" % (t) s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % (t) s += "%sif (totalWorkGroupsBetaOnly0*localWorkSizeBetaOnly[0] < sizeOfC0) { totalWorkGroupsBetaOnly0++; }\n" % (t) s += "%sif (totalWorkGroupsBetaOnly1*localWorkSizeBetaOnly[1] < sizeOfC1) { totalWorkGroupsBetaOnly1++; }\n" % (t) s += "%sglobalWorkSizeBetaOnly[0] = totalWorkGroupsBetaOnly0%s;\n" % (t, "*localWorkSizeBetaOnly[0]" if self.language == "OCL" else "") s += "%sglobalWorkSizeBetaOnly[1] = totalWorkGroupsBetaOnly1%s;\n" % (t, "*localWorkSizeBetaOnly[1]" if self.language == "OCL" else "") s += "%sglobalWorkSizeBetaOnly[2] = 1;\n" % (t) for i in range(0, problemType["NumIndicesC"]): if i != problemType["Index0"] and i != problemType["Index1"]: s += "%sglobalWorkSizeBetaOnly[2] *= size%s;\n" % (t, self.indexChars[i]) if problemType["UseBeta"]: s += "%sbool betaZero = beta == 0;\n" % (t) if self.language == "OCL": if problemType["UseBeta"]: s += "%scl_kernel kernelBetaOnly = betaZero ? kernel_%s : kernel_%s;\n" \ % (t, kernelNamesBetaOnly[0], kernelNamesBetaOnly[1]) else: #s += "%sbool betaZero = true;\n" % (t) s += "%scl_kernel kernelBetaOnly = kernel_%s;\n" \ % (t, kernelNamesBetaOnly[0]) argIdx = 0 s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % (t, argIdx); argIdx+=1 # strides for i in range(0,numStridesC): s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.strideList[i]); argIdx+=1 # sizes for i in range(0, problemType["NumIndicesC"]): s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.indexChars[i]); argIdx+=1 # beta if problemType["UseBeta"]: s += "%sif (!betaZero) {\n" % (t) s += "%s status = clSetKernelArg( kernelBetaOnly, %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (t, argIdx, typeName); argIdx+=1 s += "%s}\n" % (t) # enqueue s += "%scl_event kernelEventBetaOnly;\n" % (t) s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t) t += " " s += "%sstream,\n" % (t) s += "%skernelBetaOnly,\n" % (t) s += "%sworkDim,\n" % (t) s += "%sNULL, // globalWorkOffset\n" % (t) s += "%sglobalWorkSizeBetaOnly,\n" % (t) s += "%slocalWorkSizeBetaOnly,\n" % (t) s += "%snumInputEvents,\n" % (t) s += "%sinputEvents,\n" % (t) #s += "%soutputEvent );\n" % (t) s += "%s&kernelEventBetaOnly );\n" % (t) t = t[2:] s += "%stensileStatusCheck(status);\n" % (t) if problemType["UseBeta"]: s += "%sbeta = %s;\n" % (t, problemType["DataType"].zeroString(self.language, 1) ) #s += "%sreturn tensileStatusSuccess;\n" % (t) s += "%sstatus = clFinish(stream);\n" % (t) s += "%stensileStatusCheck(status);\n" % (t) #s += " float tmp[128*128];\n" #s += "clEnqueueReadBuffer(stream, dataC, CL_TRUE, 0, 128*128*sizeof(float), tmp, 0, NULL, NULL);\n" #s += "for (unsigned int i = 0; i < 128*128; i++) { printf(\"%f\\n\", tmp[i]); }\n" else: s += "%stry {\n" % (t) # TODO - timing with beta kernels is somewhat pessimistic since it has this separate event only on the GSU path. # Introduces 2-3us of overhead ; may want to disable PreciseKernelTime so non-GSU have same overhead. # Long-term fix would be to launch the beta kernel with the hipHccModule* API and set start-event in that call if problemType["UseBeta"]: s += "%sif (betaZero) {\n" % (t) t += " " s += "%sif( inputEvents != NULL )\n" % (t) s += "%s hipEventRecord(inputEvents[0], stream );\n" % (t) s += "%skernelsLaunched++;\n" % (t) s += "%shipLaunchKernelGGL(\n" % (t) t += " " s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[0]) s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % (t) s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % (t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sdataD,\n" % (t) s += "%sdataC,\n" % (t) # strides for i in range(0,numStridesC*2): s += "%s%s,\n" % (t, self.strideList[i]) # sizes for i in range(0, problemType["NumIndicesC"]): s += "%ssize%s%s" % (t, self.indexChars[i], ",\n" if i < problemType["NumIndicesC"]-1 else ");\n") if problemType["UseBeta"]: s += "%s} else {\n" % (t) t = t[:-2] s += "%sif( inputEvents != NULL )\n" % (t) s += "%s hipEventRecord(inputEvents[0], stream );\n" % (t) s += "%skernelsLaunched++;\n" % (t) s += "%shipLaunchKernelGGL(\n" % (t) t += " " s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[1]) s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % (t) s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % (t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sdataD,\n" % (t) s += "%sdataC,\n" % (t) # strides for i in range(0,numStridesC*2): s += "%s%s,\n" % (t, self.strideList[i]) # sizes for i in range(0, problemType["NumIndicesC"]): s += "%ssize%s,\n" % (t, self.indexChars[i]) s += "%sbeta);\n" % (t) s += "}\n" t = " " s += "%s} catch (const std::exception& e) {\n" % (t) s += "#ifdef DEBUG\n" s += "%s std::cerr << e.what() << std::endl;\n" % (t) s += "#endif\n" s += "%s return tensileStatusFailure;\n" % (t) s += "%s}\n" % (t) ######################################## # Enqueue Kernels ######################################## for kernelIdx in range(0, len(kernels)): kernel = kernels[kernelIdx] if kernel["KernelLanguage"] == "Source": kernel["ISA"] = (0, 0, 0) # HIP source kernels needs dummy ISA version kernelName = self.kernelWriter.getKernelName(kernel) s += "\n%s/* kernel %u: %s */\n" % (t, kernelIdx, kernelName) s += "%sunsigned int kernelIdx = %u;\n" % (t, kernelIdx) if self.language == "OCL": # set kernel args same for all enqueues s += "%s// kernel args same for all enqueues\n" % (t) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataD ); tensileStatusCheck(status);\n" % (t, 0) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % (t, 1) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataA ); tensileStatusCheck(status);\n" % (t, 2) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataB ); tensileStatusCheck(status);\n" % (t, 3) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &alpha ); tensileStatusCheck(status);\n" % (t, 4, typeName) s += "%s%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (t, \ "" if problemType["UseBeta"] else "//", 5, typeName) argIdx = 6 if problemType["UseBeta"] else 5 for stride in self.strideList: s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % (t, argIdx, stride) argIdx += 1 for sizeIdx in range(0, problemType["TotalIndices"]): if sizeIdx not in [ problemType["Index0"], problemType["Index1"], problemType["IndexUnroll"] ]: s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.indexChars[sizeIdx]) argIdx += 1 s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(staggerUIter), &staggerUIter ); tensileStatusCheck(status);\n" % (t, argIdx) argIdx += 1 s += "%sfor (unsigned int enqueueIdx = 0; enqueueIdx < numEnqueues[%u]; enqueueIdx++) {\n" % (t, kernelIdx) t += " " # debug print kernel dimensions if globalParameters["LibraryPrintDebug"]: s += "%sprintf(\"%s: g{ %%u, %%u, %%u } l{ %%u, %%u, %%u}\\n\", static_cast<unsigned int>(globalWorkSize[kernelIdx][0]), static_cast<unsigned int>(globalWorkSize[kernelIdx][1]), static_cast<unsigned int>(globalWorkSize[kernelIdx][2]), static_cast<unsigned int>(localWorkSize[0]), static_cast<unsigned int>(localWorkSize[1]), static_cast<unsigned int>(localWorkSize[2]) );\n" % (t, kernelName) # debug print kernel arguments # strides for stride in self.strideList: s += "%sprintf(\" %s = %%u\\n\", %s);\n" % (t, stride, stride) # sizes for i in range(0, problemType["TotalIndices"]): s += "%sprintf(\" sizes[kernelIdx][enqueueIdx][%u] = %%u\\n\", sizes[kernelIdx][enqueueIdx][%u] );\n" % (t, i, i ) s += "%sprintf(\" problemNumGroupTiles0== %%u\\n\", problemNumGroupTiles0 );\n" % (t) s += "%sprintf(\" problemNumGroupTiles1== %%u\\n\", problemNumGroupTiles1 );\n" % (t) s += "%sprintf(\" tensor2dSizeC== %%lu\\n\", tensor2dSizeC );\n" % (t) s += "%sprintf(\" tensor2dSizeA== %%lu\\n\", tensor2dSizeA );\n" % (t) s += "%sprintf(\" tensor2dSizeB== %%lu\\n\", tensor2dSizeB );\n" % (t) for idxChar in solution["PackedC0Indices"][:-1]: s += "%sprintf(\" magicNumberSize%s== 0x%%x, magicShiftSize%s== %%u)\\n\", magicNumberSize%s, magicShiftSize%s);\n" \ % (t, idxChar, idxChar, idxChar, idxChar) for idxChar in solution["PackedC1Indices"][:-1]: s += "%sprintf(\" magicNumberSize%s== 0x%%x, magicShiftSize%s== %%u)\\n\", magicNumberSize%s, magicShiftSize%s);\n" \ % (t, idxChar, idxChar, idxChar, idxChar) ######################################## # OpenCL Runtime ######################################## if self.language == "OCL": # set kernel args different for all enqueues argIdx = 6 if problemType["UseBeta"] else 5 argIdx += len(self.strideList) # sizes for sizeIdx in range(0, problemType["TotalIndices"]): if sizeIdx in [ problemType["Index0"], problemType["Index1"], problemType["IndexUnroll"] ]: s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.indexChars[sizeIdx]) argIdx += 1 # enqueue s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t) t += " " s += "%sstream,\n" % (t) s += "%skernels[kernelIdx],\n" % (t) s += "%sworkDim,\n" % (t) s += "%sNULL, // globalWorkOffset\n" % (t) s += "%sglobalWorkSize[kernelIdx],\n" % (t) s += "%slocalWorkSize,\n" % (t) if False: # gsu > 1: s += "%s1,\n" % (t) s += "%s&kernelEventBetaOnly,\n" % (t) else: s += "%snumInputEvents,\n" % (t) s += "%sinputEvents,\n" % (t) s += "%soutputEvent );\n" % (t) s += "%stensileStatusCheck(status);\n" % (t) s += "%s}\n" % (t) ######################################## # HIP Runtime ######################################## else: if not globalParameters["PreciseKernelTime"] or kernelLanguage == "Source": s += "%sif( inputEvents != NULL )\n" % (t) t += " " s += "%shipEventRecord(inputEvents[enqueueIdx], stream );\n" % (t) t = t[2:] s += "%stry {\n" % (t) t += " " # hip kernel if kernelLanguage == "Source": s += "%skernelsLaunched++;\n" % (t) s += "%shipLaunchKernelGGL(\n" % (t) t += " " s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelName) s += "%sdim3(globalWorkSize[kernelIdx][0], globalWorkSize[kernelIdx][1], globalWorkSize[kernelIdx][2]),\n" % (t) s += "%sdim3(localWorkSize[0], localWorkSize[1], localWorkSize[2]),\n" % (t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sdataD,\n" % (t) s += "%sdataC,\n" % (t) s += "%sdataA,\n" % (t) s += "%sdataB,\n" % (t) s += "%salpha,\n" % (t) s += "%s%sbeta,\n" % (t, \ "" if problemType["UseBeta"] else "//") # strides for stride in self.strideList: s += "%s%s,\n" % (t, stride) # sizes for i in range(0, problemType["TotalIndices"]): lastParam = i == problemType["TotalIndices"]-1 s += "%ssizes[kernelIdx][enqueueIdx][%u]%s\n" \ % (t, i, "" if lastParam else "," ) for idxChar in solution["PackedC0Indices"][:-1]: s += "%s,magicNumberSize%s\n" % (t, idxChar) s += "%s,magicShiftSize%s\n" % (t, idxChar) for idxChar in solution["PackedC1Indices"][:-1]: s += "%s,magicNumberSize%s\n" % (t, idxChar) s += "%s,magicShiftSize%s\n" % (t, idxChar) s += "%s,staggerUIter\n" % (t) #persistent: s += "%s,problemNumGroupTiles0\n" % (t) s += "%s,problemNumGroupTiles1\n" % (t) s += "%s,magicNumberProblemNumGroupTiles0\n" % (t) # magic number to use when dividing by problemNumGroupTiles0 s += "%s);\n" % (t) # assembly kernel else: if globalParameters["DebugKernel"]: s += "%sconst unsigned int debugBufferElementsPerThread = 16;\n" % t s += "%sunsigned int debugBufferNumElem = debugBufferElementsPerThread;\n" % (t) s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][0]);\n" % (t) s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][1]);\n" % (t) s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][2]);\n" % (t) s += "%sdebugBufferNumElem *= localWorkSize[0];\n" % (t) s += "%sdebugBufferNumElem *= localWorkSize[1];\n" % (t) s += "%sdebugBufferNumElem *= localWorkSize[2];\n" % (t) s += "%s printf(\"debugBufferNumElem: %%04i: \\n\", debugBufferNumElem);\n" % (t) s += "%ssize_t debugBufferSize = debugBufferNumElem * sizeof(unsigned int);\n" % (t) s += "%shipDevice_t device;\n" % t s += "%shipDeviceGet(&device, 0);\n" % t s += "%shipMalloc(&(hipFunctionArgs.debugBuffer), debugBufferSize);\n" % t s += "%sunsigned int *debugBufferHostPtr = new unsigned int[debugBufferNumElem];\n" % (t) s += "%smemset(debugBufferHostPtr,0,debugBufferSize);\n" % (t) s += "%shipMemcpyHtoD(hipFunctionArgs.debugBuffer, debugBufferHostPtr, debugBufferSize);\n" % (t) s += "%smemset(debugBufferHostPtr,1,debugBufferSize);\n" % (t) # hip assembly function s += "%shipFunctionArgs.tensor2dSizeC = tensor2dSizeC;\n" % (t) s += "%shipFunctionArgs.tensor2dSizeA = tensor2dSizeA;\n" % (t) s += "%shipFunctionArgs.tensor2dSizeB = tensor2dSizeB;\n" % (t) s += "%shipFunctionArgs.dataD = dataD;\n" % (t) s += "%shipFunctionArgs.dataC = dataC;\n" % (t) s += "%shipFunctionArgs.dataA = dataA;\n" % (t) s += "%shipFunctionArgs.dataB = dataB;\n" % (t) if problemType["DataType"].isHalf(): s += "%shipFunctionArgs.alpha[0] = alpha;\n" % (t) s += "%shipFunctionArgs.alpha[1] = alpha;\n" % (t) else: s += "%shipFunctionArgs.alpha = alpha;\n" % (t) if problemType["UseBeta"]: if problemType["DataType"].isHalf(): s += "%shipFunctionArgs.beta[0] = beta;\n" % (t) s += "%shipFunctionArgs.beta[1] = beta;\n" % (t) else: s += "%shipFunctionArgs.beta = beta;\n" % (t) # strides for stride in self.strideList: s += "%shipFunctionArgs.%s = %s;\n" % (t, stride, stride) # sizes for i in range(0, problemType["TotalIndices"]): lastParam = i == problemType["TotalIndices"]-1 s += "%shipFunctionArgs.size%s = sizes[kernelIdx][enqueueIdx][%u];\n" \ % (t, globalParameters["IndexChars"][i], i ) s += "%shipFunctionArgs.tensor2dSizeC = tensor2dSizeC;\n" % (t) s += "%shipFunctionArgs.tensor2dSizeA = tensor2dSizeA;\n" % (t) s += "%shipFunctionArgs.tensor2dSizeB = tensor2dSizeB;\n" % (t) s += "%shipFunctionArgs.staggerUIter = staggerUIter;\n" % (t) # persistent - pass in the number of tiles in problem since not available in WG s += "\n" s += "%shipFunctionArgs.problemNumGroupTiles0 = problemNumGroupTiles0;\n" % (t) s += "%shipFunctionArgs.problemNumGroupTiles1 = problemNumGroupTiles1;\n" % (t) s += "%shipFunctionArgs.magicNumberProblemNumGroupTiles0 = magicNumberProblemNumGroupTiles0;\n" % (t) s += "%shipFunctionArgs.gridNumWorkGroups0 = globalWorkSize[kernelIdx][0];\n" % (t) # s += "%shipFunctionArgs.numFullBlocks = numFullBlocks;\n" % (t) s += "%shipFunctionArgs.wgmRemainder1 = wgmRemainder1;\n" % (t) s += "%shipFunctionArgs.magicNumberWgmRemainder1 = magicNumberWgmRemainder1;\n" % (t) # Magic numbers for packed indices: for idxChar in solution["PackedC0Indices"][:-1]: s += "%shipFunctionArgs.magicNumberSize%s = magicNumberSize%s;\n" % (t, idxChar, idxChar) s += "%shipFunctionArgs.magicShiftSize%s = magicShiftSize%s;\n" % (t, idxChar, idxChar) for idxChar in solution["PackedC1Indices"][:-1]: s += "%shipFunctionArgs.magicNumberSize%s = magicNumberSize%s;\n" % (t, idxChar, idxChar) s += "%shipFunctionArgs.magicShiftSize%s = magicShiftSize%s;\n" % (t, idxChar, idxChar) s += "%skernelsLaunched++;\n" % (t) s += "%shipHccModuleLaunchKernel(\n" % (t) t += " " s += "%shipFunction,\n" % (t) s += "%sglobalWorkSize[kernelIdx][0]*localWorkSize[0],\n" % (t) s += "%sglobalWorkSize[kernelIdx][1]*localWorkSize[1],\n" % (t) s += "%sglobalWorkSize[kernelIdx][2]*localWorkSize[2],\n" % (t) s += "%slocalWorkSize[0],\n" % (t) s += "%slocalWorkSize[1],\n" % (t) s += "%slocalWorkSize[2],\n" % (t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sNULL,\n" % (t) s += "%s(void**)hipLaunchParams\n" % (t) if globalParameters["PreciseKernelTime"]: s += "%s,(inputEvents && kernelsLaunched==1) ? inputEvents[enqueueIdx]:nullptr\n" %(t) s += "%s,outputEvent ? outputEvent[enqueueIdx]:nullptr\n" % (t) s += "%s);\n" % (t) t = t[2:] if globalParameters["DebugKernel"]: # copy debug buffer s += "%shipMemcpyDtoH(debugBufferHostPtr, hipFunctionArgs.debugBuffer, debugBufferSize);\n" % (t) s += "%sfor(unsigned int i = 0; i < debugBufferNumElem/debugBufferElementsPerThread; i++) {\n" % (t) s += "%s printf(\"%%04i\", i);\n" % (t) s += "%s char u[debugBufferElementsPerThread] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};\n" % (t) #s += "%s char u[debugBufferElementsPerThread] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};\n" % (t) #s += "%s char u[debugBufferElementsPerThread] = {1,1,0,0,1,1,0,0,1,1,1,1,1,1,1,1};\n" % (t) s += "%s for(unsigned int j = 0; j < debugBufferElementsPerThread; j++) {\n" % (t) s += "%s if (u[j]) printf(\",%%4u\", debugBufferHostPtr[i*debugBufferElementsPerThread+j]);\n" % (t) s += "%s else printf(\",%%4.0f\", ((float *)debugBufferHostPtr)[i*debugBufferElementsPerThread+j]);\n" % (t) s += "%s }\n" % (t) s += "%s printf(\"\\n\");\n" % (t) s += "%s}\n" % (t) t = t[2:] s += "%s} catch (const std::exception& e) {\n" % (t) s += "#ifdef DEBUG\n" s += "%s std::cerr << e.what() << std::endl;\n" % (t) s += "#endif\n" s += "%s return tensileStatusFailure;\n" % (t) s += "%s}\n" % (t) if not globalParameters["PreciseKernelTime"] or kernelLanguage == "Source": s += "%sif( outputEvent != NULL )\n" % (t) s += "%s hipEventRecord(outputEvent[enqueueIdx], stream );\n" % (t) s += " }\n" s += "\n" s += " return tensileStatusSuccess;\n" s += "}\n" s += "\n" s += "/* Solution Parameters\n" s += Solution.getParametersIndented(solution.getAttributes(), " ") s += "*/\n" s += "\n" return s