def addResults( self, hardcodedParameterList, benchmarkPermutations, \ solutions, results): if globalParameters["PrintLevel"] >= 1: print1("# Adding Results to Solution Database") progressBar = ProgressBar(len(results)) for hardcodedIdx in range(0, len(results)): hardcodedResults = results[hardcodedIdx] hardcodedParameters = hardcodedParameterList[hardcodedIdx] winningIdx = -1 winningScore = -9999 # -1 is score of invalid so use -9999 here # find fastest benchmark parameters for this hardcoded for benchmarkIdx in range(0, len(hardcodedResults)): benchmarkResult = hardcodedResults[benchmarkIdx] benchmarkScore = max( benchmarkResult) # take fastest regardless of size if benchmarkScore > winningScore: winningScore = benchmarkScore winningIdx = benchmarkIdx winningSolution = solutions[hardcodedIdx][winningIdx] winningParameters = {} for paramName in benchmarkPermutations[0]: winningParameters[paramName] = winningSolution[paramName] #print2("HCP[%u] Winner: idx=%u, gflops=%f, param=%s" \ # % ( hardcodedIdx, winningIdx, winningScore, winningParameters)) matches = WinningParameterDict.get(hardcodedParameters, self.winners) if len(matches) != 1: printExit("Didn't find exactly 1 match") hardcodedParametersKey = matches[0][0] #oldWinningParameters = matches[0][1] #oldScore = matches[0][2] self.winners[hardcodedParametersKey][0].update(winningParameters) self.winners[hardcodedParametersKey][1] = winningScore if globalParameters["PrintLevel"] >= 1: progressBar.increment()
def main(config): dataPath = os.path.join(globalParameters["WorkingPath"], \ globalParameters["BenchmarkDataPath"]) pushWorkingPath(globalParameters["BenchmarkProblemsPath"]) ensurePath(dataPath) totalTestFails = 0 for benchmarkProblemTypeConfig in config: problemTypeConfig = benchmarkProblemTypeConfig[0] if len(benchmarkProblemTypeConfig) < 2: problemSizeGroupConfigs = [{}] else: problemSizeGroupConfigs = benchmarkProblemTypeConfig[1:] for problemSizeGroupIdx in range(0, len(problemSizeGroupConfigs)): problemSizeGroupConfig = problemSizeGroupConfigs[ problemSizeGroupIdx] print2("ProblemTypeConfig: %s" % problemTypeConfig) problemTypeObj = ProblemType(problemTypeConfig) globalParameters["EnableHalf"] = problemTypeObj["DataType"].isHalf( ) # results files will be named newResultsFileName = os.path.join(dataPath, "%s_%02u.csv" \ % (str(problemTypeObj), problemSizeGroupIdx) ) newSolutionsFileName = os.path.join(dataPath, "%s_%02u.yaml" \ % (str(problemTypeObj), problemSizeGroupIdx) ) # skip if possible if globalParameters["ForceRedoBenchmarkProblems"] or \ not os.path.exists(newResultsFileName): # Benchmark Problem Size Group (resultsFileBaseFinal, benchmarkErrors) = benchmarkProblemType(problemTypeConfig, \ problemSizeGroupConfig, problemSizeGroupIdx) totalTestFails += benchmarkErrors print "clientExit=%u %s for %s" %\ (totalTestFails, "(ERROR)" if totalTestFails else "(PASS)", \ globalParameters["ConfigPath"]) # Copy Data resultsFileBase = resultsFileBaseFinal resultsFileName = "%s.csv" % (resultsFileBase) solutionsFileName = "%s.yaml" % (resultsFileBase) shutil_copy(resultsFileName, newResultsFileName) shutil_copy(solutionsFileName, newSolutionsFileName) else: print1("# %s_%02u already benchmarked; skipping." % (str(problemTypeObj), problemSizeGroupIdx)) popWorkingPath() if globalParameters["ExitOnFails"] and totalTestFails: sys.exit(1)
def wpdUpdate(self, newHardcodedParameterList): # TODO when new list is joining, we need to choose the fastest oldWinners = self.winners self.winners = {} # if this is first time, populate with dummies and early exit if len(oldWinners) == 0: for newHardcodedParameters in newHardcodedParameterList: self.winners[FrozenDictionary(newHardcodedParameters)] = [{}, -1] else: if globalParameters["PrintLevel"] >= 1: print1("# Updating Solution Database") progressBar = ProgressBar(len(newHardcodedParameterList)) for newHardcodedParameters in newHardcodedParameterList: #(oldHardcodedParameters, winningParameters, score) = \ matches = WinningParameterDict.get(newHardcodedParameters, oldWinners) if len(matches) == 1: # plain update hardcodedFrozen = matches[0][0] winningParameters = matches[0][1] score = matches[0][2] #if winningParameters != None: newHardcodedParameters.update(hardcodedFrozen.parameters) self.winners[FrozenDictionary(newHardcodedParameters)] = \ [ winningParameters, score ] elif len(matches) > 1: # join fastestScore = -1 fastestHardcodedParameters = {} fastestWinningParameters = {} for matchIdx in range(0, len(matches)): match = matches[matchIdx] hardcodedFrozen = match[0] winningParameters = match[1] score = match[2] if score > fastestScore: fastestScore = score fastestWinningParameters = winningParameters fastestHardcodedParameters = hardcodedFrozen.parameters newHardcodedParameters.update(fastestHardcodedParameters) self.winners[FrozenDictionary(newHardcodedParameters)] = \ [ fastestWinningParameters, fastestScore ] if globalParameters["PrintLevel"] >= 1: progressBar.increment() # return resulting hardcodedParameterList returnHardcodedParameterList = [] for hardcodedFrozen in self.winners: returnHardcodedParameterList.append(hardcodedFrozen.parameters) #print "info: after winner-update, returnHardcodedParameterList=", len(returnHardcodedParameterList) return returnHardcodedParameterList
def readLibraryLogicForSchedule(filename): print1("# Reading Library Logic: %s" % (filename)) try: stream = open(filename, "r") except IOError: printExit("Cannot open file: %s" % filename) data = yaml.load(stream, yaml.SafeLoader) stream.close() # verify if len(data) < 6: printExit("len(%s) %u < 7" % (filename, len(data))) # parse out objects versionString = data[0]["MinimumRequiredVersion"] scheduleName = data[1] architectureName = data[2] deviceNames = data[3] problemTypeState = data[4] solutionStates = data[5] indexOrder = data[6] exactLogic = data[7] rangeLogic = data[8] # does version match if not versionIsCompatible(versionString): printWarning("File \"%s\" version=%s does not match Tensile version=%s" \ % (filename, versionString, __version__) ) # unpack problemType problemType = ProblemType(problemTypeState) # unpack solutions solutions = [] for i in range(0, len(solutionStates)): solutionState = solutionStates[i] if solutionState["KernelLanguage"] == "Assembly": isa0 = int(architectureName[3]) isa1 = int(architectureName[4]) isa2 = int(architectureName[5]) solutionState["ISA"] = (isa0, isa1, isa2) else: solutionState["ISA"] = (0, 0, 0) solutionObject = Solution(solutionState) if solutionObject["ProblemType"] != problemType: printExit("ProblemType of file doesn't match solution: %s != %s" \ % (problemType, solutionObject["ProblemType"])) solutions.append(solutionObject) return (scheduleName, deviceNames, problemType, solutions, indexOrder, \ exactLogic, rangeLogic )
def assignProblemIndependentDerivedParameters(state): if "AssignedProblemIndependentDerivedParameters" in state: if state["AssignedProblemIndependentDerivedParameters"]: return state["AssignedProblemIndependentDerivedParameters"] = False if "Valid" not in state: state["Valid"] = True state["SubGroup0"] = state["WorkGroup"][0] state["SubGroup1"] = state["WorkGroup"][1] state["LocalSplitU"] = state["WorkGroup"][2] state["NumThreads"] = state["SubGroup0"] * state["SubGroup1"] * state["LocalSplitU"] state["ThreadTile0"] = state["ThreadTile"][0] state["ThreadTile1"] = state["ThreadTile"][1] # macro tile sizes if "SubGroup0" in state and "ThreadTile0" in state: state["MacroTile0"] = state["SubGroup0"]*state["ThreadTile0"] if "SubGroup1" in state and "ThreadTile1" in state: state["MacroTile1"] = state["SubGroup1"]*state["ThreadTile1"] if "MacroTile" in state: if state["MacroTile0"] != state["MacroTile"][0] \ or state["MacroTile1"] != state["MacroTile"][1]: state["Valid"] = False if state["Valid"] and "MacroTileShapeMax" in state \ and "MacroTileShapeMin" in state: macroTileShape = max(state["MacroTile0"]/state["MacroTile1"], \ state["MacroTile1"]/state["MacroTile0"]) if macroTileShape > state["MacroTileShapeMax"] \ or macroTileShape < state["MacroTileShapeMin"]: if globalParameters["PrintSolutionRejectionReason"]: print1("rejecting MacroTile Shape %u:%u for Min:Max %u:%u" \ % (state["MacroTile0"], state["MacroTile1"], \ state["MacroTileShapeMin"], state["MacroTileShapeMax"])) state["Valid"] = False if "WorkGroupMappingType" in state: if state["WorkGroupMappingType"] == "Z": if abs(state["WorkGroupMapping"]) > 2: if globalParameters["PrintSolutionRejectionReason"]: print1("WorkGroupMappingType=Z only supports WorkGroupMapping=1, 2") state["Valid"] = False # done state["AssignedProblemIndependentDerivedParameters"] = True
def executeStepsInConfig( config ): ############################################################################## # Benchmark Problems ############################################################################## benchmarkDataPath = os.path.join(globalParameters["WorkingPath"], \ globalParameters["BenchmarkDataPath"]) if "BenchmarkProblems" in config: BenchmarkProblems.main( config["BenchmarkProblems"] ) print1("") ############################################################################## # Library Logic ############################################################################## libraryLogicDataPath = os.path.join(globalParameters["WorkingPath"], \ globalParameters["LibraryLogicPath"]) if "LibraryLogic" in config: if os.path.exists(libraryLogicDataPath): libraryLogicFiles = os.listdir(libraryLogicDataPath) else: libraryLogicFiles = [] if len(libraryLogicFiles) < 1 or globalParameters["ForceRedoLibraryLogic"]: if config["LibraryLogic"] != None: libraryLogicConfig = config["LibraryLogic"] else: libraryLogicConfig = {} LibraryLogic.main( libraryLogicConfig ) print1("") else: print1("# LibraryLogic already done.") print1("") ############################################################################## # Write Client ############################################################################## if "LibraryClient" in config: if config["LibraryClient"] != None: libraryClientConfig = config["LibraryClient"] else: libraryClientConfig = {} ClientWriter.main( libraryClientConfig ) print1("")
def writeLogic(outputPath, logicData, solutionWriter): print1("# Writing Library Logic") if not globalParameters["MergeFiles"]: ensurePath(os.path.join(outputPath, "Logic")) # Tensile.h h = "" h += "#pragma once\n" h += "#include \"TensileTypes.h\"\n" # TensileInternal.h ih = "" ih += "#include \"Tensile.h\"\n" ih += "#include \"SolutionHelper.h\"\n" if globalParameters["SolutionMapHash"]: ih += "#include <unordered_map>\n" else: ih += "#include <map>\n" ih += "#include <tuple>\n" # problem type Key problemSizeTemplate = "unsigned int, unsigned int, unsigned int" if globalParameters["RuntimeLanguage"] == "OCL": problemSizeTemplate += ", cl_command_queue" ih += "typedef std::tuple<%s> ProblemSizeKey;\n" \ % (problemSizeTemplate) # hash function ih += "\n" ih += "size_t tensileProblemSizeHasher( const ProblemSizeKey & problemSize ) {\n" ih += " size_t hash = 0;\n" ih += " // ignore lowest 4 bits; keep next 21 bits\n" ih += " size_t hash0 = (std::get<0>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of size0\n" ih += " size_t hash1 = (std::get<1>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of size1\n" ih += " size_t hashU = (std::get<2>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of sizeU\n" ih += " // 21+21+21 = 63 bit hash\n" ih += " hash |= hash0;\n" ih += " hash |= hash1<<21;\n" ih += " hash |= hashU<<42;\n" ih += " return hash;\n" ih += "}\n" ih += "\n" # Tensile.cpp s = "" s += "#include \"Tensile.h\"\n" s += "#include \"TensileInternal.h\"\n" s += "#include \"Solutions.h\"\n" s += "#include \"SolutionMapper.h\"\n" ######################################## # problemType for problemType in logicData: # function argument list argListSizes = solutionWriter.getArgList(problemType, False, False, False) argListStream = solutionWriter.getArgList(problemType, False, False, True) argListData = solutionWriter.getArgList(problemType, True, True, True) # declare tensile_ProblemType h += "\n// enqueue solution\n" h += "TensileStatus tensile_%s(\n" % problemType for i in range(0, len(argListData)): h += " %s %s%s" \ % (argListData[i][0], argListData[i][1], \ ",\n" if i < len(argListData)-1 else ");\n\n") # declare TensileSolutionPointer_ProblemType h += "\n// solution pointer\n" h += "typedef TensileStatus (*TensileSolutionPointer_%s)(\n" \ % problemType for i in range(0, len(argListData)): h += " %s %s%s" % (argListData[i][0], argListData[i][1], ",\n" \ if i < len(argListData)-1 else ");\n\n") numSizes = problemType["TotalIndices"] h += "typedef ProblemSizes<%u, %u, %u> ProblemSizes_%s;\n" \ % (numSizes, problemType["IndicesSummation"][-1], problemType["IndicesFree"][0], problemType) if 0: lastStrideC = problemType["NumIndicesC"] lastStrideA = len(problemType["IndexAssignmentsA"]) lastStrideB = len(problemType["IndexAssignmentsB"]) h += "typedef ProblemParms<%u, %u, %u, %u> ProblemSizes_%s;\n" % \ (lastStrideA, lastStrideB, lastStrideC, numSizes, problemType) # declare tensileGetSolutionPointer_ProblemType h += "\n// get solution pointer\n" h += "TensileSolutionPointer_%s tensileGetSolutionPointer_%s(\n" \ % (problemType, problemType) for i in range(0, len(argListStream)): h += " %s %s%s" \ % (argListStream[i][0], argListStream[i][1], \ ",\n" if i < len(argListStream)-1 else ");\n\n") # declare tensileName_ h += "// get solution name\n" h += "const char * tensileGetSolutionName_%s(\n" \ % (problemType) for i in range(0, len(argListStream)): h += " %s %s%s" \ % (argListStream[i][0], argListStream[i][1], \ ",\n" if i < len(argListStream)-1 else ");\n\n") # get solution naming for problem type solutionsForProblemType = [] for scheduleTuple in logicData[problemType]: solutionsForSchedule = scheduleTuple[2] for solution in solutionsForSchedule: if solution not in solutionsForProblemType: solutionsForProblemType.append(solution) # solution names for problem type solutionNamesForProblemType = [] for solution in solutionsForProblemType: solutionName = solutionWriter.getSolutionName(solution) solutionNamesForProblemType.append(solutionName) # reset problemType source if not globalParameters["MergeFiles"]: filePrefix = "Tensile_%s" % (problemType) s = "#include \"TensileTypes.h\"\n" s = "#include \"Tensile.h\"\n" s = "#include \"SolutionMapper.h\"\n" s += "#include \"TensileInternal.h\"\n" for solutionName in solutionNamesForProblemType: s += "#include \"%s.h\"\n" % solutionName ######################################## # implement per-Schedule functions in source s += "/*******************************************************************************\n * Per-Schedule Functions\n *******************************************************************************/" for scheduleTuple in logicData[problemType]: # get logic parameters for problem type scheduleName = scheduleTuple[0] deviceNames = scheduleTuple[1] solutionsForSchedule = scheduleTuple[2] indexOrder = scheduleTuple[3] exactLogic = scheduleTuple[4] rangeLogic = scheduleTuple[5] # solution names for schedule solutionNamesForSchedule = [] for solution in solutionsForSchedule: solutionName = solutionWriter.getSolutionName(solution) solutionNamesForSchedule.append(solutionName) s += "\n\n" schedProbName = "%s_%s" % (scheduleName, problemType) s += writeSolutionAndExactTable(schedProbName, problemType, \ solutionsForSchedule, solutionNamesForSchedule, exactLogic) # function tensileGetSolutionPointerUncached_Schedule_ProblemType s += "\n// problem size -> solution logic\n" s += "TensileSolutionPointer_%s tensileGetSolutionPointerUncached_%s(\n" \ % (problemType, schedProbName) for i in range(0, len(argListSizes)): s += " %s %s%s" \ % (argListSizes[i][0], argListSizes[i][1], \ ",\n" if i < len(argListSizes)-1 else ") {\n\n") s += writeSolutionAssertionCheckHeader(problemType) exactLogicStr = writeExactLogic(schedProbName, problemType, indexOrder, \ solutionsForSchedule, exactLogic, \ solutionNamesForSchedule, True) if rangeLogic != None: rangeLogicStr = writeRangeLogicRec(0, indexOrder, rangeLogic, \ solutionsForSchedule, solutionNamesForSchedule, problemType, True) else: rangeLogicStr = " return NULL; // none\n" s += " /* exact mappings */\n" s += exactLogicStr s += "\n /* range mappings */\n" s += rangeLogicStr s += "\n}\n" # function tensileGetSolutionName_Schedule_ProblemType s += "\n// get solution name for problem size\n" s += "const char * tensileGetSolutionName_%s(\n" \ % (schedProbName) for i in range(0, len(argListSizes)): s += " %s %s%s" \ % (argListSizes[i][0], argListSizes[i][1], \ ",\n" if i < len(argListSizes)-1 else ") {\n\n") s += writeSolutionAssertionCheckHeader(problemType) exactLogicStr = writeExactLogic(schedProbName, problemType, indexOrder, \ solutionsForSchedule, exactLogic, \ solutionNamesForSchedule, False) if rangeLogic != None: rangeLogicStr = writeRangeLogicRec(0, indexOrder, rangeLogic, \ solutionsForSchedule, solutionNamesForSchedule, problemType, False) else: rangeLogicStr = " return NULL; // none\n" s += " /* exact mappings */\n" s += exactLogicStr s += "\n /* range mappings */\n" s += rangeLogicStr s += "\n}\n" ######################################## # implement problem-type functions in source s += "/*******************************************************************************\n * Per-ProblemType Functions\n *******************************************************************************/" if globalParameters["SolutionMapHash"]: ih += "typedef std::unordered_map<ProblemSizeKey, TensileSolutionPointer_%s, std::function<size_t (ProblemSizeKey)>> Map_%s;\n" \ % (problemType, problemType ) else: ih += "typedef std::map<ProblemSizeKey, TensileSolutionPointer_%s> Map_%s;\n" \ % (problemType, problemType) ih += "extern Map_%s solutionMap_%s;\n" % (problemType, problemType) # implement tensileGetSolutionPointerUncached_ProblemType for ptr in [True, False]: returnType = "PointerUncached" if ptr else "Name" s += "\n// return solution %s\n" % returnType s += ("TensileSolutionPointer_%s " % problemType) if ptr else "const char *" s += "tensileGetSolution%s_%s(\n" \ % (returnType, problemType) for i in range(0, len(argListStream)): s += " %s %s%s" \ % (argListStream[i][0], argListStream[i][1], \ ",\n" if i < len(argListStream)-1 else ") {\n") # choose from schedules based on device name # print logicData schedules = logicData[problemType] numSchedules = len(schedules) if numSchedules > 1: reordered_schedules = [] for scheduleIdx in range(0, numSchedules): schedule = schedules[scheduleIdx] deviceNames = schedule[1] if deviceNames != ["fallback" ] and deviceNames != ["Device 0000"]: reordered_schedules.append(schedule) for scheduleIdx in range(0, numSchedules): schedule = schedules[scheduleIdx] deviceNames = schedule[1] if deviceNames == ["fallback" ] or deviceNames == ["Device 0000"]: reordered_schedules.append(schedule) # get device name if globalParameters["RuntimeLanguage"] == "OCL": s += "get device name opencl;\n" else: s += "\n// get device name hip;\n" s += " int deviceId;\n" s += " hipGetDevice(&deviceId);\n" s += " hipDeviceProp_t deviceProperties;\n" s += " hipGetDeviceProperties(&deviceProperties, deviceId);\n" s += " std::string name = deviceProperties.name;\n" if problemType["DataType"].isDouble(): s += "\n" s += "// intercept schedule selection and call HIP (source) kernel\n" s += " if((strideA2K == 0) || (strideB2K == 0))\n" s += " {\n" numSchedules = len(schedules) schedule = reordered_schedules[numSchedules - 1] scheduleName = schedule[0] s += " return tensileGetSolution%s_%s_%s(" \ % ( returnType, scheduleName, problemType) for i in range(0, len(argListSizes)): s += "%s%s" \ % (argListSizes[i][1], ", " if i < len(argListSizes)-1 else ");\n") s += " }\n" s += "\n" if problemType["DataType"].isHalf(): # "first" free index, usually the letter "I" free0Index = problemType["IndicesFree"][0] free0Char = globalParameters["IndexChars"][free0Index] # "second" free index, usually the letter "J" free1Index = problemType["IndicesFree"][1] free1Char = globalParameters["IndexChars"][free1Index] s += "\n" s += "// intercept schedule selection and call HIP (source) kernel\n" s += "// if either the summation size or the 'first' free index size\n" s += "// is odd or the 'second' free index size is 1\n" s += " if (((sizeL & 1) == 1) || ((size%s & 1) == 1)" % ( free0Char) s += " || (size%s == 1))\n" % (free1Char) s += " {\n" numSchedules = len(schedules) schedule = reordered_schedules[numSchedules - 1] scheduleName = schedule[0] s += " return tensileGetSolution%s_%s_%s(" \ % ( returnType, scheduleName, problemType) for i in range(0, len(argListSizes)): s += "%s%s" \ % (argListSizes[i][1], ", " if i < len(argListSizes)-1 else ");\n") s += " }\n" s += "\n" for scheduleIdx in range(0, numSchedules): schedule = reordered_schedules[scheduleIdx] scheduleName = schedule[0] deviceNames = schedule[1] if scheduleIdx > 0: s += " else " if scheduleIdx < numSchedules - 1: s += "if (" for deviceNameIdx in range(0, len(deviceNames)): deviceName = deviceNames[deviceNameIdx] if deviceNameIdx > 0: s += " || " s += "name == \"%s\"" % deviceName s += ")" s += "\n {\n" s += " return tensileGetSolution%s_%s_%s(" \ % ( returnType, scheduleName, problemType) for i in range(0, len(argListSizes)): s += "%s%s" \ % (argListSizes[i][1], ", " if i < len(argListSizes)-1 else ");\n") s += " }\n" else: # == 1 schedule = schedules[0] scheduleName = schedule[0] s += " return tensileGetSolution%s_%s_%s(" \ % ( returnType, scheduleName, problemType) for i in range(0, len(argListSizes)): s += "%s%s" \ % (argListSizes[i][1], ", " if i < len(argListSizes)-1 else ");\n") s += "\n}\n" # implement tensileGetSolutionPointer_ProblemType s += "\n// return solution pointer; user calls it\n" s += "Map_%s solutionMap_%s%s;\n" % ( problemType, problemType, "(1024, tensileProblemSizeHasher)" if globalParameters["SolutionMapHash"] else "") s += "TensileSolutionPointer_%s tensileGetSolutionPointer_%s(\n" \ % (problemType, problemType) for i in range(0, len(argListStream)): s += " %s %s%s" \ % (argListStream[i][0], argListStream[i][1], \ ",\n" if i < len(argListStream)-1 else ") {\n") # create key s += " ProblemSizeKey key = std::make_tuple( size%s, size%s, size%s%s );\n" \ % ( \ globalParameters["IndexChars"][problemType["Index0"]], \ globalParameters["IndexChars"][problemType["Index1"]], \ globalParameters["IndexChars"][problemType["IndexUnroll"]], \ ", stream" if globalParameters["RuntimeLanguage"] == "OCL" else "") # check for key in map s += " static std::mutex findKernelMutex;\n" s += " std::lock_guard<std::mutex> findKernelLock(findKernelMutex);\n" s += " Map_%s::iterator iter = solutionMap_%s.find(key);\n" \ % (problemType, problemType) s += " if (iter != solutionMap_%s.end()) {\n" % problemType s += " return iter->second;\n" s += " } else {\n" s += " TensileSolutionPointer_%s ptr = tensileGetSolutionPointerUncached_%s(\n" \ % (problemType, problemType) for i in range(0, len(argListStream)): s += " %s%s" \ % (argListStream[i][1], "," if i < len(argListStream)-1 else ");") s += "\n" s += " solutionMap_%s[key] = ptr;\n" % problemType s += " return ptr;\n" s += " }\n" s += "}\n" # declare tensile_ProblemType s += "\n// main call to solution; enqueues a kernel\n" s += "TensileStatus tensile_%s(\n" % problemType for i in range(0, len(argListData)): s += " %s %s%s" \ % (argListData[i][0], argListData[i][1], \ ",\n" if i < len(argListData)-1 else ") {\n") s += " TensileSolutionPointer_%s ptr = tensileGetSolutionPointer_%s(\n" \ % (problemType, problemType) for i in range(0, len(argListStream)): s += " %s%s" \ % (argListStream[i][1], ", " if i < len(argListStream)-1 else ");") s += "\n" s += " if ( ptr ) {\n" s += " return ptr(" for i in range(0, len(argListData)): s += "%s%s" \ % (argListData[i][1], ", " if i < len(argListData)-1 else ");\n") s += " } else {\n" s += " return tensileStatusFailure; // no solution found\n" s += " }\n" s += "}\n" # open and close problemType files if not globalParameters["MergeFiles"]: logicSourceFile = open(os.path.join(outputPath, "Logic", \ "%s.cpp" % filePrefix), "w") logicSourceFile.write(s) logicSourceFile.close() # close merged files if globalParameters["MergeFiles"]: logicSourceFile = open(os.path.join(outputPath, \ "Tensile.cpp"), "w") logicSourceFile.write(s) logicSourceFile.close() logicHeaderFile = open(os.path.join(outputPath, \ "Tensile.h"), "w") logicHeaderFile.write(h) logicHeaderFile.close() internalHeaderFile = open(os.path.join(outputPath, \ "TensileInternal.h"), "w") internalHeaderFile.write(ih) internalHeaderFile.close()
def writeCMake(outputPath, solutions, kernels, libraryStaticFiles, clientName): print1("# Writing Custom CMake") ############################################################################## # Min Naming ############################################################################## if globalParameters["ShortNames"] and not globalParameters["MergeFiles"]: solutionSerialNaming = Solution.getSerialNaming(solutions) kernelSerialNaming = Solution.getSerialNaming(kernels) else: solutionSerialNaming = None kernelSerialNaming = None solutionMinNaming = Solution.getMinNaming(solutions) kernelMinNaming = Solution.getMinNaming(kernels) solutionWriter = SolutionWriter( \ solutionMinNaming, solutionSerialNaming, \ kernelMinNaming, kernelSerialNaming) kernelWriterSource = KernelWriterSource( \ kernelMinNaming, kernelSerialNaming) kernelWriterAssembly = KernelWriterAssembly( \ kernelMinNaming, kernelSerialNaming) generatedFile = open(os.path.join(outputPath, "Generated.cmake"), "w") generatedFile.write(CMakeHeader) generatedFile.write("set( TensileClient_SOLUTIONS\n") # write solution names if globalParameters["MergeFiles"]: generatedFile.write(" ${CMAKE_SOURCE_DIR}/Solutions.h\n") generatedFile.write(" ${CMAKE_SOURCE_DIR}/Solutions.cpp\n") else: for solution in solutions: solutionName = solutionWriter.getSolutionName(solution) generatedFile.write(" ${CMAKE_SOURCE_DIR}/Solutions/%s.h\n" \ % (solutionName) ) generatedFile.write(" ${CMAKE_SOURCE_DIR}/Solutions/%s.cpp\n" \ % (solutionName) ) generatedFile.write(" )\n") # write kernel names generatedFile.write("set( TensileClient_KERNELS\n") if globalParameters["MergeFiles"]: generatedFile.write(" ${CMAKE_SOURCE_DIR}/Kernels.h\n") generatedFile.write(" ${CMAKE_SOURCE_DIR}/Kernels.cpp\n") else: for kernel in kernels: kernelName = kernelWriterSource.getKernelName(kernel) if kernel[ "KernelLanguage"] == "Source" else kernelWriterAssembly.getKernelName( kernel) generatedFile.write(" ${CMAKE_SOURCE_DIR}/Kernels/%s.h\n" % (kernelName)) generatedFile.write(" ${CMAKE_SOURCE_DIR}/Kernels/%s.cpp\n" % kernelName) generatedFile.write(" )\n") generatedFile.write("set( TensileClient_SOURCE\n") for fileName in libraryStaticFiles: # copy file shutil_copy( os.path.join(globalParameters["SourcePath"], fileName), \ outputPath ) # add file to cmake generatedFile.write(" ${CMAKE_SOURCE_DIR}/%s\n" % fileName) generatedFile.write(" )\n\n") # close generated cmake generatedFile.close()
def TensileCreateLibrary(): print1("") print1(HR) print1("# Tensile Create Library") print2(HR) print2("") ############################################################################## # Parse Command Line Arguments ############################################################################## print2("Arguments: %s" % sys.argv) argParser = argparse.ArgumentParser() argParser.add_argument("LogicPath", help="Path to LibraryLogic.yaml files.") argParser.add_argument("OutputPath", help="Where to write library files?") argParser.add_argument("RuntimeLanguage", help="Which runtime language?", \ choices=["OCL", "HIP", "HSA"]) argParser.add_argument("--merge-files", dest="MergeFiles", \ action="store_true") argParser.add_argument("--no-merge-files", dest="MergeFiles", \ action="store_false") argParser.add_argument("--short-file-names", dest="ShortNames", \ action="store_true") argParser.add_argument("--no-short-file-names", dest="ShortNames", \ action="store_false") argParser.add_argument("--library-print-debug", dest="LibraryPrintDebug", \ action="store_true") argParser.add_argument("--no-library-print-debug", dest="LibraryPrintDebug", \ action="store_false") args = argParser.parse_args() logicPath = args.LogicPath outputPath = args.OutputPath print2("OutputPath: %s" % outputPath) ensurePath(outputPath) arguments = {} arguments["RuntimeLanguage"] = args.RuntimeLanguage arguments["MergeFiles"] = args.MergeFiles arguments["ShortNames"] = args.ShortNames arguments["LibraryPrintDebug"] = args.LibraryPrintDebug arguments["CodeFromFiles"] = False assignGlobalParameters(arguments) if not os.path.exists(logicPath): printExit("LogicPath %s doesn't exist" % logicPath) logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \ if (os.path.isfile(os.path.join(logicPath, f)) \ and os.path.splitext(f)[1]==".yaml")] print1("# LibraryLogicFiles:" % logicFiles) for logicFile in logicFiles: print1("# %s" % logicFile) ############################################################################## # Parse config files ############################################################################## solutions = [] logicData = {} # keys are problemTypes, values are schedules for logicFileName in logicFiles: (scheduleName, deviceNames, problemType, solutionsForSchedule, \ indexOrder, exactLogic, rangeLogic) \ = YAMLIO.readLibraryLogicForSchedule(logicFileName) if problemType not in logicData: logicData[problemType] = [] logicData[problemType].append((scheduleName, deviceNames, \ solutionsForSchedule, indexOrder, exactLogic, rangeLogic )) for solution in solutionsForSchedule: if solution not in solutions: solutions.append(solution) # create solution writer and kernel writer kernels = [] kernelsBetaOnly = [] for solution in solutions: solutionKernels = solution.getKernels() for kernel in solutionKernels: if kernel not in kernels: kernels.append(kernel) solutionKernelsBetaOnly = solution.getKernelsBetaOnly() for kernel in solutionKernelsBetaOnly: if kernel not in kernelsBetaOnly: kernelsBetaOnly.append(kernel) # if any kernels are assembly, append every ISA supported if globalParameters["ShortNames"] and not globalParameters["MergeFiles"]: solutionSerialNaming = Solution.getSerialNaming(solutions) kernelSerialNaming = Solution.getSerialNaming(kernels) else: solutionSerialNaming = None kernelSerialNaming = None solutionMinNaming = Solution.getMinNaming(solutions) kernelMinNaming = Solution.getMinNaming(kernels) solutionWriter = SolutionWriter( \ solutionMinNaming, solutionSerialNaming, \ kernelMinNaming, kernelSerialNaming) kernelWriterSource = KernelWriterSource( \ kernelMinNaming, kernelSerialNaming) kernelWriterAssembly = KernelWriterAssembly( \ kernelMinNaming, kernelSerialNaming) # write solutions and kernels writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \ solutionWriter, kernelWriterSource, kernelWriterAssembly) libraryStaticFiles = [ "SolutionMapper.h", "TensileTypes.h", "KernelHeader.h", "SolutionHelper.cpp", "SolutionHelper.h", "Tools.cpp", "Tools.h" ] # write cmake clientName = "LibraryClient" writeCMake(outputPath, solutions, kernels, libraryStaticFiles, clientName) # write logic writeLogic(outputPath, logicData, solutionWriter) print1("# Tensile Library Writer DONE") print1(HR) print1("")
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \ solutionWriter, kernelWriterSource, kernelWriterAssembly): start = time.time() print1("# Writing Kernels...") if not globalParameters["MergeFiles"]: ensurePath(os.path.join(outputPath, "Solutions")) ensurePath(os.path.join(outputPath, "Kernels")) if globalParameters["ShowProgressBar"]: progressBar = ProgressBar(len(kernels)) ############################################################################## # Write Kernels ############################################################################## if globalParameters["MergeFiles"]: kernelSourceFile = open(os.path.join(outputPath, \ "Kernels.cpp"), "w") kernelHeaderFile = open(os.path.join(outputPath, \ "Kernels.h"), "w") kernelSourceFile.write(CHeader) kernelHeaderFile.write(CHeader) kernelSourceFile.write("#include \"Kernels.h\"\n") kernelHeaderFile.write("#pragma once\n") if globalParameters["RuntimeLanguage"] == "HIP": kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n") kernelHeaderFile.write("#include <hip/hip_runtime.h>\n") kernelHeaderFile.write("#include \"TensileTypes.h\"\n") kernelHeaderFile.write("#include \"KernelHeader.h\"\n") kernelHeaderFile.write("\n\n") kernelHeaderFile.write( "__device__ inline int GenDot4(int a, int b, int c) { \n") kernelHeaderFile.write( " typedef struct { int c0:8,c1:8,c2:8,c3:8; } C4I8;\n") kernelHeaderFile.write( " typedef union { int32_t i; C4I8 z; } PkInt8x4;\n") kernelHeaderFile.write(" PkInt8x4 va, vb; va.i = a; vb.i = b;\n") kernelHeaderFile.write( " return c + (vb.z.c3*va.z.c3 + vb.z.c2*va.z.c2 + vb.z.c1*va.z.c1 + vb.z.c0*va.z.c0); }\n" ) kernelHeaderFile.write("\n\n") else: kernelHeaderFile.write("#include <string>\n") kernelsWithBuildErrs = {} prepAsm() if globalParameters["CpuThreads"] == 0: cpus = 0 elif globalParameters["CodeFromFiles"]: cpu_count = multiprocessing.cpu_count() cpus = cpu_count*4 if globalParameters["CpuThreads"] == -1 \ else globalParameters["CpuThreads"] else: #! CodeFromFiles is not thread-safe since code merged into same file cpus = 1 workPerCpu = max(10, (len(kernels) + cpus - 1) / cpus) if cpus else 1 print "# Launching kernel compilation processes (cpus=%u kernelsPerCpu=%u)" % ( cpus, workPerCpu) kiStart = 0 cpu = 0 threads = [] if 1 and cpus and globalParameters["ShowProgressBar"]: processLaunchProgressBar = ProgressBar(len(kernels)) else: processLaunchProgressBar = None while kiStart < len(kernels): kiStop = min(len(kernels), kiStart + workPerCpu) if cpus: results = [] parentConn, child = multiprocessing.Pipe() args=(kernels, kernelWriterSource, kernelWriterAssembly, \ kiStart, kiStop, child) t = multiprocessing.Process(target=processKernelSourceChunk, args=args) t.start() child.close() # close child pipe in the parent process threads.append([t, kiStart, kiStop, parentConn]) if processLaunchProgressBar: processLaunchProgressBar.increment(kiStop - kiStart) else: sys.stderr.write( " # launched process %s for kernels %d..%d\n" % (t, kiStart, kiStop - 1)) else: # non-threaded version processKernelSourceChunk(kernels, kernelWriterSource, kernelWriterAssembly, \ kiStart, kiStop, None) kiStart += workPerCpu cpu += 1 sys.stderr.write("# Waiting for kernel compilation processes...\n") someError = 0 for (t, kiStart, kiStop, parentConn) in threads: try: results = parentConn.recv() except EOFError as pipeErr: print "*** warning: process", t, "returned pipe EOF", t, pipeErr t.join() e = t.exitcode if e != 0: print "*** warning: process", t, "returned", t, e someError = 1 results = [] if globalParameters["ShowProgressBar"]: progressBar.increment(kiStop - kiStart) for (err, src, header, kernelName) in results: if err: kernelsWithBuildErrs[kernelName] = err #print "*** warning: invalid kernel#%s"%kernelName # write kernel.cpp if not globalParameters["MergeFiles"]: kernelSourceFile = open(os.path.join(outputPath, \ "Kernels", kernelName+".cpp"), "w") kernelSourceFile.write(CHeader) kernelSourceFile.write(src) if not globalParameters["MergeFiles"]: kernelSourceFile.close() # write kernel.h kernelHeaderFile = open(os.path.join(outputPath, \ "Kernels", kernelName+".h"), "w") kernelHeaderFile.write(CHeader) kernelHeaderFile.write(header) if not globalParameters["MergeFiles"]: kernelHeaderFile.close() if someError: print "\nKernel compilation failed in one or more subprocesses. May want to set CpuThreads=0 and re-run to make debug easier" printExit("** kernel compilation failure **") # beta-only kernels for kernel in kernelsBetaOnly: kernelWriter = kernelWriterSource kernelName = kernelWriter.getKernelNameBetaOnly(kernel) # write kernel.cpp if not globalParameters["MergeFiles"]: kernelSourceFile = open(os.path.join(outputPath, \ "Kernels", kernelName+".cpp"), "w") kernelSourceFile.write(CHeader) (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel) kernelSourceFile.write(src) if err: print "*** warning: invalid kernel#%u" % kernelName if not globalParameters["MergeFiles"]: kernelSourceFile.close() # write kernel.h if not globalParameters["MergeFiles"]: kernelHeaderFile = open(os.path.join(outputPath, \ "Kernels", kernelName + ".h"), "w") kernelHeaderFile.write(CHeader) kernelHeaderFile.write( kernelWriter.getHeaderFileStringBetaOnly(kernel)) if not globalParameters["MergeFiles"]: kernelHeaderFile.close() # close merged if globalParameters["MergeFiles"]: kernelHeaderFile.close() stop = time.time() print "# Kernel Building elapsed time = %.1f secs" % (stop - start) print1("# Writing Solutions") if globalParameters["ShowProgressBar"]: progressBar = ProgressBar(len(solutions)) ############################################################################## # Write Solutions ############################################################################## if globalParameters["MergeFiles"]: solutionSourceFile = open(os.path.join(outputPath, \ "Solutions.cpp"), "w") solutionHeaderFile = open(os.path.join(outputPath, \ "Solutions.h"), "w") if globalParameters["MergeFiles"]: solutionSourceFile.write(CHeader) solutionHeaderFile.write(CHeader) solutionSourceFile.write("#include \"Solutions.h\"\n") solutionSourceFile.write("#include <algorithm>\n") solutionHeaderFile.write("#include \"TensileTypes.h\"\n") solutionHeaderFile.write("#include \"Kernels.h\"\n") solutionHeaderFile.write("#include \"SolutionHelper.h\"\n") solutionHeaderFile.write("#include \"Tools.h\"\n") if globalParameters["CodeFromFiles"]: solutionHeaderFile.write("#include <unistd.h>\n") for solution in solutions: # get solution name if not globalParameters["MergeFiles"]: solutionFileName = solutionWriter.getSolutionName(solution) # write solution.cpp if not globalParameters["MergeFiles"]: solutionSourceFile = open(os.path.join(outputPath, \ "Solutions", solutionFileName+".cpp"), "w") solutionSourceFile.write(CHeader) solutionSourceFile.write( \ solutionWriter.getSourceFileString(solution, kernelsWithBuildErrs)) if not globalParameters["MergeFiles"]: solutionSourceFile.close() # write solution.h if not globalParameters["MergeFiles"]: solutionHeaderFile = open(os.path.join(outputPath, \ "Solutions", solutionFileName+".h"), "w") solutionHeaderFile.write(CHeader) solutionHeaderFile.write( \ solutionWriter.getHeaderFileString(solution)) if not globalParameters["MergeFiles"]: solutionHeaderFile.close() if globalParameters["ShowProgressBar"]: progressBar.increment() # close merged if not globalParameters["MergeFiles"]: solutionHeaderFile.close() if globalParameters["ExitAfterKernelGen"]: printExit( "** Exiting after kernel generation due to ExitAfterKernelGen=1")
def TensileCreateLibrary(): print1("") print1(HR) print1("# Tensile Create Library") print2(HR) print2("") ############################################################################## # Parse Command Line Arguments ############################################################################## print2("Arguments: %s" % sys.argv) argParser = argparse.ArgumentParser() argParser.add_argument("LogicPath", help="Path to LibraryLogic.yaml files.") argParser.add_argument("OutputPath", help="Where to write library files?") argParser.add_argument("RuntimeLanguage", help="Which runtime language?", \ choices=["OCL", "HIP", "HSA"]) argParser.add_argument("--merge-files", dest="MergeFiles", \ action="store_true") argParser.add_argument("--no-merge-files", dest="MergeFiles", \ action="store_false") argParser.add_argument("--short-file-names", dest="ShortNames", \ action="store_true") argParser.add_argument("--no-short-file-names", dest="ShortNames", \ action="store_false") argParser.add_argument("--library-print-debug", dest="LibraryPrintDebug", \ action="store_true") argParser.add_argument("--no-library-print-debug", dest="LibraryPrintDebug", \ action="store_false") argParser.add_argument( "--isa", dest="isa", action="append", help="which architectures for assembly kernels to target") args = argParser.parse_args() logicPath = args.LogicPath outputPath = args.OutputPath print2("OutputPath: %s" % outputPath) ensurePath(outputPath) arguments = {} arguments["RuntimeLanguage"] = args.RuntimeLanguage arguments["MergeFiles"] = args.MergeFiles arguments["ShortNames"] = args.ShortNames arguments["LibraryPrintDebug"] = args.LibraryPrintDebug if args.isa: newISA = [] for isa in args.isa: gfxIdx = isa.find("gfx") if gfxIdx >= 0: major = int(isa[gfxIdx + 3:gfxIdx + 4]) minor = int(isa[gfxIdx + 4:gfxIdx + 5]) step = int(isa[gfxIdx + 5:gfxIdx + 6]) isaTuple = (major, minor, step) if isaTuple in globalParameters[ "SupportedISA"] and isaTuple not in newISA: print1("# User-Specified ISA: gfx%u%u%u" % (major, minor, step)) newISA.append(isaTuple) else: printWarning("isa parameter must be formed as: --isa gfx803") arguments["SupportedISA"] = newISA assignGlobalParameters(arguments) if not os.path.exists(logicPath): printExit("LogicPath %s doesn't exist" % logicPath) logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \ if (os.path.isfile(os.path.join(logicPath, f)) \ and os.path.splitext(f)[1]==".yaml")] print1("# LibraryLogicFiles:" % logicFiles) for logicFile in logicFiles: print1("# %s" % logicFile) ############################################################################## # Parse config files ############################################################################## solutions = [] logicData = {} # keys are problemTypes, values are schedules for logicFileName in logicFiles: (scheduleName, deviceNames, problemType, solutionsForSchedule, \ indexOrder, exactLogic, rangeLogic) \ = YAMLIO.readLibraryLogicForSchedule(logicFileName) if problemType not in logicData: logicData[problemType] = [] logicData[problemType].append((scheduleName, deviceNames, \ solutionsForSchedule, indexOrder, exactLogic, rangeLogic )) for solution in solutionsForSchedule: if solution not in solutions: solutions.append(solution) # create solution writer and kernel writer kernels = [] kernelsBetaOnly = [] for solution in solutions: solutionKernels = solution.getKernels() for kernel in solutionKernels: if kernel not in kernels: kernels.append(kernel) solutionKernelsBetaOnly = solution.getKernelsBetaOnly() for kernel in solutionKernelsBetaOnly: if kernel not in kernelsBetaOnly: kernelsBetaOnly.append(kernel) # if any kernels are assembly, append every ISA supported if globalParameters["RuntimeLanguage"] == "HIP": newKernels = [] for kernel in kernels: if kernel["KernelLanguage"] == "Assembly": kernel["ISA"] = globalParameters["SupportedISA"][0] for i in range(1, len(globalParameters["SupportedISA"])): newKernel = deepcopy(kernel) newKernel["ISA"] = globalParameters["SupportedISA"][i] newKernels.append(newKernel) else: kernel["ISA"] = (0, 0, 0) kernels.extend(newKernels) if globalParameters["ShortNames"] and not globalParameters["MergeFiles"]: solutionSerialNaming = Solution.getSerialNaming(solutions) kernelSerialNaming = Solution.getSerialNaming(kernels) else: solutionSerialNaming = None kernelSerialNaming = None solutionMinNaming = Solution.getMinNaming(solutions) kernelMinNaming = Solution.getMinNaming(kernels) solutionWriter = SolutionWriter( \ solutionMinNaming, solutionSerialNaming, \ kernelMinNaming, kernelSerialNaming) kernelWriterSource = KernelWriterSource( \ kernelMinNaming, kernelSerialNaming) kernelWriterAssembly = KernelWriterAssembly( \ kernelMinNaming, kernelSerialNaming) # write solutions and kernels writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \ solutionWriter, kernelWriterSource, kernelWriterAssembly) libraryStaticFiles = [ "TensileTypes.h", "KernelHeader.h", "SolutionHelper.cpp", "SolutionHelper.h", "Tools.cpp", "Tools.h" ] # write cmake clientName = "LibraryClient" writeCMake(outputPath, solutions, kernels, libraryStaticFiles, clientName) # write logic writeLogic(outputPath, logicData, solutionWriter) print1("# Tensile Library Writer DONE") print1(HR) print1("")
def main(config): libraryLogicPath = os.path.join(globalParameters["WorkingPath"], \ globalParameters["LibraryLogicPath"]) pushWorkingPath(globalParameters["LibraryClientPath"]) ############################################################################## # Copy Source Files ############################################################################## pushWorkingPath("source") filesToCopy = [ "Client.cpp", "Client.h", "DeviceStats.h", "ReferenceCPU.h", "TensorUtils.h", "MathTemplates.cpp", "MathTemplates.h", "KernelHeader.h", "Tools.h", "CMakeLists.txt", "TensileConfig.cmake", "TensileConfigVersion.cmake" ] for f in filesToCopy: shutil_copy(os.path.join(globalParameters["SourcePath"], f), globalParameters["WorkingPath"]) if globalParameters["RuntimeLanguage"] == "OCL": shutil_copy( os.path.join(globalParameters["SourcePath"], "FindOpenCL.cmake"), globalParameters["WorkingPath"]) else: shutil_copy( os.path.join(globalParameters["SourcePath"], "FindHIP.cmake"), globalParameters["WorkingPath"]) shutil_copy( os.path.join(globalParameters["SourcePath"], "FindHCC.cmake"), globalParameters["WorkingPath"]) ############################################################################## # Read Logic Files ############################################################################## logicFiles = [os.path.join(libraryLogicPath, f) for f \ in os.listdir(libraryLogicPath) \ if (os.path.isfile(os.path.join(libraryLogicPath, f)) \ and os.path.splitext(f)[1]==".yaml")] print1("LogicFiles: %s" % logicFiles) functions = [] functionNames = [] enableHalf = False for logicFileName in logicFiles: (scheduleName, deviceNames, problemType, solutionsForType, \ indexOrder, exactLogic, rangeLogic) \ = YAMLIO.readLibraryLogicForSchedule(logicFileName) if problemType["DataType"].isHalf(): enableHalf = True functions.append((scheduleName, problemType)) functionNames.append("tensile_%s" % (problemType)) globalParameters["EnableHalf"] = enableHalf ############################################################################## # Write Generated Header ############################################################################## forBenchmark = False solutions = None problemSizes = None stepName = None writeClientParameters(forBenchmark, solutions, problemSizes, stepName, \ functions) popWorkingPath() # source ############################################################################## # Run Build Script ############################################################################## # if redo=true, clobber the build directory if globalParameters["ForceRedoLibraryClient"]: rmtree(os.path.join(globalParameters["WorkingPath"], "build"), \ ignore_errors=True) pushWorkingPath("build") # write runScript path = globalParameters["WorkingPath"] forBenchmark = False runScriptName = writeRunScript(path, libraryLogicPath, forBenchmark) # run runScript process = Popen(runScriptName, cwd=globalParameters["WorkingPath"]) process.communicate() if process.returncode: printWarning("ClientWriter Benchmark Process exited with code %u" % process.returncode) popWorkingPath() # build popWorkingPath() # LibraryClient return process.returncode
def writeSolutionsAndKernels(outputPath, problemTypes, solutions, kernels, kernelsBetaOnly, \ solutionWriter, kernelWriterSource, kernelWriterAssembly): start = time.time() print1("# Writing Kernels...") if not globalParameters["MergeFiles"]: ensurePath(os.path.join(outputPath, "Solutions")) ensurePath(os.path.join(outputPath, "Kernels")) if globalParameters["ShowProgressBar"]: progressBar = ProgressBar(len(kernels)) ############################################################################## # Write Kernels ############################################################################## if globalParameters["MergeFiles"]: kernelSourceFile = open(os.path.join(outputPath, \ "Kernels.cpp"), "w") kernelHeaderFile = open(os.path.join(outputPath, \ "Kernels.h"), "w") kernelSourceFile.write(CHeader) kernelHeaderFile.write(CHeader) kernelSourceFile.write("#include \"Kernels.h\"\n") kernelHeaderFile.write("#pragma once\n") if globalParameters["RuntimeLanguage"] == "HIP": kernelHeaderFile.write("// Also set env var HCC_ENABLE_PRINTF=1 for printf\n") kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n") kernelHeaderFile.write("#include <hip/hip_runtime.h>\n") kernelHeaderFile.write("#include \"TensileTypes.h\"\n") kernelHeaderFile.write("#include \"KernelHeader.h\"\n") kernelHeaderFile.write("\n\n") kernelHeaderFile.write("__device__ inline int GenDot4(int a, int b, int c) { \n") kernelHeaderFile.write("#if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__\n") kernelHeaderFile.write(" typedef union { int32_t i; char4 z; } PkInt8x4;\n") kernelHeaderFile.write("#else\n") kernelHeaderFile.write(" typedef struct { int c0:8,c1:8,c2:8,c3:8; } C4I8;\n") kernelHeaderFile.write(" typedef union { int32_t i; C4I8 z; } PkInt8x4;\n") kernelHeaderFile.write("#endif\n") kernelHeaderFile.write(" PkInt8x4 va, vb; va.i = a; vb.i = b;\n") kernelHeaderFile.write("#if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__\n") kernelHeaderFile.write(" return amd_mixed_dot(va.z, vb.z, c, true); }\n") kernelHeaderFile.write("#else\n") kernelHeaderFile.write(" return c + (vb.z.c3*va.z.c3 + vb.z.c2*va.z.c2 + vb.z.c1*va.z.c1 + vb.z.c0*va.z.c0); }\n") kernelHeaderFile.write("#endif\n") kernelHeaderFile.write("\n\n") else: kernelHeaderFile.write("#include <string>\n") kernelsWithBuildErrs = {} prepAsm() if globalParameters["CpuThreads"] == 0: cpus = 0 elif globalParameters["CodeFromFiles"]: cpu_count = multiprocessing.cpu_count() cpuThreads = globalParameters["CpuThreads"] cpus = cpu_count*abs(cpuThreads) if cpuThreads < 0 \ else min(cpu_count, cpuThreads) else: #! CodeFromFiles is not thread-safe since code merged into same file cpus = 1 workPerCpu = max(10, (len(kernels)+cpus-1)/cpus) if cpus else 1 kiStart = 0 cpu = 0 threads = [] if 1 and cpus and globalParameters["ShowProgressBar"]: print "# Launching kernel compilation processes (cpus=%u kernelsPerCpu=%u)" % (cpus, workPerCpu) processLaunchProgressBar = ProgressBar(len(kernels)) else: print "# Compiling kernels (no multiprocessing, cpus=%u #kernels=%u)" % (cpus, workPerCpu) processLaunchProgressBar = None while kiStart < len(kernels): kiStop = min(len(kernels), kiStart + workPerCpu) if cpus: results = [] parentConn,child = multiprocessing.Pipe() args=(kernels, kernelWriterSource, kernelWriterAssembly, \ kiStart, kiStop, child) t = multiprocessing.Process(target=processKernelSourceChunk, args=args) t.start() child.close() # close child pipe in the parent process threads.append([t,kiStart,kiStop, parentConn]) if processLaunchProgressBar: processLaunchProgressBar.increment(kiStop-kiStart) else: sys.stderr.write(" # launched process %s for kernels %d..%d\n" %(t, kiStart, kiStop-1)) else: # non-threaded version results = processKernelSourceChunk(kernels, kernelWriterSource, kernelWriterAssembly, \ kiStart, kiStop, None) if globalParameters["ShowProgressBar"]: progressBar.increment(kiStop-kiStart) processResults(results, outputPath, kernelsWithBuildErrs, kernelSourceFile, kernelHeaderFile) kiStart += workPerCpu cpu += 1 sys.stderr.write("# Waiting for kernel compilation processes...\n") someError = 0 if cpus: for (t,kiStart,kiStop,parentConn) in threads: try: results = parentConn.recv() except EOFError as pipeErr: print "*** warning: process", t, "returned pipe EOF",t,pipeErr t.join() e = t.exitcode if e != 0 : print "*** warning: process", t, "returned",t,e someError = 1 results = [] if globalParameters["ShowProgressBar"]: progressBar.increment(kiStop-kiStart) processResults(results, outputPath, kernelsWithBuildErrs, kernelSourceFile, kernelHeaderFile) if someError: print "\nKernel compilation failed in one or more subprocesses. May want to set CpuThreads=0 and re-run to make debug easier" printExit("** kernel compilation failure **") # beta-only kernels for kernel in kernelsBetaOnly: kernelWriter = kernelWriterSource kernelName = kernelWriter.getKernelNameBetaOnly(kernel) # write kernel.cpp if not globalParameters["MergeFiles"]: kernelSourceFile = open(os.path.join(outputPath, \ "Kernels", kernelName+".cpp"), "w") kernelSourceFile.write(CHeader) (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel) kernelSourceFile.write(src) if err: print "*** warning: invalid kernel#%u"%kernelName if not globalParameters["MergeFiles"]: kernelSourceFile.close() # write kernel.h if not globalParameters["MergeFiles"]: kernelHeaderFile = open(os.path.join(outputPath, \ "Kernels", kernelName + ".h"), "w") kernelHeaderFile.write(CHeader) kernelHeaderFile.write( kernelWriter.getHeaderFileStringBetaOnly(kernel)) if not globalParameters["MergeFiles"]: kernelHeaderFile.close() # close merged if globalParameters["MergeFiles"]: kernelHeaderFile.close() stop = time.time() print "# Kernel Building elapsed time = %.1f secs" % (stop-start) print1("# Writing Solutions") if globalParameters["ShowProgressBar"]: progressBar = ProgressBar(len(solutions)) ############################################################################## # Write Solutions ############################################################################## if globalParameters["MergeFiles"]: solutionSourceFile = open(os.path.join(outputPath, \ "Solutions.cpp"), "w") solutionHeaderFile = open(os.path.join(outputPath, \ "Solutions.h"), "w") if globalParameters["MergeFiles"]: solutionSourceFile.write(CHeader) solutionHeaderFile.write(CHeader) solutionSourceFile.write("#include \"Solutions.h\"\n") solutionSourceFile.write("#include <algorithm>\n") solutionHeaderFile.write("#include \"TensileTypes.h\"\n") solutionHeaderFile.write("#include \"Kernels.h\"\n") solutionHeaderFile.write("#include \"SolutionHelper.h\"\n") solutionHeaderFile.write("#include \"Tools.h\"\n") if globalParameters["CodeFromFiles"]: solutionHeaderFile.write("#include <unistd.h>\n") # Write a solution pointer typedef for each problemType: h = "" for problemType in problemTypes: #print "p=", problemType argListAll = solutionWriter.getArgList(problemType, True, True, True, True) # declare TensileSolutionPointer_ProblemType h += "\n// solution pointer\n" h += "typedef TensileStatus (*TensileSolutionPointer_%s)(\n" \ % problemType for i in range(0, len(argListAll)): h += " %s %s%s" % (argListAll[i][0], argListAll[i][1], ",\n" \ if i < len(argListAll)-1 else ");\n\n") h += "\n" solutionHeaderFile.write(h) # for solution in solutions: # get solution name if not globalParameters["MergeFiles"]: solutionFileName = solutionWriter.getSolutionName(solution) # write solution.cpp if not globalParameters["MergeFiles"]: solutionSourceFile = open(os.path.join(outputPath, \ "Solutions", solutionFileName+".cpp"), "w") solutionSourceFile.write(CHeader) solutionSourceFile.write( \ solutionWriter.getProblemSourceString(solution["ProblemType"], solution, kernelsWithBuildErrs)) if not globalParameters["MergeFiles"]: solutionSourceFile.close() # write solution.h if not globalParameters["MergeFiles"]: solutionHeaderFile = open(os.path.join(outputPath, \ "Solutions", solutionFileName+".h"), "w") solutionHeaderFile.write(CHeader) solutionHeaderFile.write( \ solutionWriter.getHeaderFileString(solution)) if not globalParameters["MergeFiles"]: solutionHeaderFile.close() if globalParameters["ShowProgressBar"]: progressBar.increment() # close merged if not globalParameters["MergeFiles"]: solutionHeaderFile.close() if globalParameters["ExitAfterKernelGen"]: printExit("** Exiting after kernel generation due to ExitAfterKernelGen=1")
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \ solutionWriter, kernelWriterSource, kernelWriterAssembly): print1("# Writing Solutions and Kernels") if not globalParameters["MergeFiles"]: ensurePath(os.path.join(outputPath, "Solutions")) ensurePath(os.path.join(outputPath, "Kernels")) progressBar = ProgressBar(len(solutions) + len(kernels)) ############################################################################## # Write Solutions ############################################################################## if globalParameters["MergeFiles"]: solutionSourceFile = open(os.path.join(outputPath, \ "Solutions.cpp"), "w") solutionHeaderFile = open(os.path.join(outputPath, \ "Solutions.h"), "w") if globalParameters["MergeFiles"]: solutionSourceFile.write(CHeader) solutionHeaderFile.write(CHeader) solutionSourceFile.write("#include \"Solutions.h\"\n") solutionHeaderFile.write("#include \"TensileTypes.h\"\n") solutionHeaderFile.write("#include \"Kernels.h\"\n") solutionHeaderFile.write("#include \"SolutionHelper.h\"\n") solutionHeaderFile.write("#include \"Tools.h\"\n") for solution in solutions: # get solution name if not globalParameters["MergeFiles"]: solutionFileName = solutionWriter.getSolutionName(solution) # write solution.cpp if not globalParameters["MergeFiles"]: solutionSourceFile = open(os.path.join(outputPath, \ "Solutions", solutionFileName+".cpp"), "w") solutionSourceFile.write(CHeader) solutionSourceFile.write( \ solutionWriter.getSourceFileString(solution)) if not globalParameters["MergeFiles"]: solutionSourceFile.close() # write solution.h if not globalParameters["MergeFiles"]: solutionHeaderFile = open(os.path.join(outputPath, \ "Solutions", solutionFileName+".h"), "w") solutionHeaderFile.write(CHeader) solutionHeaderFile.write( \ solutionWriter.getHeaderFileString(solution)) if not globalParameters["MergeFiles"]: solutionHeaderFile.close() progressBar.increment() # close merged if not globalParameters["MergeFiles"]: solutionHeaderFile.close() ############################################################################## # Write Kernels ############################################################################## if globalParameters["MergeFiles"]: kernelSourceFile = open(os.path.join(outputPath, \ "Kernels.cpp"), "w") kernelHeaderFile = open(os.path.join(outputPath, \ "Kernels.h"), "w") kernelSourceFile.write(CHeader) kernelHeaderFile.write(CHeader) kernelSourceFile.write("#include \"Kernels.h\"\n") kernelHeaderFile.write("#pragma once\n") if globalParameters["RuntimeLanguage"] == "HIP": kernelHeaderFile.write("#include <hip/hip_runtime.h>\n") kernelHeaderFile.write("#include \"TensileTypes.h\"\n") kernelHeaderFile.write("#include \"KernelHeader.h\"\n") else: kernelHeaderFile.write("#include <string>\n") # tensor contraction kernels for kernel in kernels: kernelWriter = kernelWriterSource if kernel[ "KernelLanguage"] == "Source" else kernelWriterAssembly # get kernel name if not globalParameters["MergeFiles"]: kernelName = kernelWriter.getKernelName(kernel) # write kernel.cpp if not globalParameters["MergeFiles"]: kernelSourceFile = open(os.path.join(outputPath, \ "Kernels", kernelName+".cpp"), "w") kernelSourceFile.write(CHeader) kernelSourceFile.write(kernelWriter.getSourceFileString(kernel)) if not globalParameters["MergeFiles"]: kernelSourceFile.close() # write kernel.h if not globalParameters["MergeFiles"]: kernelHeaderFile = open(os.path.join(outputPath, \ "Kernels", kernelName+".h"), "w") kernelHeaderFile.write(CHeader) kernelHeaderFile.write(kernelWriter.getHeaderFileString(kernel)) if not globalParameters["MergeFiles"]: kernelHeaderFile.close() progressBar.increment() # beta-only kernels for kernel in kernelsBetaOnly: kernelWriter = kernelWriterSource kernelName = kernelWriter.getKernelNameBetaOnly(kernel) # write kernel.cpp if not globalParameters["MergeFiles"]: kernelSourceFile = open(os.path.join(outputPath, \ "Kernels", kernelName+".cpp"), "w") kernelSourceFile.write(CHeader) kernelSourceFile.write( kernelWriter.getSourceFileStringBetaOnly(kernel)) if not globalParameters["MergeFiles"]: kernelSourceFile.close() # write kernel.h if not globalParameters["MergeFiles"]: kernelHeaderFile = open(os.path.join(outputPath, \ "Kernels", kernelName + ".h"), "w") kernelHeaderFile.write(CHeader) kernelHeaderFile.write( kernelWriter.getHeaderFileStringBetaOnly(kernel)) if not globalParameters["MergeFiles"]: kernelHeaderFile.close() # close merged if globalParameters["MergeFiles"]: kernelHeaderFile.close()
def benchmarkProblemType( problemTypeConfig, problemSizeGroupConfig, \ problemSizeGroupIdx ): benchmarkTestFails = 0 # convert config to full benchmark process (resolves defaults) print1("") print1(HR) print1("# Converting Config to BenchmarkProcess Object") print1(HR) print1("") benchmarkProcess = BenchmarkProcess( problemTypeConfig, \ problemSizeGroupConfig ) problemTypeName = str(benchmarkProcess.problemType) problemSizeGroupName = "%s_%02u" % (problemTypeName, problemSizeGroupIdx) pushWorkingPath(problemSizeGroupName) ensurePath(os.path.join(globalParameters["WorkingPath"], "Data")) totalBenchmarkSteps = len(benchmarkProcess) resultsFileBaseFinal = None winners = WinningParameterDict() print1("# NumBenchmarkSteps: %u" % totalBenchmarkSteps) print1("") print1(HR) print1("# Done Creating BenchmarkProcess Object") print1(HR) ############################################################################## # For Each Benchmark Step ############################################################################## for benchmarkStepIdx in range(0, totalBenchmarkSteps): benchmarkStep = benchmarkProcess[benchmarkStepIdx] if winners.winners == {}: # perf optimization to skip the initial winners creation # this helps a little here but really helps below with avoiding the super-expensive # removeHardcoded step below - that can use a fast-path to create # winners when needed. print1( "# Empty winners - use fast initialization of hardcodedParameters" ) resultingHardcodedParameterList = benchmarkStep.hardcodedParameters else: resultingHardcodedParameterList = \ winners.wpdUpdate( benchmarkStep.hardcodedParameters ) benchmarkStep.hardcodedParameters = resultingHardcodedParameterList numHardcoded = len(benchmarkStep.hardcodedParameters) stepName = str(benchmarkStep) shortName = benchmarkStep.abbreviation() print1("\n") print1(HR) currentTime = time.time() elapsedTime = currentTime - startTime print1("# BenchmarkStep: %s - %s %.3fs" % (problemSizeGroupName, stepName, elapsedTime)) print1("# NumProblems: %u" % benchmarkStep.problemSizes.totalProblemSizes) print1("# BenchmarkParameters:") for paramName in benchmarkStep.benchmarkParameters: paramValues = benchmarkStep.benchmarkParameters[paramName] printStr = "# %s = { %s" % (paramName, paramValues[0]) for paramValueIdx in range(1, len(paramValues)): printStr += ", %s" % str(paramValues[paramValueIdx]) printStr += " }" print1(printStr) if False: # print1(hardcoded parameters and their winners print1("# HardcodedParameters | WinningParameters:") paramDictIdx = 0 hardcodedMinNaming = \ Solution.getMinNaming(benchmarkStep.hardcodedParameters) for paramDict in benchmarkStep.hardcodedParameters: winningParameters = winners[paramDict] print1("# (%u) %s | %s" % (paramDictIdx, \ Solution.getNameMin(paramDict, hardcodedMinNaming), \ Solution.getNameFull(winningParameters) )) paramDictIdx += 1 pushWorkingPath(shortName) ############################################################################ # Copy Files to Benchmark Source Directory ############################################################################ stepBaseDir = globalParameters["WorkingPath"] sourceDir = \ os.path.join(stepBaseDir, "source" ) ensurePath(sourceDir) pushWorkingPath("sourceTmp") filesToCopy = [ "SolutionMapper.h", "Client.cpp", "Client.h", "CMakeLists.txt", "DeviceStats.h", "TensorUtils.h", "MathTemplates.cpp", "MathTemplates.h", "TensileTypes.h", "tensile_bfloat16.h", "KernelHeader.h", "ReferenceCPU.h", "SolutionHelper.cpp", "SolutionHelper.h", "Tools.cpp", "Tools.h", ] for f in filesToCopy: shutil_copy(os.path.join(globalParameters["SourcePath"], f), globalParameters["WorkingPath"]) if globalParameters["RuntimeLanguage"] == "OCL": shutil_copy( os.path.join(globalParameters["SourcePath"], "FindOpenCL.cmake"), globalParameters["WorkingPath"]) else: shutil_copy( os.path.join(globalParameters["SourcePath"], "FindHIP.cmake"), globalParameters["WorkingPath"]) shutil_copy( os.path.join(globalParameters["SourcePath"], "FindHCC.cmake"), globalParameters["WorkingPath"]) ############################################################################ # Enumerate Benchmark Permutations ############################################################################ solutions = [] totalBenchmarkPermutations = 1 for benchmarkParamName in benchmarkStep.benchmarkParameters: totalBenchmarkPermutations *= len( benchmarkStep.benchmarkParameters[benchmarkParamName]) maxPossibleSolutions = totalBenchmarkPermutations * numHardcoded print1("# MaxPossibleSolutions: %u = %u (hardcoded) * %u (benchmark)" % \ (maxPossibleSolutions, numHardcoded, totalBenchmarkPermutations)) benchmarkPermutations = [] for i in range(0, totalBenchmarkPermutations): permutation = {} pIdx = i for benchmarkParamName in benchmarkStep.benchmarkParameters: benchmarkParamValues = deepcopy( \ benchmarkStep.benchmarkParameters[benchmarkParamName]) valueIdx = pIdx % len(benchmarkParamValues) permutation[benchmarkParamName] = benchmarkParamValues[ valueIdx] pIdx /= len(benchmarkParamValues) benchmarkPermutations.append(permutation) ############################################################################ # Enumerate Solutions = Hardcoded * Benchmark ############################################################################ print1("# Enumerating Solutions") if globalParameters["PrintLevel"] >= 1: progressBar = ProgressBar(maxPossibleSolutions) solutionSet = set() # avoid duplicates for nlca=-1, 1 for hardcodedIdx in range(0, numHardcoded): solutions.append([]) hardcodedParamDict = benchmarkStep.hardcodedParameters[ hardcodedIdx] for benchmarkIdx in range(0, len(benchmarkPermutations)): benchmarkPermutation = benchmarkPermutations[benchmarkIdx] solution = { "ProblemType": deepcopy(benchmarkProcess.problemType.state) } solution.update(benchmarkPermutation) solution.update(hardcodedParamDict) if benchmarkStepIdx > 0: winningParameters = winners[hardcodedParamDict] if winningParameters == None: # this is a joined parameter that didn't have a winner, that's okay continue solution.update(winningParameters) # append default parameters where necessary for initialSolutionParameterName in benchmarkStep.initialSolutionParameters: if initialSolutionParameterName not in solution: solution[initialSolutionParameterName] = \ benchmarkStep.initialSolutionParameters[initialSolutionParameterName] # TODO check if solution matches problem size for exact tile kernels solutionObject = Solution(solution) if solutionObject["Valid"]: if solutionObject not in solutionSet: solutionSet.add(solutionObject) solutions[hardcodedIdx].append(solutionObject) else: if globalParameters["PrintSolutionRejectionReason"]: print1("rejecting solution %s" % str(solutionObject)) if globalParameters["PrintLevel"] >= 1: progressBar.increment() # remove hardcoded that don't have any valid benchmarks removeHardcoded = [] for hardcodedIdx in range(0, numHardcoded): if len(solutions[hardcodedIdx]) == 0: hardcodedParamDict = benchmarkStep.hardcodedParameters[ hardcodedIdx] removeHardcoded.append(hardcodedParamDict) removesExist = len(removeHardcoded) > 0 for hardcodedParam in removeHardcoded: benchmarkStep.hardcodedParameters.remove(hardcodedParam) if removesExist: print1( "# Updating winners since enumeration removed unused hardcoded solutions. removeHardcoded=%u winners=%u" % (len(removeHardcoded), len(winners.winners))) winners.wpdUpdate(benchmarkStep.hardcodedParameters) if globalParameters["PrintLevel"] >= 1: print1("") numHardcoded = len(benchmarkStep.hardcodedParameters) # remove from solution 2D list also for solutionList in shallowcopy(solutions): if len(solutionList) == 0: solutions.remove(solutionList) elif winners.winners == {}: print1("# Populating initial winners (%u solutions)\n" % len(benchmarkStep.hardcodedParameters)) for hcParm in benchmarkStep.hardcodedParameters: winners.winners[FrozenDictionary(hcParm)] = [{}, -1] print1("# Actual Solutions: %u / %u\n" % ( len(solutions), \ maxPossibleSolutions )) # create linear list solutionList = [] for i in range(0, len(solutions)): solutionsForHardcoded = solutions[i] for j in range(0, len(solutionsForHardcoded)): solution = solutionsForHardcoded[j] solutionList.append(solution) if len(solutionList) == 0: msg = "Your parameters resulted in 0 valid solutions." if globalParameters["PrintSolutionRejectionReason"]: msg += "\nExamine reject and backtrace messages above to see why and where solutions were rejected." else: msg += "\nYou should re-run with \"PrintSolutionRejectionReason: True\" to see why each parameter combination was rejected." printExit(msg) if globalParameters["PrintLevel"] >= 1: for i in range(0, len(solutions)): solutionsForHardcoded = solutions[i] for j in range(0, len(solutionsForHardcoded)): solution = solutionsForHardcoded[j] print2("# (%u:%u) %s" % (i, j, \ Solution.getNameFull(solution) )) print2(HR) # write benchmarkFiles writeBenchmarkFiles(stepBaseDir, solutionList, benchmarkStep.problemSizes, \ shortName, filesToCopy) print1("# Copying files that differ from sourceTmp -> source") sourceTmp = globalParameters["WorkingPath"] files = os.listdir(sourceTmp) for f in files: f0 = os.path.join(sourceTmp, f) f1 = os.path.join(sourceDir, f) if os.path.isdir(f0): #print "cpDir:", f0, f1 if os.path.isdir(f1): shutil.rmtree(f1, True) shutil.copytree(f0, f1) elif not os.path.exists(f1) or not filecmp.cmp(f0, f1): #print "cp:", f0, f1 shutil.copy(f0, f1) shutil.rmtree(sourceTmp, True) popWorkingPath() # source ############################################################################ # Run Benchmark Script ############################################################################ resultsFileBase = os.path.normpath(os.path.join( \ globalParameters["WorkingPath"], "../Data", shortName)) if benchmarkStep.isFinal(): resultsFileBaseFinal = resultsFileBase resultsFileName = resultsFileBase + ".csv" solutionsFileName = resultsFileBase + ".yaml" if not os.path.exists(resultsFileName) or \ globalParameters["ForceRedoBenchmarkProblems"]: pushWorkingPath("build") # write runScript libraryLogicPath = None path = globalParameters["WorkingPath"] forBenchmark = True runScriptName = writeRunScript(path, libraryLogicPath, forBenchmark) # run runScript process = Popen(runScriptName, cwd=globalParameters["WorkingPath"]) process.communicate() if process.returncode: benchmarkTestFails += 1 printWarning( "BenchmarkProblems: Benchmark Process exited with code %u" % process.returncode) popWorkingPath() # build else: print1("# Already benchmarked; skipping.") ############################################################################ # Winners -> Determined Parameters ############################################################################ results = getResults(resultsFileName, solutions) print2("CSV Results: %s" % results) winners.addResults(benchmarkStep.hardcodedParameters, \ benchmarkPermutations, solutions, results) ############################################################################ # Write Solutions YAML ############################################################################ YAMLIO.writeSolutions(solutionsFileName, benchmarkStep.problemSizes, \ solutions ) # End Iteration popWorkingPath() # stepName currentTime = time.time() elapsedTime = currentTime - startTime print1("%s\n# %s\n# %s: End - %.3fs\n%s\n" \ % (HR, problemSizeGroupName, shortName, elapsedTime, HR)) popWorkingPath() # ProblemType return (resultsFileBaseFinal, benchmarkTestFails)
def convertParametersToSteps(self): print2("") print2( "####################################################################" ) print1("# Convert Parameters to Steps") print2( "####################################################################" ) print2("") ############################################################################ # (II-1) benchmark common parameters print2("") print2( "####################################################################" ) print1("# Benchmark Common Parameters") self.addStepsForParameters(self.benchmarkCommonParameters) ############################################################################ # (II-2) fork parameters # calculate permutations of print2("") print2( "####################################################################" ) print1("# Fork Parameters") print2(self.forkParameters) totalPermutations = 1 for param in self.forkParameters: for name in param: # only 1 values = param[name] totalPermutations *= len(values) forkPermutations = [] for i in range(0, totalPermutations): forkPermutations.append({}) pIdx = i for param in self.forkParameters: for name in param: values = param[name] valueIdx = pIdx % len(values) forkPermutations[i][name] = values[valueIdx] pIdx /= len(values) if len(forkPermutations) > 0: self.forkHardcodedParameters(forkPermutations) ############################################################################ # (II-3) benchmark fork parameters print2("") print2( "####################################################################" ) print1("# Benchmark Fork Parameters") self.addStepsForParameters(self.benchmarkForkParameters) ############################################################################ # (II-4.1) join parameters # answer should go in hard-coded parameters # does it remove the prior forks? Yes. print2("") print2( "####################################################################" ) print1("# Join Parameters") macroTileJoinSet = set() totalPermutations = 1 if len(self.joinParameters) > 0: for joinName in self.joinParameters: # joining a parameter with only a single value if hasParam(joinName, self.singleValueParameters): pass elif hasParam(joinName, self.forkParameters): # count permutations for param in self.forkParameters: for name in param: # only 1 if name == joinName: values = param[name] localPermutations = len(values) print2( "JoinParameter %s has %u possibilities" % (joinName, localPermutations)) totalPermutations *= localPermutations ########################################################################## # (II-4.2) Join MacroTile elif joinName == "MacroTile": print2("JoinParam: MacroTile") # get possible WorkGroupEdges from forked print2("currentForkParameters = %s" % str(self.forkParameters)) threadTileValues = [] workGroupValues = [] # todo having MacroTile as join parameter causes trouble if # one parameter is benchmarked rather than forked # however, this may still be the right way to do it # count permutations for paramList in [self.benchmarkCommonParameters, \ self.forkParameters, self.benchmarkForkParameters, \ self.benchmarkJoinParameters, self.singleValueParameters ]: if hasParam("ThreadTile", paramList): threadTileValues = getParamValues( "ThreadTile", paramList) if hasParam("WorkGroup", paramList): workGroupValues = getParamValues( "WorkGroup", paramList) macroTilePermutations = len(workGroupValues) * len( threadTileValues) print2("# Total JoinMacroTile Permutations: %u" % macroTilePermutations) # enumerate permutations for i in range(0, macroTilePermutations): pIdx = i workGroupIdx = pIdx % len(workGroupValues) pIdx /= len(workGroupValues) threadTileIdx = pIdx % len(threadTileValues) workGroup = workGroupValues[workGroupIdx] threadTile = threadTileValues[threadTileIdx] macroTile0 = workGroup[0] * threadTile[0] macroTile1 = workGroup[1] * threadTile[1] macroTileJoinSet.add((macroTile0, macroTile1)) totalPermutations *= len(macroTileJoinSet) print2("JoinMacroTileSet(%u): %s" % (len(macroTileJoinSet), macroTileJoinSet)) # invalid join parameter else: validJoinNames = ["MacroTile"] for validParam in self.forkParameters: for validName in validParam: # only 1 validJoinNames.append(validName) printExit("JoinParameter \"%s\" not in %s" % (joinName, validJoinNames)) ############################################################################ # (II-4.4) Enumerate Permutations Other * MacroTile * DepthU macroTiles = list(macroTileJoinSet) print2("# TotalJoinPermutations = %u" % (totalPermutations)) joinPermutations = [] for i in range(0, totalPermutations): joinPermutations.append({}) pIdx = i for joinName in self.joinParameters: if hasParam(joinName, self.forkParameters): for paramDict in self.forkParameters: # hardcodedPermutations if joinName in paramDict: paramValues = paramDict[joinName] valueIdx = pIdx % len(paramValues) joinPermutations[i][joinName] = paramValues[ valueIdx] pIdx /= len(paramValues) break elif joinName == "MacroTile": valueIdx = pIdx % len(macroTiles) pIdx /= len(macroTiles) joinPermutations[i]["MacroTile0"] = macroTiles[ valueIdx][0] joinPermutations[i]["MacroTile1"] = macroTiles[ valueIdx][1] if len(joinPermutations) > 0: self.joinHardcodedParameters(joinPermutations) ############################################################################ # (II-5) benchmark join parameters print2("") print2( "####################################################################" ) print1("# Benchmark Join Parameters") self.addStepsForParameters(self.benchmarkJoinParameters) ############################################################################ # (II-6) benchmark final print2("") print2( "####################################################################" ) print1("# Benchmark Final") for problemSizesDict in self.benchmarkFinalParameters: problemSizes = problemSizesDict["ProblemSizes"] self.currentProblemSizes = ProblemSizes(self.problemType, problemSizes) currentBenchmarkParameters = {} benchmarkStep = BenchmarkStep(self.hardcodedParameters, currentBenchmarkParameters, self.initialSolutionParameters, self.currentProblemSizes, self.benchmarkStepIdx) self.benchmarkSteps.append(benchmarkStep) self.benchmarkStepIdx += 1
def assignDerivedParameters(state): Solution.assignProblemIndependentDerivedParameters(state) if "AssignedDerivedParameters" in state: if state["AssignedDerivedParameters"]: return state["AssignedDerivedParameters"] = False ProblemType.assignDerivedParameters(state["ProblemType"]) if not state["Valid"]: return if state["ProblemType"]["Tensor0"]==0: state["ThreadTileA"] = state["ThreadTile0"] state["ThreadTileB"] = state["ThreadTile1"] state["SubGroupA"] = state["SubGroup0"] state["SubGroupB"] = state["SubGroup1"] state["MacroTileA"] = state["MacroTile0"] state["MacroTileB"] = state["MacroTile1"] else: state["ThreadTileB"] = state["ThreadTile0"] state["ThreadTileA"] = state["ThreadTile1"] state["SubGroupB"] = state["SubGroup0"] state["SubGroupA"] = state["SubGroup1"] state["MacroTileB"] = state["MacroTile0"] state["MacroTileA"] = state["MacroTile1"] # VectorWidth if state["VectorWidth"] < 1: state["VectorWidth"] = int(4 / state["ProblemType"]["DataType"].numRegisters()) while state["ThreadTile0"] % state["VectorWidth"] != 0 \ or state["ThreadTile1"] % state["VectorWidth"] != 0: state["VectorWidth"] /= 2 # TT0,1 both must be multiples of VW, b/c of rC, rA, rB if state["ThreadTile0"] % state["VectorWidth"] != 0 \ or state["ThreadTile1"] % state["VectorWidth"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("ThreadTile0 %u or ThreadTile1 %u not a multiple of VectorWidth %u" \ % (state["ThreadTile0"], state["ThreadTile1"], \ state["VectorWidth"])) state["Valid"] = False return # LocalSplitU too large? numElementsPerWorkGroup = state["MacroTile0"]*state["MacroTile1"] if numElementsPerWorkGroup < state["NumThreads"]: if globalParameters["PrintSolutionRejectionReason"]: print1("NumElementsPerWorkGroup %u < NumThreads %u; reduce LocalSplitU" \ % (numElementsPerWorkGroup, state["NumThreads"])) state["Valid"] = False return state["NumElementsPerThread"] = numElementsPerWorkGroup / \ state["NumThreads"] state["GlobalWriteVectorWidth"] = min(state["VectorWidth"], state["NumElementsPerThread"] ) if state["NumElementsPerThread"] % state["GlobalWriteVectorWidth"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("LSU NumElementsPerThread %u not divisible into GWVW %u" \ % (state["NumElementsPerThread"], state["GlobalWriteVectorWidth"])) state["Valid"] = False return state["NumGlobalWriteVectorsPerThread"] = state["NumElementsPerThread"] \ / state["GlobalWriteVectorWidth"] # LocalSplitU but can't NumThreads%MacroTile doesn't support sideways store if state["LocalSplitU"] > 1: if state["NumThreads"] % state["MacroTile0"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("LocalSplitU but NumThreads=%u not divisible by MT0=%u for sideways store" \ % (state["NumThreads"], state["MacroTile0"])) state["Valid"] = False return if state["MacroTile0"]*state["MacroTile1"] % state["NumThreads"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("LocalSplitU but MT0*MT1=%u elements doesn't divide into NumThreads=%u" \ % (state["MacroTile0"]*state["MacroTile1"], state["NumThreads"])) state["Valid"] = False return # GlobalSplitU doesn't work with if state["GlobalSplitU"] > 1: if not state["GlobalSplitUSummationAssignmentRoundRobin"] \ and state["LoopTail"]: if globalParameters["PrintSolutionRejectionReason"]: print1("GlobalSplitU and LoopTail require SummationAssignmentRoundRobin=True since strongly breaks Tensile kernel architecture") state["Valid"] = False return if not state["ProblemType"]["DataType"].isSingle(): if globalParameters["PrintSolutionRejectionReason"]: print1("GlobalSplitU only compatible with single precision") state["Valid"] = False return ######################################## # Initial DepthU ######################################## userDepthU = state["DepthU"] # DepthU == -1 means glvw=1 if state["DepthU"] == -1: if state["MacroTile0"] != state["MacroTile1"]: if globalParameters["PrintSolutionRejectionReason"]: print1("DepthU=0 requires square MacroTile") state["Valid"] = False return if userDepthU < 0: depthU = 2 maxDepthU = globalParameters["MaxDepthU"] else: depthU = userDepthU maxDepthU = userDepthU ######################################## # Search DepthU ######################################## while True: # exit criteria at end validDepthU = True # how many elements to load if state["ProblemType"]["TLUA"]: totalElementsCoalescedA = state["MacroTile0"] totalElementsPerpA = depthU else: totalElementsCoalescedA = depthU totalElementsPerpA = state["MacroTile0"] if state["ProblemType"]["TLUB"]: totalElementsCoalescedB = state["MacroTile1"] totalElementsPerpB = depthU else: totalElementsCoalescedB = depthU totalElementsPerpB = state["MacroTile1"] totalElementsA = totalElementsCoalescedA * totalElementsPerpA totalElementsB = totalElementsCoalescedB * totalElementsPerpB # convert elements to vectors based on VectorWidth totalVectorsCoalescedA = totalElementsCoalescedA / state["VectorWidth"] totalVectorsCoalescedB = totalElementsCoalescedB / state["VectorWidth"] totalVectorsA = totalElementsA / state["VectorWidth"] totalVectorsB = totalElementsB / state["VectorWidth"] if totalVectorsA < state["NumThreads"]: state["PVA"] = state["NumThreads"] / totalVectorsA # partial vector if state["NumThreads"] % totalVectorsA != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("NumThreads %u %% totalVectorsA %u != 0" \ % (state["NumThreads"], totalVectorsA)) validDepthU = False if state["PVA"] * totalVectorsA != state["NumThreads"]: if globalParameters["PrintSolutionRejectionReason"]: print1("PVA %u * totalVectorsA %u != NumThreads %u" \ % (state["PVA"], totalVectorsA, state["NumThreads"])) validDepthU = False if state["VectorWidth"] % state["PVA"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("NumThreads %u %% totalVectorsA %u != 0" \ % (state["NumThreads"], totalVectorsA)) validDepthU = False else: state["PVA"] = 1 # partial vector if totalVectorsA % state["NumThreads"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("totalVectorsA %u %% NumThreads %u != 0" \ % (totalVectorsA, state["NumThreads"])) validDepthU = False if state["VectorWidth"] % state["PVA"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("VectorWidth %u %% PVA %u != 0" \ % (state["VectorWidth"], state["PVA"])) validDepthU = False state["GlobalLoadVectorWidthA"] = state["VectorWidth"] / state["PVA"] state["NumLoadsA"] = totalVectorsA * state["PVA"] / state["NumThreads"] if totalVectorsB < state["NumThreads"]: state["PVB"] = state["NumThreads"] / totalVectorsB # partial vector if state["NumThreads"] % totalVectorsB != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("NumThreads %u %% totalVectorsB %u != 0" \ % (state["NumThreads"], totalVectorsB)) validDepthU = False if state["PVB"] * totalVectorsB != state["NumThreads"]: if globalParameters["PrintSolutionRejectionReason"]: print1("PVB %u * totalVectorsB %u != NumThreads %u" \ % (state["PVB"], totalVectorsB, state["NumThreads"])) validDepthU = False if state["VectorWidth"] % state["PVB"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("VectorWidth %u %% PVB %u != 0" \ % (state["VectorWidth"], state["PVB"])) validDepthU = False else: state["PVB"] = 1 # partial vector if totalVectorsB % state["NumThreads"] != 0 \ or state["VectorWidth"] % state["PVB"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("totalVectorsB %u %% NumThreads %u != 0" \ % (totalVectorsB, state["NumThreads"])) validDepthU = False state["GlobalLoadVectorWidthB"] = state["VectorWidth"] / state["PVB"] state["NumLoadsB"] = totalVectorsB * state["PVB"] / state["NumThreads"] if userDepthU == -1: # no vectors if state["GlobalLoadVectorWidthA"] != 1 \ or state["GlobalLoadVectorWidthB"] != 1: validDepthU = False elif userDepthU == -2: if max( state["GlobalLoadVectorWidthA"], \ state["GlobalLoadVectorWidthB"]) \ < state["VectorWidth"]: validDepthU = False elif userDepthU <= -3: if min( state["GlobalLoadVectorWidthA"], \ state["GlobalLoadVectorWidthB"]) \ < state["VectorWidth"]: validDepthU = False if not state["ProblemType"]["TLUA"]: if depthU < state["GlobalLoadVectorWidthA"]: validDepthU = False if not state["ProblemType"]["TLUB"]: if depthU < state["GlobalLoadVectorWidthB"]: validDepthU = False # this depthU is valid, done unless user wants to double (for TN) if validDepthU: if userDepthU < -3: # for every int below -3, use next doubled value userDepthU += 1 depthU *= 2 continue else: # use this found value state["DepthU"] = depthU break # this depthU not valid else: # keep looking if depthU < maxDepthU: depthU += 2 continue # give up else: state["Valid"] = False return ######################################## # end DepthU loop ######################################## # nlca = 1 if state["NumLoadsCoalescedA"] == 1: foundValid = False for nlca in range(1, state["NumLoadsA"]+1): nlpa = state["NumLoadsA"] / nlca #print nlca, nlpa if state["NumLoadsA"] % nlca == 0 \ and totalVectorsCoalescedA % nlca == 0 \ and totalElementsPerpA % nlpa == 0: state["NumLoadsCoalescedA"] = nlca state["NumLoadsPerpendicularA"] = nlpa foundValid = True break if not foundValid: if globalParameters["PrintSolutionRejectionReason"]: print1("No NumLoadsCoalescedA=1 found") state["Valid"] = False return # nlca = -1 elif state["NumLoadsCoalescedA"] == -1: foundValid = False for nlca in range(state["NumLoadsA"], 0, -1): nlpa = state["NumLoadsA"] / nlca if state["NumLoadsA"] % nlca == 0 \ and totalVectorsCoalescedA % nlca == 0 \ and totalElementsPerpA % nlpa == 0: state["NumLoadsCoalescedA"] = nlca state["NumLoadsPerpendicularA"] = nlpa foundValid = True break if not foundValid: if globalParameters["PrintSolutionRejectionReason"]: print1("No NumLoadsCoalescedA=-1 found") state["Valid"] = False return # nlca = other else: if state["NumLoadsCoalescedA"] > state["NumLoadsA"]: if globalParameters["PrintSolutionRejectionReason"]: print1("NLCA > NLA") state["Valid"] = False return state["NumLoadsPerpendicularA"] = state["NumLoadsA"] \ / state["NumLoadsCoalescedA"] if state["NumLoadsA"] % state["NumLoadsCoalescedA"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("numLoadsA %u %% numLoadsParaA %u != 0" \ % (state["NumLoadsA"], state["NumLoadsCoalescedA"])) state["Valid"] = False if totalVectorsCoalescedA % state["NumLoadsCoalescedA"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("totalVectorsCoalescedA %u %% numLoadsParaA %u != 0" \ % (totalVectorsCoalescedA, state["NumLoadsCoalescedA"])) state["Valid"] = False return if totalElementsPerpA % state["NumLoadsPerpendicularA"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \ % (totalElementsPerpA, state["NumLoadsPerpendicularA"])) state["Valid"] = False return # nlcb = 1 if state["NumLoadsCoalescedB"] == 1: foundValid = False for nlcb in range(1, state["NumLoadsB"]+1): nlpb = state["NumLoadsB"] / nlcb #print nlcb, nlpb if state["NumLoadsB"] % nlcb == 0 \ and totalVectorsCoalescedB % nlcb == 0 \ and totalElementsPerpB % nlpb == 0: state["NumLoadsCoalescedB"] = nlcb state["NumLoadsPerpendicularB"] = nlpb foundValid = True break if not foundValid: if globalParameters["PrintSolutionRejectionReason"]: print1("No NumLoadsCoalescedB=1 found") state["Valid"] = False return # nlcb = -1 elif state["NumLoadsCoalescedB"] == -1: foundValid = False for nlcb in range(state["NumLoadsB"], 0, -1): nlpb = state["NumLoadsB"] / nlcb if state["NumLoadsB"] % nlcb == 0 \ and totalVectorsCoalescedB % nlcb == 0 \ and totalElementsPerpB % nlpb == 0: state["NumLoadsCoalescedB"] = nlcb state["NumLoadsPerpendicularB"] = nlpb foundValid = True break if not foundValid: if globalParameters["PrintSolutionRejectionReason"]: print1("No NumLoadsCoalescedB=-1 found") state["Valid"] = False return # nlcb = other else: if state["NumLoadsCoalescedB"] > state["NumLoadsB"]: if globalParameters["PrintSolutionRejectionReason"]: print1("NLCB > NLB") state["Valid"] = False return state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \ / state["NumLoadsCoalescedB"] if state["NumLoadsB"] % state["NumLoadsCoalescedB"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("numLoadsB %u %% numLoadsParaB %u != 0" \ % (state["NumLoadsB"], state["NumLoadsCoalescedB"])) state["Valid"] = False return if totalVectorsCoalescedB % state["NumLoadsCoalescedB"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("totalVectorsCoalescedB %u %% numLoadsParaB %u != 0" \ % (totalVectorsCoalescedB, state["NumLoadsCoalescedB"])) state["Valid"] = False return if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0: if globalParameters["PrintSolutionRejectionReason"]: print1("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \ % (totalElementsPerpB, state["NumLoadsPerpendicularB"])) state["Valid"] = False return if state["ProblemType"]["TLUA"]: state["LSCA"] = state["MacroTileA"] \ / state["NumLoadsCoalescedA"] state["LSPA"] = state["DepthU"] / state["NumLoadsPerpendicularA"] else: state["LSCA"] = state["DepthU"] / state["NumLoadsCoalescedA"] state["LSPA"] = state["MacroTileA"] \ / state["NumLoadsPerpendicularA"] if state["ProblemType"]["TLUB"]: state["LSCB"] = state["MacroTileB"] \ / state["NumLoadsCoalescedB"] state["LSPB"] = state["DepthU"] / state["NumLoadsPerpendicularB"] else: state["LSCB"] = state["DepthU"] / state["NumLoadsCoalescedB"] state["LSPB"] = state["MacroTileB"] \ / state["NumLoadsPerpendicularB"] state["LVCA"] = state["LSCA"] / state["GlobalLoadVectorWidthA"] state["LVCB"] = state["LSCB"] / state["GlobalLoadVectorWidthB"] state["LVPA"] = state["LSPA"] / state["GlobalLoadVectorWidthA"] state["LVPB"] = state["LSPB"] / state["GlobalLoadVectorWidthB"] # lds buffer size for A, B ldsAlign = int(64 / state["ProblemType"]["DataType"].numRegisters()) ldsNumElementsA = state["DepthU"]*(state["MacroTile0"]+state["LdsPad"]) ldsNumElementsAlignedA = ((ldsNumElementsA+ldsAlign-1)/ldsAlign)*ldsAlign ldsNumElementsB = state["DepthU"]*(state["MacroTile1"]+state["LdsPad"]) ldsNumElementsAlignedB = ((ldsNumElementsB+ldsAlign-1)/ldsAlign)*ldsAlign # todo, can the alignment be a power of 2? if state["PrefetchGlobalRead"]: state["LdsNumElementsAlignedA"] = ldsNumElementsAlignedA state["LdsNumElementsAlignedB"] = ldsNumElementsAlignedB state["LdsOffsetA"] = 0 state["LdsOffsetB"] = state["LdsOffsetA"] \ + state["LdsNumElementsAlignedA"] offsetBlk = state["LdsOffsetB"] + state["LdsNumElementsAlignedB"] offsetBlk = int(2**(ceil(log(offsetBlk, 2)))) state["LdsOffsetA_Blk"] = offsetBlk state["LdsOffsetB_Blk"] = state["LdsOffsetA_Blk"] \ + state["LdsNumElementsAlignedA"] ldsNumElementsAB = state["LdsOffsetB_Blk"]+ ldsNumElementsB else: state["LdsOffsetB"] = ldsNumElementsAlignedA ldsNumElementsAB = ldsNumElementsAlignedA + ldsNumElementsB # lds buffer size for reduction ldsNumElementsReduction = state["LocalSplitU"]*state["MacroTile0"]*state["MacroTile1"] if state["LocalSplitU"] > 1 else 0 # lds max occupancy ldsSizeOccupancy = globalParameters["DeviceLDS"] / state["MaxOccupancy"] ldsNumElementsOccupancy = ldsSizeOccupancy / state["ProblemType"]["DataType"].numBytes() # lds size is the greater of the two ldsNumElements = max(ldsNumElementsAB, ldsNumElementsReduction, ldsNumElementsOccupancy) state["LdsNumElements"] = ldsNumElements ldsSize = ldsNumElements * state["ProblemType"]["DataType"].numBytes() if ldsSize > globalParameters["MaxLDS"]: if globalParameters["PrintSolutionRejectionReason"]: print1("Kernel Uses %u > %u bytes of LDS" % ( ldsSize, globalParameters["MaxLDS"])) state["Valid"] = False return # LoopUnroll = DepthU / LocalSplitU if "LocalSplitU" in state and "DepthU" in state: state["LoopUnroll"] = state["DepthU"] / state["LocalSplitU"] if state["LoopUnroll"] * state["LocalSplitU"] != state["DepthU"]: state["Valid"] = False # LoopUnroll too small if state["LoopUnroll"] < 2: if globalParameters["PrintSolutionRejectionReason"]: print1("LoopUnroll %u is less than 2" \ % (state["LoopUnroll"])) state["Valid"] = False state["AssignedDerivedParameters"] = True
def writeLogic(outputPath, logicData, solutionWriter ): print1("# Writing Library Logic") if not globalParameters["MergeFiles"]: ensurePath(os.path.join(outputPath, "Logic")) # Tensile.h h = "" h += "#pragma once\n" h += "#include \"TensileTypes.h\"\n" h += "#include \"SolutionHelper.h\"\n" h += "#include \"SolutionMapper.h\"\n" # TensileInternal.h ih = "" ih += "#include \"Tensile.h\"\n" # Tensile.cpp s = "" s += "#include \"Solutions.h\"\n" s += "#include \"Tensile.h\"\n" s += "#include \"TensileInternal.h\"\n" s += "#include \"SolutionMapper.h\"\n" ######################################## # problemType for problemType in logicData: # function argument list argListSizes = solutionWriter.getArgList(problemType, False, False, False, False) argListData = solutionWriter.getArgList(problemType, False, True, True, True) argListAll = solutionWriter.getArgList(problemType, True, True, True, True) # tensile initializer h += "\nvoid tensileInitialize();\n\n" # declare tensile_ProblemType h += "\n// enqueue solution\n" h += "TensileStatus tensile_%s(\n" % problemType for i in range(0, len(argListData)): h += " %s %s%s" \ % (argListData[i][0], argListData[i][1], \ ",\n" if i < len(argListData)-1 else ");\n\n") numSizes = problemType["TotalIndices"]; firstStride = 0 if problemType["UseInitialStrides"] else 1 lastStrideA = len(problemType["IndexAssignmentsA"]) lastStrideB = len(problemType["IndexAssignmentsB"]) lastStrideC = problemType["NumIndicesC"] h += "typedef ProblemKey<%u> ProblemKey_%s;\n" % (numSizes,problemType) h += "typedef ProblemDims<%u,%u,%u,%u,%u> ProblemDims_%s;\n" \ % (firstStride, lastStrideC, lastStrideA, lastStrideB, numSizes, problemType) h += "typedef SolutionMapper<ProblemDims_%s, ProblemKey_%s> SolutionMapper_%s;\n" \ % (problemType, problemType, problemType) # declare tensileGetSolutionPointer_ProblemType h += "\n// get solution pointer\n" h += "SolutionMapper_%s::SolutionRuntime *\n" % (problemType) h += "tensileGetSolutionPointer_%s(\n" % (problemType) for i in range(0, len(argListSizes)): h += " %s %s%s" \ % (argListSizes[i][0], argListSizes[i][1], \ ",\n" if i < len(argListSizes)-1 else ");\n\n") # declare tensileName_ h += "// get solution name\n" h += "const char * tensileGetSolutionName_%s(\n" \ % (problemType) for i in range(0, len(argListSizes)): h += " %s %s%s" \ % (argListSizes[i][0], argListSizes[i][1], \ ",\n" if i < len(argListSizes)-1 else ");\n\n") # get solution naming for problem type solutionsForProblemType = [] for scheduleTuple in logicData[problemType]: solutionsForSchedule = scheduleTuple[2] for solution in solutionsForSchedule: if solution not in solutionsForProblemType: solutionsForProblemType.append(solution) # solution names for problem type solutionNamesForProblemType = [] for solution in solutionsForProblemType: solutionName = solutionWriter.getSolutionName(solution) solutionNamesForProblemType.append(solutionName) # reset problemType source if not globalParameters["MergeFiles"]: filePrefix = "Tensile_%s" % (problemType) s = "#include \"TensileTypes.h\"\n" s = "#include \"Tensile.h\"\n" s += "#include \"TensileInternal.h\"\n" for solutionName in solutionNamesForProblemType: s += "#include \"%s.h\"\n" % solutionName ######################################## # Per-problem constants here: # These are common for all schedules and thus do not include schedule name (vega,hip,etc) s += "\n" s += "/*******************************************************************************\n" s += "* Per-Problem Functions for %s\n" % problemType s += "*******************************************************************************/\n" s += "// Problem type include the index assignments for free, summation, batch:\n" s += "static const ProblemType problemType_%s( " % problemType s += listToInitializer(problemType["IndicesFree"]) + ", " s += listToInitializer(problemType["IndicesSummation"]) + ", " s += listToInitializer(problemType["IndicesBatch"]) s += ");\n" s += "\n" s += "// Master solution mapper is the entry point for problem->solution mapping\n" s += "// There is one master solution mapper per problem type\n" s += "// The master solution mapper contains pointers to the solution mappers for each device\n" s += "static MasterSolutionMapper<ProblemDims_%s> masterSolutionMapper_%s;\n " % (problemType,problemType) ######################################## # implement per-Schedule functions in source s += "\n" s += "/*******************************************************************************\n * Per-Schedule Functions\n *******************************************************************************/" for scheduleTuple in logicData[problemType]: # get logic parameters for problem type scheduleName = scheduleTuple[0] deviceNames = scheduleTuple[1] solutionsForSchedule = scheduleTuple[2] indexOrder = scheduleTuple[3] exactLogic = scheduleTuple[4] rangeLogic = scheduleTuple[5] # solution names for schedule solutionNamesForSchedule = [] for solution in solutionsForSchedule: solutionName = solutionWriter.getSolutionName(solution) solutionNamesForSchedule.append(solutionName) s += "\n\n" schedProbName = "%s_%s" % (scheduleName, problemType) s += writeSolutionAndExactTable(scheduleName, deviceNames, schedProbName, problemType, \ solutionsForSchedule, solutionNamesForSchedule, exactLogic) # Per-problem function here: # function tensileGetSolutionPointer_ProblemType del schedProbName del scheduleName s += "\n// problem dims -> solution logic\n" s += "SolutionMapper_%s::SolutionRuntime *\n" % (problemType) s += "tensileGetSolutionPointer_%s(\n" % (problemType) for i in range(0, len(argListSizes)): s += " %s %s%s" \ % (argListSizes[i][0], argListSizes[i][1], \ ",\n" if i < len(argListSizes)-1 else ") {\n\n") exactLogicStr = writeExactLogic(problemType, indexOrder, \ solutionsForSchedule, exactLogic, \ solutionNamesForSchedule, True) if rangeLogic != None: print "** warning: ignored ranges in logic file, these should have been expanded with ExpandRanges=1 during Tensile phase 3" s += " /* exact mappings */\n" s += exactLogicStr s += "\n return nullptr;\n" s += "\n}\n" # function tensileGetSolutionName_Schedule_ProblemType s += "\n// get solution name for problem dims\n" s += "const char * tensileGetSolutionName_%s(\n" \ % (problemType) for i in range(0, len(argListSizes)): s += " %s %s%s" \ % (argListSizes[i][0], argListSizes[i][1], \ ",\n" if i < len(argListSizes)-1 else ") {\n\n") exactLogicStr = writeExactLogic(problemType, indexOrder, \ solutionsForSchedule, exactLogic, \ solutionNamesForSchedule, False) s += " /* exact mappings */\n" s += exactLogicStr #s += " return NULL; // none\n" s += "\n}\n" ######################################## # implement problem-type functions in source s += "/*******************************************************************************\n * Per-ProblemType Functions\n *******************************************************************************/" # declare tensile_ProblemType s += "\n// main call to solution; enqueues a kernel\n" s += "TensileStatus tensile_%s(\n" % problemType for i in range(0, len(argListData)): s += " %s %s%s" \ % (argListData[i][0], argListData[i][1], \ ",\n" if i < len(argListData)-1 else ") {\n") s += " auto solution = tensileGetSolutionPointer_%s(\n" % (problemType) for i in range(0, len(argListSizes)): s += " %s%s" \ % (argListSizes[i][1], ", " if i < len(argListSizes)-1 else ");") s += "\n" s += " if (solution) {\n" s += " TensileSolutionPointer_%s f = reinterpret_cast<TensileSolutionPointer_%s> (solution->_info->_functionPtr);\n" \ % (problemType, problemType) s += " auto solutionLock = &solution->_lock;\n" s += " return f(" for i in range(0, len(argListAll)): s += "%s%s" \ % (argListAll[i][1], ", " if i < len(argListAll)-1 else ");\n") s += " } else {\n" #s += " printf(\"solution not valid, returning fail\\n\");" s += " return tensileStatusFailure; // no solution found\n" s += " }\n" s += "}\n" # open and close problemType files if not globalParameters["MergeFiles"]: logicSourceFile = open(os.path.join(outputPath, "Logic", \ "%s.cpp" % filePrefix), "w") logicSourceFile.write(s) logicSourceFile.close() s += "\n" s += writeTensileInitialize(logicData) # close merged files if globalParameters["MergeFiles"]: logicSourceFile = open(os.path.join(outputPath, \ "Tensile.cpp"), "w") logicSourceFile.write(s) logicSourceFile.close() logicHeaderFile = open(os.path.join(outputPath, \ "Tensile.h"), "w") logicHeaderFile.write(h) logicHeaderFile.close() internalHeaderFile = open(os.path.join(outputPath, \ "TensileInternal.h"), "w") internalHeaderFile.write(ih) internalHeaderFile.close()
def Tensile(userArgs): # 1st half of splash print1("") print1(HR) print1("#") print1("# Tensile v%s" % (__version__)) # setup argument parser argParser = argparse.ArgumentParser() argParser.add_argument("config_file", \ help="benchmark config.yaml file") argParser.add_argument("output_path", \ help="path where to conduct benchmark") argParser.add_argument("--version", action="version", \ version="%(prog)s {version}".format(version=__version__)) argParser.add_argument("-d", "--device", dest="device", type=int, \ help="override which device to benchmark") argParser.add_argument("-p", "--platform", dest="platform", type=int, \ help="override which OpenCL platform to benchmark") argParser.add_argument("--runtime-language", dest="RuntimeLanguage", \ choices=["HIP", "OCL"], help="override which runtime language to use") argParser.add_argument("--code-object-version", dest="CodeObjectVersion", \ choices=["V2", "V3"], help="HSA code-object version") argParser.add_argument("-v", "--verbose", action="store_true", \ help="set PrintLevel=2") argParser.add_argument("--debug", dest="debug", action="store_true", \ help="set PrintLevel=2 and CMakeBuildType=Debug") argParser.add_argument("--short-names", dest="shortNames", action="store_true", \ help="use serial kernel and solution names") argParser.add_argument("--no-merge-files", dest="noMergeFiles", action="store_true", \ help="kernels and solutions written to individual files") # argParser.add_argument("--hcc-version", dest="HccVersion", \ # help="This can affect what opcodes are emitted by the assembler") print1("# Restoring default globalParameters") for key in defaultGlobalParameters: globalParameters[key] = defaultGlobalParameters[key] # parse arguments args = argParser.parse_args(userArgs) configPath = os.path.realpath(args.config_file) # 2nd half of splash print1("# Config: %s" % (configPath)) print1("#") print1(HR) print1("") # read config config = YAMLIO.readConfig(configPath) globalParameters["ConfigPath"] = configPath # assign global parameters if "GlobalParameters" in config: assignGlobalParameters(config["GlobalParameters"]) else: assignGlobalParameters({}) globalParameters["WorkingPath"] = os.path.abspath(args.output_path) ensurePath(globalParameters["WorkingPath"]) # override config with command-line options if args.device: print1("# Command-line override: Device") globalParameters["Device"] = args.device if args.platform: print1("# Command-line override: Platform") globalParameters["Platform"] = args.platform if args.RuntimeLanguage: print1("# Command-line override: RuntimeLanguage") globalParameters["RuntimeLanguage"] = args.RuntimeLanguage if args.CodeObjectVersion: print1("# Command-line override: CodeObjectVersion") globalParameters["CodeObjectVersion"] = args.CodeObjectVersion if args.verbose: print1("# Command-line override: PrintLevel") globalParameters["PrintLevel"] = 2 if args.debug: print1("# Command-line override: Debug") globalParameters["PrintLevel"] = 2 globalParameters["CMakeBuildType"] = "Debug" if args.shortNames: globalParameters["ShortNames"] = True if args.noMergeFiles: globalParameters["MergeFiles"] = False print1("") # Execute Steps in the config script executeStepsInConfig(config)
def fillInMissingStepsWithDefaults(self, config): print2("") print2( "####################################################################" ) print1("# Filling in Parameters With Defaults") print2( "####################################################################" ) print2("") ############################################################################ # (I-0) get 6 phases from config configBenchmarkCommonParameters = config["BenchmarkCommonParameters"] \ if "BenchmarkCommonParameters" in config \ else [{"ProblemSizes": defaultProblemSizes}] configForkParameters = config["ForkParameters"] \ if "ForkParameters" in config else [] configBenchmarkForkParameters = config["BenchmarkForkParameters"] \ if "BenchmarkForkParameters" in config \ else [] configJoinParameters = config["JoinParameters"] \ if "JoinParameters" in config else [] configBenchmarkJoinParameters = config["BenchmarkJoinParameters"] \ if "BenchmarkJoinParameters" in config \ else [] configBenchmarkFinalParameters = config["BenchmarkFinalParameters"] \ if "BenchmarkFinalParameters" in config and config["BenchmarkFinalParameters"] != None \ and len(config["BenchmarkFinalParameters"]) > 0 \ else [{"ProblemSizes": defaultBenchmarkFinalProblemSizes}] ############################################################################ # Ensure only valid solution parameters were requested validParameterNames = validParameters.keys() for paramDictList in [configBenchmarkCommonParameters, \ configForkParameters, configBenchmarkForkParameters, \ configBenchmarkJoinParameters]: if paramDictList != None: for paramDict in paramDictList: for paramName in paramDict: if paramName in ["ProblemSizes"]: continue else: if paramName not in validParameterNames: printExit("Invalid parameter name: %s\nValid parameters are %s." \ % (paramName, validParameterNames)) paramValues = paramDict[paramName] for paramValue in paramValues: if paramValue not in validParameters[ paramName]: printExit("Invalid parameter value: %s = %s\nValid values for %s are %s." \ % (paramName, paramValue, paramName, validParameters[paramName])) ############################################################################ # (I-1) get current problem sizes currentProblemSizes = defaultProblemSizes if configBenchmarkCommonParameters != None: if len(configBenchmarkCommonParameters) > 0: if "ProblemSizes" in configBenchmarkCommonParameters[0]: # user specified, so use it, remove it from config and insert later currentProblemSizes = \ configBenchmarkCommonParameters[0]["ProblemSizes"] del configBenchmarkCommonParameters[0] # into common we put in all Dcommon that # don't show up in Ccommon/Cfork/CBfork/Cjoin/CBjoin # followed by Ccommon self.benchmarkCommonParameters = [{ "ProblemSizes": currentProblemSizes }] for paramDict in defaultBenchmarkCommonParameters: for paramName in paramDict: if not hasParam( paramName, [ configBenchmarkCommonParameters, \ configForkParameters, configBenchmarkForkParameters, \ configJoinParameters, configBenchmarkJoinParameters]) \ or paramName == "ProblemSizes": self.benchmarkCommonParameters.append(paramDict) if configBenchmarkCommonParameters != None: for paramDict in configBenchmarkCommonParameters: self.benchmarkCommonParameters.append(paramDict) else: # make empty self.benchmarkCommonParameters = [{ "ProblemSizes": currentProblemSizes }] ############################################################################ # (I-2) into fork we put in all Dfork that # don't show up in Bcommon/Cfork/CBfork/Cjoin/CBjoin # followed by Cfork self.forkParameters = [] for paramDict in defaultForkParameters: for paramName in paramDict: if not hasParam( paramName, [ self.benchmarkCommonParameters, \ configForkParameters, configBenchmarkForkParameters, \ configJoinParameters, configBenchmarkJoinParameters]) \ or paramName == "ProblemSizes": self.forkParameters.append(paramDict) if configForkParameters != None: for paramDict in configForkParameters: self.forkParameters.append(paramDict) else: # make empty self.forkParameters = [] ############################################################################ # (I-3) get current problem sizes if configBenchmarkForkParameters != None: if len(configBenchmarkForkParameters) > 0: if "ProblemSizes" in configBenchmarkForkParameters[0]: # user specified, so use it, remove it from config and insert later currentProblemSizes = configBenchmarkForkParameters[0][ "ProblemSizes"] del configBenchmarkForkParameters[0] # into Bfork we put in all DBfork that # don't show up in Bcommon/Bfork/CBfork/Cjoin/CBjoin # followed by CBforked self.benchmarkForkParameters = [{"ProblemSizes": currentProblemSizes}] for paramDict in defaultBenchmarkForkParameters: for paramName in paramDict: if not hasParam( paramName, [ self.benchmarkCommonParameters, \ self.forkParameters, configBenchmarkForkParameters, \ configJoinParameters, configBenchmarkJoinParameters]) \ or paramName == "ProblemSizes": self.benchmarkForkParameters.append(paramDict) if configBenchmarkForkParameters != None: for paramDict in configBenchmarkForkParameters: self.benchmarkForkParameters.append(paramDict) else: # make empty self.benchmarkForkParameters = [{ "ProblemSizes": currentProblemSizes }] ############################################################################ # (I-4) into join we put in all non-derrived Djoin that # don't show up in Bcommon/Bfork/CBfork/Cjoin/CBjoin # followed by CBforked self.joinParameters = [] for paramName in defaultJoinParameters: if not hasParam( paramName, [ self.benchmarkCommonParameters, \ self.forkParameters, self.benchmarkForkParameters, \ configJoinParameters, configBenchmarkJoinParameters]) \ or paramName == "ProblemSizes": if "JoinParameters" not in config \ or (paramName != "MacroTile"): self.joinParameters.append(paramName) if configJoinParameters != None: for paramName in configJoinParameters: self.joinParameters.append(paramName) else: # make empty self.joinParameters = [] ############################################################################ # (I-5) benchmark join if configBenchmarkJoinParameters != None: if len(configBenchmarkJoinParameters) > 0: if "ProblemSizes" in configBenchmarkJoinParameters[0]: # user specified, so use it, remove it from config and insert later currentProblemSizes = configBenchmarkJoinParameters[0][ "ProblemSizes"] del configBenchmarkJoinParameters[0] # into Bjoin we put in all DBjoin that # don't show up in Bcommon/Bfork/BBfork/Bjoin/CBjoin # followed by CBjoin self.benchmarkJoinParameters = [{"ProblemSizes": currentProblemSizes}] for paramDict in defaultBenchmarkJoinParameters: for paramName in paramDict: if not hasParam( paramName, [ self.benchmarkCommonParameters, \ self.forkParameters, self.benchmarkForkParameters, \ self.joinParameters, configBenchmarkJoinParameters]) \ or paramName == "ProblemSizes": self.benchmarkJoinParameters.append(paramDict) if configBenchmarkJoinParameters != None: for paramDict in configBenchmarkJoinParameters: self.benchmarkJoinParameters.append(paramDict) else: # make empty self.benchmarkJoinParameters = [{ "ProblemSizes": currentProblemSizes }] ############################################################################ # (I-6) benchmark final sizes self.benchmarkFinalParameters = configBenchmarkFinalParameters # no other parameters besides problem sizes ############################################################################ # (I-7) any default param with 1 value will be hardcoded; move to beginning for stepList in [self.benchmarkCommonParameters, \ self.forkParameters, self.benchmarkForkParameters, \ self.benchmarkJoinParameters]: for paramDict in copy(stepList): for paramName in copy(paramDict): paramValues = paramDict[paramName] if paramValues == None: printExit( "You must specify value for parameters \"%s\"" % paramName) if len(paramValues) < 2 and paramName != "ProblemSizes": paramDict.pop(paramName) #self.benchmarkCommonParameters.insert(0, {paramName: paramValues }) self.hardcodedParameters[0][paramName] = paramValues[0] self.singleValueParameters[paramName] = [ paramValues[0] ] self.initialSolutionParameters[ paramName] = paramValues[0] if len(paramDict) == 0: stepList.remove(paramDict) ############################################################################ # (I-8) if fork and join, but no benchmark fork, append dummy benchmarkFork if len(self.forkParameters) > 0 and len(self.joinParameters) > 0 \ and (len(self.benchmarkForkParameters) == 0 \ or (len(self.benchmarkForkParameters) == 1 \ and hasParam("ProblemSizes", self.benchmarkForkParameters)) ): self.benchmarkForkParameters.append({"BenchmarkFork": [0]}) ############################################################################ # (I-9) if join, but no benchmark join, append dummy benchmarkJoin #if len(self.joinParameters) > 0 \ # and (len(self.benchmarkJoinParameters) == 0 \ # or (len(self.benchmarkJoinParameters) == 1 \ # and hasParam("ProblemSizes", self.benchmarkJoinParameters)) ): # self.benchmarkJoinParameters.append({"BenchmarkJoin": [0]}) # No, this is handles by Final Benchmark ############################################################################ # (I-10) Parameter Lists # benchmarkCommonParameters print2("HardcodedParameters:") for paramName in self.hardcodedParameters[0]: paramValues = self.hardcodedParameters[0][paramName] print2(" %s: %s" % (paramName, paramValues)) print2("BenchmarkCommonParameters:") for step in self.benchmarkCommonParameters: print2(" %s" % step) # forkParameters print2("ForkParameters:") for param in self.forkParameters: print2(" %s" % param) # benchmarkForkParameters print2("BenchmarkForkParameters:") for step in self.benchmarkForkParameters: print2(" %s" % step) # joinParameters print2("JoinParameters:") for param in self.joinParameters: print2(" %s" % param) # benchmarkJoinParameters print2("BenchmarkJoinParameters:") for step in self.benchmarkJoinParameters: print2(" %s" % step) # benchmarkJoinParameters print2("BenchmarkFinalParameters:") for step in self.benchmarkFinalParameters: print2(" %s" % step)
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \ solutionWriter, kernelWriterSource, kernelWriterAssembly): print1("# Writing Kernels") if not globalParameters["MergeFiles"]: ensurePath(os.path.join(outputPath, "Solutions")) ensurePath(os.path.join(outputPath, "Kernels")) if globalParameters["ShowProgressBar"]: progressBar = ProgressBar(len(kernels)) ############################################################################## # Write Kernels ############################################################################## if globalParameters["MergeFiles"]: kernelSourceFile = open(os.path.join(outputPath, \ "Kernels.cpp"), "w") kernelHeaderFile = open(os.path.join(outputPath, \ "Kernels.h"), "w") kernelSourceFile.write(CHeader) kernelHeaderFile.write(CHeader) kernelSourceFile.write("#include \"Kernels.h\"\n") kernelHeaderFile.write("#pragma once\n") if globalParameters["RuntimeLanguage"] == "HIP": kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n") kernelHeaderFile.write("#include <hip/hip_runtime.h>\n") kernelHeaderFile.write("#include \"TensileTypes.h\"\n") kernelHeaderFile.write("#include \"KernelHeader.h\"\n") else: kernelHeaderFile.write("#include <string>\n") kernelsWithBuildErrs = {} # tensor contraction kernels - dispatch as multiple threads: kLock = threading.Lock() pLock = threading.Lock() prepAsm() if globalParameters["CpuThreads"] == 0: cpus = 0 elif globalParameters["CodeFromFiles"]: cpu_count = multiprocessing.cpu_count() cpus = cpu_count if globalParameters["CpuThreads"] == -1 \ else min(cpu_count, globalParameters["CpuThreads"]) else: #! CodeFromFiles is not thread-safe since code merged into same file cpus = 1 workPerCpu = max(10, (len(kernels) + cpus - 1) / cpus) if cpus else 1 print "info: cpus=%u kernelsPerCpu=%u" % (cpus, workPerCpu) kiStart = 0 cpu = 0 threads = [] while kiStart < len(kernels): kiStop = min(len(kernels), kiStart + workPerCpu) #sys.stderr.write("cpu:%u process kernels #%u-#%u\n"% (cpu, kiStart, kiStop)) if cpus: args=(outputPath, kernels, kernelSourceFile, kernelHeaderFile, \ kernelWriterSource, kernelWriterAssembly, \ kernelsWithBuildErrs, progressBar, kLock, pLock, kiStart, kiStop) t = threading.Thread(target=processKernelSourceChunk, args=args) t.start() threads.append(t) else: processKernelSourceChunk(outputPath, kernels, kernelSourceFile, kernelHeaderFile, \ kernelWriterSource, kernelWriterAssembly, \ kernelsWithBuildErrs, kLock, pLock, kiStart, kiStop) kiStart += workPerCpu cpu += 1 for t in threads: t.join() # beta-only kernels for kernel in kernelsBetaOnly: kernelWriter = kernelWriterSource kernelName = kernelWriter.getKernelNameBetaOnly(kernel) # write kernel.cpp if not globalParameters["MergeFiles"]: kernelSourceFile = open(os.path.join(outputPath, \ "Kernels", kernelName+".cpp"), "w") kernelSourceFile.write(CHeader) (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel) kernelSourceFile.write(src) if err: print "*** warning: invalid kernel#%u" % kernelName if not globalParameters["MergeFiles"]: kernelSourceFile.close() # write kernel.h if not globalParameters["MergeFiles"]: kernelHeaderFile = open(os.path.join(outputPath, \ "Kernels", kernelName + ".h"), "w") kernelHeaderFile.write(CHeader) kernelHeaderFile.write( kernelWriter.getHeaderFileStringBetaOnly(kernel)) if not globalParameters["MergeFiles"]: kernelHeaderFile.close() # close merged if globalParameters["MergeFiles"]: kernelHeaderFile.close() print1("# Writing Solutions") if globalParameters["ShowProgressBar"]: progressBar = ProgressBar(len(solutions)) ############################################################################## # Write Solutions ############################################################################## if globalParameters["MergeFiles"]: solutionSourceFile = open(os.path.join(outputPath, \ "Solutions.cpp"), "w") solutionHeaderFile = open(os.path.join(outputPath, \ "Solutions.h"), "w") if globalParameters["MergeFiles"]: solutionSourceFile.write(CHeader) solutionHeaderFile.write(CHeader) solutionSourceFile.write("#include \"Solutions.h\"\n") solutionHeaderFile.write("#include \"TensileTypes.h\"\n") solutionHeaderFile.write("#include \"Kernels.h\"\n") solutionHeaderFile.write("#include \"SolutionHelper.h\"\n") solutionHeaderFile.write("#include \"Tools.h\"\n") if globalParameters["CodeFromFiles"]: solutionHeaderFile.write("#include <unistd.h>\n") for solution in solutions: # get solution name if not globalParameters["MergeFiles"]: solutionFileName = solutionWriter.getSolutionName(solution) # write solution.cpp if not globalParameters["MergeFiles"]: solutionSourceFile = open(os.path.join(outputPath, \ "Solutions", solutionFileName+".cpp"), "w") solutionSourceFile.write(CHeader) solutionSourceFile.write( \ solutionWriter.getSourceFileString(solution, kernelsWithBuildErrs)) if not globalParameters["MergeFiles"]: solutionSourceFile.close() # write solution.h if not globalParameters["MergeFiles"]: solutionHeaderFile = open(os.path.join(outputPath, \ "Solutions", solutionFileName+".h"), "w") solutionHeaderFile.write(CHeader) solutionHeaderFile.write( \ solutionWriter.getHeaderFileString(solution)) if not globalParameters["MergeFiles"]: solutionHeaderFile.close() if globalParameters["ShowProgressBar"]: progressBar.increment() # close merged if not globalParameters["MergeFiles"]: solutionHeaderFile.close() if globalParameters["ExitAfterKernelGen"]: printExit( "** Exiting after kernel generation due to ExitAfterKernelGen=1")
def writeLogic(outputPath, logicData, solutionWriter): print1("# Writing Library Logic") if not globalParameters["MergeFiles"]: ensurePath(os.path.join(outputPath, "Logic")) # Tensile.h h = "" h += "#pragma once\n" h += "#include \"TensileTypes.h\"\n" # TensileInternal.h ih = "" ih += "#include \"Tensile.h\"\n" ih += "#include \"SolutionHelper.h\"\n" if globalParameters["SolutionMapHash"]: ih += "#include <unordered_map>\n" else: ih += "#include <map>\n" ih += "#include <tuple>\n" # problem type Key problemSizeTemplate = "unsigned int, unsigned int, unsigned int" if globalParameters["RuntimeLanguage"] == "OCL": problemSizeTemplate += ", cl_command_queue" ih += "typedef std::tuple<%s> ProblemSizeKey;\n" \ % (problemSizeTemplate) # hash function ih += "\n" ih += "size_t tensileProblemSizeHasher( const ProblemSizeKey & problemSize ) {\n" ih += " size_t hash = 0;\n" ih += " // ignore lowest 4 bits; keep next 21 bits\n" ih += " size_t hash0 = (std::get<0>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of size0\n" ih += " size_t hash1 = (std::get<1>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of size1\n" ih += " size_t hashU = (std::get<2>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of sizeU\n" ih += " // 21+21+21 = 63 bit hash\n" ih += " hash |= hash0;\n" ih += " hash |= hash1<<21;\n" ih += " hash |= hashU<<42;\n" ih += " return hash;\n" ih += "}\n" ih += "\n" # Tensile.cpp s = "" s += "#include \"Tensile.h\"\n" s += "#include \"TensileInternal.h\"\n" s += "#include \"Solutions.h\"\n" ######################################## # problemType for problemType in logicData: # function argument list argListSizes = solutionWriter.getArgList(problemType, False, False, False) argListStream = solutionWriter.getArgList(problemType, False, False, True) argListData = solutionWriter.getArgList(problemType, True, True, True) # declare tensile_ProblemType h += "\n// enqueue solution\n" h += "TensileStatus tensile_%s(\n" % problemType for i in range(0, len(argListData)): h += " %s %s%s" \ % (argListData[i][0], argListData[i][1], \ ",\n" if i < len(argListData)-1 else ");\n\n") # declare TensileSolutionPointer_ProblemType h += "\n// solution pointer\n" h += "typedef TensileStatus (*TensileSolutionPointer_%s)(\n" \ % problemType for i in range(0, len(argListData)): h += " %s %s%s" % (argListData[i][0], argListData[i][1], ",\n" \ if i < len(argListData)-1 else ");\n\n") # declare tensileGetSolutionPointer_ProblemType h += "\n// get solution pointer\n" h += "TensileSolutionPointer_%s tensileGetSolutionPointer_%s(\n" \ % (problemType, problemType) for i in range(0, len(argListStream)): h += " %s %s%s" \ % (argListStream[i][0], argListStream[i][1], \ ",\n" if i < len(argListStream)-1 else ");\n\n") # declare tensileName_ h += "// get solution name\n" h += "const char * tensileGetSolutionName_%s(\n" \ % (problemType) for i in range(0, len(argListStream)): h += " %s %s%s" \ % (argListStream[i][0], argListStream[i][1], \ ",\n" if i < len(argListStream)-1 else ");\n\n") # get solution naming for problem type solutionsForProblemType = [] for scheduleTuple in logicData[problemType]: solutionsForSchedule = scheduleTuple[2] for solution in solutionsForSchedule: if solution not in solutionsForProblemType: solutionsForProblemType.append(solution) # solution names for problem type solutionNamesForProblemType = [] for solution in solutionsForProblemType: solutionName = solutionWriter.getSolutionName(solution) solutionNamesForProblemType.append(solutionName) # reset problemType source if not globalParameters["MergeFiles"]: filePrefix = "Tensile_%s" % (problemType) s = "#include \"Tensile.h\"\n" s += "#include \"TensileInternal.h\"\n" for solutionName in solutionNamesForProblemType: s += "#include \"%s.h\"\n" % solutionName ######################################## # implement per-Schedule functions in source s += "/*******************************************************************************\n * Per-Schedule Functions\n *******************************************************************************/" for scheduleTuple in logicData[problemType]: # get logic parameters for problem type scheduleName = scheduleTuple[0] deviceNames = scheduleTuple[1] solutionsForSchedule = scheduleTuple[2] indexOrder = scheduleTuple[3] exactLogic = scheduleTuple[4] rangeLogic = scheduleTuple[5] # solution names for schedule solutionNamesForSchedule = [] for solution in solutionsForSchedule: solutionName = solutionWriter.getSolutionName(solution) solutionNamesForSchedule.append(solutionName) # function tensileGetSolutionPointerUncached_Schedule_ProblemType s += "\n// problem size -> solution logic\n" s += "TensileSolutionPointer_%s tensileGetSolutionPointerUncached_%s_%s(\n" \ % (problemType, scheduleName, problemType) for i in range(0, len(argListSizes)): s += " %s %s%s" \ % (argListSizes[i][0], argListSizes[i][1], \ ",\n" if i < len(argListSizes)-1 else ") {\n\n") exactLogicStr = writeExactLogic(exactLogic, \ solutionNamesForSchedule, True) if rangeLogic != None: rangeLogicStr = writeRangeLogicRec(0, indexOrder, rangeLogic, \ solutionNamesForSchedule, problemType, True) else: rangeLogicStr = " return NULL; // none\n" s += " /* exact mappings */\n" s += exactLogicStr s += "\n /* range mappings */\n" s += rangeLogicStr s += "\n}\n" # function tensileGetSolutionName_Schedule_ProblemType s += "\n// get solution name for problem size\n" s += "const char * tensileGetSolutionName_%s_%s(\n" \ % (scheduleName, problemType) for i in range(0, len(argListSizes)): s += " %s %s%s" \ % (argListSizes[i][0], argListSizes[i][1], \ ",\n" if i < len(argListSizes)-1 else ") {\n\n") exactLogicStr = writeExactLogic(exactLogic, \ solutionNamesForSchedule, False) if rangeLogic != None: rangeLogicStr = writeRangeLogicRec(0, indexOrder, rangeLogic, \ solutionNamesForSchedule, problemType, False) else: rangeLogicStr = " return NULL; // none\n" s += " /* exact mappings */\n" s += exactLogicStr s += "\n /* range mappings */\n" s += rangeLogicStr s += "\n}\n" ######################################## # implement problem-type functions in source s += "/*******************************************************************************\n * Per-ProblemType Functions\n *******************************************************************************/" if globalParameters["SolutionMapHash"]: ih += "typedef std::unordered_map<ProblemSizeKey, TensileSolutionPointer_%s, std::function<size_t (ProblemSizeKey)>> Map_%s;\n" \ % (problemType, problemType ) else: ih += "typedef std::map<ProblemSizeKey, TensileSolutionPointer_%s> Map_%s;\n" \ % (problemType, problemType) ih += "extern Map_%s solutionMap_%s;\n" % (problemType, problemType) # implement tensileGetSolutionPointerUncached_ProblemType for ptr in [True, False]: returnType = "PointerUncached" if ptr else "Name" s += "\n// return solution %s\n" % returnType s += ("TensileSolutionPointer_%s " % problemType) if ptr else "const char *" s += "tensileGetSolution%s_%s(\n" \ % (returnType, problemType) for i in range(0, len(argListStream)): s += " %s %s%s" \ % (argListStream[i][0], argListStream[i][1], \ ",\n" if i < len(argListStream)-1 else ") {\n") # choose from schedules based on device name schedules = logicData[problemType] numSchedules = len(schedules) if numSchedules > 1: # get device name if globalParameters["RuntimeLanguage"] == "OCL": s += "get device name opencl;\n" else: s += "get device name hip;\n" for scheduleIdx in range(0, numSchedules): schedule = schedules[scheduleIdx] deviceNames = schedule[1] if scheduleIdx > 0: s += "else " if scheduleIdx < numSchedules - 1: s += "if (" for deviceNameIdx in range(0, len(deviceNames)): deviceName = deviceNames[deviceNameIdx] if deviceNameIdx > 0: s += " && " s += "name == \"%s\"" % deviceName s += ")" s += "{" s += " return tensileGetSolution%s_%s_%s(" \ % ( returnType, scheduleName, problemType) for i in range(0, len(argListSizes)): s += "%s%s" \ % (argListSizes[i][1], ", " if i < len(argListSizes)-1 else ");\n") s += "}" else: # == 1 schedule = schedules[0] scheduleName = schedule[0] s += " return tensileGetSolution%s_%s_%s(" \ % ( returnType, scheduleName, problemType) for i in range(0, len(argListSizes)): s += "%s%s" \ % (argListSizes[i][1], ", " if i < len(argListSizes)-1 else ");\n") s += "\n}\n" # implement tensileGetSolutionPointer_ProblemType s += "\n// return solution pointer; user calls it\n" s += "Map_%s solutionMap_%s%s;\n" % ( problemType, problemType, "(1024, tensileProblemSizeHasher)" if globalParameters["SolutionMapHash"] else "") s += "TensileSolutionPointer_%s tensileGetSolutionPointer_%s(\n" \ % (problemType, problemType) for i in range(0, len(argListStream)): s += " %s %s%s" \ % (argListStream[i][0], argListStream[i][1], \ ",\n" if i < len(argListStream)-1 else ") {\n") # create key s += " ProblemSizeKey key = std::make_tuple( size%s, size%s, size%s%s );\n" \ % ( \ globalParameters["IndexChars"][problemType["Index0"]], \ globalParameters["IndexChars"][problemType["Index1"]], \ globalParameters["IndexChars"][problemType["IndexUnroll"]], \ ", stream" if globalParameters["RuntimeLanguage"] == "OCL" else "") # check for key in map s += " static std::mutex findKernelMutex;\n" s += " std::lock_guard<std::mutex> findKernelLock(findKernelMutex);\n" s += " Map_%s::iterator iter = solutionMap_%s.find(key);\n" \ % (problemType, problemType) s += " if (iter != solutionMap_%s.end()) {\n" % problemType s += " return iter->second;\n" s += " } else {\n" s += " TensileSolutionPointer_%s ptr = tensileGetSolutionPointerUncached_%s(\n" \ % (problemType, problemType) for i in range(0, len(argListStream)): s += " %s%s" \ % (argListStream[i][1], "," if i < len(argListStream)-1 else ");") s += "\n" s += " solutionMap_%s[key] = ptr;\n" % problemType s += " return ptr;\n" s += " }\n" s += "}\n" # declare tensile_ProblemType s += "\n// main call to solution; enqueues a kernel\n" s += "TensileStatus tensile_%s(\n" % problemType for i in range(0, len(argListData)): s += " %s %s%s" \ % (argListData[i][0], argListData[i][1], \ ",\n" if i < len(argListData)-1 else ") {\n") s += " TensileSolutionPointer_%s ptr = tensileGetSolutionPointer_%s(\n" \ % (problemType, problemType) for i in range(0, len(argListStream)): s += " %s%s" \ % (argListStream[i][1], ", " if i < len(argListStream)-1 else ");") s += "\n" s += " if ( ptr ) {\n" s += " return ptr(" for i in range(0, len(argListData)): s += "%s%s" \ % (argListData[i][1], ", " if i < len(argListData)-1 else ");\n") s += " } else {\n" s += " return tensileStatusFailure; // no solution found\n" s += " }\n" s += "}\n" # open and close problemType files if not globalParameters["MergeFiles"]: logicSourceFile = open(os.path.join(outputPath, "Logic", \ "%s.cpp" % filePrefix), "w") logicSourceFile.write(s) logicSourceFile.close() # close merged files if globalParameters["MergeFiles"]: logicSourceFile = open(os.path.join(outputPath, \ "Tensile.cpp"), "w") logicSourceFile.write(s) logicSourceFile.close() logicHeaderFile = open(os.path.join(outputPath, \ "Tensile.h"), "w") logicHeaderFile.write(h) logicHeaderFile.close() internalHeaderFile = open(os.path.join(outputPath, \ "TensileInternal.h"), "w") internalHeaderFile.write(ih) internalHeaderFile.close()
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \ solutionWriter, kernelWriterSource, kernelWriterAssembly): print1("# Writing Kernels") if not globalParameters["MergeFiles"]: ensurePath(os.path.join(outputPath, "Solutions")) ensurePath(os.path.join(outputPath, "Kernels")) if globalParameters["ShowProgressBar"]: progressBar = ProgressBar(len(kernels)) ############################################################################## # Write Kernels ############################################################################## if globalParameters["MergeFiles"]: kernelSourceFile = open(os.path.join(outputPath, \ "Kernels.cpp"), "w") kernelHeaderFile = open(os.path.join(outputPath, \ "Kernels.h"), "w") kernelSourceFile.write(CHeader) kernelHeaderFile.write(CHeader) kernelSourceFile.write("#include \"Kernels.h\"\n") kernelHeaderFile.write("#pragma once\n") if globalParameters["RuntimeLanguage"] == "HIP": kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n") kernelHeaderFile.write("#include <hip/hip_runtime.h>\n") kernelHeaderFile.write("#include \"TensileTypes.h\"\n") kernelHeaderFile.write("#include \"KernelHeader.h\"\n") else: kernelHeaderFile.write("#include <string>\n") # tensor contraction kernels for ki in range(0, len(kernels)): kernel = kernels[ki] kernelWriter = kernelWriterSource if kernel[ "KernelLanguage"] == "Source" else kernelWriterAssembly # get kernel name if not globalParameters["MergeFiles"]: kernelName = kernelWriter.getKernelName(kernel) # write kernel.cpp if not globalParameters["MergeFiles"]: kernelSourceFile = open(os.path.join(outputPath, \ "Kernels", kernelName+".cpp"), "w") kernelSourceFile.write(CHeader) (err, src) = kernelWriter.getSourceFileString(kernel) kernelSourceFile.write(src) if err: print "*** warning: invalid kernel#%u" % ki if not globalParameters["MergeFiles"]: kernelSourceFile.close() # write kernel.h if not globalParameters["MergeFiles"]: kernelHeaderFile = open(os.path.join(outputPath, \ "Kernels", kernelName+".h"), "w") kernelHeaderFile.write(CHeader) kernelHeaderFile.write(kernelWriter.getHeaderFileString(kernel)) if not globalParameters["MergeFiles"]: kernelHeaderFile.close() if globalParameters["ShowProgressBar"]: progressBar.increment() # beta-only kernels for kernel in kernelsBetaOnly: kernelWriter = kernelWriterSource kernelName = kernelWriter.getKernelNameBetaOnly(kernel) # write kernel.cpp if not globalParameters["MergeFiles"]: kernelSourceFile = open(os.path.join(outputPath, \ "Kernels", kernelName+".cpp"), "w") kernelSourceFile.write(CHeader) (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel) kernelSourceFile.write(src) if err: print "*** warning: invalid kernel#%u" % ki if not globalParameters["MergeFiles"]: kernelSourceFile.close() # write kernel.h if not globalParameters["MergeFiles"]: kernelHeaderFile = open(os.path.join(outputPath, \ "Kernels", kernelName + ".h"), "w") kernelHeaderFile.write(CHeader) kernelHeaderFile.write( kernelWriter.getHeaderFileStringBetaOnly(kernel)) if not globalParameters["MergeFiles"]: kernelHeaderFile.close() # close merged if globalParameters["MergeFiles"]: kernelHeaderFile.close() print1("# Writing Solutions") if globalParameters["ShowProgressBar"]: progressBar = ProgressBar(len(solutions)) ############################################################################## # Write Solutions ############################################################################## if globalParameters["MergeFiles"]: solutionSourceFile = open(os.path.join(outputPath, \ "Solutions.cpp"), "w") solutionHeaderFile = open(os.path.join(outputPath, \ "Solutions.h"), "w") if globalParameters["MergeFiles"]: solutionSourceFile.write(CHeader) solutionHeaderFile.write(CHeader) solutionSourceFile.write("#include \"Solutions.h\"\n") solutionHeaderFile.write("#include \"TensileTypes.h\"\n") solutionHeaderFile.write("#include \"Kernels.h\"\n") solutionHeaderFile.write("#include \"SolutionHelper.h\"\n") solutionHeaderFile.write("#include \"Tools.h\"\n") for solution in solutions: # get solution name if not globalParameters["MergeFiles"]: solutionFileName = solutionWriter.getSolutionName(solution) # write solution.cpp if not globalParameters["MergeFiles"]: solutionSourceFile = open(os.path.join(outputPath, \ "Solutions", solutionFileName+".cpp"), "w") solutionSourceFile.write(CHeader) solutionSourceFile.write( \ solutionWriter.getSourceFileString(solution)) if not globalParameters["MergeFiles"]: solutionSourceFile.close() # write solution.h if not globalParameters["MergeFiles"]: solutionHeaderFile = open(os.path.join(outputPath, \ "Solutions", solutionFileName+".h"), "w") solutionHeaderFile.write(CHeader) solutionHeaderFile.write( \ solutionWriter.getHeaderFileString(solution)) if not globalParameters["MergeFiles"]: solutionHeaderFile.close() if globalParameters["ShowProgressBar"]: progressBar.increment() # close merged if not globalParameters["MergeFiles"]: solutionHeaderFile.close() if globalParameters["ExitAfterKernelGen"]: printExit( "** Exiting after kernel generation due to ExitAfterKernelGen=1")