Exemple #1
0
def writeBenchmarkFiles(stepBaseDir, solutions, problemSizes, stepName, filesToCopy):
  if not globalParameters["MergeFiles"]:
    ensurePath(os.path.join(globalParameters["WorkingPath"], "Solutions"))
    ensurePath(os.path.join(globalParameters["WorkingPath"], "Kernels"))

  ##############################################################################
  # Min Naming
  ##############################################################################
  kernels = []
  kernelsBetaOnly = []
  for solution in solutions:
    solutionKernels = solution.getKernels()
    for kernel in solutionKernels:
      if kernel not in kernels:
        kernels.append(kernel)
    solutionKernelsBetaOnly = solution.getKernelsBetaOnly()
    for kernel in solutionKernelsBetaOnly:
      if kernel not in kernelsBetaOnly:
        kernelsBetaOnly.append(kernel)

  solutionSerialNaming = Solution.getSerialNaming(solutions)
  kernelSerialNaming = Solution.getSerialNaming(kernels)
  solutionMinNaming = Solution.getMinNaming(solutions)
  kernelMinNaming = Solution.getMinNaming(kernels)
  solutionWriter = SolutionWriter( \
      solutionMinNaming, solutionSerialNaming, \
      kernelMinNaming, kernelSerialNaming)
  kernelWriterSource = KernelWriterSource( \
      kernelMinNaming, kernelSerialNaming)
  kernelWriterAssembly = KernelWriterAssembly( \
      kernelMinNaming, kernelSerialNaming)

  # write solution, kernels and CMake
  problemType = solutions[0]["ProblemType"]
  writeSolutionsAndKernels( \
      globalParameters["WorkingPath"], [problemType], solutions, kernels, kernelsBetaOnly, \
      solutionWriter, kernelWriterSource, kernelWriterAssembly )

  ##############################################################################
  # Write CMake
  ##############################################################################

  clientName = "TensileBenchmark_%s" % stepName
  writeCMake(globalParameters["WorkingPath"], solutions, kernels, filesToCopy, \
      clientName)

  forBenchmark = True
  writeClientParameters(forBenchmark, solutions, problemSizes, stepName, \
      filesToCopy, stepBaseDir)
Exemple #2
0
def main( config ):
  dataPath = os.path.join(globalParameters["WorkingPath"], \
      globalParameters["BenchmarkDataPath"])
  pushWorkingPath(globalParameters["BenchmarkProblemsPath"])
  ensurePath(dataPath)
  totalTestFails = 0
  for benchmarkProblemTypeConfig in config:
    problemTypeConfig = benchmarkProblemTypeConfig[0]
    if len(benchmarkProblemTypeConfig) < 2:
      problemSizeGroupConfigs = [{}]
    else:
      problemSizeGroupConfigs = benchmarkProblemTypeConfig[1:]
    for problemSizeGroupIdx in range(0, len(problemSizeGroupConfigs)):
      problemSizeGroupConfig = problemSizeGroupConfigs[problemSizeGroupIdx]
      print2("ProblemTypeConfig: %s" % problemTypeConfig)
      problemTypeObj = ProblemType(problemTypeConfig)
      globalParameters["EnableHalf"] = problemTypeObj["DataType"].isHalf()

      # results files will be named
      newResultsFileName = os.path.join(dataPath, "%s_%02u.csv" \
          % (str(problemTypeObj), problemSizeGroupIdx) )
      newSolutionsFileName = os.path.join(dataPath, "%s_%02u.yaml" \
          % (str(problemTypeObj), problemSizeGroupIdx) )

      # skip if possible
      if globalParameters["ForceRedoBenchmarkProblems"] or \
          not os.path.exists(newResultsFileName):

        # Benchmark Problem Size Group
        (resultsFileBaseFinal, benchmarkErrors) = benchmarkProblemType(problemTypeConfig, \
            problemSizeGroupConfig, problemSizeGroupIdx)
        totalTestFails += benchmarkErrors
        print "clientExit=%u %s for %s" %\
                (totalTestFails, "(ERROR)" if totalTestFails else "(PASS)", \
                globalParameters["ConfigPath"])

        # Copy Data
        resultsFileBase = resultsFileBaseFinal
        resultsFileName = "%s.csv" % (resultsFileBase)
        solutionsFileName = "%s.yaml" % (resultsFileBase)
        shutil_copy( resultsFileName, newResultsFileName )
        shutil_copy( solutionsFileName, newSolutionsFileName )
      else:
        print1("# %s_%02u already benchmarked; skipping." % (str(problemTypeObj), problemSizeGroupIdx) )

  popWorkingPath()

  if globalParameters["ExitOnFails"] and totalTestFails:
    sys.exit(1)
def prepAsm():
    asmPath = ensurePath(
        os.path.join(globalParameters["WorkingPath"], "assembly"))
    assemblerFileName = os.path.join(asmPath, \
        "asm.%s"%("bat" if os.name=="nt" else "sh"))
    assemblerFile = open(assemblerFileName, "w")
    if os.name == "nt":
        assemblerFile.write("echo Windows: Copying instead of Assembling\n")
        assemblerFile.write("copy %1.s %1.o\n")
        assemblerFile.write("copy %1.o %1.co\n")
    else:
        assemblerFile.write(
            "#!/bin/sh %s\n" %
            ("-x" if globalParameters["PrintLevel"] >= 2 else ""))
        assemblerFile.write("# usage: asm.sh kernelName ASM_ARGS\n")
        assemblerFile.write("# example: asm.sh kernelName -mcpu=gfx900\n")
        assemblerFile.write("f=$1\n")
        assemblerFile.write("shift\n")
        assemblerFile.write("ASM=%s\n" % globalParameters["AssemblerPath"])
        # cannot use globalParameters["CurrentISA"] because it might be (0,0,0)
        defaultIsa = (9, 0, 0)
        assemblerFile.write( \
          "${ASM} -x assembler -target amdgcn--amdhsa %s $@ -c -o $f.o $f.s\n" % \
          ("-mno-code-object-v3" if \
          globalParameters["AsmCaps"][defaultIsa]["HasCodeObjectV3"] else ""))
        assemblerFile.write("${ASM} -target amdgcn--amdhsa $f.o -o $f.co\n")
    assemblerFile.close()
    os.chmod(assemblerFileName, 0777)
def prepAsm():
  asmPath = ensurePath(os.path.join(globalParameters["WorkingPath"], "assembly") )
  assemblerFileName = os.path.join(asmPath, \
      "asm.%s"%("bat" if os.name=="nt" else "sh"))
  assemblerFile = open(assemblerFileName, "w")
  if os.name == "nt":
    assemblerFile.write("echo Windows: Copying instead of Assembling\n")
    assemblerFile.write("copy %1.s %1.o\n")
    assemblerFile.write("copy %1.o %1.co\n")
  else:
    assemblerFile.write("#!/bin/sh %s\n" % ("-x" if globalParameters["PrintLevel"] >=2  else ""))
    assemblerFile.write("# usage: asm.sh kernelName ASM_ARGS\n")
    assemblerFile.write("# example: asm.sh kernelName -mcpu=gfx900\n")
    assemblerFile.write("f=$1\n")
    assemblerFile.write("shift\n")
    assemblerFile.write("ASM=%s\n"%globalParameters["AssemblerPath"])
    assemblerFile.write("${ASM} -x assembler -target amdgcn--amdhsa $@ -c -o $f.o $f.s\n")
    assemblerFile.write("${ASM} -target amdgcn--amdhsa $f.o -o $f.co\n")
  assemblerFile.close()
  os.chmod(assemblerFileName, 0777)
Exemple #5
0
def benchmarkProblemType( problemTypeConfig, problemSizeGroupConfig, \
    problemSizeGroupIdx ):

    benchmarkTestFails = 0

    # convert config to full benchmark process (resolves defaults)
    print1("")
    print1(HR)
    print1("# Converting Config to BenchmarkProcess Object")
    print1(HR)
    print1("")
    benchmarkProcess = BenchmarkProcess( problemTypeConfig, \
        problemSizeGroupConfig )

    problemTypeName = str(benchmarkProcess.problemType)
    problemSizeGroupName = "%s_%02u" % (problemTypeName, problemSizeGroupIdx)
    pushWorkingPath(problemSizeGroupName)
    ensurePath(os.path.join(globalParameters["WorkingPath"], "Data"))

    totalBenchmarkSteps = len(benchmarkProcess)
    resultsFileBaseFinal = None
    winners = WinningParameterDict()
    print1("# NumBenchmarkSteps: %u" % totalBenchmarkSteps)
    print1("")
    print1(HR)
    print1("# Done Creating BenchmarkProcess Object")
    print1(HR)

    ##############################################################################
    # For Each Benchmark Step
    ##############################################################################
    for benchmarkStepIdx in range(0, totalBenchmarkSteps):

        benchmarkStep = benchmarkProcess[benchmarkStepIdx]
        if winners.winners == {}:
            # perf optimization to skip the initial winners creation
            # this helps a little here but really helps below with avoiding the super-expensive
            # removeHardcoded step below - that can use a fast-path to create
            # winners when needed.
            print1(
                "# Empty winners - use fast initialization of hardcodedParameters"
            )
            resultingHardcodedParameterList = benchmarkStep.hardcodedParameters
        else:
            resultingHardcodedParameterList = \
                winners.wpdUpdate( benchmarkStep.hardcodedParameters )

        benchmarkStep.hardcodedParameters = resultingHardcodedParameterList
        numHardcoded = len(benchmarkStep.hardcodedParameters)
        stepName = str(benchmarkStep)
        shortName = benchmarkStep.abbreviation()
        print1("\n")
        print1(HR)
        currentTime = time.time()
        elapsedTime = currentTime - startTime
        print1("# BenchmarkStep: %s - %s %.3fs" %
               (problemSizeGroupName, stepName, elapsedTime))
        print1("# NumProblems: %u" %
               benchmarkStep.problemSizes.totalProblemSizes)
        print1("# BenchmarkParameters:")
        for paramName in benchmarkStep.benchmarkParameters:
            paramValues = benchmarkStep.benchmarkParameters[paramName]
            printStr = "#     %s = { %s" % (paramName, paramValues[0])
            for paramValueIdx in range(1, len(paramValues)):
                printStr += ", %s" % str(paramValues[paramValueIdx])
            printStr += " }"
            print1(printStr)

        if False:
            # print1(hardcoded parameters and their winners
            print1("# HardcodedParameters | WinningParameters:")
            paramDictIdx = 0
            hardcodedMinNaming = \
                Solution.getMinNaming(benchmarkStep.hardcodedParameters)
            for paramDict in benchmarkStep.hardcodedParameters:
                winningParameters = winners[paramDict]
                print1("#    (%u) %s | %s" % (paramDictIdx, \
                    Solution.getNameMin(paramDict, hardcodedMinNaming), \
                    Solution.getNameFull(winningParameters) ))
                paramDictIdx += 1
        pushWorkingPath(shortName)

        ############################################################################
        # Copy Files to Benchmark Source Directory
        ############################################################################
        stepBaseDir = globalParameters["WorkingPath"]
        sourceDir = \
          os.path.join(stepBaseDir, "source" )
        ensurePath(sourceDir)
        pushWorkingPath("sourceTmp")
        filesToCopy = [
            "SolutionMapper.h",
            "Client.cpp",
            "Client.h",
            "CMakeLists.txt",
            "DeviceStats.h",
            "TensorUtils.h",
            "MathTemplates.cpp",
            "MathTemplates.h",
            "TensileTypes.h",
            "tensile_bfloat16.h",
            "KernelHeader.h",
            "ReferenceCPU.h",
            "SolutionHelper.cpp",
            "SolutionHelper.h",
            "Tools.cpp",
            "Tools.h",
        ]

        for f in filesToCopy:
            shutil_copy(os.path.join(globalParameters["SourcePath"], f),
                        globalParameters["WorkingPath"])
        if globalParameters["RuntimeLanguage"] == "OCL":
            shutil_copy(
                os.path.join(globalParameters["SourcePath"],
                             "FindOpenCL.cmake"),
                globalParameters["WorkingPath"])
        else:
            shutil_copy(
                os.path.join(globalParameters["SourcePath"], "FindHIP.cmake"),
                globalParameters["WorkingPath"])
            shutil_copy(
                os.path.join(globalParameters["SourcePath"], "FindHCC.cmake"),
                globalParameters["WorkingPath"])

        ############################################################################
        # Enumerate Benchmark Permutations
        ############################################################################
        solutions = []
        totalBenchmarkPermutations = 1
        for benchmarkParamName in benchmarkStep.benchmarkParameters:
            totalBenchmarkPermutations *= len(
                benchmarkStep.benchmarkParameters[benchmarkParamName])
        maxPossibleSolutions = totalBenchmarkPermutations * numHardcoded
        print1("# MaxPossibleSolutions: %u = %u (hardcoded) * %u (benchmark)" % \
            (maxPossibleSolutions, numHardcoded, totalBenchmarkPermutations))

        benchmarkPermutations = []
        for i in range(0, totalBenchmarkPermutations):
            permutation = {}
            pIdx = i
            for benchmarkParamName in benchmarkStep.benchmarkParameters:
                benchmarkParamValues = deepcopy( \
                    benchmarkStep.benchmarkParameters[benchmarkParamName])
                valueIdx = pIdx % len(benchmarkParamValues)
                permutation[benchmarkParamName] = benchmarkParamValues[
                    valueIdx]
                pIdx /= len(benchmarkParamValues)
            benchmarkPermutations.append(permutation)

        ############################################################################
        # Enumerate Solutions = Hardcoded * Benchmark
        ############################################################################
        print1("# Enumerating Solutions")
        if globalParameters["PrintLevel"] >= 1:
            progressBar = ProgressBar(maxPossibleSolutions)
        solutionSet = set()  # avoid duplicates for nlca=-1, 1
        for hardcodedIdx in range(0, numHardcoded):
            solutions.append([])
            hardcodedParamDict = benchmarkStep.hardcodedParameters[
                hardcodedIdx]
            for benchmarkIdx in range(0, len(benchmarkPermutations)):
                benchmarkPermutation = benchmarkPermutations[benchmarkIdx]
                solution = {
                    "ProblemType": deepcopy(benchmarkProcess.problemType.state)
                }
                solution.update(benchmarkPermutation)
                solution.update(hardcodedParamDict)
                if benchmarkStepIdx > 0:
                    winningParameters = winners[hardcodedParamDict]
                    if winningParameters == None:
                        # this is a joined parameter that didn't have a winner, that's okay
                        continue
                    solution.update(winningParameters)

                # append default parameters where necessary
                for initialSolutionParameterName in benchmarkStep.initialSolutionParameters:
                    if initialSolutionParameterName not in solution:
                        solution[initialSolutionParameterName] = \
                            benchmarkStep.initialSolutionParameters[initialSolutionParameterName]
                # TODO check if solution matches problem size for exact tile kernels
                solutionObject = Solution(solution)
                if solutionObject["Valid"]:
                    if solutionObject not in solutionSet:
                        solutionSet.add(solutionObject)
                        solutions[hardcodedIdx].append(solutionObject)
                else:
                    if globalParameters["PrintSolutionRejectionReason"]:
                        print1("rejecting solution %s" % str(solutionObject))
                if globalParameters["PrintLevel"] >= 1:
                    progressBar.increment()

        # remove hardcoded that don't have any valid benchmarks
        removeHardcoded = []
        for hardcodedIdx in range(0, numHardcoded):
            if len(solutions[hardcodedIdx]) == 0:
                hardcodedParamDict = benchmarkStep.hardcodedParameters[
                    hardcodedIdx]
                removeHardcoded.append(hardcodedParamDict)
        removesExist = len(removeHardcoded) > 0
        for hardcodedParam in removeHardcoded:
            benchmarkStep.hardcodedParameters.remove(hardcodedParam)

        if removesExist:
            print1(
                "# Updating winners since enumeration removed unused hardcoded solutions.  removeHardcoded=%u winners=%u"
                % (len(removeHardcoded), len(winners.winners)))
            winners.wpdUpdate(benchmarkStep.hardcodedParameters)
            if globalParameters["PrintLevel"] >= 1:
                print1("")
            numHardcoded = len(benchmarkStep.hardcodedParameters)
            # remove from solution 2D list also
            for solutionList in shallowcopy(solutions):
                if len(solutionList) == 0:
                    solutions.remove(solutionList)
        elif winners.winners == {}:
            print1("# Populating initial winners (%u solutions)\n" %
                   len(benchmarkStep.hardcodedParameters))
            for hcParm in benchmarkStep.hardcodedParameters:
                winners.winners[FrozenDictionary(hcParm)] = [{}, -1]

        print1("# Actual Solutions: %u / %u\n" % ( len(solutions), \
            maxPossibleSolutions ))

        # create linear list
        solutionList = []
        for i in range(0, len(solutions)):
            solutionsForHardcoded = solutions[i]
            for j in range(0, len(solutionsForHardcoded)):
                solution = solutionsForHardcoded[j]
                solutionList.append(solution)
        if len(solutionList) == 0:
            msg = "Your parameters resulted in 0 valid solutions."
            if globalParameters["PrintSolutionRejectionReason"]:
                msg += "\nExamine reject and backtrace messages above to see why and where solutions were rejected."
            else:
                msg += "\nYou should re-run with \"PrintSolutionRejectionReason: True\" to see why each parameter combination was rejected."
            printExit(msg)
        if globalParameters["PrintLevel"] >= 1:
            for i in range(0, len(solutions)):
                solutionsForHardcoded = solutions[i]
                for j in range(0, len(solutionsForHardcoded)):
                    solution = solutionsForHardcoded[j]
                    print2("#    (%u:%u) %s" % (i, j, \
                        Solution.getNameFull(solution) ))
            print2(HR)

        # write benchmarkFiles
        writeBenchmarkFiles(stepBaseDir, solutionList, benchmarkStep.problemSizes, \
            shortName, filesToCopy)

        print1("# Copying files that differ from sourceTmp -> source")
        sourceTmp = globalParameters["WorkingPath"]
        files = os.listdir(sourceTmp)
        for f in files:
            f0 = os.path.join(sourceTmp, f)
            f1 = os.path.join(sourceDir, f)
            if os.path.isdir(f0):
                #print "cpDir:", f0, f1
                if os.path.isdir(f1):
                    shutil.rmtree(f1, True)
                shutil.copytree(f0, f1)
            elif not os.path.exists(f1) or not filecmp.cmp(f0, f1):
                #print "cp:", f0, f1
                shutil.copy(f0, f1)
        shutil.rmtree(sourceTmp, True)

        popWorkingPath()  # source

        ############################################################################
        # Run Benchmark Script
        ############################################################################
        resultsFileBase = os.path.normpath(os.path.join( \
            globalParameters["WorkingPath"], "../Data", shortName))
        if benchmarkStep.isFinal():
            resultsFileBaseFinal = resultsFileBase
        resultsFileName = resultsFileBase + ".csv"
        solutionsFileName = resultsFileBase + ".yaml"
        if not os.path.exists(resultsFileName) or \
            globalParameters["ForceRedoBenchmarkProblems"]:
            pushWorkingPath("build")

            # write runScript
            libraryLogicPath = None
            path = globalParameters["WorkingPath"]
            forBenchmark = True
            runScriptName = writeRunScript(path, libraryLogicPath,
                                           forBenchmark)

            # run runScript
            process = Popen(runScriptName, cwd=globalParameters["WorkingPath"])
            process.communicate()
            if process.returncode:
                benchmarkTestFails += 1
                printWarning(
                    "BenchmarkProblems: Benchmark Process exited with code %u"
                    % process.returncode)
            popWorkingPath()  # build
        else:
            print1("# Already benchmarked; skipping.")

        ############################################################################
        # Winners -> Determined Parameters
        ############################################################################
        results = getResults(resultsFileName, solutions)
        print2("CSV Results: %s" % results)
        winners.addResults(benchmarkStep.hardcodedParameters, \
            benchmarkPermutations, solutions, results)

        ############################################################################
        # Write Solutions YAML
        ############################################################################
        YAMLIO.writeSolutions(solutionsFileName, benchmarkStep.problemSizes, \
            solutions )

        # End Iteration
        popWorkingPath()  # stepName
        currentTime = time.time()
        elapsedTime = currentTime - startTime
        print1("%s\n# %s\n# %s: End - %.3fs\n%s\n" \
            % (HR, problemSizeGroupName, shortName, elapsedTime, HR))

    popWorkingPath()  # ProblemType
    return (resultsFileBaseFinal, benchmarkTestFails)
def writeLogic(outputPath, logicData, solutionWriter):
    print1("# Writing Library Logic")

    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Logic"))

    # Tensile.h
    h = ""
    h += "#pragma once\n"
    h += "#include \"TensileTypes.h\"\n"

    # TensileInternal.h
    ih = ""
    ih += "#include \"Tensile.h\"\n"
    ih += "#include \"SolutionHelper.h\"\n"
    if globalParameters["SolutionMapHash"]:
        ih += "#include <unordered_map>\n"
    else:
        ih += "#include <map>\n"
    ih += "#include <tuple>\n"

    # problem type Key
    problemSizeTemplate = "unsigned int, unsigned int, unsigned int"
    if globalParameters["RuntimeLanguage"] == "OCL":
        problemSizeTemplate += ", cl_command_queue"
    ih += "typedef std::tuple<%s> ProblemSizeKey;\n" \
        % (problemSizeTemplate)

    # hash function
    ih += "\n"
    ih += "size_t tensileProblemSizeHasher( const ProblemSizeKey & problemSize ) {\n"
    ih += "  size_t hash = 0;\n"
    ih += "  // ignore lowest 4 bits; keep next 21 bits\n"
    ih += "  size_t hash0 = (std::get<0>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of size0\n"
    ih += "  size_t hash1 = (std::get<1>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of size1\n"
    ih += "  size_t hashU = (std::get<2>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of sizeU\n"
    ih += "  // 21+21+21 = 63 bit hash\n"
    ih += "  hash |= hash0;\n"
    ih += "  hash |= hash1<<21;\n"
    ih += "  hash |= hashU<<42;\n"
    ih += "  return hash;\n"
    ih += "}\n"
    ih += "\n"

    # Tensile.cpp
    s = ""
    s += "#include \"Tensile.h\"\n"
    s += "#include \"TensileInternal.h\"\n"
    s += "#include \"Solutions.h\"\n"
    s += "#include \"SolutionMapper.h\"\n"

    ########################################
    # problemType
    for problemType in logicData:

        # function argument list
        argListSizes = solutionWriter.getArgList(problemType, False, False,
                                                 False)
        argListStream = solutionWriter.getArgList(problemType, False, False,
                                                  True)
        argListData = solutionWriter.getArgList(problemType, True, True, True)

        # declare tensile_ProblemType
        h += "\n// enqueue solution\n"
        h += "TensileStatus tensile_%s(\n" % problemType
        for i in range(0, len(argListData)):
            h += "    %s %s%s" \
                % (argListData[i][0], argListData[i][1], \
                ",\n" if i < len(argListData)-1 else ");\n\n")

        # declare TensileSolutionPointer_ProblemType
        h += "\n// solution pointer\n"
        h += "typedef TensileStatus (*TensileSolutionPointer_%s)(\n" \
            % problemType
        for i in range(0, len(argListData)):
            h += "    %s %s%s" % (argListData[i][0], argListData[i][1], ",\n" \
                if i < len(argListData)-1 else ");\n\n")

        numSizes = problemType["TotalIndices"]
        h += "typedef ProblemSizes<%u, %u, %u> ProblemSizes_%s;\n" \
            % (numSizes, problemType["IndicesSummation"][-1], problemType["IndicesFree"][0], problemType)
        if 0:
            lastStrideC = problemType["NumIndicesC"]
            lastStrideA = len(problemType["IndexAssignmentsA"])
            lastStrideB = len(problemType["IndexAssignmentsB"])
            h += "typedef ProblemParms<%u, %u, %u, %u> ProblemSizes_%s;\n" % \
                  (lastStrideA, lastStrideB, lastStrideC, numSizes, problemType)

        # declare tensileGetSolutionPointer_ProblemType
        h += "\n// get solution pointer\n"
        h += "TensileSolutionPointer_%s tensileGetSolutionPointer_%s(\n" \
            % (problemType, problemType)
        for i in range(0, len(argListStream)):
            h += "    %s %s%s" \
                % (argListStream[i][0], argListStream[i][1], \
                ",\n" if i < len(argListStream)-1 else ");\n\n")

        # declare tensileName_
        h += "// get solution name\n"
        h += "const char * tensileGetSolutionName_%s(\n" \
            % (problemType)
        for i in range(0, len(argListStream)):
            h += "    %s %s%s" \
                % (argListStream[i][0], argListStream[i][1], \
                ",\n" if i < len(argListStream)-1 else ");\n\n")

        # get solution naming for problem type
        solutionsForProblemType = []
        for scheduleTuple in logicData[problemType]:
            solutionsForSchedule = scheduleTuple[2]
            for solution in solutionsForSchedule:
                if solution not in solutionsForProblemType:
                    solutionsForProblemType.append(solution)

        # solution names for problem type
        solutionNamesForProblemType = []
        for solution in solutionsForProblemType:
            solutionName = solutionWriter.getSolutionName(solution)
            solutionNamesForProblemType.append(solutionName)

        # reset problemType source
        if not globalParameters["MergeFiles"]:
            filePrefix = "Tensile_%s" % (problemType)
            s = "#include \"TensileTypes.h\"\n"
            s = "#include \"Tensile.h\"\n"
            s = "#include \"SolutionMapper.h\"\n"
            s += "#include \"TensileInternal.h\"\n"
            for solutionName in solutionNamesForProblemType:
                s += "#include \"%s.h\"\n" % solutionName

        ########################################
        # implement per-Schedule functions in source
        s += "/*******************************************************************************\n * Per-Schedule Functions\n *******************************************************************************/"
        for scheduleTuple in logicData[problemType]:

            # get logic parameters for problem type
            scheduleName = scheduleTuple[0]
            deviceNames = scheduleTuple[1]
            solutionsForSchedule = scheduleTuple[2]
            indexOrder = scheduleTuple[3]
            exactLogic = scheduleTuple[4]
            rangeLogic = scheduleTuple[5]

            # solution names for schedule
            solutionNamesForSchedule = []
            for solution in solutionsForSchedule:
                solutionName = solutionWriter.getSolutionName(solution)
                solutionNamesForSchedule.append(solutionName)

            s += "\n\n"
            schedProbName = "%s_%s" % (scheduleName, problemType)
            s += writeSolutionAndExactTable(schedProbName, problemType, \
                    solutionsForSchedule, solutionNamesForSchedule, exactLogic)

            # function tensileGetSolutionPointerUncached_Schedule_ProblemType
            s += "\n// problem size -> solution logic\n"
            s += "TensileSolutionPointer_%s tensileGetSolutionPointerUncached_%s(\n" \
                % (problemType, schedProbName)
            for i in range(0, len(argListSizes)):
                s += "    %s %s%s" \
                    % (argListSizes[i][0], argListSizes[i][1], \
                    ",\n" if i < len(argListSizes)-1 else ") {\n\n")
            s += writeSolutionAssertionCheckHeader(problemType)

            exactLogicStr = writeExactLogic(schedProbName, problemType, indexOrder, \
                                            solutionsForSchedule, exactLogic, \
                                            solutionNamesForSchedule, True)
            if rangeLogic != None:
                rangeLogicStr = writeRangeLogicRec(0, indexOrder, rangeLogic, \
                    solutionsForSchedule, solutionNamesForSchedule, problemType, True)
            else:
                rangeLogicStr = "  return NULL; // none\n"
            s += "  /* exact mappings */\n"
            s += exactLogicStr
            s += "\n  /* range mappings */\n"
            s += rangeLogicStr
            s += "\n}\n"

            # function tensileGetSolutionName_Schedule_ProblemType
            s += "\n// get solution name for problem size\n"
            s += "const char * tensileGetSolutionName_%s(\n" \
                % (schedProbName)
            for i in range(0, len(argListSizes)):
                s += "    %s %s%s" \
                    % (argListSizes[i][0], argListSizes[i][1], \
                    ",\n" if i < len(argListSizes)-1 else ") {\n\n")
            s += writeSolutionAssertionCheckHeader(problemType)

            exactLogicStr = writeExactLogic(schedProbName, problemType, indexOrder, \
                                            solutionsForSchedule, exactLogic, \
                                            solutionNamesForSchedule, False)
            if rangeLogic != None:
                rangeLogicStr = writeRangeLogicRec(0, indexOrder, rangeLogic, \
                    solutionsForSchedule, solutionNamesForSchedule, problemType, False)
            else:
                rangeLogicStr = "  return NULL; // none\n"
            s += "  /* exact mappings */\n"
            s += exactLogicStr
            s += "\n  /* range mappings */\n"
            s += rangeLogicStr
            s += "\n}\n"

        ########################################
        # implement problem-type functions in source
        s += "/*******************************************************************************\n * Per-ProblemType Functions\n *******************************************************************************/"

        if globalParameters["SolutionMapHash"]:
            ih += "typedef std::unordered_map<ProblemSizeKey, TensileSolutionPointer_%s, std::function<size_t (ProblemSizeKey)>> Map_%s;\n" \
                % (problemType, problemType )
        else:
            ih += "typedef std::map<ProblemSizeKey, TensileSolutionPointer_%s> Map_%s;\n" \
                % (problemType, problemType)

        ih += "extern Map_%s solutionMap_%s;\n" % (problemType, problemType)

        # implement tensileGetSolutionPointerUncached_ProblemType
        for ptr in [True, False]:
            returnType = "PointerUncached" if ptr else "Name"
            s += "\n// return solution %s\n" % returnType
            s += ("TensileSolutionPointer_%s " %
                  problemType) if ptr else "const char *"
            s += "tensileGetSolution%s_%s(\n" \
                % (returnType, problemType)
            for i in range(0, len(argListStream)):
                s += "    %s %s%s" \
                    % (argListStream[i][0], argListStream[i][1], \
                    ",\n" if i < len(argListStream)-1 else ") {\n")

            # choose from schedules based on device name


#     print logicData
            schedules = logicData[problemType]
            numSchedules = len(schedules)
            if numSchedules > 1:

                reordered_schedules = []
                for scheduleIdx in range(0, numSchedules):
                    schedule = schedules[scheduleIdx]
                    deviceNames = schedule[1]
                    if deviceNames != ["fallback"
                                       ] and deviceNames != ["Device 0000"]:
                        reordered_schedules.append(schedule)
                for scheduleIdx in range(0, numSchedules):
                    schedule = schedules[scheduleIdx]
                    deviceNames = schedule[1]
                    if deviceNames == ["fallback"
                                       ] or deviceNames == ["Device 0000"]:
                        reordered_schedules.append(schedule)

                # get device name
                if globalParameters["RuntimeLanguage"] == "OCL":
                    s += "get device name opencl;\n"
                else:
                    s += "\n//  get device name hip;\n"
                    s += "    int deviceId;\n"
                    s += "    hipGetDevice(&deviceId);\n"
                    s += "    hipDeviceProp_t deviceProperties;\n"
                    s += "    hipGetDeviceProperties(&deviceProperties, deviceId);\n"
                    s += "    std::string name = deviceProperties.name;\n"

                if problemType["DataType"].isDouble():
                    s += "\n"
                    s += "//  intercept schedule selection and call HIP (source) kernel\n"
                    s += "    if((strideA2K == 0) || (strideB2K == 0))\n"
                    s += "    {\n"
                    numSchedules = len(schedules)
                    schedule = reordered_schedules[numSchedules - 1]
                    scheduleName = schedule[0]
                    s += "        return tensileGetSolution%s_%s_%s(" \
                          % ( returnType, scheduleName, problemType)
                    for i in range(0, len(argListSizes)):
                        s += "%s%s" \
                            % (argListSizes[i][1],
                                ", " if i < len(argListSizes)-1 else ");\n")
                    s += "    }\n"
                    s += "\n"

                if problemType["DataType"].isHalf():
                    # "first" free index, usually the letter "I"
                    free0Index = problemType["IndicesFree"][0]
                    free0Char = globalParameters["IndexChars"][free0Index]
                    # "second" free index, usually the letter "J"
                    free1Index = problemType["IndicesFree"][1]
                    free1Char = globalParameters["IndexChars"][free1Index]
                    s += "\n"
                    s += "//  intercept schedule selection and call HIP (source) kernel\n"
                    s += "//  if either the summation size or the 'first' free index size\n"
                    s += "//  is odd or the 'second' free index size is 1\n"
                    s += "    if (((sizeL & 1) == 1) || ((size%s & 1) == 1)" % (
                        free0Char)
                    s += " || (size%s == 1))\n" % (free1Char)
                    s += "    {\n"
                    numSchedules = len(schedules)
                    schedule = reordered_schedules[numSchedules - 1]
                    scheduleName = schedule[0]
                    s += "        return tensileGetSolution%s_%s_%s(" \
                          % ( returnType, scheduleName, problemType)
                    for i in range(0, len(argListSizes)):
                        s += "%s%s" \
                            % (argListSizes[i][1],
                                ", " if i < len(argListSizes)-1 else ");\n")
                    s += "    }\n"
                    s += "\n"

                for scheduleIdx in range(0, numSchedules):
                    schedule = reordered_schedules[scheduleIdx]
                    scheduleName = schedule[0]
                    deviceNames = schedule[1]
                    if scheduleIdx > 0:
                        s += "    else "
                    if scheduleIdx < numSchedules - 1:
                        s += "if ("
                        for deviceNameIdx in range(0, len(deviceNames)):
                            deviceName = deviceNames[deviceNameIdx]
                            if deviceNameIdx > 0:
                                s += " || "
                            s += "name == \"%s\"" % deviceName
                        s += ")"
                    s += "\n    {\n"
                    s += "        return tensileGetSolution%s_%s_%s(" \
                        % ( returnType, scheduleName, problemType)
                    for i in range(0, len(argListSizes)):
                        s += "%s%s" \
                            % (argListSizes[i][1],
                                ", " if i < len(argListSizes)-1 else ");\n")
                    s += "    }\n"
            else:  # == 1
                schedule = schedules[0]
                scheduleName = schedule[0]
                s += "  return tensileGetSolution%s_%s_%s(" \
                    % ( returnType, scheduleName, problemType)
                for i in range(0, len(argListSizes)):
                    s += "%s%s" \
                        % (argListSizes[i][1],
                            ", " if i < len(argListSizes)-1 else ");\n")
            s += "\n}\n"

        # implement tensileGetSolutionPointer_ProblemType
        s += "\n// return solution pointer; user calls it\n"
        s += "Map_%s solutionMap_%s%s;\n" % (
            problemType, problemType, "(1024, tensileProblemSizeHasher)"
            if globalParameters["SolutionMapHash"] else "")
        s += "TensileSolutionPointer_%s tensileGetSolutionPointer_%s(\n" \
            % (problemType, problemType)
        for i in range(0, len(argListStream)):
            s += "    %s %s%s" \
                % (argListStream[i][0], argListStream[i][1], \
                ",\n" if i < len(argListStream)-1 else ") {\n")
        # create key
        s += "  ProblemSizeKey key = std::make_tuple( size%s, size%s, size%s%s );\n" \
            % ( \
            globalParameters["IndexChars"][problemType["Index0"]], \
            globalParameters["IndexChars"][problemType["Index1"]], \
            globalParameters["IndexChars"][problemType["IndexUnroll"]], \
            ", stream" if globalParameters["RuntimeLanguage"] == "OCL" else "")
        # check for key in map
        s += "  static std::mutex findKernelMutex;\n"
        s += "  std::lock_guard<std::mutex> findKernelLock(findKernelMutex);\n"
        s += "  Map_%s::iterator iter = solutionMap_%s.find(key);\n" \
            % (problemType, problemType)
        s += "  if (iter != solutionMap_%s.end()) {\n" % problemType
        s += "    return iter->second;\n"
        s += "  } else {\n"
        s += "    TensileSolutionPointer_%s ptr = tensileGetSolutionPointerUncached_%s(\n" \
            % (problemType, problemType)
        for i in range(0, len(argListStream)):
            s += "        %s%s" \
                % (argListStream[i][1], "," if i < len(argListStream)-1 else ");")
            s += "\n"
        s += "    solutionMap_%s[key] = ptr;\n" % problemType
        s += "    return ptr;\n"
        s += "  }\n"
        s += "}\n"

        # declare tensile_ProblemType
        s += "\n// main call to solution; enqueues a kernel\n"
        s += "TensileStatus tensile_%s(\n" % problemType
        for i in range(0, len(argListData)):
            s += "    %s %s%s" \
                % (argListData[i][0], argListData[i][1], \
                ",\n" if i < len(argListData)-1 else ") {\n")
        s += "    TensileSolutionPointer_%s ptr = tensileGetSolutionPointer_%s(\n" \
            % (problemType, problemType)
        for i in range(0, len(argListStream)):
            s += "        %s%s" \
                % (argListStream[i][1], ", " if i < len(argListStream)-1 else ");")
            s += "\n"
        s += "    if ( ptr ) {\n"
        s += "      return ptr("
        for i in range(0, len(argListData)):
            s += "%s%s" \
                % (argListData[i][1], ", " if i < len(argListData)-1 else ");\n")
        s += "    } else {\n"
        s += "      return tensileStatusFailure; // no solution found\n"
        s += "    }\n"
        s += "}\n"

        # open and close problemType files
        if not globalParameters["MergeFiles"]:
            logicSourceFile = open(os.path.join(outputPath, "Logic", \
                "%s.cpp" % filePrefix), "w")
            logicSourceFile.write(s)
            logicSourceFile.close()

    # close merged files
    if globalParameters["MergeFiles"]:
        logicSourceFile = open(os.path.join(outputPath, \
            "Tensile.cpp"), "w")
        logicSourceFile.write(s)
        logicSourceFile.close()

    logicHeaderFile = open(os.path.join(outputPath, \
        "Tensile.h"), "w")
    logicHeaderFile.write(h)
    logicHeaderFile.close()

    internalHeaderFile = open(os.path.join(outputPath, \
        "TensileInternal.h"), "w")
    internalHeaderFile.write(ih)
    internalHeaderFile.close()
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
    start = time.time()
    print1("# Writing Kernels...")
    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Solutions"))
        ensurePath(os.path.join(outputPath, "Kernels"))

    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(kernels))

    ##############################################################################
    # Write Kernels
    ##############################################################################
    if globalParameters["MergeFiles"]:
        kernelSourceFile = open(os.path.join(outputPath, \
            "Kernels.cpp"), "w")
        kernelHeaderFile = open(os.path.join(outputPath, \
            "Kernels.h"), "w")
        kernelSourceFile.write(CHeader)
        kernelHeaderFile.write(CHeader)
        kernelSourceFile.write("#include \"Kernels.h\"\n")
        kernelHeaderFile.write("#pragma once\n")
        if globalParameters["RuntimeLanguage"] == "HIP":
            kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n")
            kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
            kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
            kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
            kernelHeaderFile.write("\n\n")
            kernelHeaderFile.write(
                "__device__ inline int GenDot4(int a, int b, int c) { \n")
            kernelHeaderFile.write(
                "  typedef struct { int c0:8,c1:8,c2:8,c3:8; } C4I8;\n")
            kernelHeaderFile.write(
                "  typedef union { int32_t i; C4I8 z; } PkInt8x4;\n")
            kernelHeaderFile.write("  PkInt8x4 va, vb; va.i = a; vb.i = b;\n")
            kernelHeaderFile.write(
                "  return c + (vb.z.c3*va.z.c3 + vb.z.c2*va.z.c2 + vb.z.c1*va.z.c1 + vb.z.c0*va.z.c0); }\n"
            )
            kernelHeaderFile.write("\n\n")
        else:
            kernelHeaderFile.write("#include <string>\n")

    kernelsWithBuildErrs = {}

    prepAsm()

    if globalParameters["CpuThreads"] == 0:
        cpus = 0
    elif globalParameters["CodeFromFiles"]:
        cpu_count = multiprocessing.cpu_count()
        cpus = cpu_count*4 if globalParameters["CpuThreads"] == -1 \
               else globalParameters["CpuThreads"]
    else:  #! CodeFromFiles is not thread-safe since code merged into same file
        cpus = 1

    workPerCpu = max(10, (len(kernels) + cpus - 1) / cpus) if cpus else 1
    print "# Launching kernel compilation processes (cpus=%u kernelsPerCpu=%u)" % (
        cpus, workPerCpu)

    kiStart = 0
    cpu = 0
    threads = []
    if 1 and cpus and globalParameters["ShowProgressBar"]:
        processLaunchProgressBar = ProgressBar(len(kernels))
    else:
        processLaunchProgressBar = None
    while kiStart < len(kernels):
        kiStop = min(len(kernels), kiStart + workPerCpu)
        if cpus:
            results = []
            parentConn, child = multiprocessing.Pipe()
            args=(kernels, kernelWriterSource, kernelWriterAssembly, \
                  kiStart, kiStop, child)
            t = multiprocessing.Process(target=processKernelSourceChunk,
                                        args=args)
            t.start()
            child.close()  # close child pipe in the parent process
            threads.append([t, kiStart, kiStop, parentConn])
            if processLaunchProgressBar:
                processLaunchProgressBar.increment(kiStop - kiStart)
            else:
                sys.stderr.write(
                    "  # launched process %s for kernels %d..%d\n" %
                    (t, kiStart, kiStop - 1))

        else:  # non-threaded version
            processKernelSourceChunk(kernels, kernelWriterSource, kernelWriterAssembly, \
                                     kiStart, kiStop, None)
        kiStart += workPerCpu
        cpu += 1
    sys.stderr.write("# Waiting for kernel compilation processes...\n")

    someError = 0
    for (t, kiStart, kiStop, parentConn) in threads:
        try:
            results = parentConn.recv()
        except EOFError as pipeErr:
            print "*** warning: process", t, "returned pipe EOF", t, pipeErr

        t.join()
        e = t.exitcode
        if e != 0:
            print "*** warning: process", t, "returned", t, e
            someError = 1
            results = []

        if globalParameters["ShowProgressBar"]:
            progressBar.increment(kiStop - kiStart)
        for (err, src, header, kernelName) in results:
            if err:
                kernelsWithBuildErrs[kernelName] = err
                #print "*** warning: invalid kernel#%s"%kernelName

            # write kernel.cpp
            if not globalParameters["MergeFiles"]:
                kernelSourceFile = open(os.path.join(outputPath, \
                    "Kernels", kernelName+".cpp"), "w")
                kernelSourceFile.write(CHeader)

            kernelSourceFile.write(src)

            if not globalParameters["MergeFiles"]:
                kernelSourceFile.close()
                # write kernel.h
                kernelHeaderFile = open(os.path.join(outputPath, \
                    "Kernels", kernelName+".h"), "w")
                kernelHeaderFile.write(CHeader)

            kernelHeaderFile.write(header)

            if not globalParameters["MergeFiles"]:
                kernelHeaderFile.close()

    if someError:
        print "\nKernel compilation failed in one or more subprocesses. May want to set CpuThreads=0 and re-run to make debug easier"
        printExit("** kernel compilation failure **")

    # beta-only kernels
    for kernel in kernelsBetaOnly:
        kernelWriter = kernelWriterSource
        kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)

        (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel)
        kernelSourceFile.write(src)
        if err:
            print "*** warning: invalid kernel#%u" % kernelName
        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()
        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName + ".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(
            kernelWriter.getHeaderFileStringBetaOnly(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()

    # close merged
    if globalParameters["MergeFiles"]:
        kernelHeaderFile.close()

    stop = time.time()
    print "# Kernel Building elapsed time = %.1f secs" % (stop - start)

    print1("# Writing Solutions")
    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(solutions))
    ##############################################################################
    # Write Solutions
    ##############################################################################
    if globalParameters["MergeFiles"]:
        solutionSourceFile = open(os.path.join(outputPath, \
            "Solutions.cpp"), "w")
        solutionHeaderFile = open(os.path.join(outputPath, \
            "Solutions.h"), "w")
        if globalParameters["MergeFiles"]:
            solutionSourceFile.write(CHeader)
            solutionHeaderFile.write(CHeader)
        solutionSourceFile.write("#include \"Solutions.h\"\n")
        solutionSourceFile.write("#include <algorithm>\n")
        solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
        solutionHeaderFile.write("#include \"Kernels.h\"\n")
        solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
        solutionHeaderFile.write("#include \"Tools.h\"\n")
        if globalParameters["CodeFromFiles"]:
            solutionHeaderFile.write("#include <unistd.h>\n")
    for solution in solutions:
        # get solution name
        if not globalParameters["MergeFiles"]:
            solutionFileName = solutionWriter.getSolutionName(solution)

        # write solution.cpp
        if not globalParameters["MergeFiles"]:
            solutionSourceFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".cpp"), "w")
            solutionSourceFile.write(CHeader)
        solutionSourceFile.write( \
            solutionWriter.getSourceFileString(solution, kernelsWithBuildErrs))
        if not globalParameters["MergeFiles"]:
            solutionSourceFile.close()

        # write solution.h
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".h"), "w")
            solutionHeaderFile.write(CHeader)
        solutionHeaderFile.write( \
            solutionWriter.getHeaderFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile.close()
        if globalParameters["ShowProgressBar"]:
            progressBar.increment()
    # close merged
    if not globalParameters["MergeFiles"]:
        solutionHeaderFile.close()

    if globalParameters["ExitAfterKernelGen"]:
        printExit(
            "** Exiting after kernel generation due to ExitAfterKernelGen=1")
def TensileCreateLibrary():
    print1("")
    print1(HR)
    print1("# Tensile Create Library")
    print2(HR)
    print2("")

    ##############################################################################
    # Parse Command Line Arguments
    ##############################################################################
    print2("Arguments: %s" % sys.argv)
    argParser = argparse.ArgumentParser()
    argParser.add_argument("LogicPath",
                           help="Path to LibraryLogic.yaml files.")
    argParser.add_argument("OutputPath", help="Where to write library files?")
    argParser.add_argument("RuntimeLanguage", help="Which runtime language?", \
        choices=["OCL", "HIP", "HSA"])
    argParser.add_argument("--merge-files", dest="MergeFiles", \
        action="store_true")
    argParser.add_argument("--no-merge-files", dest="MergeFiles", \
        action="store_false")
    argParser.add_argument("--short-file-names", dest="ShortNames", \
        action="store_true")
    argParser.add_argument("--no-short-file-names", dest="ShortNames", \
        action="store_false")
    argParser.add_argument("--library-print-debug", dest="LibraryPrintDebug", \
        action="store_true")
    argParser.add_argument("--no-library-print-debug", dest="LibraryPrintDebug", \
        action="store_false")
    args = argParser.parse_args()

    logicPath = args.LogicPath
    outputPath = args.OutputPath
    print2("OutputPath: %s" % outputPath)
    ensurePath(outputPath)
    arguments = {}
    arguments["RuntimeLanguage"] = args.RuntimeLanguage
    arguments["MergeFiles"] = args.MergeFiles
    arguments["ShortNames"] = args.ShortNames
    arguments["LibraryPrintDebug"] = args.LibraryPrintDebug
    arguments["CodeFromFiles"] = False
    assignGlobalParameters(arguments)

    if not os.path.exists(logicPath):
        printExit("LogicPath %s doesn't exist" % logicPath)

    logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \
        if (os.path.isfile(os.path.join(logicPath, f)) \
        and os.path.splitext(f)[1]==".yaml")]

    print1("# LibraryLogicFiles:" % logicFiles)
    for logicFile in logicFiles:
        print1("#   %s" % logicFile)

    ##############################################################################
    # Parse config files
    ##############################################################################
    solutions = []
    logicData = {}  # keys are problemTypes, values are schedules
    for logicFileName in logicFiles:
        (scheduleName, deviceNames, problemType, solutionsForSchedule, \
            indexOrder, exactLogic, rangeLogic) \
            = YAMLIO.readLibraryLogicForSchedule(logicFileName)
        if problemType not in logicData:
            logicData[problemType] = []
        logicData[problemType].append((scheduleName, deviceNames, \
            solutionsForSchedule, indexOrder, exactLogic, rangeLogic ))
        for solution in solutionsForSchedule:
            if solution not in solutions:
                solutions.append(solution)

    # create solution writer and kernel writer
    kernels = []
    kernelsBetaOnly = []
    for solution in solutions:
        solutionKernels = solution.getKernels()
        for kernel in solutionKernels:
            if kernel not in kernels:
                kernels.append(kernel)
        solutionKernelsBetaOnly = solution.getKernelsBetaOnly()
        for kernel in solutionKernelsBetaOnly:
            if kernel not in kernelsBetaOnly:
                kernelsBetaOnly.append(kernel)

    # if any kernels are assembly, append every ISA supported

    if globalParameters["ShortNames"] and not globalParameters["MergeFiles"]:
        solutionSerialNaming = Solution.getSerialNaming(solutions)
        kernelSerialNaming = Solution.getSerialNaming(kernels)
    else:
        solutionSerialNaming = None
        kernelSerialNaming = None
    solutionMinNaming = Solution.getMinNaming(solutions)
    kernelMinNaming = Solution.getMinNaming(kernels)
    solutionWriter = SolutionWriter( \
        solutionMinNaming, solutionSerialNaming, \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterSource = KernelWriterSource( \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterAssembly = KernelWriterAssembly( \
        kernelMinNaming, kernelSerialNaming)

    # write solutions and kernels
    writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
        solutionWriter, kernelWriterSource, kernelWriterAssembly)

    libraryStaticFiles = [
        "SolutionMapper.h", "TensileTypes.h", "KernelHeader.h",
        "SolutionHelper.cpp", "SolutionHelper.h", "Tools.cpp", "Tools.h"
    ]

    # write cmake
    clientName = "LibraryClient"
    writeCMake(outputPath, solutions, kernels, libraryStaticFiles, clientName)

    # write logic
    writeLogic(outputPath, logicData, solutionWriter)
    print1("# Tensile Library Writer DONE")
    print1(HR)
    print1("")
Exemple #9
0
def TensileCreateLibrary():
    print1("")
    print1(HR)
    print1("# Tensile Create Library")
    print2(HR)
    print2("")

    ##############################################################################
    # Parse Command Line Arguments
    ##############################################################################
    print2("Arguments: %s" % sys.argv)
    argParser = argparse.ArgumentParser()
    argParser.add_argument("LogicPath",
                           help="Path to LibraryLogic.yaml files.")
    argParser.add_argument("OutputPath", help="Where to write library files?")
    argParser.add_argument("RuntimeLanguage", help="Which runtime language?", \
        choices=["OCL", "HIP", "HSA"])
    argParser.add_argument("--merge-files", dest="MergeFiles", \
        action="store_true")
    argParser.add_argument("--no-merge-files", dest="MergeFiles", \
        action="store_false")
    argParser.add_argument("--short-file-names", dest="ShortNames", \
        action="store_true")
    argParser.add_argument("--no-short-file-names", dest="ShortNames", \
        action="store_false")
    argParser.add_argument("--library-print-debug", dest="LibraryPrintDebug", \
        action="store_true")
    argParser.add_argument("--no-library-print-debug", dest="LibraryPrintDebug", \
        action="store_false")
    argParser.add_argument(
        "--isa",
        dest="isa",
        action="append",
        help="which architectures for assembly kernels to target")
    args = argParser.parse_args()

    logicPath = args.LogicPath
    outputPath = args.OutputPath
    print2("OutputPath: %s" % outputPath)
    ensurePath(outputPath)
    arguments = {}
    arguments["RuntimeLanguage"] = args.RuntimeLanguage
    arguments["MergeFiles"] = args.MergeFiles
    arguments["ShortNames"] = args.ShortNames
    arguments["LibraryPrintDebug"] = args.LibraryPrintDebug
    if args.isa:
        newISA = []
        for isa in args.isa:
            gfxIdx = isa.find("gfx")
            if gfxIdx >= 0:
                major = int(isa[gfxIdx + 3:gfxIdx + 4])
                minor = int(isa[gfxIdx + 4:gfxIdx + 5])
                step = int(isa[gfxIdx + 5:gfxIdx + 6])
                isaTuple = (major, minor, step)
                if isaTuple in globalParameters[
                        "SupportedISA"] and isaTuple not in newISA:
                    print1("# User-Specified ISA: gfx%u%u%u" %
                           (major, minor, step))
                    newISA.append(isaTuple)
            else:
                printWarning("isa parameter must be formed as: --isa gfx803")
        arguments["SupportedISA"] = newISA
    assignGlobalParameters(arguments)

    if not os.path.exists(logicPath):
        printExit("LogicPath %s doesn't exist" % logicPath)

    logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \
        if (os.path.isfile(os.path.join(logicPath, f)) \
        and os.path.splitext(f)[1]==".yaml")]

    print1("# LibraryLogicFiles:" % logicFiles)
    for logicFile in logicFiles:
        print1("#   %s" % logicFile)

    ##############################################################################
    # Parse config files
    ##############################################################################
    solutions = []
    logicData = {}  # keys are problemTypes, values are schedules
    for logicFileName in logicFiles:
        (scheduleName, deviceNames, problemType, solutionsForSchedule, \
            indexOrder, exactLogic, rangeLogic) \
            = YAMLIO.readLibraryLogicForSchedule(logicFileName)
        if problemType not in logicData:
            logicData[problemType] = []
        logicData[problemType].append((scheduleName, deviceNames, \
            solutionsForSchedule, indexOrder, exactLogic, rangeLogic ))
        for solution in solutionsForSchedule:
            if solution not in solutions:
                solutions.append(solution)

    # create solution writer and kernel writer
    kernels = []
    kernelsBetaOnly = []
    for solution in solutions:
        solutionKernels = solution.getKernels()
        for kernel in solutionKernels:
            if kernel not in kernels:
                kernels.append(kernel)
        solutionKernelsBetaOnly = solution.getKernelsBetaOnly()
        for kernel in solutionKernelsBetaOnly:
            if kernel not in kernelsBetaOnly:
                kernelsBetaOnly.append(kernel)

    # if any kernels are assembly, append every ISA supported
    if globalParameters["RuntimeLanguage"] == "HIP":
        newKernels = []
        for kernel in kernels:
            if kernel["KernelLanguage"] == "Assembly":
                kernel["ISA"] = globalParameters["SupportedISA"][0]
                for i in range(1, len(globalParameters["SupportedISA"])):
                    newKernel = deepcopy(kernel)
                    newKernel["ISA"] = globalParameters["SupportedISA"][i]
                    newKernels.append(newKernel)
            else:
                kernel["ISA"] = (0, 0, 0)
        kernels.extend(newKernels)

    if globalParameters["ShortNames"] and not globalParameters["MergeFiles"]:
        solutionSerialNaming = Solution.getSerialNaming(solutions)
        kernelSerialNaming = Solution.getSerialNaming(kernels)
    else:
        solutionSerialNaming = None
        kernelSerialNaming = None
    solutionMinNaming = Solution.getMinNaming(solutions)
    kernelMinNaming = Solution.getMinNaming(kernels)
    solutionWriter = SolutionWriter( \
        solutionMinNaming, solutionSerialNaming, \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterSource = KernelWriterSource( \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterAssembly = KernelWriterAssembly( \
        kernelMinNaming, kernelSerialNaming)

    # write solutions and kernels
    writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
        solutionWriter, kernelWriterSource, kernelWriterAssembly)

    libraryStaticFiles = [
        "TensileTypes.h", "KernelHeader.h", "SolutionHelper.cpp",
        "SolutionHelper.h", "Tools.cpp", "Tools.h"
    ]

    # write cmake
    clientName = "LibraryClient"
    writeCMake(outputPath, solutions, kernels, libraryStaticFiles, clientName)

    # write logic
    writeLogic(outputPath, logicData, solutionWriter)
    print1("# Tensile Library Writer DONE")
    print1(HR)
    print1("")
Exemple #10
0
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
    print1("# Writing Solutions and Kernels")
    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Solutions"))
        ensurePath(os.path.join(outputPath, "Kernels"))

    progressBar = ProgressBar(len(solutions) + len(kernels))

    ##############################################################################
    # Write Solutions
    ##############################################################################
    if globalParameters["MergeFiles"]:
        solutionSourceFile = open(os.path.join(outputPath, \
            "Solutions.cpp"), "w")
        solutionHeaderFile = open(os.path.join(outputPath, \
            "Solutions.h"), "w")
        if globalParameters["MergeFiles"]:
            solutionSourceFile.write(CHeader)
            solutionHeaderFile.write(CHeader)
        solutionSourceFile.write("#include \"Solutions.h\"\n")
        solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
        solutionHeaderFile.write("#include \"Kernels.h\"\n")
        solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
        solutionHeaderFile.write("#include \"Tools.h\"\n")
    for solution in solutions:
        # get solution name
        if not globalParameters["MergeFiles"]:
            solutionFileName = solutionWriter.getSolutionName(solution)

        # write solution.cpp
        if not globalParameters["MergeFiles"]:
            solutionSourceFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".cpp"), "w")
            solutionSourceFile.write(CHeader)
        solutionSourceFile.write( \
            solutionWriter.getSourceFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionSourceFile.close()

        # write solution.h
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".h"), "w")
            solutionHeaderFile.write(CHeader)
        solutionHeaderFile.write( \
            solutionWriter.getHeaderFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile.close()
        progressBar.increment()
    # close merged
    if not globalParameters["MergeFiles"]:
        solutionHeaderFile.close()

    ##############################################################################
    # Write Kernels
    ##############################################################################
    if globalParameters["MergeFiles"]:
        kernelSourceFile = open(os.path.join(outputPath, \
            "Kernels.cpp"), "w")
        kernelHeaderFile = open(os.path.join(outputPath, \
            "Kernels.h"), "w")
        kernelSourceFile.write(CHeader)
        kernelHeaderFile.write(CHeader)
        kernelSourceFile.write("#include \"Kernels.h\"\n")
        kernelHeaderFile.write("#pragma once\n")
        if globalParameters["RuntimeLanguage"] == "HIP":
            kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
            kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
            kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
        else:
            kernelHeaderFile.write("#include <string>\n")

    # tensor contraction kernels
    for kernel in kernels:
        kernelWriter = kernelWriterSource if kernel[
            "KernelLanguage"] == "Source" else kernelWriterAssembly
        # get kernel name
        if not globalParameters["MergeFiles"]:
            kernelName = kernelWriter.getKernelName(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)
        kernelSourceFile.write(kernelWriter.getSourceFileString(kernel))
        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()

        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(kernelWriter.getHeaderFileString(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()
        progressBar.increment()

    # beta-only kernels
    for kernel in kernelsBetaOnly:
        kernelWriter = kernelWriterSource
        kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)
        kernelSourceFile.write(
            kernelWriter.getSourceFileStringBetaOnly(kernel))
        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()

        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName + ".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(
            kernelWriter.getHeaderFileStringBetaOnly(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()

    # close merged
    if globalParameters["MergeFiles"]:
        kernelHeaderFile.close()
Exemple #11
0
def writeLogic(outputPath, logicData, solutionWriter):
    print1("# Writing Library Logic")

    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Logic"))

    # Tensile.h
    h = ""
    h += "#pragma once\n"
    h += "#include \"TensileTypes.h\"\n"

    # TensileInternal.h
    ih = ""
    ih += "#include \"Tensile.h\"\n"
    ih += "#include \"SolutionHelper.h\"\n"
    if globalParameters["SolutionMapHash"]:
        ih += "#include <unordered_map>\n"
    else:
        ih += "#include <map>\n"
    ih += "#include <tuple>\n"

    # problem type Key
    problemSizeTemplate = "unsigned int, unsigned int, unsigned int"
    if globalParameters["RuntimeLanguage"] == "OCL":
        problemSizeTemplate += ", cl_command_queue"
    ih += "typedef std::tuple<%s> ProblemSizeKey;\n" \
        % (problemSizeTemplate)

    # hash function
    ih += "\n"
    ih += "size_t tensileProblemSizeHasher( const ProblemSizeKey & problemSize ) {\n"
    ih += "  size_t hash = 0;\n"
    ih += "  // ignore lowest 4 bits; keep next 21 bits\n"
    ih += "  size_t hash0 = (std::get<0>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of size0\n"
    ih += "  size_t hash1 = (std::get<1>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of size1\n"
    ih += "  size_t hashU = (std::get<2>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of sizeU\n"
    ih += "  // 21+21+21 = 63 bit hash\n"
    ih += "  hash |= hash0;\n"
    ih += "  hash |= hash1<<21;\n"
    ih += "  hash |= hashU<<42;\n"
    ih += "  return hash;\n"
    ih += "}\n"
    ih += "\n"

    # Tensile.cpp
    s = ""
    s += "#include \"Tensile.h\"\n"
    s += "#include \"TensileInternal.h\"\n"
    s += "#include \"Solutions.h\"\n"

    ########################################
    # problemType
    for problemType in logicData:

        # function argument list
        argListSizes = solutionWriter.getArgList(problemType, False, False,
                                                 False)
        argListStream = solutionWriter.getArgList(problemType, False, False,
                                                  True)
        argListData = solutionWriter.getArgList(problemType, True, True, True)

        # declare tensile_ProblemType
        h += "\n// enqueue solution\n"
        h += "TensileStatus tensile_%s(\n" % problemType
        for i in range(0, len(argListData)):
            h += "    %s %s%s" \
                % (argListData[i][0], argListData[i][1], \
                ",\n" if i < len(argListData)-1 else ");\n\n")

        # declare TensileSolutionPointer_ProblemType
        h += "\n// solution pointer\n"
        h += "typedef TensileStatus (*TensileSolutionPointer_%s)(\n" \
            % problemType
        for i in range(0, len(argListData)):
            h += "    %s %s%s" % (argListData[i][0], argListData[i][1], ",\n" \
                if i < len(argListData)-1 else ");\n\n")

        # declare tensileGetSolutionPointer_ProblemType
        h += "\n// get solution pointer\n"
        h += "TensileSolutionPointer_%s tensileGetSolutionPointer_%s(\n" \
            % (problemType, problemType)
        for i in range(0, len(argListStream)):
            h += "    %s %s%s" \
                % (argListStream[i][0], argListStream[i][1], \
                ",\n" if i < len(argListStream)-1 else ");\n\n")

        # declare tensileName_
        h += "// get solution name\n"
        h += "const char * tensileGetSolutionName_%s(\n" \
            % (problemType)
        for i in range(0, len(argListStream)):
            h += "    %s %s%s" \
                % (argListStream[i][0], argListStream[i][1], \
                ",\n" if i < len(argListStream)-1 else ");\n\n")

        # get solution naming for problem type
        solutionsForProblemType = []
        for scheduleTuple in logicData[problemType]:
            solutionsForSchedule = scheduleTuple[2]
            for solution in solutionsForSchedule:
                if solution not in solutionsForProblemType:
                    solutionsForProblemType.append(solution)

        # solution names for problem type
        solutionNamesForProblemType = []
        for solution in solutionsForProblemType:
            solutionName = solutionWriter.getSolutionName(solution)
            solutionNamesForProblemType.append(solutionName)

        # reset problemType source
        if not globalParameters["MergeFiles"]:
            filePrefix = "Tensile_%s" % (problemType)
            s = "#include \"Tensile.h\"\n"
            s += "#include \"TensileInternal.h\"\n"
            for solutionName in solutionNamesForProblemType:
                s += "#include \"%s.h\"\n" % solutionName

        ########################################
        # implement per-Schedule functions in source
        s += "/*******************************************************************************\n * Per-Schedule Functions\n *******************************************************************************/"
        for scheduleTuple in logicData[problemType]:

            # get logic parameters for problem type
            scheduleName = scheduleTuple[0]
            deviceNames = scheduleTuple[1]
            solutionsForSchedule = scheduleTuple[2]
            indexOrder = scheduleTuple[3]
            exactLogic = scheduleTuple[4]
            rangeLogic = scheduleTuple[5]

            # solution names for schedule
            solutionNamesForSchedule = []
            for solution in solutionsForSchedule:
                solutionName = solutionWriter.getSolutionName(solution)
                solutionNamesForSchedule.append(solutionName)

            # function tensileGetSolutionPointerUncached_Schedule_ProblemType
            s += "\n// problem size -> solution logic\n"
            s += "TensileSolutionPointer_%s tensileGetSolutionPointerUncached_%s_%s(\n" \
                % (problemType, scheduleName, problemType)
            for i in range(0, len(argListSizes)):
                s += "    %s %s%s" \
                    % (argListSizes[i][0], argListSizes[i][1], \
                    ",\n" if i < len(argListSizes)-1 else ") {\n\n")

            exactLogicStr = writeExactLogic(exactLogic, \
                solutionNamesForSchedule, True)
            if rangeLogic != None:
                rangeLogicStr = writeRangeLogicRec(0, indexOrder, rangeLogic, \
                    solutionNamesForSchedule, problemType, True)
            else:
                rangeLogicStr = "  return NULL; // none\n"
            s += "  /* exact mappings */\n"
            s += exactLogicStr
            s += "\n  /* range mappings */\n"
            s += rangeLogicStr
            s += "\n}\n"

            # function tensileGetSolutionName_Schedule_ProblemType
            s += "\n// get solution name for problem size\n"
            s += "const char * tensileGetSolutionName_%s_%s(\n" \
                % (scheduleName, problemType)
            for i in range(0, len(argListSizes)):
                s += "    %s %s%s" \
                    % (argListSizes[i][0], argListSizes[i][1], \
                    ",\n" if i < len(argListSizes)-1 else ") {\n\n")
            exactLogicStr = writeExactLogic(exactLogic, \
                solutionNamesForSchedule, False)
            if rangeLogic != None:
                rangeLogicStr = writeRangeLogicRec(0, indexOrder, rangeLogic, \
                    solutionNamesForSchedule, problemType, False)
            else:
                rangeLogicStr = "  return NULL; // none\n"
            s += "  /* exact mappings */\n"
            s += exactLogicStr
            s += "\n  /* range mappings */\n"
            s += rangeLogicStr
            s += "\n}\n"

        ########################################
        # implement problem-type functions in source
        s += "/*******************************************************************************\n * Per-ProblemType Functions\n *******************************************************************************/"

        if globalParameters["SolutionMapHash"]:
            ih += "typedef std::unordered_map<ProblemSizeKey, TensileSolutionPointer_%s, std::function<size_t (ProblemSizeKey)>> Map_%s;\n" \
                % (problemType, problemType )
        else:
            ih += "typedef std::map<ProblemSizeKey, TensileSolutionPointer_%s> Map_%s;\n" \
                % (problemType, problemType)

        ih += "extern Map_%s solutionMap_%s;\n" % (problemType, problemType)

        # implement tensileGetSolutionPointerUncached_ProblemType
        for ptr in [True, False]:
            returnType = "PointerUncached" if ptr else "Name"
            s += "\n// return solution %s\n" % returnType
            s += ("TensileSolutionPointer_%s " %
                  problemType) if ptr else "const char *"
            s += "tensileGetSolution%s_%s(\n" \
                % (returnType, problemType)
            for i in range(0, len(argListStream)):
                s += "    %s %s%s" \
                    % (argListStream[i][0], argListStream[i][1], \
                    ",\n" if i < len(argListStream)-1 else ") {\n")

            # choose from schedules based on device name
            schedules = logicData[problemType]
            numSchedules = len(schedules)
            if numSchedules > 1:

                # get device name
                if globalParameters["RuntimeLanguage"] == "OCL":
                    s += "get device name opencl;\n"
                else:
                    s += "get device name hip;\n"

                for scheduleIdx in range(0, numSchedules):
                    schedule = schedules[scheduleIdx]
                    deviceNames = schedule[1]
                    if scheduleIdx > 0:
                        s += "else "
                    if scheduleIdx < numSchedules - 1:
                        s += "if ("
                        for deviceNameIdx in range(0, len(deviceNames)):
                            deviceName = deviceNames[deviceNameIdx]
                            if deviceNameIdx > 0:
                                s += " && "
                                s += "name == \"%s\"" % deviceName
                        s += ")"
                    s += "{"
                    s += "  return tensileGetSolution%s_%s_%s(" \
                        % ( returnType, scheduleName, problemType)
                    for i in range(0, len(argListSizes)):
                        s += "%s%s" \
                            % (argListSizes[i][1],
                                ", " if i < len(argListSizes)-1 else ");\n")
                        s += "}"
            else:  # == 1
                schedule = schedules[0]
                scheduleName = schedule[0]
                s += "  return tensileGetSolution%s_%s_%s(" \
                    % ( returnType, scheduleName, problemType)
                for i in range(0, len(argListSizes)):
                    s += "%s%s" \
                        % (argListSizes[i][1],
                            ", " if i < len(argListSizes)-1 else ");\n")
            s += "\n}\n"

        # implement tensileGetSolutionPointer_ProblemType
        s += "\n// return solution pointer; user calls it\n"
        s += "Map_%s solutionMap_%s%s;\n" % (
            problemType, problemType, "(1024, tensileProblemSizeHasher)"
            if globalParameters["SolutionMapHash"] else "")
        s += "TensileSolutionPointer_%s tensileGetSolutionPointer_%s(\n" \
            % (problemType, problemType)
        for i in range(0, len(argListStream)):
            s += "    %s %s%s" \
                % (argListStream[i][0], argListStream[i][1], \
                ",\n" if i < len(argListStream)-1 else ") {\n")
        # create key
        s += "  ProblemSizeKey key = std::make_tuple( size%s, size%s, size%s%s );\n" \
            % ( \
            globalParameters["IndexChars"][problemType["Index0"]], \
            globalParameters["IndexChars"][problemType["Index1"]], \
            globalParameters["IndexChars"][problemType["IndexUnroll"]], \
            ", stream" if globalParameters["RuntimeLanguage"] == "OCL" else "")
        # check for key in map
        s += "  static std::mutex findKernelMutex;\n"
        s += "  std::lock_guard<std::mutex> findKernelLock(findKernelMutex);\n"
        s += "  Map_%s::iterator iter = solutionMap_%s.find(key);\n" \
            % (problemType, problemType)
        s += "  if (iter != solutionMap_%s.end()) {\n" % problemType
        s += "    return iter->second;\n"
        s += "  } else {\n"
        s += "    TensileSolutionPointer_%s ptr = tensileGetSolutionPointerUncached_%s(\n" \
            % (problemType, problemType)
        for i in range(0, len(argListStream)):
            s += "        %s%s" \
                % (argListStream[i][1], "," if i < len(argListStream)-1 else ");")
            s += "\n"
        s += "    solutionMap_%s[key] = ptr;\n" % problemType
        s += "    return ptr;\n"
        s += "  }\n"
        s += "}\n"

        # declare tensile_ProblemType
        s += "\n// main call to solution; enqueues a kernel\n"
        s += "TensileStatus tensile_%s(\n" % problemType
        for i in range(0, len(argListData)):
            s += "    %s %s%s" \
                % (argListData[i][0], argListData[i][1], \
                ",\n" if i < len(argListData)-1 else ") {\n")
        s += "    TensileSolutionPointer_%s ptr = tensileGetSolutionPointer_%s(\n" \
            % (problemType, problemType)
        for i in range(0, len(argListStream)):
            s += "        %s%s" \
                % (argListStream[i][1], ", " if i < len(argListStream)-1 else ");")
            s += "\n"
        s += "    if ( ptr ) {\n"
        s += "      return ptr("
        for i in range(0, len(argListData)):
            s += "%s%s" \
                % (argListData[i][1], ", " if i < len(argListData)-1 else ");\n")
        s += "    } else {\n"
        s += "      return tensileStatusFailure; // no solution found\n"
        s += "    }\n"
        s += "}\n"

        # open and close problemType files
        if not globalParameters["MergeFiles"]:
            logicSourceFile = open(os.path.join(outputPath, "Logic", \
                "%s.cpp" % filePrefix), "w")
            logicSourceFile.write(s)
            logicSourceFile.close()

    # close merged files
    if globalParameters["MergeFiles"]:
        logicSourceFile = open(os.path.join(outputPath, \
            "Tensile.cpp"), "w")
        logicSourceFile.write(s)
        logicSourceFile.close()

    logicHeaderFile = open(os.path.join(outputPath, \
        "Tensile.h"), "w")
    logicHeaderFile.write(h)
    logicHeaderFile.close()

    internalHeaderFile = open(os.path.join(outputPath, \
        "TensileInternal.h"), "w")
    internalHeaderFile.write(ih)
    internalHeaderFile.close()
Exemple #12
0
def Tensile(userArgs):
    # 1st half of splash
    print1("")
    print1(HR)
    print1("#")
    print1("#  Tensile v%s" % (__version__))

    # setup argument parser
    argParser = argparse.ArgumentParser()
    argParser.add_argument("config_file", \
        help="benchmark config.yaml file")
    argParser.add_argument("output_path", \
        help="path where to conduct benchmark")
    argParser.add_argument("--version", action="version", \
        version="%(prog)s {version}".format(version=__version__))
    argParser.add_argument("-d", "--device", dest="device", type=int, \
        help="override which device to benchmark")
    argParser.add_argument("-p", "--platform", dest="platform", type=int, \
        help="override which OpenCL platform to benchmark")
    argParser.add_argument("--runtime-language", dest="RuntimeLanguage", \
        choices=["HIP", "OCL"], help="override which runtime language to use")
    argParser.add_argument("--code-object-version", dest="CodeObjectVersion", \
        choices=["V2", "V3"], help="HSA code-object version")
    argParser.add_argument("-v", "--verbose", action="store_true", \
        help="set PrintLevel=2")
    argParser.add_argument("--debug", dest="debug", action="store_true", \
        help="set PrintLevel=2 and CMakeBuildType=Debug")
    argParser.add_argument("--short-names", dest="shortNames", action="store_true", \
        help="use serial kernel and solution names")
    argParser.add_argument("--no-merge-files", dest="noMergeFiles", action="store_true", \
        help="kernels and solutions written to individual files")
    # argParser.add_argument("--hcc-version", dest="HccVersion", \
    #     help="This can affect what opcodes are emitted by the assembler")

    print1("# Restoring default globalParameters")
    for key in defaultGlobalParameters:
        globalParameters[key] = defaultGlobalParameters[key]

    # parse arguments
    args = argParser.parse_args(userArgs)
    configPath = os.path.realpath(args.config_file)

    # 2nd half of splash
    print1("#  Config: %s" % (configPath))
    print1("#")
    print1(HR)
    print1("")

    # read config
    config = YAMLIO.readConfig(configPath)
    globalParameters["ConfigPath"] = configPath

    # assign global parameters
    if "GlobalParameters" in config:
        assignGlobalParameters(config["GlobalParameters"])
    else:
        assignGlobalParameters({})

    globalParameters["WorkingPath"] = os.path.abspath(args.output_path)
    ensurePath(globalParameters["WorkingPath"])

    # override config with command-line options
    if args.device:
        print1("# Command-line override: Device")
        globalParameters["Device"] = args.device
    if args.platform:
        print1("# Command-line override: Platform")
        globalParameters["Platform"] = args.platform
    if args.RuntimeLanguage:
        print1("# Command-line override: RuntimeLanguage")
        globalParameters["RuntimeLanguage"] = args.RuntimeLanguage
    if args.CodeObjectVersion:
        print1("# Command-line override: CodeObjectVersion")
        globalParameters["CodeObjectVersion"] = args.CodeObjectVersion
    if args.verbose:
        print1("# Command-line override: PrintLevel")
        globalParameters["PrintLevel"] = 2
    if args.debug:
        print1("# Command-line override: Debug")
        globalParameters["PrintLevel"] = 2
        globalParameters["CMakeBuildType"] = "Debug"
    if args.shortNames:
        globalParameters["ShortNames"] = True
    if args.noMergeFiles:
        globalParameters["MergeFiles"] = False
    print1("")

    # Execute Steps in the config script
    executeStepsInConfig(config)
Exemple #13
0
def writeLogic(outputPath, logicData, solutionWriter ):
  print1("# Writing Library Logic")

  if not globalParameters["MergeFiles"]:
    ensurePath(os.path.join(outputPath, "Logic"))

  # Tensile.h
  h = ""
  h += "#pragma once\n"
  h += "#include \"TensileTypes.h\"\n"
  h += "#include \"SolutionHelper.h\"\n"
  h += "#include \"SolutionMapper.h\"\n"

  # TensileInternal.h
  ih = ""
  ih += "#include \"Tensile.h\"\n"

  # Tensile.cpp
  s = ""
  s += "#include \"Solutions.h\"\n"
  s += "#include \"Tensile.h\"\n"
  s += "#include \"TensileInternal.h\"\n"
  s += "#include \"SolutionMapper.h\"\n"

  ########################################
  # problemType
  for problemType in logicData:

    # function argument list
    argListSizes = solutionWriter.getArgList(problemType, False, False, False, False)
    argListData  = solutionWriter.getArgList(problemType, False, True, True, True)
    argListAll  = solutionWriter.getArgList(problemType, True, True, True, True)
    
    # tensile initializer
    h += "\nvoid tensileInitialize();\n\n"

    # declare tensile_ProblemType
    h += "\n// enqueue solution\n"
    h += "TensileStatus tensile_%s(\n" % problemType
    for i in range(0, len(argListData)):
      h += "    %s %s%s" \
          % (argListData[i][0], argListData[i][1], \
          ",\n" if i < len(argListData)-1 else ");\n\n")


    numSizes = problemType["TotalIndices"];
    firstStride = 0 if problemType["UseInitialStrides"] else 1
    lastStrideA = len(problemType["IndexAssignmentsA"])
    lastStrideB = len(problemType["IndexAssignmentsB"])
    lastStrideC = problemType["NumIndicesC"]
    h += "typedef ProblemKey<%u> ProblemKey_%s;\n" % (numSizes,problemType)
    h += "typedef ProblemDims<%u,%u,%u,%u,%u> ProblemDims_%s;\n" \
        % (firstStride, lastStrideC, lastStrideA, lastStrideB, numSizes, problemType)
    h += "typedef SolutionMapper<ProblemDims_%s, ProblemKey_%s> SolutionMapper_%s;\n" \
            % (problemType, problemType, problemType)

    # declare tensileGetSolutionPointer_ProblemType
    h += "\n// get solution pointer\n"
    h += "SolutionMapper_%s::SolutionRuntime *\n" % (problemType)
    h += "tensileGetSolutionPointer_%s(\n" % (problemType)
    for i in range(0, len(argListSizes)):
      h += "    %s %s%s" \
          % (argListSizes[i][0], argListSizes[i][1], \
          ",\n" if i < len(argListSizes)-1 else ");\n\n")

    # declare tensileName_
    h += "// get solution name\n"
    h += "const char * tensileGetSolutionName_%s(\n" \
        % (problemType)
    for i in range(0, len(argListSizes)):
      h += "    %s %s%s" \
          % (argListSizes[i][0], argListSizes[i][1], \
          ",\n" if i < len(argListSizes)-1 else ");\n\n")


    # get solution naming for problem type
    solutionsForProblemType = []
    for scheduleTuple in logicData[problemType]:
      solutionsForSchedule = scheduleTuple[2]
      for solution in solutionsForSchedule:
        if solution not in solutionsForProblemType:
          solutionsForProblemType.append(solution)

    # solution names for problem type
    solutionNamesForProblemType = []
    for solution in solutionsForProblemType:
      solutionName = solutionWriter.getSolutionName(solution)
      solutionNamesForProblemType.append(solutionName)

    # reset problemType source
    if not globalParameters["MergeFiles"]:
      filePrefix = "Tensile_%s" % (problemType)
      s = "#include \"TensileTypes.h\"\n"
      s = "#include \"Tensile.h\"\n"
      s += "#include \"TensileInternal.h\"\n"
      for solutionName in solutionNamesForProblemType:
        s += "#include \"%s.h\"\n" % solutionName

    ########################################
    # Per-problem constants here:
    # These are common for all schedules and thus do not include schedule name (vega,hip,etc)
    s += "\n"
    s += "/*******************************************************************************\n"
    s += "* Per-Problem Functions for %s\n" % problemType
    s += "*******************************************************************************/\n"

    s += "// Problem type include the index assignments for free, summation, batch:\n"
    s += "static const ProblemType problemType_%s( " % problemType
    s += listToInitializer(problemType["IndicesFree"]) + ", "
    s += listToInitializer(problemType["IndicesSummation"]) + ", "
    s += listToInitializer(problemType["IndicesBatch"])
    s += ");\n"

    s += "\n"
    s += "// Master solution mapper is the entry point for problem->solution mapping\n"
    s += "// There is one master solution mapper per problem type\n"
    s += "// The master solution mapper contains pointers to the solution mappers for each device\n"
    s += "static MasterSolutionMapper<ProblemDims_%s> masterSolutionMapper_%s;\n " % (problemType,problemType)


    ########################################
    # implement per-Schedule functions in source
    s += "\n"
    s += "/*******************************************************************************\n * Per-Schedule Functions\n *******************************************************************************/"
    for scheduleTuple in logicData[problemType]:

      # get logic parameters for problem type
      scheduleName  = scheduleTuple[0]
      deviceNames   = scheduleTuple[1]
      solutionsForSchedule = scheduleTuple[2]
      indexOrder    = scheduleTuple[3]
      exactLogic    = scheduleTuple[4]
      rangeLogic    = scheduleTuple[5]

      # solution names for schedule
      solutionNamesForSchedule = []
      for solution in solutionsForSchedule:
        solutionName = solutionWriter.getSolutionName(solution)
        solutionNamesForSchedule.append(solutionName)

      s += "\n\n"
      schedProbName = "%s_%s" % (scheduleName, problemType)
      s += writeSolutionAndExactTable(scheduleName, deviceNames, schedProbName, problemType, \
              solutionsForSchedule, solutionNamesForSchedule, exactLogic)


    # Per-problem function here:
    # function tensileGetSolutionPointer_ProblemType
    del schedProbName
    del scheduleName
    s += "\n// problem dims -> solution logic\n"
    s += "SolutionMapper_%s::SolutionRuntime *\n" % (problemType)
    s += "tensileGetSolutionPointer_%s(\n" % (problemType)
    for i in range(0, len(argListSizes)):
      s += "    %s %s%s" \
          % (argListSizes[i][0], argListSizes[i][1], \
          ",\n" if i < len(argListSizes)-1 else ") {\n\n")

    exactLogicStr = writeExactLogic(problemType, indexOrder, \
                                    solutionsForSchedule, exactLogic, \
                                    solutionNamesForSchedule, True)
    if rangeLogic != None:
      print "** warning: ignored ranges in logic file, these should have been expanded with ExpandRanges=1 during Tensile phase 3"
    s += "  /* exact mappings */\n"
    s += exactLogicStr
    s += "\n  return nullptr;\n"
    s += "\n}\n"

    # function tensileGetSolutionName_Schedule_ProblemType
    s += "\n// get solution name for problem dims\n"
    s += "const char * tensileGetSolutionName_%s(\n" \
        % (problemType)
    for i in range(0, len(argListSizes)):
      s += "    %s %s%s" \
          % (argListSizes[i][0], argListSizes[i][1], \
          ",\n" if i < len(argListSizes)-1 else ") {\n\n")

    exactLogicStr = writeExactLogic(problemType, indexOrder, \
                                    solutionsForSchedule, exactLogic, \
                                    solutionNamesForSchedule, False)
    s += "  /* exact mappings */\n"
    s += exactLogicStr
    #s += "  return NULL; // none\n"
    s += "\n}\n"

    ########################################
    # implement problem-type functions in source
    s += "/*******************************************************************************\n * Per-ProblemType Functions\n *******************************************************************************/"


    # declare tensile_ProblemType
    s += "\n// main call to solution; enqueues a kernel\n"
    s += "TensileStatus tensile_%s(\n" % problemType
    for i in range(0, len(argListData)):
      s += "    %s %s%s" \
          % (argListData[i][0], argListData[i][1], \
          ",\n" if i < len(argListData)-1 else ") {\n")
    s += "    auto solution = tensileGetSolutionPointer_%s(\n" % (problemType)
    for i in range(0, len(argListSizes)):
      s += "        %s%s" \
          % (argListSizes[i][1], ", " if i < len(argListSizes)-1 else ");")
      s += "\n"
    s += "    if (solution) {\n"
    s += "      TensileSolutionPointer_%s f = reinterpret_cast<TensileSolutionPointer_%s> (solution->_info->_functionPtr);\n" \
      % (problemType, problemType)
    s += "      auto solutionLock = &solution->_lock;\n"
    s += "      return f("
    for i in range(0, len(argListAll)):
      s += "%s%s" \
          % (argListAll[i][1], ", " if i < len(argListAll)-1 else ");\n")
    s += "    } else {\n"
    #s += "      printf(\"solution not valid, returning fail\\n\");"
    s += "      return tensileStatusFailure; // no solution found\n"
    s += "    }\n"
    s += "}\n"

    # open and close problemType files
    if not globalParameters["MergeFiles"]:
      logicSourceFile = open(os.path.join(outputPath, "Logic", \
          "%s.cpp" % filePrefix), "w")
      logicSourceFile.write(s)
      logicSourceFile.close()

  s += "\n"
  s += writeTensileInitialize(logicData)

  # close merged files
  if globalParameters["MergeFiles"]:
    logicSourceFile = open(os.path.join(outputPath, \
        "Tensile.cpp"), "w")
    logicSourceFile.write(s)
    logicSourceFile.close()

  logicHeaderFile = open(os.path.join(outputPath, \
      "Tensile.h"), "w")
  logicHeaderFile.write(h)
  logicHeaderFile.close()

  internalHeaderFile = open(os.path.join(outputPath, \
      "TensileInternal.h"), "w")
  internalHeaderFile.write(ih)
  internalHeaderFile.close()
Exemple #14
0
def writeSolutionsAndKernels(outputPath, problemTypes, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
  start = time.time()
  print1("# Writing Kernels...")
  if not globalParameters["MergeFiles"]:
    ensurePath(os.path.join(outputPath, "Solutions"))
    ensurePath(os.path.join(outputPath, "Kernels"))

  if globalParameters["ShowProgressBar"]:
    progressBar = ProgressBar(len(kernels))

  ##############################################################################
  # Write Kernels
  ##############################################################################
  if globalParameters["MergeFiles"]:
    kernelSourceFile = open(os.path.join(outputPath, \
        "Kernels.cpp"), "w")
    kernelHeaderFile = open(os.path.join(outputPath, \
        "Kernels.h"), "w")
    kernelSourceFile.write(CHeader)
    kernelHeaderFile.write(CHeader)
    kernelSourceFile.write("#include \"Kernels.h\"\n")
    kernelHeaderFile.write("#pragma once\n")
    if globalParameters["RuntimeLanguage"] == "HIP":
      kernelHeaderFile.write("// Also set env var HCC_ENABLE_PRINTF=1 for printf\n")
      kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n")
      kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
      kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
      kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
      kernelHeaderFile.write("\n\n")
      kernelHeaderFile.write("__device__ inline int GenDot4(int a, int b, int c) { \n")
      kernelHeaderFile.write("#if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__\n")
      kernelHeaderFile.write("  typedef union { int32_t i; char4 z; } PkInt8x4;\n")
      kernelHeaderFile.write("#else\n")
      kernelHeaderFile.write("  typedef struct { int c0:8,c1:8,c2:8,c3:8; } C4I8;\n")
      kernelHeaderFile.write("  typedef union { int32_t i; C4I8 z; } PkInt8x4;\n")
      kernelHeaderFile.write("#endif\n")
      kernelHeaderFile.write("  PkInt8x4 va, vb; va.i = a; vb.i = b;\n")

      kernelHeaderFile.write("#if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__\n")
      kernelHeaderFile.write("      return amd_mixed_dot(va.z, vb.z, c, true); }\n")
      kernelHeaderFile.write("#else\n")
      kernelHeaderFile.write("      return c + (vb.z.c3*va.z.c3 + vb.z.c2*va.z.c2 + vb.z.c1*va.z.c1 + vb.z.c0*va.z.c0); }\n")
      kernelHeaderFile.write("#endif\n")
      kernelHeaderFile.write("\n\n")
    else:
      kernelHeaderFile.write("#include <string>\n")

  kernelsWithBuildErrs = {}

  prepAsm()

  if globalParameters["CpuThreads"] == 0:
    cpus = 0
  elif globalParameters["CodeFromFiles"]:
    cpu_count = multiprocessing.cpu_count()
    cpuThreads = globalParameters["CpuThreads"]
    cpus = cpu_count*abs(cpuThreads) if cpuThreads < 0 \
           else min(cpu_count, cpuThreads)
  else: #! CodeFromFiles is not thread-safe since code merged into same file
    cpus = 1

  workPerCpu = max(10, (len(kernels)+cpus-1)/cpus) if cpus else 1

  kiStart = 0
  cpu = 0
  threads = []
  if 1 and cpus and globalParameters["ShowProgressBar"]:
    print "# Launching kernel compilation processes (cpus=%u kernelsPerCpu=%u)" % (cpus, workPerCpu)
    processLaunchProgressBar = ProgressBar(len(kernels))
  else:
    print "# Compiling kernels (no multiprocessing, cpus=%u #kernels=%u)" % (cpus, workPerCpu)
    processLaunchProgressBar = None
  while kiStart < len(kernels):
    kiStop = min(len(kernels), kiStart + workPerCpu)
    if cpus:
      results = []
      parentConn,child  = multiprocessing.Pipe()
      args=(kernels, kernelWriterSource, kernelWriterAssembly, \
            kiStart, kiStop, child)
      t = multiprocessing.Process(target=processKernelSourceChunk, args=args)
      t.start()
      child.close() # close child pipe in the parent process
      threads.append([t,kiStart,kiStop, parentConn])
      if processLaunchProgressBar:
        processLaunchProgressBar.increment(kiStop-kiStart)
      else:
        sys.stderr.write("  # launched process %s for kernels %d..%d\n" %(t, kiStart, kiStop-1))

    else: # non-threaded version
      results = processKernelSourceChunk(kernels, kernelWriterSource, kernelWriterAssembly, \
                               kiStart, kiStop, None)
      if globalParameters["ShowProgressBar"]:
        progressBar.increment(kiStop-kiStart)
      processResults(results, outputPath, kernelsWithBuildErrs, kernelSourceFile, kernelHeaderFile)

    kiStart += workPerCpu
    cpu += 1
  sys.stderr.write("# Waiting for kernel compilation processes...\n")

  someError = 0
  if cpus:
    for (t,kiStart,kiStop,parentConn) in threads:
      try:
        results = parentConn.recv()
      except EOFError as pipeErr:
        print  "*** warning: process", t, "returned pipe EOF",t,pipeErr

      t.join()
      e = t.exitcode
      if e != 0 :
        print  "*** warning: process", t, "returned",t,e
        someError = 1
        results = []

      if globalParameters["ShowProgressBar"]:
        progressBar.increment(kiStop-kiStart)
      processResults(results, outputPath, kernelsWithBuildErrs, kernelSourceFile, kernelHeaderFile)

  if someError:
    print "\nKernel compilation failed in one or more subprocesses. May want to set CpuThreads=0 and re-run to make debug easier"
    printExit("** kernel compilation failure **")


  # beta-only kernels
  for kernel in kernelsBetaOnly:
    kernelWriter = kernelWriterSource
    kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

    # write kernel.cpp
    if not globalParameters["MergeFiles"]:
      kernelSourceFile = open(os.path.join(outputPath, \
          "Kernels", kernelName+".cpp"), "w")
      kernelSourceFile.write(CHeader)

    (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel)
    kernelSourceFile.write(src)
    if err:
      print "*** warning: invalid kernel#%u"%kernelName
    if not globalParameters["MergeFiles"]:
      kernelSourceFile.close()
    # write kernel.h
    if not globalParameters["MergeFiles"]:
      kernelHeaderFile = open(os.path.join(outputPath, \
          "Kernels", kernelName + ".h"), "w")
      kernelHeaderFile.write(CHeader)
    kernelHeaderFile.write( kernelWriter.getHeaderFileStringBetaOnly(kernel))
    if not globalParameters["MergeFiles"]:
      kernelHeaderFile.close()

  # close merged
  if globalParameters["MergeFiles"]:
    kernelHeaderFile.close()

  stop = time.time()
  print "# Kernel Building elapsed time = %.1f secs" % (stop-start)

  print1("# Writing Solutions")
  if globalParameters["ShowProgressBar"]:
    progressBar = ProgressBar(len(solutions))
  ##############################################################################
  # Write Solutions
  ##############################################################################
  if globalParameters["MergeFiles"]:
    solutionSourceFile = open(os.path.join(outputPath, \
        "Solutions.cpp"), "w")
    solutionHeaderFile = open(os.path.join(outputPath, \
        "Solutions.h"), "w")
    if globalParameters["MergeFiles"]:
      solutionSourceFile.write(CHeader)
      solutionHeaderFile.write(CHeader)
    solutionSourceFile.write("#include \"Solutions.h\"\n")
    solutionSourceFile.write("#include <algorithm>\n")
    solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
    solutionHeaderFile.write("#include \"Kernels.h\"\n")
    solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
    solutionHeaderFile.write("#include \"Tools.h\"\n")
    if globalParameters["CodeFromFiles"]:
      solutionHeaderFile.write("#include <unistd.h>\n")


  # Write a solution pointer typedef for each problemType:
  h = ""
  for problemType in problemTypes:
    #print "p=", problemType
    argListAll = solutionWriter.getArgList(problemType, True, True, True, True)
    # declare TensileSolutionPointer_ProblemType
    h += "\n// solution pointer\n"
    h += "typedef TensileStatus (*TensileSolutionPointer_%s)(\n" \
        % problemType
    for i in range(0, len(argListAll)):
      h += "    %s %s%s" % (argListAll[i][0], argListAll[i][1], ",\n" \
          if i < len(argListAll)-1 else ");\n\n")
    h += "\n"

  solutionHeaderFile.write(h)
#
  for solution in solutions:
    # get solution name
    if not globalParameters["MergeFiles"]:
      solutionFileName = solutionWriter.getSolutionName(solution)

    # write solution.cpp
    if not globalParameters["MergeFiles"]:
      solutionSourceFile = open(os.path.join(outputPath, \
          "Solutions", solutionFileName+".cpp"), "w")
      solutionSourceFile.write(CHeader)
    solutionSourceFile.write( \
        solutionWriter.getProblemSourceString(solution["ProblemType"], solution, kernelsWithBuildErrs))
    if not globalParameters["MergeFiles"]:
      solutionSourceFile.close()

    # write solution.h
    if not globalParameters["MergeFiles"]:
      solutionHeaderFile = open(os.path.join(outputPath, \
          "Solutions", solutionFileName+".h"), "w")
      solutionHeaderFile.write(CHeader)
    solutionHeaderFile.write( \
        solutionWriter.getHeaderFileString(solution))
    if not globalParameters["MergeFiles"]:
      solutionHeaderFile.close()
    if globalParameters["ShowProgressBar"]:
      progressBar.increment()
  # close merged
  if not globalParameters["MergeFiles"]:
    solutionHeaderFile.close()

  if globalParameters["ExitAfterKernelGen"]:
    printExit("** Exiting after kernel generation due to ExitAfterKernelGen=1")
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
    print1("# Writing Kernels")
    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Solutions"))
        ensurePath(os.path.join(outputPath, "Kernels"))

    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(kernels))

    ##############################################################################
    # Write Kernels
    ##############################################################################
    if globalParameters["MergeFiles"]:
        kernelSourceFile = open(os.path.join(outputPath, \
            "Kernels.cpp"), "w")
        kernelHeaderFile = open(os.path.join(outputPath, \
            "Kernels.h"), "w")
        kernelSourceFile.write(CHeader)
        kernelHeaderFile.write(CHeader)
        kernelSourceFile.write("#include \"Kernels.h\"\n")
        kernelHeaderFile.write("#pragma once\n")
        if globalParameters["RuntimeLanguage"] == "HIP":
            kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n")
            kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
            kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
            kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
        else:
            kernelHeaderFile.write("#include <string>\n")

    kernelsWithBuildErrs = {}

    # tensor contraction kernels - dispatch as multiple threads:
    kLock = threading.Lock()
    pLock = threading.Lock()

    prepAsm()

    if globalParameters["CpuThreads"] == 0:
        cpus = 0
    elif globalParameters["CodeFromFiles"]:
        cpu_count = multiprocessing.cpu_count()
        cpus = cpu_count if globalParameters["CpuThreads"] == -1 \
               else min(cpu_count, globalParameters["CpuThreads"])
    else:  #! CodeFromFiles is not thread-safe since code merged into same file
        cpus = 1

    workPerCpu = max(10, (len(kernels) + cpus - 1) / cpus) if cpus else 1
    print "info: cpus=%u kernelsPerCpu=%u" % (cpus, workPerCpu)

    kiStart = 0
    cpu = 0
    threads = []
    while kiStart < len(kernels):
        kiStop = min(len(kernels), kiStart + workPerCpu)
        #sys.stderr.write("cpu:%u process kernels #%u-#%u\n"% (cpu, kiStart, kiStop))

        if cpus:
            args=(outputPath, kernels, kernelSourceFile, kernelHeaderFile, \
                  kernelWriterSource, kernelWriterAssembly, \
                  kernelsWithBuildErrs, progressBar, kLock, pLock, kiStart, kiStop)
            t = threading.Thread(target=processKernelSourceChunk, args=args)
            t.start()
            threads.append(t)
        else:
            processKernelSourceChunk(outputPath, kernels, kernelSourceFile, kernelHeaderFile, \
                                      kernelWriterSource, kernelWriterAssembly, \
                                      kernelsWithBuildErrs, kLock, pLock, kiStart, kiStop)
        kiStart += workPerCpu
        cpu += 1

    for t in threads:
        t.join()

    # beta-only kernels
    for kernel in kernelsBetaOnly:
        kernelWriter = kernelWriterSource
        kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)

        (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel)
        kernelSourceFile.write(src)
        if err:
            print "*** warning: invalid kernel#%u" % kernelName
        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()
        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName + ".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(
            kernelWriter.getHeaderFileStringBetaOnly(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()

    # close merged
    if globalParameters["MergeFiles"]:
        kernelHeaderFile.close()

    print1("# Writing Solutions")
    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(solutions))
    ##############################################################################
    # Write Solutions
    ##############################################################################
    if globalParameters["MergeFiles"]:
        solutionSourceFile = open(os.path.join(outputPath, \
            "Solutions.cpp"), "w")
        solutionHeaderFile = open(os.path.join(outputPath, \
            "Solutions.h"), "w")
        if globalParameters["MergeFiles"]:
            solutionSourceFile.write(CHeader)
            solutionHeaderFile.write(CHeader)
        solutionSourceFile.write("#include \"Solutions.h\"\n")
        solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
        solutionHeaderFile.write("#include \"Kernels.h\"\n")
        solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
        solutionHeaderFile.write("#include \"Tools.h\"\n")
        if globalParameters["CodeFromFiles"]:
            solutionHeaderFile.write("#include <unistd.h>\n")
    for solution in solutions:
        # get solution name
        if not globalParameters["MergeFiles"]:
            solutionFileName = solutionWriter.getSolutionName(solution)

        # write solution.cpp
        if not globalParameters["MergeFiles"]:
            solutionSourceFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".cpp"), "w")
            solutionSourceFile.write(CHeader)
        solutionSourceFile.write( \
            solutionWriter.getSourceFileString(solution, kernelsWithBuildErrs))
        if not globalParameters["MergeFiles"]:
            solutionSourceFile.close()

        # write solution.h
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".h"), "w")
            solutionHeaderFile.write(CHeader)
        solutionHeaderFile.write( \
            solutionWriter.getHeaderFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile.close()
        if globalParameters["ShowProgressBar"]:
            progressBar.increment()
    # close merged
    if not globalParameters["MergeFiles"]:
        solutionHeaderFile.close()

    if globalParameters["ExitAfterKernelGen"]:
        printExit(
            "** Exiting after kernel generation due to ExitAfterKernelGen=1")
Exemple #16
0
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
    print1("# Writing Kernels")
    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Solutions"))
        ensurePath(os.path.join(outputPath, "Kernels"))

    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(kernels))

    ##############################################################################
    # Write Kernels
    ##############################################################################
    if globalParameters["MergeFiles"]:
        kernelSourceFile = open(os.path.join(outputPath, \
            "Kernels.cpp"), "w")
        kernelHeaderFile = open(os.path.join(outputPath, \
            "Kernels.h"), "w")
        kernelSourceFile.write(CHeader)
        kernelHeaderFile.write(CHeader)
        kernelSourceFile.write("#include \"Kernels.h\"\n")
        kernelHeaderFile.write("#pragma once\n")
        if globalParameters["RuntimeLanguage"] == "HIP":
            kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n")
            kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
            kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
            kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
        else:
            kernelHeaderFile.write("#include <string>\n")

    # tensor contraction kernels
    for ki in range(0, len(kernels)):
        kernel = kernels[ki]
        kernelWriter = kernelWriterSource if kernel[
            "KernelLanguage"] == "Source" else kernelWriterAssembly
        # get kernel name
        if not globalParameters["MergeFiles"]:
            kernelName = kernelWriter.getKernelName(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)
        (err, src) = kernelWriter.getSourceFileString(kernel)
        kernelSourceFile.write(src)
        if err:
            print "*** warning: invalid kernel#%u" % ki

        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()

        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(kernelWriter.getHeaderFileString(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()
        if globalParameters["ShowProgressBar"]:
            progressBar.increment()

    # beta-only kernels
    for kernel in kernelsBetaOnly:
        kernelWriter = kernelWriterSource
        kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)

        (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel)
        kernelSourceFile.write(src)
        if err:
            print "*** warning: invalid kernel#%u" % ki
        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()
        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName + ".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(
            kernelWriter.getHeaderFileStringBetaOnly(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()

    # close merged
    if globalParameters["MergeFiles"]:
        kernelHeaderFile.close()

    print1("# Writing Solutions")
    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(solutions))
    ##############################################################################
    # Write Solutions
    ##############################################################################
    if globalParameters["MergeFiles"]:
        solutionSourceFile = open(os.path.join(outputPath, \
            "Solutions.cpp"), "w")
        solutionHeaderFile = open(os.path.join(outputPath, \
            "Solutions.h"), "w")
        if globalParameters["MergeFiles"]:
            solutionSourceFile.write(CHeader)
            solutionHeaderFile.write(CHeader)
        solutionSourceFile.write("#include \"Solutions.h\"\n")
        solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
        solutionHeaderFile.write("#include \"Kernels.h\"\n")
        solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
        solutionHeaderFile.write("#include \"Tools.h\"\n")
    for solution in solutions:
        # get solution name
        if not globalParameters["MergeFiles"]:
            solutionFileName = solutionWriter.getSolutionName(solution)

        # write solution.cpp
        if not globalParameters["MergeFiles"]:
            solutionSourceFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".cpp"), "w")
            solutionSourceFile.write(CHeader)
        solutionSourceFile.write( \
            solutionWriter.getSourceFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionSourceFile.close()

        # write solution.h
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".h"), "w")
            solutionHeaderFile.write(CHeader)
        solutionHeaderFile.write( \
            solutionWriter.getHeaderFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile.close()
        if globalParameters["ShowProgressBar"]:
            progressBar.increment()
    # close merged
    if not globalParameters["MergeFiles"]:
        solutionHeaderFile.close()

    if globalParameters["ExitAfterKernelGen"]:
        printExit(
            "** Exiting after kernel generation due to ExitAfterKernelGen=1")