예제 #1
0
 def addResults( self, hardcodedParameterList, benchmarkPermutations, \
     solutions, results):
     if globalParameters["PrintLevel"] >= 1:
         print1("# Adding Results to Solution Database")
         progressBar = ProgressBar(len(results))
     for hardcodedIdx in range(0, len(results)):
         hardcodedResults = results[hardcodedIdx]
         hardcodedParameters = hardcodedParameterList[hardcodedIdx]
         winningIdx = -1
         winningScore = -9999  # -1 is score of invalid so use -9999 here
         # find fastest benchmark parameters for this hardcoded
         for benchmarkIdx in range(0, len(hardcodedResults)):
             benchmarkResult = hardcodedResults[benchmarkIdx]
             benchmarkScore = max(
                 benchmarkResult)  # take fastest regardless of size
             if benchmarkScore > winningScore:
                 winningScore = benchmarkScore
                 winningIdx = benchmarkIdx
         winningSolution = solutions[hardcodedIdx][winningIdx]
         winningParameters = {}
         for paramName in benchmarkPermutations[0]:
             winningParameters[paramName] = winningSolution[paramName]
         #print2("HCP[%u] Winner: idx=%u, gflops=%f, param=%s" \
         #    % ( hardcodedIdx, winningIdx, winningScore, winningParameters))
         matches = WinningParameterDict.get(hardcodedParameters,
                                            self.winners)
         if len(matches) != 1:
             printExit("Didn't find exactly 1 match")
         hardcodedParametersKey = matches[0][0]
         #oldWinningParameters = matches[0][1]
         #oldScore = matches[0][2]
         self.winners[hardcodedParametersKey][0].update(winningParameters)
         self.winners[hardcodedParametersKey][1] = winningScore
         if globalParameters["PrintLevel"] >= 1:
             progressBar.increment()
예제 #2
0
    def wpdUpdate(self, newHardcodedParameterList):
        # TODO when new list is joining, we need to choose the fastest
        oldWinners = self.winners
        self.winners = {}

        # if this is first time, populate with dummies and early exit
        if len(oldWinners) == 0:
            for newHardcodedParameters in newHardcodedParameterList:
                self.winners[FrozenDictionary(newHardcodedParameters)] = [{},
                                                                          -1]
        else:
            if globalParameters["PrintLevel"] >= 1:
                print1("# Updating Solution Database")
                progressBar = ProgressBar(len(newHardcodedParameterList))
            for newHardcodedParameters in newHardcodedParameterList:
                #(oldHardcodedParameters, winningParameters, score) = \
                matches = WinningParameterDict.get(newHardcodedParameters,
                                                   oldWinners)
                if len(matches) == 1:  # plain update
                    hardcodedFrozen = matches[0][0]
                    winningParameters = matches[0][1]
                    score = matches[0][2]
                    #if winningParameters != None:
                    newHardcodedParameters.update(hardcodedFrozen.parameters)
                    self.winners[FrozenDictionary(newHardcodedParameters)] = \
                        [ winningParameters, score ]
                elif len(matches) > 1:  # join
                    fastestScore = -1
                    fastestHardcodedParameters = {}
                    fastestWinningParameters = {}
                    for matchIdx in range(0, len(matches)):
                        match = matches[matchIdx]
                        hardcodedFrozen = match[0]
                        winningParameters = match[1]
                        score = match[2]
                        if score > fastestScore:
                            fastestScore = score
                            fastestWinningParameters = winningParameters
                            fastestHardcodedParameters = hardcodedFrozen.parameters
                    newHardcodedParameters.update(fastestHardcodedParameters)
                    self.winners[FrozenDictionary(newHardcodedParameters)] = \
                        [ fastestWinningParameters, fastestScore ]
                if globalParameters["PrintLevel"] >= 1:
                    progressBar.increment()

        # return resulting hardcodedParameterList
        returnHardcodedParameterList = []
        for hardcodedFrozen in self.winners:
            returnHardcodedParameterList.append(hardcodedFrozen.parameters)
        #print "info: after winner-update, returnHardcodedParameterList=", len(returnHardcodedParameterList)
        return returnHardcodedParameterList
예제 #3
0
def benchmarkProblemType( problemTypeConfig, problemSizeGroupConfig, \
    problemSizeGroupIdx ):

    benchmarkTestFails = 0

    # convert config to full benchmark process (resolves defaults)
    print1("")
    print1(HR)
    print1("# Converting Config to BenchmarkProcess Object")
    print1(HR)
    print1("")
    benchmarkProcess = BenchmarkProcess( problemTypeConfig, \
        problemSizeGroupConfig )

    problemTypeName = str(benchmarkProcess.problemType)
    problemSizeGroupName = "%s_%02u" % (problemTypeName, problemSizeGroupIdx)
    pushWorkingPath(problemSizeGroupName)
    ensurePath(os.path.join(globalParameters["WorkingPath"], "Data"))

    totalBenchmarkSteps = len(benchmarkProcess)
    resultsFileBaseFinal = None
    winners = WinningParameterDict()
    print1("# NumBenchmarkSteps: %u" % totalBenchmarkSteps)
    print1("")
    print1(HR)
    print1("# Done Creating BenchmarkProcess Object")
    print1(HR)

    ##############################################################################
    # For Each Benchmark Step
    ##############################################################################
    for benchmarkStepIdx in range(0, totalBenchmarkSteps):

        benchmarkStep = benchmarkProcess[benchmarkStepIdx]
        if winners.winners == {}:
            # perf optimization to skip the initial winners creation
            # this helps a little here but really helps below with avoiding the super-expensive
            # removeHardcoded step below - that can use a fast-path to create
            # winners when needed.
            print1(
                "# Empty winners - use fast initialization of hardcodedParameters"
            )
            resultingHardcodedParameterList = benchmarkStep.hardcodedParameters
        else:
            resultingHardcodedParameterList = \
                winners.wpdUpdate( benchmarkStep.hardcodedParameters )

        benchmarkStep.hardcodedParameters = resultingHardcodedParameterList
        numHardcoded = len(benchmarkStep.hardcodedParameters)
        stepName = str(benchmarkStep)
        shortName = benchmarkStep.abbreviation()
        print1("\n")
        print1(HR)
        currentTime = time.time()
        elapsedTime = currentTime - startTime
        print1("# BenchmarkStep: %s - %s %.3fs" %
               (problemSizeGroupName, stepName, elapsedTime))
        print1("# NumProblems: %u" %
               benchmarkStep.problemSizes.totalProblemSizes)
        print1("# BenchmarkParameters:")
        for paramName in benchmarkStep.benchmarkParameters:
            paramValues = benchmarkStep.benchmarkParameters[paramName]
            printStr = "#     %s = { %s" % (paramName, paramValues[0])
            for paramValueIdx in range(1, len(paramValues)):
                printStr += ", %s" % str(paramValues[paramValueIdx])
            printStr += " }"
            print1(printStr)

        if False:
            # print1(hardcoded parameters and their winners
            print1("# HardcodedParameters | WinningParameters:")
            paramDictIdx = 0
            hardcodedMinNaming = \
                Solution.getMinNaming(benchmarkStep.hardcodedParameters)
            for paramDict in benchmarkStep.hardcodedParameters:
                winningParameters = winners[paramDict]
                print1("#    (%u) %s | %s" % (paramDictIdx, \
                    Solution.getNameMin(paramDict, hardcodedMinNaming), \
                    Solution.getNameFull(winningParameters) ))
                paramDictIdx += 1
        pushWorkingPath(shortName)

        ############################################################################
        # Copy Files to Benchmark Source Directory
        ############################################################################
        stepBaseDir = globalParameters["WorkingPath"]
        sourceDir = \
          os.path.join(stepBaseDir, "source" )
        ensurePath(sourceDir)
        pushWorkingPath("sourceTmp")
        filesToCopy = [
            "SolutionMapper.h",
            "Client.cpp",
            "Client.h",
            "CMakeLists.txt",
            "DeviceStats.h",
            "TensorUtils.h",
            "MathTemplates.cpp",
            "MathTemplates.h",
            "TensileTypes.h",
            "tensile_bfloat16.h",
            "KernelHeader.h",
            "ReferenceCPU.h",
            "SolutionHelper.cpp",
            "SolutionHelper.h",
            "Tools.cpp",
            "Tools.h",
        ]

        for f in filesToCopy:
            shutil_copy(os.path.join(globalParameters["SourcePath"], f),
                        globalParameters["WorkingPath"])
        if globalParameters["RuntimeLanguage"] == "OCL":
            shutil_copy(
                os.path.join(globalParameters["SourcePath"],
                             "FindOpenCL.cmake"),
                globalParameters["WorkingPath"])
        else:
            shutil_copy(
                os.path.join(globalParameters["SourcePath"], "FindHIP.cmake"),
                globalParameters["WorkingPath"])
            shutil_copy(
                os.path.join(globalParameters["SourcePath"], "FindHCC.cmake"),
                globalParameters["WorkingPath"])

        ############################################################################
        # Enumerate Benchmark Permutations
        ############################################################################
        solutions = []
        totalBenchmarkPermutations = 1
        for benchmarkParamName in benchmarkStep.benchmarkParameters:
            totalBenchmarkPermutations *= len(
                benchmarkStep.benchmarkParameters[benchmarkParamName])
        maxPossibleSolutions = totalBenchmarkPermutations * numHardcoded
        print1("# MaxPossibleSolutions: %u = %u (hardcoded) * %u (benchmark)" % \
            (maxPossibleSolutions, numHardcoded, totalBenchmarkPermutations))

        benchmarkPermutations = []
        for i in range(0, totalBenchmarkPermutations):
            permutation = {}
            pIdx = i
            for benchmarkParamName in benchmarkStep.benchmarkParameters:
                benchmarkParamValues = deepcopy( \
                    benchmarkStep.benchmarkParameters[benchmarkParamName])
                valueIdx = pIdx % len(benchmarkParamValues)
                permutation[benchmarkParamName] = benchmarkParamValues[
                    valueIdx]
                pIdx /= len(benchmarkParamValues)
            benchmarkPermutations.append(permutation)

        ############################################################################
        # Enumerate Solutions = Hardcoded * Benchmark
        ############################################################################
        print1("# Enumerating Solutions")
        if globalParameters["PrintLevel"] >= 1:
            progressBar = ProgressBar(maxPossibleSolutions)
        solutionSet = set()  # avoid duplicates for nlca=-1, 1
        for hardcodedIdx in range(0, numHardcoded):
            solutions.append([])
            hardcodedParamDict = benchmarkStep.hardcodedParameters[
                hardcodedIdx]
            for benchmarkIdx in range(0, len(benchmarkPermutations)):
                benchmarkPermutation = benchmarkPermutations[benchmarkIdx]
                solution = {
                    "ProblemType": deepcopy(benchmarkProcess.problemType.state)
                }
                solution.update(benchmarkPermutation)
                solution.update(hardcodedParamDict)
                if benchmarkStepIdx > 0:
                    winningParameters = winners[hardcodedParamDict]
                    if winningParameters == None:
                        # this is a joined parameter that didn't have a winner, that's okay
                        continue
                    solution.update(winningParameters)

                # append default parameters where necessary
                for initialSolutionParameterName in benchmarkStep.initialSolutionParameters:
                    if initialSolutionParameterName not in solution:
                        solution[initialSolutionParameterName] = \
                            benchmarkStep.initialSolutionParameters[initialSolutionParameterName]
                # TODO check if solution matches problem size for exact tile kernels
                solutionObject = Solution(solution)
                if solutionObject["Valid"]:
                    if solutionObject not in solutionSet:
                        solutionSet.add(solutionObject)
                        solutions[hardcodedIdx].append(solutionObject)
                else:
                    if globalParameters["PrintSolutionRejectionReason"]:
                        print1("rejecting solution %s" % str(solutionObject))
                if globalParameters["PrintLevel"] >= 1:
                    progressBar.increment()

        # remove hardcoded that don't have any valid benchmarks
        removeHardcoded = []
        for hardcodedIdx in range(0, numHardcoded):
            if len(solutions[hardcodedIdx]) == 0:
                hardcodedParamDict = benchmarkStep.hardcodedParameters[
                    hardcodedIdx]
                removeHardcoded.append(hardcodedParamDict)
        removesExist = len(removeHardcoded) > 0
        for hardcodedParam in removeHardcoded:
            benchmarkStep.hardcodedParameters.remove(hardcodedParam)

        if removesExist:
            print1(
                "# Updating winners since enumeration removed unused hardcoded solutions.  removeHardcoded=%u winners=%u"
                % (len(removeHardcoded), len(winners.winners)))
            winners.wpdUpdate(benchmarkStep.hardcodedParameters)
            if globalParameters["PrintLevel"] >= 1:
                print1("")
            numHardcoded = len(benchmarkStep.hardcodedParameters)
            # remove from solution 2D list also
            for solutionList in shallowcopy(solutions):
                if len(solutionList) == 0:
                    solutions.remove(solutionList)
        elif winners.winners == {}:
            print1("# Populating initial winners (%u solutions)\n" %
                   len(benchmarkStep.hardcodedParameters))
            for hcParm in benchmarkStep.hardcodedParameters:
                winners.winners[FrozenDictionary(hcParm)] = [{}, -1]

        print1("# Actual Solutions: %u / %u\n" % ( len(solutions), \
            maxPossibleSolutions ))

        # create linear list
        solutionList = []
        for i in range(0, len(solutions)):
            solutionsForHardcoded = solutions[i]
            for j in range(0, len(solutionsForHardcoded)):
                solution = solutionsForHardcoded[j]
                solutionList.append(solution)
        if len(solutionList) == 0:
            msg = "Your parameters resulted in 0 valid solutions."
            if globalParameters["PrintSolutionRejectionReason"]:
                msg += "\nExamine reject and backtrace messages above to see why and where solutions were rejected."
            else:
                msg += "\nYou should re-run with \"PrintSolutionRejectionReason: True\" to see why each parameter combination was rejected."
            printExit(msg)
        if globalParameters["PrintLevel"] >= 1:
            for i in range(0, len(solutions)):
                solutionsForHardcoded = solutions[i]
                for j in range(0, len(solutionsForHardcoded)):
                    solution = solutionsForHardcoded[j]
                    print2("#    (%u:%u) %s" % (i, j, \
                        Solution.getNameFull(solution) ))
            print2(HR)

        # write benchmarkFiles
        writeBenchmarkFiles(stepBaseDir, solutionList, benchmarkStep.problemSizes, \
            shortName, filesToCopy)

        print1("# Copying files that differ from sourceTmp -> source")
        sourceTmp = globalParameters["WorkingPath"]
        files = os.listdir(sourceTmp)
        for f in files:
            f0 = os.path.join(sourceTmp, f)
            f1 = os.path.join(sourceDir, f)
            if os.path.isdir(f0):
                #print "cpDir:", f0, f1
                if os.path.isdir(f1):
                    shutil.rmtree(f1, True)
                shutil.copytree(f0, f1)
            elif not os.path.exists(f1) or not filecmp.cmp(f0, f1):
                #print "cp:", f0, f1
                shutil.copy(f0, f1)
        shutil.rmtree(sourceTmp, True)

        popWorkingPath()  # source

        ############################################################################
        # Run Benchmark Script
        ############################################################################
        resultsFileBase = os.path.normpath(os.path.join( \
            globalParameters["WorkingPath"], "../Data", shortName))
        if benchmarkStep.isFinal():
            resultsFileBaseFinal = resultsFileBase
        resultsFileName = resultsFileBase + ".csv"
        solutionsFileName = resultsFileBase + ".yaml"
        if not os.path.exists(resultsFileName) or \
            globalParameters["ForceRedoBenchmarkProblems"]:
            pushWorkingPath("build")

            # write runScript
            libraryLogicPath = None
            path = globalParameters["WorkingPath"]
            forBenchmark = True
            runScriptName = writeRunScript(path, libraryLogicPath,
                                           forBenchmark)

            # run runScript
            process = Popen(runScriptName, cwd=globalParameters["WorkingPath"])
            process.communicate()
            if process.returncode:
                benchmarkTestFails += 1
                printWarning(
                    "BenchmarkProblems: Benchmark Process exited with code %u"
                    % process.returncode)
            popWorkingPath()  # build
        else:
            print1("# Already benchmarked; skipping.")

        ############################################################################
        # Winners -> Determined Parameters
        ############################################################################
        results = getResults(resultsFileName, solutions)
        print2("CSV Results: %s" % results)
        winners.addResults(benchmarkStep.hardcodedParameters, \
            benchmarkPermutations, solutions, results)

        ############################################################################
        # Write Solutions YAML
        ############################################################################
        YAMLIO.writeSolutions(solutionsFileName, benchmarkStep.problemSizes, \
            solutions )

        # End Iteration
        popWorkingPath()  # stepName
        currentTime = time.time()
        elapsedTime = currentTime - startTime
        print1("%s\n# %s\n# %s: End - %.3fs\n%s\n" \
            % (HR, problemSizeGroupName, shortName, elapsedTime, HR))

    popWorkingPath()  # ProblemType
    return (resultsFileBaseFinal, benchmarkTestFails)
예제 #4
0
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
    start = time.time()
    print1("# Writing Kernels...")
    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Solutions"))
        ensurePath(os.path.join(outputPath, "Kernels"))

    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(kernels))

    ##############################################################################
    # Write Kernels
    ##############################################################################
    if globalParameters["MergeFiles"]:
        kernelSourceFile = open(os.path.join(outputPath, \
            "Kernels.cpp"), "w")
        kernelHeaderFile = open(os.path.join(outputPath, \
            "Kernels.h"), "w")
        kernelSourceFile.write(CHeader)
        kernelHeaderFile.write(CHeader)
        kernelSourceFile.write("#include \"Kernels.h\"\n")
        kernelHeaderFile.write("#pragma once\n")
        if globalParameters["RuntimeLanguage"] == "HIP":
            kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n")
            kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
            kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
            kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
            kernelHeaderFile.write("\n\n")
            kernelHeaderFile.write(
                "__device__ inline int GenDot4(int a, int b, int c) { \n")
            kernelHeaderFile.write(
                "  typedef struct { int c0:8,c1:8,c2:8,c3:8; } C4I8;\n")
            kernelHeaderFile.write(
                "  typedef union { int32_t i; C4I8 z; } PkInt8x4;\n")
            kernelHeaderFile.write("  PkInt8x4 va, vb; va.i = a; vb.i = b;\n")
            kernelHeaderFile.write(
                "  return c + (vb.z.c3*va.z.c3 + vb.z.c2*va.z.c2 + vb.z.c1*va.z.c1 + vb.z.c0*va.z.c0); }\n"
            )
            kernelHeaderFile.write("\n\n")
        else:
            kernelHeaderFile.write("#include <string>\n")

    kernelsWithBuildErrs = {}

    prepAsm()

    if globalParameters["CpuThreads"] == 0:
        cpus = 0
    elif globalParameters["CodeFromFiles"]:
        cpu_count = multiprocessing.cpu_count()
        cpus = cpu_count*4 if globalParameters["CpuThreads"] == -1 \
               else globalParameters["CpuThreads"]
    else:  #! CodeFromFiles is not thread-safe since code merged into same file
        cpus = 1

    workPerCpu = max(10, (len(kernels) + cpus - 1) / cpus) if cpus else 1
    print "# Launching kernel compilation processes (cpus=%u kernelsPerCpu=%u)" % (
        cpus, workPerCpu)

    kiStart = 0
    cpu = 0
    threads = []
    if 1 and cpus and globalParameters["ShowProgressBar"]:
        processLaunchProgressBar = ProgressBar(len(kernels))
    else:
        processLaunchProgressBar = None
    while kiStart < len(kernels):
        kiStop = min(len(kernels), kiStart + workPerCpu)
        if cpus:
            results = []
            parentConn, child = multiprocessing.Pipe()
            args=(kernels, kernelWriterSource, kernelWriterAssembly, \
                  kiStart, kiStop, child)
            t = multiprocessing.Process(target=processKernelSourceChunk,
                                        args=args)
            t.start()
            child.close()  # close child pipe in the parent process
            threads.append([t, kiStart, kiStop, parentConn])
            if processLaunchProgressBar:
                processLaunchProgressBar.increment(kiStop - kiStart)
            else:
                sys.stderr.write(
                    "  # launched process %s for kernels %d..%d\n" %
                    (t, kiStart, kiStop - 1))

        else:  # non-threaded version
            processKernelSourceChunk(kernels, kernelWriterSource, kernelWriterAssembly, \
                                     kiStart, kiStop, None)
        kiStart += workPerCpu
        cpu += 1
    sys.stderr.write("# Waiting for kernel compilation processes...\n")

    someError = 0
    for (t, kiStart, kiStop, parentConn) in threads:
        try:
            results = parentConn.recv()
        except EOFError as pipeErr:
            print "*** warning: process", t, "returned pipe EOF", t, pipeErr

        t.join()
        e = t.exitcode
        if e != 0:
            print "*** warning: process", t, "returned", t, e
            someError = 1
            results = []

        if globalParameters["ShowProgressBar"]:
            progressBar.increment(kiStop - kiStart)
        for (err, src, header, kernelName) in results:
            if err:
                kernelsWithBuildErrs[kernelName] = err
                #print "*** warning: invalid kernel#%s"%kernelName

            # write kernel.cpp
            if not globalParameters["MergeFiles"]:
                kernelSourceFile = open(os.path.join(outputPath, \
                    "Kernels", kernelName+".cpp"), "w")
                kernelSourceFile.write(CHeader)

            kernelSourceFile.write(src)

            if not globalParameters["MergeFiles"]:
                kernelSourceFile.close()
                # write kernel.h
                kernelHeaderFile = open(os.path.join(outputPath, \
                    "Kernels", kernelName+".h"), "w")
                kernelHeaderFile.write(CHeader)

            kernelHeaderFile.write(header)

            if not globalParameters["MergeFiles"]:
                kernelHeaderFile.close()

    if someError:
        print "\nKernel compilation failed in one or more subprocesses. May want to set CpuThreads=0 and re-run to make debug easier"
        printExit("** kernel compilation failure **")

    # beta-only kernels
    for kernel in kernelsBetaOnly:
        kernelWriter = kernelWriterSource
        kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)

        (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel)
        kernelSourceFile.write(src)
        if err:
            print "*** warning: invalid kernel#%u" % kernelName
        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()
        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName + ".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(
            kernelWriter.getHeaderFileStringBetaOnly(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()

    # close merged
    if globalParameters["MergeFiles"]:
        kernelHeaderFile.close()

    stop = time.time()
    print "# Kernel Building elapsed time = %.1f secs" % (stop - start)

    print1("# Writing Solutions")
    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(solutions))
    ##############################################################################
    # Write Solutions
    ##############################################################################
    if globalParameters["MergeFiles"]:
        solutionSourceFile = open(os.path.join(outputPath, \
            "Solutions.cpp"), "w")
        solutionHeaderFile = open(os.path.join(outputPath, \
            "Solutions.h"), "w")
        if globalParameters["MergeFiles"]:
            solutionSourceFile.write(CHeader)
            solutionHeaderFile.write(CHeader)
        solutionSourceFile.write("#include \"Solutions.h\"\n")
        solutionSourceFile.write("#include <algorithm>\n")
        solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
        solutionHeaderFile.write("#include \"Kernels.h\"\n")
        solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
        solutionHeaderFile.write("#include \"Tools.h\"\n")
        if globalParameters["CodeFromFiles"]:
            solutionHeaderFile.write("#include <unistd.h>\n")
    for solution in solutions:
        # get solution name
        if not globalParameters["MergeFiles"]:
            solutionFileName = solutionWriter.getSolutionName(solution)

        # write solution.cpp
        if not globalParameters["MergeFiles"]:
            solutionSourceFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".cpp"), "w")
            solutionSourceFile.write(CHeader)
        solutionSourceFile.write( \
            solutionWriter.getSourceFileString(solution, kernelsWithBuildErrs))
        if not globalParameters["MergeFiles"]:
            solutionSourceFile.close()

        # write solution.h
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".h"), "w")
            solutionHeaderFile.write(CHeader)
        solutionHeaderFile.write( \
            solutionWriter.getHeaderFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile.close()
        if globalParameters["ShowProgressBar"]:
            progressBar.increment()
    # close merged
    if not globalParameters["MergeFiles"]:
        solutionHeaderFile.close()

    if globalParameters["ExitAfterKernelGen"]:
        printExit(
            "** Exiting after kernel generation due to ExitAfterKernelGen=1")
예제 #5
0
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
    print1("# Writing Solutions and Kernels")
    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Solutions"))
        ensurePath(os.path.join(outputPath, "Kernels"))

    progressBar = ProgressBar(len(solutions) + len(kernels))

    ##############################################################################
    # Write Solutions
    ##############################################################################
    if globalParameters["MergeFiles"]:
        solutionSourceFile = open(os.path.join(outputPath, \
            "Solutions.cpp"), "w")
        solutionHeaderFile = open(os.path.join(outputPath, \
            "Solutions.h"), "w")
        if globalParameters["MergeFiles"]:
            solutionSourceFile.write(CHeader)
            solutionHeaderFile.write(CHeader)
        solutionSourceFile.write("#include \"Solutions.h\"\n")
        solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
        solutionHeaderFile.write("#include \"Kernels.h\"\n")
        solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
        solutionHeaderFile.write("#include \"Tools.h\"\n")
    for solution in solutions:
        # get solution name
        if not globalParameters["MergeFiles"]:
            solutionFileName = solutionWriter.getSolutionName(solution)

        # write solution.cpp
        if not globalParameters["MergeFiles"]:
            solutionSourceFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".cpp"), "w")
            solutionSourceFile.write(CHeader)
        solutionSourceFile.write( \
            solutionWriter.getSourceFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionSourceFile.close()

        # write solution.h
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".h"), "w")
            solutionHeaderFile.write(CHeader)
        solutionHeaderFile.write( \
            solutionWriter.getHeaderFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile.close()
        progressBar.increment()
    # close merged
    if not globalParameters["MergeFiles"]:
        solutionHeaderFile.close()

    ##############################################################################
    # Write Kernels
    ##############################################################################
    if globalParameters["MergeFiles"]:
        kernelSourceFile = open(os.path.join(outputPath, \
            "Kernels.cpp"), "w")
        kernelHeaderFile = open(os.path.join(outputPath, \
            "Kernels.h"), "w")
        kernelSourceFile.write(CHeader)
        kernelHeaderFile.write(CHeader)
        kernelSourceFile.write("#include \"Kernels.h\"\n")
        kernelHeaderFile.write("#pragma once\n")
        if globalParameters["RuntimeLanguage"] == "HIP":
            kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
            kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
            kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
        else:
            kernelHeaderFile.write("#include <string>\n")

    # tensor contraction kernels
    for kernel in kernels:
        kernelWriter = kernelWriterSource if kernel[
            "KernelLanguage"] == "Source" else kernelWriterAssembly
        # get kernel name
        if not globalParameters["MergeFiles"]:
            kernelName = kernelWriter.getKernelName(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)
        kernelSourceFile.write(kernelWriter.getSourceFileString(kernel))
        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()

        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(kernelWriter.getHeaderFileString(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()
        progressBar.increment()

    # beta-only kernels
    for kernel in kernelsBetaOnly:
        kernelWriter = kernelWriterSource
        kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)
        kernelSourceFile.write(
            kernelWriter.getSourceFileStringBetaOnly(kernel))
        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()

        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName + ".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(
            kernelWriter.getHeaderFileStringBetaOnly(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()

    # close merged
    if globalParameters["MergeFiles"]:
        kernelHeaderFile.close()
예제 #6
0
def writeSolutionsAndKernels(outputPath, problemTypes, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
  start = time.time()
  print1("# Writing Kernels...")
  if not globalParameters["MergeFiles"]:
    ensurePath(os.path.join(outputPath, "Solutions"))
    ensurePath(os.path.join(outputPath, "Kernels"))

  if globalParameters["ShowProgressBar"]:
    progressBar = ProgressBar(len(kernels))

  ##############################################################################
  # Write Kernels
  ##############################################################################
  if globalParameters["MergeFiles"]:
    kernelSourceFile = open(os.path.join(outputPath, \
        "Kernels.cpp"), "w")
    kernelHeaderFile = open(os.path.join(outputPath, \
        "Kernels.h"), "w")
    kernelSourceFile.write(CHeader)
    kernelHeaderFile.write(CHeader)
    kernelSourceFile.write("#include \"Kernels.h\"\n")
    kernelHeaderFile.write("#pragma once\n")
    if globalParameters["RuntimeLanguage"] == "HIP":
      kernelHeaderFile.write("// Also set env var HCC_ENABLE_PRINTF=1 for printf\n")
      kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n")
      kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
      kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
      kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
      kernelHeaderFile.write("\n\n")
      kernelHeaderFile.write("__device__ inline int GenDot4(int a, int b, int c) { \n")
      kernelHeaderFile.write("#if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__\n")
      kernelHeaderFile.write("  typedef union { int32_t i; char4 z; } PkInt8x4;\n")
      kernelHeaderFile.write("#else\n")
      kernelHeaderFile.write("  typedef struct { int c0:8,c1:8,c2:8,c3:8; } C4I8;\n")
      kernelHeaderFile.write("  typedef union { int32_t i; C4I8 z; } PkInt8x4;\n")
      kernelHeaderFile.write("#endif\n")
      kernelHeaderFile.write("  PkInt8x4 va, vb; va.i = a; vb.i = b;\n")

      kernelHeaderFile.write("#if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__\n")
      kernelHeaderFile.write("      return amd_mixed_dot(va.z, vb.z, c, true); }\n")
      kernelHeaderFile.write("#else\n")
      kernelHeaderFile.write("      return c + (vb.z.c3*va.z.c3 + vb.z.c2*va.z.c2 + vb.z.c1*va.z.c1 + vb.z.c0*va.z.c0); }\n")
      kernelHeaderFile.write("#endif\n")
      kernelHeaderFile.write("\n\n")
    else:
      kernelHeaderFile.write("#include <string>\n")

  kernelsWithBuildErrs = {}

  prepAsm()

  if globalParameters["CpuThreads"] == 0:
    cpus = 0
  elif globalParameters["CodeFromFiles"]:
    cpu_count = multiprocessing.cpu_count()
    cpuThreads = globalParameters["CpuThreads"]
    cpus = cpu_count*abs(cpuThreads) if cpuThreads < 0 \
           else min(cpu_count, cpuThreads)
  else: #! CodeFromFiles is not thread-safe since code merged into same file
    cpus = 1

  workPerCpu = max(10, (len(kernels)+cpus-1)/cpus) if cpus else 1

  kiStart = 0
  cpu = 0
  threads = []
  if 1 and cpus and globalParameters["ShowProgressBar"]:
    print "# Launching kernel compilation processes (cpus=%u kernelsPerCpu=%u)" % (cpus, workPerCpu)
    processLaunchProgressBar = ProgressBar(len(kernels))
  else:
    print "# Compiling kernels (no multiprocessing, cpus=%u #kernels=%u)" % (cpus, workPerCpu)
    processLaunchProgressBar = None
  while kiStart < len(kernels):
    kiStop = min(len(kernels), kiStart + workPerCpu)
    if cpus:
      results = []
      parentConn,child  = multiprocessing.Pipe()
      args=(kernels, kernelWriterSource, kernelWriterAssembly, \
            kiStart, kiStop, child)
      t = multiprocessing.Process(target=processKernelSourceChunk, args=args)
      t.start()
      child.close() # close child pipe in the parent process
      threads.append([t,kiStart,kiStop, parentConn])
      if processLaunchProgressBar:
        processLaunchProgressBar.increment(kiStop-kiStart)
      else:
        sys.stderr.write("  # launched process %s for kernels %d..%d\n" %(t, kiStart, kiStop-1))

    else: # non-threaded version
      results = processKernelSourceChunk(kernels, kernelWriterSource, kernelWriterAssembly, \
                               kiStart, kiStop, None)
      if globalParameters["ShowProgressBar"]:
        progressBar.increment(kiStop-kiStart)
      processResults(results, outputPath, kernelsWithBuildErrs, kernelSourceFile, kernelHeaderFile)

    kiStart += workPerCpu
    cpu += 1
  sys.stderr.write("# Waiting for kernel compilation processes...\n")

  someError = 0
  if cpus:
    for (t,kiStart,kiStop,parentConn) in threads:
      try:
        results = parentConn.recv()
      except EOFError as pipeErr:
        print  "*** warning: process", t, "returned pipe EOF",t,pipeErr

      t.join()
      e = t.exitcode
      if e != 0 :
        print  "*** warning: process", t, "returned",t,e
        someError = 1
        results = []

      if globalParameters["ShowProgressBar"]:
        progressBar.increment(kiStop-kiStart)
      processResults(results, outputPath, kernelsWithBuildErrs, kernelSourceFile, kernelHeaderFile)

  if someError:
    print "\nKernel compilation failed in one or more subprocesses. May want to set CpuThreads=0 and re-run to make debug easier"
    printExit("** kernel compilation failure **")


  # beta-only kernels
  for kernel in kernelsBetaOnly:
    kernelWriter = kernelWriterSource
    kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

    # write kernel.cpp
    if not globalParameters["MergeFiles"]:
      kernelSourceFile = open(os.path.join(outputPath, \
          "Kernels", kernelName+".cpp"), "w")
      kernelSourceFile.write(CHeader)

    (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel)
    kernelSourceFile.write(src)
    if err:
      print "*** warning: invalid kernel#%u"%kernelName
    if not globalParameters["MergeFiles"]:
      kernelSourceFile.close()
    # write kernel.h
    if not globalParameters["MergeFiles"]:
      kernelHeaderFile = open(os.path.join(outputPath, \
          "Kernels", kernelName + ".h"), "w")
      kernelHeaderFile.write(CHeader)
    kernelHeaderFile.write( kernelWriter.getHeaderFileStringBetaOnly(kernel))
    if not globalParameters["MergeFiles"]:
      kernelHeaderFile.close()

  # close merged
  if globalParameters["MergeFiles"]:
    kernelHeaderFile.close()

  stop = time.time()
  print "# Kernel Building elapsed time = %.1f secs" % (stop-start)

  print1("# Writing Solutions")
  if globalParameters["ShowProgressBar"]:
    progressBar = ProgressBar(len(solutions))
  ##############################################################################
  # Write Solutions
  ##############################################################################
  if globalParameters["MergeFiles"]:
    solutionSourceFile = open(os.path.join(outputPath, \
        "Solutions.cpp"), "w")
    solutionHeaderFile = open(os.path.join(outputPath, \
        "Solutions.h"), "w")
    if globalParameters["MergeFiles"]:
      solutionSourceFile.write(CHeader)
      solutionHeaderFile.write(CHeader)
    solutionSourceFile.write("#include \"Solutions.h\"\n")
    solutionSourceFile.write("#include <algorithm>\n")
    solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
    solutionHeaderFile.write("#include \"Kernels.h\"\n")
    solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
    solutionHeaderFile.write("#include \"Tools.h\"\n")
    if globalParameters["CodeFromFiles"]:
      solutionHeaderFile.write("#include <unistd.h>\n")


  # Write a solution pointer typedef for each problemType:
  h = ""
  for problemType in problemTypes:
    #print "p=", problemType
    argListAll = solutionWriter.getArgList(problemType, True, True, True, True)
    # declare TensileSolutionPointer_ProblemType
    h += "\n// solution pointer\n"
    h += "typedef TensileStatus (*TensileSolutionPointer_%s)(\n" \
        % problemType
    for i in range(0, len(argListAll)):
      h += "    %s %s%s" % (argListAll[i][0], argListAll[i][1], ",\n" \
          if i < len(argListAll)-1 else ");\n\n")
    h += "\n"

  solutionHeaderFile.write(h)
#
  for solution in solutions:
    # get solution name
    if not globalParameters["MergeFiles"]:
      solutionFileName = solutionWriter.getSolutionName(solution)

    # write solution.cpp
    if not globalParameters["MergeFiles"]:
      solutionSourceFile = open(os.path.join(outputPath, \
          "Solutions", solutionFileName+".cpp"), "w")
      solutionSourceFile.write(CHeader)
    solutionSourceFile.write( \
        solutionWriter.getProblemSourceString(solution["ProblemType"], solution, kernelsWithBuildErrs))
    if not globalParameters["MergeFiles"]:
      solutionSourceFile.close()

    # write solution.h
    if not globalParameters["MergeFiles"]:
      solutionHeaderFile = open(os.path.join(outputPath, \
          "Solutions", solutionFileName+".h"), "w")
      solutionHeaderFile.write(CHeader)
    solutionHeaderFile.write( \
        solutionWriter.getHeaderFileString(solution))
    if not globalParameters["MergeFiles"]:
      solutionHeaderFile.close()
    if globalParameters["ShowProgressBar"]:
      progressBar.increment()
  # close merged
  if not globalParameters["MergeFiles"]:
    solutionHeaderFile.close()

  if globalParameters["ExitAfterKernelGen"]:
    printExit("** Exiting after kernel generation due to ExitAfterKernelGen=1")
예제 #7
0
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
    print1("# Writing Kernels")
    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Solutions"))
        ensurePath(os.path.join(outputPath, "Kernels"))

    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(kernels))

    ##############################################################################
    # Write Kernels
    ##############################################################################
    if globalParameters["MergeFiles"]:
        kernelSourceFile = open(os.path.join(outputPath, \
            "Kernels.cpp"), "w")
        kernelHeaderFile = open(os.path.join(outputPath, \
            "Kernels.h"), "w")
        kernelSourceFile.write(CHeader)
        kernelHeaderFile.write(CHeader)
        kernelSourceFile.write("#include \"Kernels.h\"\n")
        kernelHeaderFile.write("#pragma once\n")
        if globalParameters["RuntimeLanguage"] == "HIP":
            kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n")
            kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
            kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
            kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
        else:
            kernelHeaderFile.write("#include <string>\n")

    kernelsWithBuildErrs = {}

    # tensor contraction kernels - dispatch as multiple threads:
    kLock = threading.Lock()
    pLock = threading.Lock()

    prepAsm()

    if globalParameters["CpuThreads"] == 0:
        cpus = 0
    elif globalParameters["CodeFromFiles"]:
        cpu_count = multiprocessing.cpu_count()
        cpus = cpu_count if globalParameters["CpuThreads"] == -1 \
               else min(cpu_count, globalParameters["CpuThreads"])
    else:  #! CodeFromFiles is not thread-safe since code merged into same file
        cpus = 1

    workPerCpu = max(10, (len(kernels) + cpus - 1) / cpus) if cpus else 1
    print "info: cpus=%u kernelsPerCpu=%u" % (cpus, workPerCpu)

    kiStart = 0
    cpu = 0
    threads = []
    while kiStart < len(kernels):
        kiStop = min(len(kernels), kiStart + workPerCpu)
        #sys.stderr.write("cpu:%u process kernels #%u-#%u\n"% (cpu, kiStart, kiStop))

        if cpus:
            args=(outputPath, kernels, kernelSourceFile, kernelHeaderFile, \
                  kernelWriterSource, kernelWriterAssembly, \
                  kernelsWithBuildErrs, progressBar, kLock, pLock, kiStart, kiStop)
            t = threading.Thread(target=processKernelSourceChunk, args=args)
            t.start()
            threads.append(t)
        else:
            processKernelSourceChunk(outputPath, kernels, kernelSourceFile, kernelHeaderFile, \
                                      kernelWriterSource, kernelWriterAssembly, \
                                      kernelsWithBuildErrs, kLock, pLock, kiStart, kiStop)
        kiStart += workPerCpu
        cpu += 1

    for t in threads:
        t.join()

    # beta-only kernels
    for kernel in kernelsBetaOnly:
        kernelWriter = kernelWriterSource
        kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)

        (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel)
        kernelSourceFile.write(src)
        if err:
            print "*** warning: invalid kernel#%u" % kernelName
        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()
        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName + ".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(
            kernelWriter.getHeaderFileStringBetaOnly(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()

    # close merged
    if globalParameters["MergeFiles"]:
        kernelHeaderFile.close()

    print1("# Writing Solutions")
    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(solutions))
    ##############################################################################
    # Write Solutions
    ##############################################################################
    if globalParameters["MergeFiles"]:
        solutionSourceFile = open(os.path.join(outputPath, \
            "Solutions.cpp"), "w")
        solutionHeaderFile = open(os.path.join(outputPath, \
            "Solutions.h"), "w")
        if globalParameters["MergeFiles"]:
            solutionSourceFile.write(CHeader)
            solutionHeaderFile.write(CHeader)
        solutionSourceFile.write("#include \"Solutions.h\"\n")
        solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
        solutionHeaderFile.write("#include \"Kernels.h\"\n")
        solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
        solutionHeaderFile.write("#include \"Tools.h\"\n")
        if globalParameters["CodeFromFiles"]:
            solutionHeaderFile.write("#include <unistd.h>\n")
    for solution in solutions:
        # get solution name
        if not globalParameters["MergeFiles"]:
            solutionFileName = solutionWriter.getSolutionName(solution)

        # write solution.cpp
        if not globalParameters["MergeFiles"]:
            solutionSourceFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".cpp"), "w")
            solutionSourceFile.write(CHeader)
        solutionSourceFile.write( \
            solutionWriter.getSourceFileString(solution, kernelsWithBuildErrs))
        if not globalParameters["MergeFiles"]:
            solutionSourceFile.close()

        # write solution.h
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".h"), "w")
            solutionHeaderFile.write(CHeader)
        solutionHeaderFile.write( \
            solutionWriter.getHeaderFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile.close()
        if globalParameters["ShowProgressBar"]:
            progressBar.increment()
    # close merged
    if not globalParameters["MergeFiles"]:
        solutionHeaderFile.close()

    if globalParameters["ExitAfterKernelGen"]:
        printExit(
            "** Exiting after kernel generation due to ExitAfterKernelGen=1")
예제 #8
0
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
    print1("# Writing Kernels")
    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Solutions"))
        ensurePath(os.path.join(outputPath, "Kernels"))

    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(kernels))

    ##############################################################################
    # Write Kernels
    ##############################################################################
    if globalParameters["MergeFiles"]:
        kernelSourceFile = open(os.path.join(outputPath, \
            "Kernels.cpp"), "w")
        kernelHeaderFile = open(os.path.join(outputPath, \
            "Kernels.h"), "w")
        kernelSourceFile.write(CHeader)
        kernelHeaderFile.write(CHeader)
        kernelSourceFile.write("#include \"Kernels.h\"\n")
        kernelHeaderFile.write("#pragma once\n")
        if globalParameters["RuntimeLanguage"] == "HIP":
            kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n")
            kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
            kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
            kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
        else:
            kernelHeaderFile.write("#include <string>\n")

    # tensor contraction kernels
    for ki in range(0, len(kernels)):
        kernel = kernels[ki]
        kernelWriter = kernelWriterSource if kernel[
            "KernelLanguage"] == "Source" else kernelWriterAssembly
        # get kernel name
        if not globalParameters["MergeFiles"]:
            kernelName = kernelWriter.getKernelName(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)
        (err, src) = kernelWriter.getSourceFileString(kernel)
        kernelSourceFile.write(src)
        if err:
            print "*** warning: invalid kernel#%u" % ki

        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()

        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(kernelWriter.getHeaderFileString(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()
        if globalParameters["ShowProgressBar"]:
            progressBar.increment()

    # beta-only kernels
    for kernel in kernelsBetaOnly:
        kernelWriter = kernelWriterSource
        kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)

        (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel)
        kernelSourceFile.write(src)
        if err:
            print "*** warning: invalid kernel#%u" % ki
        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()
        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName + ".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(
            kernelWriter.getHeaderFileStringBetaOnly(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()

    # close merged
    if globalParameters["MergeFiles"]:
        kernelHeaderFile.close()

    print1("# Writing Solutions")
    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(solutions))
    ##############################################################################
    # Write Solutions
    ##############################################################################
    if globalParameters["MergeFiles"]:
        solutionSourceFile = open(os.path.join(outputPath, \
            "Solutions.cpp"), "w")
        solutionHeaderFile = open(os.path.join(outputPath, \
            "Solutions.h"), "w")
        if globalParameters["MergeFiles"]:
            solutionSourceFile.write(CHeader)
            solutionHeaderFile.write(CHeader)
        solutionSourceFile.write("#include \"Solutions.h\"\n")
        solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
        solutionHeaderFile.write("#include \"Kernels.h\"\n")
        solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
        solutionHeaderFile.write("#include \"Tools.h\"\n")
    for solution in solutions:
        # get solution name
        if not globalParameters["MergeFiles"]:
            solutionFileName = solutionWriter.getSolutionName(solution)

        # write solution.cpp
        if not globalParameters["MergeFiles"]:
            solutionSourceFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".cpp"), "w")
            solutionSourceFile.write(CHeader)
        solutionSourceFile.write( \
            solutionWriter.getSourceFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionSourceFile.close()

        # write solution.h
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".h"), "w")
            solutionHeaderFile.write(CHeader)
        solutionHeaderFile.write( \
            solutionWriter.getHeaderFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile.close()
        if globalParameters["ShowProgressBar"]:
            progressBar.increment()
    # close merged
    if not globalParameters["MergeFiles"]:
        solutionHeaderFile.close()

    if globalParameters["ExitAfterKernelGen"]:
        printExit(
            "** Exiting after kernel generation due to ExitAfterKernelGen=1")