Beispiel #1
0
 def __getitem__(self, hardcodedParameters):
     #(hardcodedParametersKey, winningParameters, score) = \
     matches = WinningParameterDict.get(hardcodedParameters, self.winners)
     if len(matches) == 1:
         return matches[0][1]
     elif len(matches) == 0:
         return None
     else:
         printExit("Didn't find exactly 1 match")
Beispiel #2
0
def writeLibraryLogicForSchedule( filePath, schedulePrefix, architectureName, deviceNames, \
    logicTuple):
    problemType = logicTuple[0]
    solutions = logicTuple[1]
    indexOrder = logicTuple[2]
    exactLogic = logicTuple[3]
    rangeLogic = logicTuple[4]
    filename = os.path.join(filePath, "%s_%s.yaml" \
        % (schedulePrefix, str(problemType)))
    print2("# writeLogic( %s )" % (filename))

    data = []
    # Tensile version
    data.append({"MinimumRequiredVersion": __version__})
    # schedule name
    data.append(schedulePrefix)  # change from Tensile to vega10
    data.append(architectureName)
    # schedule device names
    data.append(deviceNames)
    # problem type
    problemTypeState = problemType.state
    problemTypeState["DataType"] = \
        problemTypeState["DataType"].value
    problemTypeState["DestDataType"] = \
        problemTypeState["DestDataType"].value
    data.append(problemTypeState)
    # solutions
    solutionList = []
    for solution in solutions:
        solutionState = solution.state
        solutionState["ProblemType"] = solutionState["ProblemType"].state
        solutionState["ProblemType"]["DataType"] = \
            solutionState["ProblemType"]["DataType"].value
        solutionState["ProblemType"]["DestDataType"] = \
            solutionState["ProblemType"]["DestDataType"].value
        solutionList.append(solutionState)
    data.append(solutionList)
    # index order
    data.append(indexOrder)

    # exactLogic
    exactLogicList = []
    for key in exactLogic:
        exactLogicList.append([list(key), exactLogic[key]])
    data.append(exactLogicList)

    # rangeLogic
    data.append(rangeLogic)

    # open & write file
    try:
        stream = open(filename, "w")
        yaml.dump(data, stream)
        stream.close()
    except IOError:
        printExit("Cannot open file: %s" % filename)
Beispiel #3
0
 def __init__( self, value ):
   if isinstance(value, int):
     self.value = value
   elif isinstance(value, basestring):
     for propertiesIdx in range(0,6):
       for dataTypeIdx in range(0,self.num):
         if value.lower() == self.properties[dataTypeIdx][propertiesIdx].lower():
           self.value = dataTypeIdx
           return
   elif isinstance(value, DataType):
     self.value = value.value
   else:
     printExit("initializing DataType to %s %s" % (str(type(value)), str(value)) )
Beispiel #4
0
  def __init__(self, config):
    self.state = {}
    for key in defaultProblemType:
      assignParameterWithDefault(self.state, key, config, defaultProblemType)

    if "DataType" in config:
      self["DataType"] = DataType(config["DataType"])
    else:
      printExit("NO data type specified")
      self["DataType"] = DataType(0)

    if self["OperationType"] == "GEMM":
      self.initGEMM(config)
    elif self["OperationType"] == "TensorContraction":
      self.initTensorContraction(config)

    self.state["AssignedDerivedParameters"] = False
    ProblemType.assignDerivedParameters(self.state)
Beispiel #5
0
def getResults(resultsFileName, solutions):
    try:
        resultsFile = open(resultsFileName, "r")
    except IOError:
        printExit("Can't open \"%s\" to get results" % resultsFileName)

    # setup data structures
    numSolutions = 0
    results = []
    for solutionsForHardcoded in solutions:
        results.append([])
        for solution in solutionsForHardcoded:
            problemSizeIdx = solution["ProblemType"]["TotalIndices"] + 1
            results[-1].append([])
            numSolutions += 1

    # read results in gflops
    csvFile = csv.reader(resultsFile)
    startIdx = problemSizeIdx + 1
    rowLength = startIdx + numSolutions

    rowIdx = 0
    for row in csvFile:
        rowIdx += 1
        if rowIdx == 1:
            continue
        else:
            if len(row) < rowLength:
                printWarning("CSV File %s row %u doesn't have %u elements; ignoring remainer of file." \
                    % (resultsFileName, rowIdx, rowLength) )
                break
            idx = startIdx
            for i in range(0, len(solutions)):
                solutionsForHardcoded = solutions[i]
                for j in range(0, len(solutionsForHardcoded)):
                    solution = solutionsForHardcoded[j]
                    gflops = float(row[idx])
                    results[i][j].append(gflops)
                    idx += 1
    if rowIdx < 2:
        printExit("CSV File %s only has %u row(s); prior benchmark must not have run long enough to produce data." \
            % (resultsFileName, rowIdx) )
    return results
Beispiel #6
0
  def __init__(self, problemType, config):
    self.problemType = problemType
    self.ranges = []
    self.exacts = []
    for dictionary in config:
      for sizeTypeKey in dictionary:
        if sizeTypeKey == "Range":
          psr = ProblemSizeRange(problemType, dictionary[sizeTypeKey])
          self.ranges.append( psr )
        elif sizeTypeKey == "Exact":
          e = dictionary[sizeTypeKey]
          if len(e) != problemType["TotalIndices"]:
            printExit("ExactSize %s doesn't match indices of ProblemType %s" \
                % (e, problemType) )
          else:
            self.exacts.append(tuple(e))
        else:
          printExit("ProblemSize Type %s not supported"%sizeTypeKey)

    self.sizes = set()
    for sizeRange in self.ranges:
      self.sizes.update(sizeRange.problemSizes)
    self.sizes.update(self.exacts)
    self.sizes = sorted( list( self.sizes ) )
    self.totalProblemSizes = len(self.sizes)

    # max sizes
    self.maxC = 0
    self.maxA = 0
    self.maxB = 0
    for problemSize in self.sizes:
      sizeC = 1
      sizeA = 1
      sizeB = 1
      for i in range(0, problemType["NumIndicesC"]):
        sizeC *= problemSize[i]
      for i in self.problemType["IndexAssignmentsA"]:
        sizeA *= problemSize[i]
      for i in self.problemType["IndexAssignmentsB"]:
        sizeB *= problemSize[i]
      self.maxC = max(self.maxC, sizeC)
      self.maxA = max(self.maxA, sizeA)
      self.maxB = max(self.maxB, sizeB)
Beispiel #7
0
def readSolutions(filename):
    try:
        stream = open(filename, "r")
    except IOError:
        printExit("Cannot open file: %s" % filename)
    solutionStates = yaml.load(stream, yaml.SafeLoader)
    stream.close()

    # verify
    if len(solutionStates) < 2:
        printExit("len(%s) %u < 2" % (filename, len(solutionStates)))
    versionString = solutionStates[0]["MinimumRequiredVersion"]
    if not versionIsCompatible(versionString):
        printWarning("File \"%s\" version=%s does not match current Tensile version=%s" \
            % (filename, versionString, __version__) )

    if "ProblemSizes" not in solutionStates[1]:
        printExit("%s doesn't begin with ProblemSizes" % filename)
    else:
        problemSizesConfig = solutionStates[1]["ProblemSizes"]

    solutions = []
    for i in range(2, len(solutionStates)):
        solutionState = solutionStates[i]
        solutionObject = Solution(solutionState)
        solutions.append(solutionObject)
    problemType = solutions[0]["ProblemType"]
    problemSizes = ProblemSizes(problemType, problemSizesConfig)
    return (problemSizes, solutions)
Beispiel #8
0
def writeSolutions( filename, problemSizes, solutions ):
  # convert objects to nested dictionaries
  solutionStates = []
  for hardcoded in solutions:
    for solution in hardcoded:
      solutionState = solution.state
      solutionState["ProblemType"] = solutionState["ProblemType"].state
      solutionState["ProblemType"]["DataType"] = \
          solutionState["ProblemType"]["DataType"].value
      solutionStates.append(solutionState)
  # write dictionaries
  try:
    stream = open(filename, "w")
  except IOError:
    printExit("Cannot open file: %s" % filename)
  stream.write("- MinimumRequiredVersion: %s\n" % __version__ )
  stream.write("- ProblemSizes:\n")
  for sizeRange in problemSizes.ranges:
    stream.write("  - Range: %s\n" % sizeRange)
  for sizeExact in problemSizes.exacts:
    stream.write("  - Exact: %s\n" % list(sizeExact))
  yaml.dump(solutionStates, stream, default_flow_style=False)
  stream.close()
Beispiel #9
0
def readLibraryLogicForSchedule(filename):
    print1("# Reading Library Logic: %s" % (filename))
    try:
        stream = open(filename, "r")
    except IOError:
        printExit("Cannot open file: %s" % filename)
    data = yaml.load(stream, yaml.SafeLoader)
    stream.close()

    # verify
    if len(data) < 6:
        printExit("len(%s) %u < 7" % (filename, len(data)))

    # parse out objects
    versionString = data[0]["MinimumRequiredVersion"]
    scheduleName = data[1]
    architectureName = data[2]
    deviceNames = data[3]
    problemTypeState = data[4]
    solutionStates = data[5]
    indexOrder = data[6]
    exactLogic = data[7]
    rangeLogic = data[8]

    # does version match
    if not versionIsCompatible(versionString):
        printWarning("File \"%s\" version=%s does not match Tensile version=%s" \
            % (filename, versionString, __version__) )

    # unpack problemType
    problemType = ProblemType(problemTypeState)
    # unpack solutions
    solutions = []
    for i in range(0, len(solutionStates)):
        solutionState = solutionStates[i]
        if solutionState["KernelLanguage"] == "Assembly":
            isa0 = int(architectureName[3])
            isa1 = int(architectureName[4])
            isa2 = int(architectureName[5])
            solutionState["ISA"] = (isa0, isa1, isa2)
        else:
            solutionState["ISA"] = (0, 0, 0)
        solutionObject = Solution(solutionState)
        if solutionObject["ProblemType"] != problemType:
            printExit("ProblemType of file doesn't match solution: %s != %s" \
                % (problemType, solutionObject["ProblemType"]))
        solutions.append(solutionObject)

    return (scheduleName, deviceNames, problemType, solutions, indexOrder, \
        exactLogic, rangeLogic )
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
    start = time.time()
    print1("# Writing Kernels...")
    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Solutions"))
        ensurePath(os.path.join(outputPath, "Kernels"))

    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(kernels))

    ##############################################################################
    # Write Kernels
    ##############################################################################
    if globalParameters["MergeFiles"]:
        kernelSourceFile = open(os.path.join(outputPath, \
            "Kernels.cpp"), "w")
        kernelHeaderFile = open(os.path.join(outputPath, \
            "Kernels.h"), "w")
        kernelSourceFile.write(CHeader)
        kernelHeaderFile.write(CHeader)
        kernelSourceFile.write("#include \"Kernels.h\"\n")
        kernelHeaderFile.write("#pragma once\n")
        if globalParameters["RuntimeLanguage"] == "HIP":
            kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n")
            kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
            kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
            kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
            kernelHeaderFile.write("\n\n")
            kernelHeaderFile.write(
                "__device__ inline int GenDot4(int a, int b, int c) { \n")
            kernelHeaderFile.write(
                "  typedef struct { int c0:8,c1:8,c2:8,c3:8; } C4I8;\n")
            kernelHeaderFile.write(
                "  typedef union { int32_t i; C4I8 z; } PkInt8x4;\n")
            kernelHeaderFile.write("  PkInt8x4 va, vb; va.i = a; vb.i = b;\n")
            kernelHeaderFile.write(
                "  return c + (vb.z.c3*va.z.c3 + vb.z.c2*va.z.c2 + vb.z.c1*va.z.c1 + vb.z.c0*va.z.c0); }\n"
            )
            kernelHeaderFile.write("\n\n")
        else:
            kernelHeaderFile.write("#include <string>\n")

    kernelsWithBuildErrs = {}

    prepAsm()

    if globalParameters["CpuThreads"] == 0:
        cpus = 0
    elif globalParameters["CodeFromFiles"]:
        cpu_count = multiprocessing.cpu_count()
        cpus = cpu_count*4 if globalParameters["CpuThreads"] == -1 \
               else globalParameters["CpuThreads"]
    else:  #! CodeFromFiles is not thread-safe since code merged into same file
        cpus = 1

    workPerCpu = max(10, (len(kernels) + cpus - 1) / cpus) if cpus else 1
    print "# Launching kernel compilation processes (cpus=%u kernelsPerCpu=%u)" % (
        cpus, workPerCpu)

    kiStart = 0
    cpu = 0
    threads = []
    if 1 and cpus and globalParameters["ShowProgressBar"]:
        processLaunchProgressBar = ProgressBar(len(kernels))
    else:
        processLaunchProgressBar = None
    while kiStart < len(kernels):
        kiStop = min(len(kernels), kiStart + workPerCpu)
        if cpus:
            results = []
            parentConn, child = multiprocessing.Pipe()
            args=(kernels, kernelWriterSource, kernelWriterAssembly, \
                  kiStart, kiStop, child)
            t = multiprocessing.Process(target=processKernelSourceChunk,
                                        args=args)
            t.start()
            child.close()  # close child pipe in the parent process
            threads.append([t, kiStart, kiStop, parentConn])
            if processLaunchProgressBar:
                processLaunchProgressBar.increment(kiStop - kiStart)
            else:
                sys.stderr.write(
                    "  # launched process %s for kernels %d..%d\n" %
                    (t, kiStart, kiStop - 1))

        else:  # non-threaded version
            processKernelSourceChunk(kernels, kernelWriterSource, kernelWriterAssembly, \
                                     kiStart, kiStop, None)
        kiStart += workPerCpu
        cpu += 1
    sys.stderr.write("# Waiting for kernel compilation processes...\n")

    someError = 0
    for (t, kiStart, kiStop, parentConn) in threads:
        try:
            results = parentConn.recv()
        except EOFError as pipeErr:
            print "*** warning: process", t, "returned pipe EOF", t, pipeErr

        t.join()
        e = t.exitcode
        if e != 0:
            print "*** warning: process", t, "returned", t, e
            someError = 1
            results = []

        if globalParameters["ShowProgressBar"]:
            progressBar.increment(kiStop - kiStart)
        for (err, src, header, kernelName) in results:
            if err:
                kernelsWithBuildErrs[kernelName] = err
                #print "*** warning: invalid kernel#%s"%kernelName

            # write kernel.cpp
            if not globalParameters["MergeFiles"]:
                kernelSourceFile = open(os.path.join(outputPath, \
                    "Kernels", kernelName+".cpp"), "w")
                kernelSourceFile.write(CHeader)

            kernelSourceFile.write(src)

            if not globalParameters["MergeFiles"]:
                kernelSourceFile.close()
                # write kernel.h
                kernelHeaderFile = open(os.path.join(outputPath, \
                    "Kernels", kernelName+".h"), "w")
                kernelHeaderFile.write(CHeader)

            kernelHeaderFile.write(header)

            if not globalParameters["MergeFiles"]:
                kernelHeaderFile.close()

    if someError:
        print "\nKernel compilation failed in one or more subprocesses. May want to set CpuThreads=0 and re-run to make debug easier"
        printExit("** kernel compilation failure **")

    # beta-only kernels
    for kernel in kernelsBetaOnly:
        kernelWriter = kernelWriterSource
        kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)

        (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel)
        kernelSourceFile.write(src)
        if err:
            print "*** warning: invalid kernel#%u" % kernelName
        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()
        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName + ".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(
            kernelWriter.getHeaderFileStringBetaOnly(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()

    # close merged
    if globalParameters["MergeFiles"]:
        kernelHeaderFile.close()

    stop = time.time()
    print "# Kernel Building elapsed time = %.1f secs" % (stop - start)

    print1("# Writing Solutions")
    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(solutions))
    ##############################################################################
    # Write Solutions
    ##############################################################################
    if globalParameters["MergeFiles"]:
        solutionSourceFile = open(os.path.join(outputPath, \
            "Solutions.cpp"), "w")
        solutionHeaderFile = open(os.path.join(outputPath, \
            "Solutions.h"), "w")
        if globalParameters["MergeFiles"]:
            solutionSourceFile.write(CHeader)
            solutionHeaderFile.write(CHeader)
        solutionSourceFile.write("#include \"Solutions.h\"\n")
        solutionSourceFile.write("#include <algorithm>\n")
        solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
        solutionHeaderFile.write("#include \"Kernels.h\"\n")
        solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
        solutionHeaderFile.write("#include \"Tools.h\"\n")
        if globalParameters["CodeFromFiles"]:
            solutionHeaderFile.write("#include <unistd.h>\n")
    for solution in solutions:
        # get solution name
        if not globalParameters["MergeFiles"]:
            solutionFileName = solutionWriter.getSolutionName(solution)

        # write solution.cpp
        if not globalParameters["MergeFiles"]:
            solutionSourceFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".cpp"), "w")
            solutionSourceFile.write(CHeader)
        solutionSourceFile.write( \
            solutionWriter.getSourceFileString(solution, kernelsWithBuildErrs))
        if not globalParameters["MergeFiles"]:
            solutionSourceFile.close()

        # write solution.h
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".h"), "w")
            solutionHeaderFile.write(CHeader)
        solutionHeaderFile.write( \
            solutionWriter.getHeaderFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile.close()
        if globalParameters["ShowProgressBar"]:
            progressBar.increment()
    # close merged
    if not globalParameters["MergeFiles"]:
        solutionHeaderFile.close()

    if globalParameters["ExitAfterKernelGen"]:
        printExit(
            "** Exiting after kernel generation due to ExitAfterKernelGen=1")
def TensileCreateLibrary():
    print1("")
    print1(HR)
    print1("# Tensile Create Library")
    print2(HR)
    print2("")

    ##############################################################################
    # Parse Command Line Arguments
    ##############################################################################
    print2("Arguments: %s" % sys.argv)
    argParser = argparse.ArgumentParser()
    argParser.add_argument("LogicPath",
                           help="Path to LibraryLogic.yaml files.")
    argParser.add_argument("OutputPath", help="Where to write library files?")
    argParser.add_argument("RuntimeLanguage", help="Which runtime language?", \
        choices=["OCL", "HIP", "HSA"])
    argParser.add_argument("--merge-files", dest="MergeFiles", \
        action="store_true")
    argParser.add_argument("--no-merge-files", dest="MergeFiles", \
        action="store_false")
    argParser.add_argument("--short-file-names", dest="ShortNames", \
        action="store_true")
    argParser.add_argument("--no-short-file-names", dest="ShortNames", \
        action="store_false")
    argParser.add_argument("--library-print-debug", dest="LibraryPrintDebug", \
        action="store_true")
    argParser.add_argument("--no-library-print-debug", dest="LibraryPrintDebug", \
        action="store_false")
    args = argParser.parse_args()

    logicPath = args.LogicPath
    outputPath = args.OutputPath
    print2("OutputPath: %s" % outputPath)
    ensurePath(outputPath)
    arguments = {}
    arguments["RuntimeLanguage"] = args.RuntimeLanguage
    arguments["MergeFiles"] = args.MergeFiles
    arguments["ShortNames"] = args.ShortNames
    arguments["LibraryPrintDebug"] = args.LibraryPrintDebug
    arguments["CodeFromFiles"] = False
    assignGlobalParameters(arguments)

    if not os.path.exists(logicPath):
        printExit("LogicPath %s doesn't exist" % logicPath)

    logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \
        if (os.path.isfile(os.path.join(logicPath, f)) \
        and os.path.splitext(f)[1]==".yaml")]

    print1("# LibraryLogicFiles:" % logicFiles)
    for logicFile in logicFiles:
        print1("#   %s" % logicFile)

    ##############################################################################
    # Parse config files
    ##############################################################################
    solutions = []
    logicData = {}  # keys are problemTypes, values are schedules
    for logicFileName in logicFiles:
        (scheduleName, deviceNames, problemType, solutionsForSchedule, \
            indexOrder, exactLogic, rangeLogic) \
            = YAMLIO.readLibraryLogicForSchedule(logicFileName)
        if problemType not in logicData:
            logicData[problemType] = []
        logicData[problemType].append((scheduleName, deviceNames, \
            solutionsForSchedule, indexOrder, exactLogic, rangeLogic ))
        for solution in solutionsForSchedule:
            if solution not in solutions:
                solutions.append(solution)

    # create solution writer and kernel writer
    kernels = []
    kernelsBetaOnly = []
    for solution in solutions:
        solutionKernels = solution.getKernels()
        for kernel in solutionKernels:
            if kernel not in kernels:
                kernels.append(kernel)
        solutionKernelsBetaOnly = solution.getKernelsBetaOnly()
        for kernel in solutionKernelsBetaOnly:
            if kernel not in kernelsBetaOnly:
                kernelsBetaOnly.append(kernel)

    # if any kernels are assembly, append every ISA supported

    if globalParameters["ShortNames"] and not globalParameters["MergeFiles"]:
        solutionSerialNaming = Solution.getSerialNaming(solutions)
        kernelSerialNaming = Solution.getSerialNaming(kernels)
    else:
        solutionSerialNaming = None
        kernelSerialNaming = None
    solutionMinNaming = Solution.getMinNaming(solutions)
    kernelMinNaming = Solution.getMinNaming(kernels)
    solutionWriter = SolutionWriter( \
        solutionMinNaming, solutionSerialNaming, \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterSource = KernelWriterSource( \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterAssembly = KernelWriterAssembly( \
        kernelMinNaming, kernelSerialNaming)

    # write solutions and kernels
    writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
        solutionWriter, kernelWriterSource, kernelWriterAssembly)

    libraryStaticFiles = [
        "SolutionMapper.h", "TensileTypes.h", "KernelHeader.h",
        "SolutionHelper.cpp", "SolutionHelper.h", "Tools.cpp", "Tools.h"
    ]

    # write cmake
    clientName = "LibraryClient"
    writeCMake(outputPath, solutions, kernels, libraryStaticFiles, clientName)

    # write logic
    writeLogic(outputPath, logicData, solutionWriter)
    print1("# Tensile Library Writer DONE")
    print1(HR)
    print1("")
Beispiel #12
0
def TensileCreateLibrary():
    print1("")
    print1(HR)
    print1("# Tensile Create Library")
    print2(HR)
    print2("")

    ##############################################################################
    # Parse Command Line Arguments
    ##############################################################################
    print2("Arguments: %s" % sys.argv)
    argParser = argparse.ArgumentParser()
    argParser.add_argument("LogicPath",
                           help="Path to LibraryLogic.yaml files.")
    argParser.add_argument("OutputPath", help="Where to write library files?")
    argParser.add_argument("RuntimeLanguage", help="Which runtime language?", \
        choices=["OCL", "HIP", "HSA"])
    argParser.add_argument("--merge-files", dest="MergeFiles", \
        action="store_true")
    argParser.add_argument("--no-merge-files", dest="MergeFiles", \
        action="store_false")
    argParser.add_argument("--short-file-names", dest="ShortNames", \
        action="store_true")
    argParser.add_argument("--no-short-file-names", dest="ShortNames", \
        action="store_false")
    argParser.add_argument("--library-print-debug", dest="LibraryPrintDebug", \
        action="store_true")
    argParser.add_argument("--no-library-print-debug", dest="LibraryPrintDebug", \
        action="store_false")
    argParser.add_argument(
        "--isa",
        dest="isa",
        action="append",
        help="which architectures for assembly kernels to target")
    args = argParser.parse_args()

    logicPath = args.LogicPath
    outputPath = args.OutputPath
    print2("OutputPath: %s" % outputPath)
    ensurePath(outputPath)
    arguments = {}
    arguments["RuntimeLanguage"] = args.RuntimeLanguage
    arguments["MergeFiles"] = args.MergeFiles
    arguments["ShortNames"] = args.ShortNames
    arguments["LibraryPrintDebug"] = args.LibraryPrintDebug
    if args.isa:
        newISA = []
        for isa in args.isa:
            gfxIdx = isa.find("gfx")
            if gfxIdx >= 0:
                major = int(isa[gfxIdx + 3:gfxIdx + 4])
                minor = int(isa[gfxIdx + 4:gfxIdx + 5])
                step = int(isa[gfxIdx + 5:gfxIdx + 6])
                isaTuple = (major, minor, step)
                if isaTuple in globalParameters[
                        "SupportedISA"] and isaTuple not in newISA:
                    print1("# User-Specified ISA: gfx%u%u%u" %
                           (major, minor, step))
                    newISA.append(isaTuple)
            else:
                printWarning("isa parameter must be formed as: --isa gfx803")
        arguments["SupportedISA"] = newISA
    assignGlobalParameters(arguments)

    if not os.path.exists(logicPath):
        printExit("LogicPath %s doesn't exist" % logicPath)

    logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \
        if (os.path.isfile(os.path.join(logicPath, f)) \
        and os.path.splitext(f)[1]==".yaml")]

    print1("# LibraryLogicFiles:" % logicFiles)
    for logicFile in logicFiles:
        print1("#   %s" % logicFile)

    ##############################################################################
    # Parse config files
    ##############################################################################
    solutions = []
    logicData = {}  # keys are problemTypes, values are schedules
    for logicFileName in logicFiles:
        (scheduleName, deviceNames, problemType, solutionsForSchedule, \
            indexOrder, exactLogic, rangeLogic) \
            = YAMLIO.readLibraryLogicForSchedule(logicFileName)
        if problemType not in logicData:
            logicData[problemType] = []
        logicData[problemType].append((scheduleName, deviceNames, \
            solutionsForSchedule, indexOrder, exactLogic, rangeLogic ))
        for solution in solutionsForSchedule:
            if solution not in solutions:
                solutions.append(solution)

    # create solution writer and kernel writer
    kernels = []
    kernelsBetaOnly = []
    for solution in solutions:
        solutionKernels = solution.getKernels()
        for kernel in solutionKernels:
            if kernel not in kernels:
                kernels.append(kernel)
        solutionKernelsBetaOnly = solution.getKernelsBetaOnly()
        for kernel in solutionKernelsBetaOnly:
            if kernel not in kernelsBetaOnly:
                kernelsBetaOnly.append(kernel)

    # if any kernels are assembly, append every ISA supported
    if globalParameters["RuntimeLanguage"] == "HIP":
        newKernels = []
        for kernel in kernels:
            if kernel["KernelLanguage"] == "Assembly":
                kernel["ISA"] = globalParameters["SupportedISA"][0]
                for i in range(1, len(globalParameters["SupportedISA"])):
                    newKernel = deepcopy(kernel)
                    newKernel["ISA"] = globalParameters["SupportedISA"][i]
                    newKernels.append(newKernel)
            else:
                kernel["ISA"] = (0, 0, 0)
        kernels.extend(newKernels)

    if globalParameters["ShortNames"] and not globalParameters["MergeFiles"]:
        solutionSerialNaming = Solution.getSerialNaming(solutions)
        kernelSerialNaming = Solution.getSerialNaming(kernels)
    else:
        solutionSerialNaming = None
        kernelSerialNaming = None
    solutionMinNaming = Solution.getMinNaming(solutions)
    kernelMinNaming = Solution.getMinNaming(kernels)
    solutionWriter = SolutionWriter( \
        solutionMinNaming, solutionSerialNaming, \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterSource = KernelWriterSource( \
        kernelMinNaming, kernelSerialNaming)
    kernelWriterAssembly = KernelWriterAssembly( \
        kernelMinNaming, kernelSerialNaming)

    # write solutions and kernels
    writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
        solutionWriter, kernelWriterSource, kernelWriterAssembly)

    libraryStaticFiles = [
        "TensileTypes.h", "KernelHeader.h", "SolutionHelper.cpp",
        "SolutionHelper.h", "Tools.cpp", "Tools.h"
    ]

    # write cmake
    clientName = "LibraryClient"
    writeCMake(outputPath, solutions, kernels, libraryStaticFiles, clientName)

    # write logic
    writeLogic(outputPath, logicData, solutionWriter)
    print1("# Tensile Library Writer DONE")
    print1(HR)
    print1("")
    def fillInMissingStepsWithDefaults(self, config):
        print2("")
        print2(
            "####################################################################"
        )
        print1("# Filling in Parameters With Defaults")
        print2(
            "####################################################################"
        )
        print2("")

        ############################################################################
        # (I-0) get 6 phases from config
        configBenchmarkCommonParameters = config["BenchmarkCommonParameters"] \
            if "BenchmarkCommonParameters" in config \
            else [{"ProblemSizes": defaultProblemSizes}]
        configForkParameters = config["ForkParameters"] \
            if "ForkParameters" in config else []
        configBenchmarkForkParameters = config["BenchmarkForkParameters"] \
            if "BenchmarkForkParameters" in config \
            else []
        configJoinParameters = config["JoinParameters"] \
            if "JoinParameters" in config else []
        configBenchmarkJoinParameters = config["BenchmarkJoinParameters"] \
            if "BenchmarkJoinParameters" in config \
            else []
        configBenchmarkFinalParameters = config["BenchmarkFinalParameters"] \
            if "BenchmarkFinalParameters" in config and config["BenchmarkFinalParameters"] != None \
            and len(config["BenchmarkFinalParameters"]) > 0 \
            else [{"ProblemSizes": defaultBenchmarkFinalProblemSizes}]

        ############################################################################
        # Ensure only valid solution parameters were requested
        validParameterNames = validParameters.keys()
        for paramDictList in [configBenchmarkCommonParameters, \
            configForkParameters, configBenchmarkForkParameters, \
            configBenchmarkJoinParameters]:
            if paramDictList != None:
                for paramDict in paramDictList:
                    for paramName in paramDict:
                        if paramName in ["ProblemSizes"]:
                            continue
                        else:
                            if paramName not in validParameterNames:
                                printExit("Invalid parameter name: %s\nValid parameters are %s." \
                                    % (paramName, validParameterNames))
                            paramValues = paramDict[paramName]
                            for paramValue in paramValues:
                                if paramValue not in validParameters[
                                        paramName]:
                                    printExit("Invalid parameter value: %s = %s\nValid values for %s are %s." \
                                        % (paramName, paramValue, paramName, validParameters[paramName]))

        ############################################################################
        # (I-1) get current problem sizes
        currentProblemSizes = defaultProblemSizes
        if configBenchmarkCommonParameters != None:
            if len(configBenchmarkCommonParameters) > 0:
                if "ProblemSizes" in configBenchmarkCommonParameters[0]:
                    # user specified, so use it, remove it from config and insert later
                    currentProblemSizes = \
                      configBenchmarkCommonParameters[0]["ProblemSizes"]
                    del configBenchmarkCommonParameters[0]
        # into common we put in all Dcommon that
        # don't show up in Ccommon/Cfork/CBfork/Cjoin/CBjoin
        # followed by Ccommon
        self.benchmarkCommonParameters = [{
            "ProblemSizes": currentProblemSizes
        }]
        for paramDict in defaultBenchmarkCommonParameters:
            for paramName in paramDict:
                if not hasParam( paramName, [ configBenchmarkCommonParameters, \
                    configForkParameters, configBenchmarkForkParameters, \
                    configJoinParameters, configBenchmarkJoinParameters]) \
                    or paramName == "ProblemSizes":
                    self.benchmarkCommonParameters.append(paramDict)
        if configBenchmarkCommonParameters != None:
            for paramDict in configBenchmarkCommonParameters:
                self.benchmarkCommonParameters.append(paramDict)
        else:  # make empty
            self.benchmarkCommonParameters = [{
                "ProblemSizes":
                currentProblemSizes
            }]

        ############################################################################
        # (I-2) into fork we put in all Dfork that
        # don't show up in Bcommon/Cfork/CBfork/Cjoin/CBjoin
        # followed by Cfork
        self.forkParameters = []
        for paramDict in defaultForkParameters:
            for paramName in paramDict:
                if not hasParam( paramName, [ self.benchmarkCommonParameters, \
                    configForkParameters, configBenchmarkForkParameters, \
                    configJoinParameters, configBenchmarkJoinParameters]) \
                    or paramName == "ProblemSizes":
                    self.forkParameters.append(paramDict)
        if configForkParameters != None:
            for paramDict in configForkParameters:
                self.forkParameters.append(paramDict)
        else:  # make empty
            self.forkParameters = []

        ############################################################################
        # (I-3) get current problem sizes
        if configBenchmarkForkParameters != None:
            if len(configBenchmarkForkParameters) > 0:
                if "ProblemSizes" in configBenchmarkForkParameters[0]:
                    # user specified, so use it, remove it from config and insert later
                    currentProblemSizes = configBenchmarkForkParameters[0][
                        "ProblemSizes"]
                    del configBenchmarkForkParameters[0]
        # into Bfork we put in all DBfork that
        # don't show up in Bcommon/Bfork/CBfork/Cjoin/CBjoin
        # followed by CBforked
        self.benchmarkForkParameters = [{"ProblemSizes": currentProblemSizes}]
        for paramDict in defaultBenchmarkForkParameters:
            for paramName in paramDict:
                if not hasParam( paramName, [ self.benchmarkCommonParameters, \
                    self.forkParameters, configBenchmarkForkParameters, \
                    configJoinParameters, configBenchmarkJoinParameters]) \
                    or paramName == "ProblemSizes":
                    self.benchmarkForkParameters.append(paramDict)
        if configBenchmarkForkParameters != None:
            for paramDict in configBenchmarkForkParameters:
                self.benchmarkForkParameters.append(paramDict)
        else:  # make empty
            self.benchmarkForkParameters = [{
                "ProblemSizes": currentProblemSizes
            }]

        ############################################################################
        # (I-4) into join we put in all non-derrived Djoin that
        # don't show up in Bcommon/Bfork/CBfork/Cjoin/CBjoin
        # followed by CBforked
        self.joinParameters = []
        for paramName in defaultJoinParameters:
            if not hasParam( paramName, [ self.benchmarkCommonParameters, \
                self.forkParameters, self.benchmarkForkParameters, \
                configJoinParameters, configBenchmarkJoinParameters]) \
                or paramName == "ProblemSizes":
                if "JoinParameters" not in config \
                    or (paramName != "MacroTile"):
                    self.joinParameters.append(paramName)
        if configJoinParameters != None:
            for paramName in configJoinParameters:
                self.joinParameters.append(paramName)
        else:  # make empty
            self.joinParameters = []

        ############################################################################
        # (I-5) benchmark join
        if configBenchmarkJoinParameters != None:
            if len(configBenchmarkJoinParameters) > 0:
                if "ProblemSizes" in configBenchmarkJoinParameters[0]:
                    # user specified, so use it, remove it from config and insert later
                    currentProblemSizes = configBenchmarkJoinParameters[0][
                        "ProblemSizes"]
                    del configBenchmarkJoinParameters[0]
        # into Bjoin we put in all DBjoin that
        # don't show up in Bcommon/Bfork/BBfork/Bjoin/CBjoin
        # followed by CBjoin
        self.benchmarkJoinParameters = [{"ProblemSizes": currentProblemSizes}]
        for paramDict in defaultBenchmarkJoinParameters:
            for paramName in paramDict:
                if not hasParam( paramName, [ self.benchmarkCommonParameters, \
                    self.forkParameters, self.benchmarkForkParameters, \
                    self.joinParameters, configBenchmarkJoinParameters]) \
                    or paramName == "ProblemSizes":
                    self.benchmarkJoinParameters.append(paramDict)
        if configBenchmarkJoinParameters != None:
            for paramDict in configBenchmarkJoinParameters:
                self.benchmarkJoinParameters.append(paramDict)
        else:  # make empty
            self.benchmarkJoinParameters = [{
                "ProblemSizes": currentProblemSizes
            }]

        ############################################################################
        # (I-6) benchmark final sizes
        self.benchmarkFinalParameters = configBenchmarkFinalParameters
        # no other parameters besides problem sizes

        ############################################################################
        # (I-7) any default param with 1 value will be hardcoded; move to beginning
        for stepList in [self.benchmarkCommonParameters, \
            self.forkParameters, self.benchmarkForkParameters, \
            self.benchmarkJoinParameters]:
            for paramDict in copy(stepList):
                for paramName in copy(paramDict):
                    paramValues = paramDict[paramName]
                    if paramValues == None:
                        printExit(
                            "You must specify value for parameters \"%s\"" %
                            paramName)
                    if len(paramValues) < 2 and paramName != "ProblemSizes":
                        paramDict.pop(paramName)
                        #self.benchmarkCommonParameters.insert(0, {paramName: paramValues })
                        self.hardcodedParameters[0][paramName] = paramValues[0]
                        self.singleValueParameters[paramName] = [
                            paramValues[0]
                        ]
                        self.initialSolutionParameters[
                            paramName] = paramValues[0]
                        if len(paramDict) == 0:
                            stepList.remove(paramDict)

        ############################################################################
        # (I-8) if fork and join, but no benchmark fork, append dummy benchmarkFork
        if len(self.forkParameters) > 0 and len(self.joinParameters) > 0 \
            and (len(self.benchmarkForkParameters) == 0 \
            or (len(self.benchmarkForkParameters) == 1 \
            and hasParam("ProblemSizes", self.benchmarkForkParameters)) ):
            self.benchmarkForkParameters.append({"BenchmarkFork": [0]})

        ############################################################################
        # (I-9) if join, but no benchmark join, append dummy benchmarkJoin
        #if len(self.joinParameters) > 0 \
        #    and (len(self.benchmarkJoinParameters) == 0 \
        #    or (len(self.benchmarkJoinParameters) == 1 \
        #    and hasParam("ProblemSizes", self.benchmarkJoinParameters)) ):
        #  self.benchmarkJoinParameters.append({"BenchmarkJoin": [0]})
        # No, this is handles by Final Benchmark

        ############################################################################
        # (I-10) Parameter Lists
        # benchmarkCommonParameters
        print2("HardcodedParameters:")
        for paramName in self.hardcodedParameters[0]:
            paramValues = self.hardcodedParameters[0][paramName]
            print2("    %s: %s" % (paramName, paramValues))
        print2("BenchmarkCommonParameters:")
        for step in self.benchmarkCommonParameters:
            print2("    %s" % step)
        # forkParameters
        print2("ForkParameters:")
        for param in self.forkParameters:
            print2("    %s" % param)
        # benchmarkForkParameters
        print2("BenchmarkForkParameters:")
        for step in self.benchmarkForkParameters:
            print2("    %s" % step)
        # joinParameters
        print2("JoinParameters:")
        for param in self.joinParameters:
            print2("    %s" % param)
        # benchmarkJoinParameters
        print2("BenchmarkJoinParameters:")
        for step in self.benchmarkJoinParameters:
            print2("    %s" % step)
        # benchmarkJoinParameters
        print2("BenchmarkFinalParameters:")
        for step in self.benchmarkFinalParameters:
            print2("    %s" % step)
    def convertParametersToSteps(self):
        print2("")
        print2(
            "####################################################################"
        )
        print1("# Convert Parameters to Steps")
        print2(
            "####################################################################"
        )
        print2("")

        ############################################################################
        # (II-1) benchmark common parameters
        print2("")
        print2(
            "####################################################################"
        )
        print1("# Benchmark Common Parameters")
        self.addStepsForParameters(self.benchmarkCommonParameters)

        ############################################################################
        # (II-2) fork parameters
        # calculate permutations of
        print2("")
        print2(
            "####################################################################"
        )
        print1("# Fork Parameters")
        print2(self.forkParameters)
        totalPermutations = 1
        for param in self.forkParameters:
            for name in param:  # only 1
                values = param[name]
                totalPermutations *= len(values)
        forkPermutations = []
        for i in range(0, totalPermutations):
            forkPermutations.append({})
            pIdx = i
            for param in self.forkParameters:
                for name in param:
                    values = param[name]
                    valueIdx = pIdx % len(values)
                    forkPermutations[i][name] = values[valueIdx]
                    pIdx /= len(values)
        if len(forkPermutations) > 0:
            self.forkHardcodedParameters(forkPermutations)

        ############################################################################
        # (II-3) benchmark fork parameters
        print2("")
        print2(
            "####################################################################"
        )
        print1("# Benchmark Fork Parameters")
        self.addStepsForParameters(self.benchmarkForkParameters)

        ############################################################################
        # (II-4.1) join parameters
        # answer should go in hard-coded parameters
        # does it remove the prior forks? Yes.
        print2("")
        print2(
            "####################################################################"
        )
        print1("# Join Parameters")
        macroTileJoinSet = set()
        totalPermutations = 1
        if len(self.joinParameters) > 0:
            for joinName in self.joinParameters:
                # joining a parameter with only a single value
                if hasParam(joinName, self.singleValueParameters):
                    pass
                elif hasParam(joinName, self.forkParameters):
                    # count permutations
                    for param in self.forkParameters:
                        for name in param:  # only 1
                            if name == joinName:
                                values = param[name]
                                localPermutations = len(values)
                                print2(
                                    "JoinParameter %s has %u possibilities" %
                                    (joinName, localPermutations))
                                totalPermutations *= localPermutations

                ##########################################################################
                # (II-4.2) Join MacroTile
                elif joinName == "MacroTile":
                    print2("JoinParam: MacroTile")
                    # get possible WorkGroupEdges from forked
                    print2("currentForkParameters = %s" %
                           str(self.forkParameters))
                    threadTileValues = []
                    workGroupValues = []
                    # todo having MacroTile as join parameter causes trouble if
                    # one parameter is benchmarked rather than forked
                    # however, this may still be the right way to do it

                    # count permutations
                    for paramList in [self.benchmarkCommonParameters, \
                        self.forkParameters, self.benchmarkForkParameters, \
                        self.benchmarkJoinParameters, self.singleValueParameters ]:
                        if hasParam("ThreadTile", paramList):
                            threadTileValues = getParamValues(
                                "ThreadTile", paramList)
                        if hasParam("WorkGroup", paramList):
                            workGroupValues = getParamValues(
                                "WorkGroup", paramList)
                    macroTilePermutations = len(workGroupValues) * len(
                        threadTileValues)
                    print2("# Total JoinMacroTile Permutations: %u" %
                           macroTilePermutations)

                    # enumerate permutations
                    for i in range(0, macroTilePermutations):
                        pIdx = i
                        workGroupIdx = pIdx % len(workGroupValues)
                        pIdx /= len(workGroupValues)
                        threadTileIdx = pIdx % len(threadTileValues)

                        workGroup = workGroupValues[workGroupIdx]
                        threadTile = threadTileValues[threadTileIdx]

                        macroTile0 = workGroup[0] * threadTile[0]
                        macroTile1 = workGroup[1] * threadTile[1]
                        macroTileJoinSet.add((macroTile0, macroTile1))
                    totalPermutations *= len(macroTileJoinSet)
                    print2("JoinMacroTileSet(%u): %s" %
                           (len(macroTileJoinSet), macroTileJoinSet))

                # invalid join parameter
                else:
                    validJoinNames = ["MacroTile"]
                    for validParam in self.forkParameters:
                        for validName in validParam:  # only 1
                            validJoinNames.append(validName)
                    printExit("JoinParameter \"%s\" not in %s" %
                              (joinName, validJoinNames))

            ############################################################################
            # (II-4.4) Enumerate Permutations Other * MacroTile * DepthU
            macroTiles = list(macroTileJoinSet)
            print2("# TotalJoinPermutations = %u" % (totalPermutations))
            joinPermutations = []
            for i in range(0, totalPermutations):
                joinPermutations.append({})
                pIdx = i
                for joinName in self.joinParameters:
                    if hasParam(joinName, self.forkParameters):
                        for paramDict in self.forkParameters:  # hardcodedPermutations
                            if joinName in paramDict:
                                paramValues = paramDict[joinName]
                                valueIdx = pIdx % len(paramValues)
                                joinPermutations[i][joinName] = paramValues[
                                    valueIdx]
                                pIdx /= len(paramValues)
                                break
                    elif joinName == "MacroTile":
                        valueIdx = pIdx % len(macroTiles)
                        pIdx /= len(macroTiles)
                        joinPermutations[i]["MacroTile0"] = macroTiles[
                            valueIdx][0]
                        joinPermutations[i]["MacroTile1"] = macroTiles[
                            valueIdx][1]
            if len(joinPermutations) > 0:
                self.joinHardcodedParameters(joinPermutations)

        ############################################################################
        # (II-5) benchmark join parameters
        print2("")
        print2(
            "####################################################################"
        )
        print1("# Benchmark Join Parameters")
        self.addStepsForParameters(self.benchmarkJoinParameters)

        ############################################################################
        # (II-6) benchmark final
        print2("")
        print2(
            "####################################################################"
        )
        print1("# Benchmark Final")
        for problemSizesDict in self.benchmarkFinalParameters:
            problemSizes = problemSizesDict["ProblemSizes"]
            self.currentProblemSizes = ProblemSizes(self.problemType,
                                                    problemSizes)
            currentBenchmarkParameters = {}
            benchmarkStep = BenchmarkStep(self.hardcodedParameters,
                                          currentBenchmarkParameters,
                                          self.initialSolutionParameters,
                                          self.currentProblemSizes,
                                          self.benchmarkStepIdx)
            self.benchmarkSteps.append(benchmarkStep)
            self.benchmarkStepIdx += 1
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
    print1("# Writing Kernels")
    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Solutions"))
        ensurePath(os.path.join(outputPath, "Kernels"))

    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(kernels))

    ##############################################################################
    # Write Kernels
    ##############################################################################
    if globalParameters["MergeFiles"]:
        kernelSourceFile = open(os.path.join(outputPath, \
            "Kernels.cpp"), "w")
        kernelHeaderFile = open(os.path.join(outputPath, \
            "Kernels.h"), "w")
        kernelSourceFile.write(CHeader)
        kernelHeaderFile.write(CHeader)
        kernelSourceFile.write("#include \"Kernels.h\"\n")
        kernelHeaderFile.write("#pragma once\n")
        if globalParameters["RuntimeLanguage"] == "HIP":
            kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n")
            kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
            kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
            kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
        else:
            kernelHeaderFile.write("#include <string>\n")

    kernelsWithBuildErrs = {}

    # tensor contraction kernels - dispatch as multiple threads:
    kLock = threading.Lock()
    pLock = threading.Lock()

    prepAsm()

    if globalParameters["CpuThreads"] == 0:
        cpus = 0
    elif globalParameters["CodeFromFiles"]:
        cpu_count = multiprocessing.cpu_count()
        cpus = cpu_count if globalParameters["CpuThreads"] == -1 \
               else min(cpu_count, globalParameters["CpuThreads"])
    else:  #! CodeFromFiles is not thread-safe since code merged into same file
        cpus = 1

    workPerCpu = max(10, (len(kernels) + cpus - 1) / cpus) if cpus else 1
    print "info: cpus=%u kernelsPerCpu=%u" % (cpus, workPerCpu)

    kiStart = 0
    cpu = 0
    threads = []
    while kiStart < len(kernels):
        kiStop = min(len(kernels), kiStart + workPerCpu)
        #sys.stderr.write("cpu:%u process kernels #%u-#%u\n"% (cpu, kiStart, kiStop))

        if cpus:
            args=(outputPath, kernels, kernelSourceFile, kernelHeaderFile, \
                  kernelWriterSource, kernelWriterAssembly, \
                  kernelsWithBuildErrs, progressBar, kLock, pLock, kiStart, kiStop)
            t = threading.Thread(target=processKernelSourceChunk, args=args)
            t.start()
            threads.append(t)
        else:
            processKernelSourceChunk(outputPath, kernels, kernelSourceFile, kernelHeaderFile, \
                                      kernelWriterSource, kernelWriterAssembly, \
                                      kernelsWithBuildErrs, kLock, pLock, kiStart, kiStop)
        kiStart += workPerCpu
        cpu += 1

    for t in threads:
        t.join()

    # beta-only kernels
    for kernel in kernelsBetaOnly:
        kernelWriter = kernelWriterSource
        kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)

        (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel)
        kernelSourceFile.write(src)
        if err:
            print "*** warning: invalid kernel#%u" % kernelName
        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()
        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName + ".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(
            kernelWriter.getHeaderFileStringBetaOnly(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()

    # close merged
    if globalParameters["MergeFiles"]:
        kernelHeaderFile.close()

    print1("# Writing Solutions")
    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(solutions))
    ##############################################################################
    # Write Solutions
    ##############################################################################
    if globalParameters["MergeFiles"]:
        solutionSourceFile = open(os.path.join(outputPath, \
            "Solutions.cpp"), "w")
        solutionHeaderFile = open(os.path.join(outputPath, \
            "Solutions.h"), "w")
        if globalParameters["MergeFiles"]:
            solutionSourceFile.write(CHeader)
            solutionHeaderFile.write(CHeader)
        solutionSourceFile.write("#include \"Solutions.h\"\n")
        solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
        solutionHeaderFile.write("#include \"Kernels.h\"\n")
        solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
        solutionHeaderFile.write("#include \"Tools.h\"\n")
        if globalParameters["CodeFromFiles"]:
            solutionHeaderFile.write("#include <unistd.h>\n")
    for solution in solutions:
        # get solution name
        if not globalParameters["MergeFiles"]:
            solutionFileName = solutionWriter.getSolutionName(solution)

        # write solution.cpp
        if not globalParameters["MergeFiles"]:
            solutionSourceFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".cpp"), "w")
            solutionSourceFile.write(CHeader)
        solutionSourceFile.write( \
            solutionWriter.getSourceFileString(solution, kernelsWithBuildErrs))
        if not globalParameters["MergeFiles"]:
            solutionSourceFile.close()

        # write solution.h
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".h"), "w")
            solutionHeaderFile.write(CHeader)
        solutionHeaderFile.write( \
            solutionWriter.getHeaderFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile.close()
        if globalParameters["ShowProgressBar"]:
            progressBar.increment()
    # close merged
    if not globalParameters["MergeFiles"]:
        solutionHeaderFile.close()

    if globalParameters["ExitAfterKernelGen"]:
        printExit(
            "** Exiting after kernel generation due to ExitAfterKernelGen=1")
Beispiel #16
0
  def __init__(self, problemType, config):
    self.totalIndices = 1+max(problemType["IndexAssignmentsA"])
    if len(config) < self.totalIndices:
      for i in range(len(config), self.totalIndices):
        config.append(0)
    self.indexMax = []
    self.indexIsSized = []
    self.indicesSized = []
    self.indicesMapped = []
    for i in range(0, self.totalIndices):
      dim = deepcopy(config[i])
      if isinstance(dim, list):
        if len(dim) == 1:
          self.indicesSized.append([dim[0], 1, 0, dim[0]])
        elif len(dim) == 2:
          self.indicesSized.append([dim[0], dim[0], 0, dim[1]])
        elif len(dim) == 3:
          self.indicesSized.append([dim[0], dim[1], 0, dim[2]])
        elif len(dim) == 4:
          self.indicesSized.append([dim[0], dim[1], dim[2], dim[3]])
        else:
          printExit("dimension[%u] config (%s) has %u descriptors rather than 1-4."
              % ( i, dim, len(dim) ))
        self.indexIsSized.append(True)
        self.indexMax.append(self.indicesSized[len(self.indicesSized)-1][3])

      elif isinstance(dim, int):
        self.indicesMapped.append(dim)
        self.indexIsSized.append(False)
        self.indexMax.append(self.indicesSized[self.indicesMapped[ \
            len(self.indicesMapped)-1]][3])

    # max num elements in each tensor
    self.maxNumElements = [ 1, 1, 1 ]
    for i in range(0, problemType["NumIndicesC"]):
      self.maxNumElements[0] *= self.indexMax[i]
    for i in problemType["IndexAssignmentsA"]:
      self.maxNumElements[1] *= self.indexMax[i]
    for i in problemType["IndexAssignmentsB"]:
      self.maxNumElements[2] *= self.indexMax[i]

    self.totalProblemSizes = 1
    self.numProblemSizes = [] # per index
    self.problemSizeToIndex = []
    self.problemIndexToSize = []
    sizedIdx = 0
    for i in range(0, len(self.indexIsSized)):
      self.problemSizeToIndex.append({})
      self.problemIndexToSize.append({})
      if self.indexIsSized[i]:
        self.numProblemSizes.append(0)
        index = self.indicesSized[sizedIdx]
        sizedIdx += 1
        currentSize = index[0]
        currentIncrement = index[1]
        while currentSize <= index[3]:
          currentSize += currentIncrement
          currentIncrement += index[2]
          self.numProblemSizes[i] += 1
      else:
        self.numProblemSizes.append(1)
      self.totalProblemSizes *= self.numProblemSizes[i]

    ########################################
    # enumerate problem sizes
    currentSizedIndexSizes = []
    currentSizedIndexIncrements = []
    for i in range(0, len(self.indicesSized)):
      currentSizedIndexSizes.append(self.indicesSized[i][0])
      currentSizedIndexIncrements.append(self.indicesSized[i][1])

    # iterate over all problem sizes
    self.problemSizes = []
    moreProblemSizes = True
    problemIdx = 0
    problemSize = [0]*self.totalIndices
    while moreProblemSizes:
      #/ convert current sized and mapped indices to full sizes
      currentSizedIdx = 0
      currentMappedIdx = 0
      for i in range(0, self.totalIndices):
        if self.indexIsSized[i]:
          problemSize[i] = currentSizedIndexSizes[currentSizedIdx]
          currentSizedIdx+=1
        else:
          problemSize[i] = problemSize[self.indicesMapped[currentMappedIdx]]
          currentMappedIdx+=1
      self.problemSizes.append(tuple(problemSize))

      #/ increment sizes for next benchmark
      currentSizedIndexSizes[0] += currentSizedIndexIncrements[0]
      currentSizedIndexIncrements[0] += self.indicesSized[0][2]
      for i in range(1, len(self.indicesSized)+1):
        # if prior index past max, reset to min and increment next index
        if currentSizedIndexSizes[i-1] > self.indicesSized[i-1][3]:
          #/ reset prior index
          currentSizedIndexSizes[i-1] = self.indicesSized[i-1][0]
          currentSizedIndexIncrements[i-1] = self.indicesSized[i-1][1]
          # increment next index
          if i >= len(self.indicesSized):
            moreProblemSizes = False
          else:
            currentSizedIndexSizes[i] += currentSizedIndexIncrements[i]
            currentSizedIndexIncrements[i] += self.indicesSized[i][2]

      problemIdx+=1
Beispiel #17
0
def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
    print1("# Writing Kernels")
    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Solutions"))
        ensurePath(os.path.join(outputPath, "Kernels"))

    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(kernels))

    ##############################################################################
    # Write Kernels
    ##############################################################################
    if globalParameters["MergeFiles"]:
        kernelSourceFile = open(os.path.join(outputPath, \
            "Kernels.cpp"), "w")
        kernelHeaderFile = open(os.path.join(outputPath, \
            "Kernels.h"), "w")
        kernelSourceFile.write(CHeader)
        kernelHeaderFile.write(CHeader)
        kernelSourceFile.write("#include \"Kernels.h\"\n")
        kernelHeaderFile.write("#pragma once\n")
        if globalParameters["RuntimeLanguage"] == "HIP":
            kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n")
            kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
            kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
            kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
        else:
            kernelHeaderFile.write("#include <string>\n")

    # tensor contraction kernels
    for ki in range(0, len(kernels)):
        kernel = kernels[ki]
        kernelWriter = kernelWriterSource if kernel[
            "KernelLanguage"] == "Source" else kernelWriterAssembly
        # get kernel name
        if not globalParameters["MergeFiles"]:
            kernelName = kernelWriter.getKernelName(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)
        (err, src) = kernelWriter.getSourceFileString(kernel)
        kernelSourceFile.write(src)
        if err:
            print "*** warning: invalid kernel#%u" % ki

        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()

        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(kernelWriter.getHeaderFileString(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()
        if globalParameters["ShowProgressBar"]:
            progressBar.increment()

    # beta-only kernels
    for kernel in kernelsBetaOnly:
        kernelWriter = kernelWriterSource
        kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

        # write kernel.cpp
        if not globalParameters["MergeFiles"]:
            kernelSourceFile = open(os.path.join(outputPath, \
                "Kernels", kernelName+".cpp"), "w")
            kernelSourceFile.write(CHeader)

        (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel)
        kernelSourceFile.write(src)
        if err:
            print "*** warning: invalid kernel#%u" % ki
        if not globalParameters["MergeFiles"]:
            kernelSourceFile.close()
        # write kernel.h
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile = open(os.path.join(outputPath, \
                "Kernels", kernelName + ".h"), "w")
            kernelHeaderFile.write(CHeader)
        kernelHeaderFile.write(
            kernelWriter.getHeaderFileStringBetaOnly(kernel))
        if not globalParameters["MergeFiles"]:
            kernelHeaderFile.close()

    # close merged
    if globalParameters["MergeFiles"]:
        kernelHeaderFile.close()

    print1("# Writing Solutions")
    if globalParameters["ShowProgressBar"]:
        progressBar = ProgressBar(len(solutions))
    ##############################################################################
    # Write Solutions
    ##############################################################################
    if globalParameters["MergeFiles"]:
        solutionSourceFile = open(os.path.join(outputPath, \
            "Solutions.cpp"), "w")
        solutionHeaderFile = open(os.path.join(outputPath, \
            "Solutions.h"), "w")
        if globalParameters["MergeFiles"]:
            solutionSourceFile.write(CHeader)
            solutionHeaderFile.write(CHeader)
        solutionSourceFile.write("#include \"Solutions.h\"\n")
        solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
        solutionHeaderFile.write("#include \"Kernels.h\"\n")
        solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
        solutionHeaderFile.write("#include \"Tools.h\"\n")
    for solution in solutions:
        # get solution name
        if not globalParameters["MergeFiles"]:
            solutionFileName = solutionWriter.getSolutionName(solution)

        # write solution.cpp
        if not globalParameters["MergeFiles"]:
            solutionSourceFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".cpp"), "w")
            solutionSourceFile.write(CHeader)
        solutionSourceFile.write( \
            solutionWriter.getSourceFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionSourceFile.close()

        # write solution.h
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile = open(os.path.join(outputPath, \
                "Solutions", solutionFileName+".h"), "w")
            solutionHeaderFile.write(CHeader)
        solutionHeaderFile.write( \
            solutionWriter.getHeaderFileString(solution))
        if not globalParameters["MergeFiles"]:
            solutionHeaderFile.close()
        if globalParameters["ShowProgressBar"]:
            progressBar.increment()
    # close merged
    if not globalParameters["MergeFiles"]:
        solutionHeaderFile.close()

    if globalParameters["ExitAfterKernelGen"]:
        printExit(
            "** Exiting after kernel generation due to ExitAfterKernelGen=1")
Beispiel #18
0
def writeSolutionsAndKernels(outputPath, problemTypes, solutions, kernels, kernelsBetaOnly, \
    solutionWriter, kernelWriterSource, kernelWriterAssembly):
  start = time.time()
  print1("# Writing Kernels...")
  if not globalParameters["MergeFiles"]:
    ensurePath(os.path.join(outputPath, "Solutions"))
    ensurePath(os.path.join(outputPath, "Kernels"))

  if globalParameters["ShowProgressBar"]:
    progressBar = ProgressBar(len(kernels))

  ##############################################################################
  # Write Kernels
  ##############################################################################
  if globalParameters["MergeFiles"]:
    kernelSourceFile = open(os.path.join(outputPath, \
        "Kernels.cpp"), "w")
    kernelHeaderFile = open(os.path.join(outputPath, \
        "Kernels.h"), "w")
    kernelSourceFile.write(CHeader)
    kernelHeaderFile.write(CHeader)
    kernelSourceFile.write("#include \"Kernels.h\"\n")
    kernelHeaderFile.write("#pragma once\n")
    if globalParameters["RuntimeLanguage"] == "HIP":
      kernelHeaderFile.write("// Also set env var HCC_ENABLE_PRINTF=1 for printf\n")
      kernelHeaderFile.write("#define HCC_ENABLE_ACCELERATOR_PRINTF\n\n")
      kernelHeaderFile.write("#include <hip/hip_runtime.h>\n")
      kernelHeaderFile.write("#include \"TensileTypes.h\"\n")
      kernelHeaderFile.write("#include \"KernelHeader.h\"\n")
      kernelHeaderFile.write("\n\n")
      kernelHeaderFile.write("__device__ inline int GenDot4(int a, int b, int c) { \n")
      kernelHeaderFile.write("#if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__\n")
      kernelHeaderFile.write("  typedef union { int32_t i; char4 z; } PkInt8x4;\n")
      kernelHeaderFile.write("#else\n")
      kernelHeaderFile.write("  typedef struct { int c0:8,c1:8,c2:8,c3:8; } C4I8;\n")
      kernelHeaderFile.write("  typedef union { int32_t i; C4I8 z; } PkInt8x4;\n")
      kernelHeaderFile.write("#endif\n")
      kernelHeaderFile.write("  PkInt8x4 va, vb; va.i = a; vb.i = b;\n")

      kernelHeaderFile.write("#if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__\n")
      kernelHeaderFile.write("      return amd_mixed_dot(va.z, vb.z, c, true); }\n")
      kernelHeaderFile.write("#else\n")
      kernelHeaderFile.write("      return c + (vb.z.c3*va.z.c3 + vb.z.c2*va.z.c2 + vb.z.c1*va.z.c1 + vb.z.c0*va.z.c0); }\n")
      kernelHeaderFile.write("#endif\n")
      kernelHeaderFile.write("\n\n")
    else:
      kernelHeaderFile.write("#include <string>\n")

  kernelsWithBuildErrs = {}

  prepAsm()

  if globalParameters["CpuThreads"] == 0:
    cpus = 0
  elif globalParameters["CodeFromFiles"]:
    cpu_count = multiprocessing.cpu_count()
    cpuThreads = globalParameters["CpuThreads"]
    cpus = cpu_count*abs(cpuThreads) if cpuThreads < 0 \
           else min(cpu_count, cpuThreads)
  else: #! CodeFromFiles is not thread-safe since code merged into same file
    cpus = 1

  workPerCpu = max(10, (len(kernels)+cpus-1)/cpus) if cpus else 1

  kiStart = 0
  cpu = 0
  threads = []
  if 1 and cpus and globalParameters["ShowProgressBar"]:
    print "# Launching kernel compilation processes (cpus=%u kernelsPerCpu=%u)" % (cpus, workPerCpu)
    processLaunchProgressBar = ProgressBar(len(kernels))
  else:
    print "# Compiling kernels (no multiprocessing, cpus=%u #kernels=%u)" % (cpus, workPerCpu)
    processLaunchProgressBar = None
  while kiStart < len(kernels):
    kiStop = min(len(kernels), kiStart + workPerCpu)
    if cpus:
      results = []
      parentConn,child  = multiprocessing.Pipe()
      args=(kernels, kernelWriterSource, kernelWriterAssembly, \
            kiStart, kiStop, child)
      t = multiprocessing.Process(target=processKernelSourceChunk, args=args)
      t.start()
      child.close() # close child pipe in the parent process
      threads.append([t,kiStart,kiStop, parentConn])
      if processLaunchProgressBar:
        processLaunchProgressBar.increment(kiStop-kiStart)
      else:
        sys.stderr.write("  # launched process %s for kernels %d..%d\n" %(t, kiStart, kiStop-1))

    else: # non-threaded version
      results = processKernelSourceChunk(kernels, kernelWriterSource, kernelWriterAssembly, \
                               kiStart, kiStop, None)
      if globalParameters["ShowProgressBar"]:
        progressBar.increment(kiStop-kiStart)
      processResults(results, outputPath, kernelsWithBuildErrs, kernelSourceFile, kernelHeaderFile)

    kiStart += workPerCpu
    cpu += 1
  sys.stderr.write("# Waiting for kernel compilation processes...\n")

  someError = 0
  if cpus:
    for (t,kiStart,kiStop,parentConn) in threads:
      try:
        results = parentConn.recv()
      except EOFError as pipeErr:
        print  "*** warning: process", t, "returned pipe EOF",t,pipeErr

      t.join()
      e = t.exitcode
      if e != 0 :
        print  "*** warning: process", t, "returned",t,e
        someError = 1
        results = []

      if globalParameters["ShowProgressBar"]:
        progressBar.increment(kiStop-kiStart)
      processResults(results, outputPath, kernelsWithBuildErrs, kernelSourceFile, kernelHeaderFile)

  if someError:
    print "\nKernel compilation failed in one or more subprocesses. May want to set CpuThreads=0 and re-run to make debug easier"
    printExit("** kernel compilation failure **")


  # beta-only kernels
  for kernel in kernelsBetaOnly:
    kernelWriter = kernelWriterSource
    kernelName = kernelWriter.getKernelNameBetaOnly(kernel)

    # write kernel.cpp
    if not globalParameters["MergeFiles"]:
      kernelSourceFile = open(os.path.join(outputPath, \
          "Kernels", kernelName+".cpp"), "w")
      kernelSourceFile.write(CHeader)

    (err, src) = kernelWriter.getSourceFileStringBetaOnly(kernel)
    kernelSourceFile.write(src)
    if err:
      print "*** warning: invalid kernel#%u"%kernelName
    if not globalParameters["MergeFiles"]:
      kernelSourceFile.close()
    # write kernel.h
    if not globalParameters["MergeFiles"]:
      kernelHeaderFile = open(os.path.join(outputPath, \
          "Kernels", kernelName + ".h"), "w")
      kernelHeaderFile.write(CHeader)
    kernelHeaderFile.write( kernelWriter.getHeaderFileStringBetaOnly(kernel))
    if not globalParameters["MergeFiles"]:
      kernelHeaderFile.close()

  # close merged
  if globalParameters["MergeFiles"]:
    kernelHeaderFile.close()

  stop = time.time()
  print "# Kernel Building elapsed time = %.1f secs" % (stop-start)

  print1("# Writing Solutions")
  if globalParameters["ShowProgressBar"]:
    progressBar = ProgressBar(len(solutions))
  ##############################################################################
  # Write Solutions
  ##############################################################################
  if globalParameters["MergeFiles"]:
    solutionSourceFile = open(os.path.join(outputPath, \
        "Solutions.cpp"), "w")
    solutionHeaderFile = open(os.path.join(outputPath, \
        "Solutions.h"), "w")
    if globalParameters["MergeFiles"]:
      solutionSourceFile.write(CHeader)
      solutionHeaderFile.write(CHeader)
    solutionSourceFile.write("#include \"Solutions.h\"\n")
    solutionSourceFile.write("#include <algorithm>\n")
    solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
    solutionHeaderFile.write("#include \"Kernels.h\"\n")
    solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
    solutionHeaderFile.write("#include \"Tools.h\"\n")
    if globalParameters["CodeFromFiles"]:
      solutionHeaderFile.write("#include <unistd.h>\n")


  # Write a solution pointer typedef for each problemType:
  h = ""
  for problemType in problemTypes:
    #print "p=", problemType
    argListAll = solutionWriter.getArgList(problemType, True, True, True, True)
    # declare TensileSolutionPointer_ProblemType
    h += "\n// solution pointer\n"
    h += "typedef TensileStatus (*TensileSolutionPointer_%s)(\n" \
        % problemType
    for i in range(0, len(argListAll)):
      h += "    %s %s%s" % (argListAll[i][0], argListAll[i][1], ",\n" \
          if i < len(argListAll)-1 else ");\n\n")
    h += "\n"

  solutionHeaderFile.write(h)
#
  for solution in solutions:
    # get solution name
    if not globalParameters["MergeFiles"]:
      solutionFileName = solutionWriter.getSolutionName(solution)

    # write solution.cpp
    if not globalParameters["MergeFiles"]:
      solutionSourceFile = open(os.path.join(outputPath, \
          "Solutions", solutionFileName+".cpp"), "w")
      solutionSourceFile.write(CHeader)
    solutionSourceFile.write( \
        solutionWriter.getProblemSourceString(solution["ProblemType"], solution, kernelsWithBuildErrs))
    if not globalParameters["MergeFiles"]:
      solutionSourceFile.close()

    # write solution.h
    if not globalParameters["MergeFiles"]:
      solutionHeaderFile = open(os.path.join(outputPath, \
          "Solutions", solutionFileName+".h"), "w")
      solutionHeaderFile.write(CHeader)
    solutionHeaderFile.write( \
        solutionWriter.getHeaderFileString(solution))
    if not globalParameters["MergeFiles"]:
      solutionHeaderFile.close()
    if globalParameters["ShowProgressBar"]:
      progressBar.increment()
  # close merged
  if not globalParameters["MergeFiles"]:
    solutionHeaderFile.close()

  if globalParameters["ExitAfterKernelGen"]:
    printExit("** Exiting after kernel generation due to ExitAfterKernelGen=1")
Beispiel #19
0
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
from Common import print1, print2, printExit, printWarning, versionIsCompatible
from SolutionStructs import Solution, ProblemSizes, ProblemType
from __init__ import __version__

import os
try:
    import yaml
except ImportError:
    printExit(
        "You must install PyYAML to use Tensile (to parse config files). See http://pyyaml.org/wiki/PyYAML for installation instructions."
    )


################################################################################
# Read Benchmark Config from YAML Files
################################################################################
def readConfig(filename):
    try:
        stream = open(filename, "r")
    except IOError:
        printExit("Cannot open file: %s" % filename)
    config = yaml.load(stream, yaml.SafeLoader)
    stream.close()
    return config
Beispiel #20
0
def benchmarkProblemType( problemTypeConfig, problemSizeGroupConfig, \
    problemSizeGroupIdx ):

    benchmarkTestFails = 0

    # convert config to full benchmark process (resolves defaults)
    print1("")
    print1(HR)
    print1("# Converting Config to BenchmarkProcess Object")
    print1(HR)
    print1("")
    benchmarkProcess = BenchmarkProcess( problemTypeConfig, \
        problemSizeGroupConfig )

    problemTypeName = str(benchmarkProcess.problemType)
    problemSizeGroupName = "%s_%02u" % (problemTypeName, problemSizeGroupIdx)
    pushWorkingPath(problemSizeGroupName)
    ensurePath(os.path.join(globalParameters["WorkingPath"], "Data"))

    totalBenchmarkSteps = len(benchmarkProcess)
    resultsFileBaseFinal = None
    winners = WinningParameterDict()
    print1("# NumBenchmarkSteps: %u" % totalBenchmarkSteps)
    print1("")
    print1(HR)
    print1("# Done Creating BenchmarkProcess Object")
    print1(HR)

    ##############################################################################
    # For Each Benchmark Step
    ##############################################################################
    for benchmarkStepIdx in range(0, totalBenchmarkSteps):

        benchmarkStep = benchmarkProcess[benchmarkStepIdx]
        if winners.winners == {}:
            # perf optimization to skip the initial winners creation
            # this helps a little here but really helps below with avoiding the super-expensive
            # removeHardcoded step below - that can use a fast-path to create
            # winners when needed.
            print1(
                "# Empty winners - use fast initialization of hardcodedParameters"
            )
            resultingHardcodedParameterList = benchmarkStep.hardcodedParameters
        else:
            resultingHardcodedParameterList = \
                winners.wpdUpdate( benchmarkStep.hardcodedParameters )

        benchmarkStep.hardcodedParameters = resultingHardcodedParameterList
        numHardcoded = len(benchmarkStep.hardcodedParameters)
        stepName = str(benchmarkStep)
        shortName = benchmarkStep.abbreviation()
        print1("\n")
        print1(HR)
        currentTime = time.time()
        elapsedTime = currentTime - startTime
        print1("# BenchmarkStep: %s - %s %.3fs" %
               (problemSizeGroupName, stepName, elapsedTime))
        print1("# NumProblems: %u" %
               benchmarkStep.problemSizes.totalProblemSizes)
        print1("# BenchmarkParameters:")
        for paramName in benchmarkStep.benchmarkParameters:
            paramValues = benchmarkStep.benchmarkParameters[paramName]
            printStr = "#     %s = { %s" % (paramName, paramValues[0])
            for paramValueIdx in range(1, len(paramValues)):
                printStr += ", %s" % str(paramValues[paramValueIdx])
            printStr += " }"
            print1(printStr)

        if False:
            # print1(hardcoded parameters and their winners
            print1("# HardcodedParameters | WinningParameters:")
            paramDictIdx = 0
            hardcodedMinNaming = \
                Solution.getMinNaming(benchmarkStep.hardcodedParameters)
            for paramDict in benchmarkStep.hardcodedParameters:
                winningParameters = winners[paramDict]
                print1("#    (%u) %s | %s" % (paramDictIdx, \
                    Solution.getNameMin(paramDict, hardcodedMinNaming), \
                    Solution.getNameFull(winningParameters) ))
                paramDictIdx += 1
        pushWorkingPath(shortName)

        ############################################################################
        # Copy Files to Benchmark Source Directory
        ############################################################################
        stepBaseDir = globalParameters["WorkingPath"]
        sourceDir = \
          os.path.join(stepBaseDir, "source" )
        ensurePath(sourceDir)
        pushWorkingPath("sourceTmp")
        filesToCopy = [
            "SolutionMapper.h",
            "Client.cpp",
            "Client.h",
            "CMakeLists.txt",
            "DeviceStats.h",
            "TensorUtils.h",
            "MathTemplates.cpp",
            "MathTemplates.h",
            "TensileTypes.h",
            "tensile_bfloat16.h",
            "KernelHeader.h",
            "ReferenceCPU.h",
            "SolutionHelper.cpp",
            "SolutionHelper.h",
            "Tools.cpp",
            "Tools.h",
        ]

        for f in filesToCopy:
            shutil_copy(os.path.join(globalParameters["SourcePath"], f),
                        globalParameters["WorkingPath"])
        if globalParameters["RuntimeLanguage"] == "OCL":
            shutil_copy(
                os.path.join(globalParameters["SourcePath"],
                             "FindOpenCL.cmake"),
                globalParameters["WorkingPath"])
        else:
            shutil_copy(
                os.path.join(globalParameters["SourcePath"], "FindHIP.cmake"),
                globalParameters["WorkingPath"])
            shutil_copy(
                os.path.join(globalParameters["SourcePath"], "FindHCC.cmake"),
                globalParameters["WorkingPath"])

        ############################################################################
        # Enumerate Benchmark Permutations
        ############################################################################
        solutions = []
        totalBenchmarkPermutations = 1
        for benchmarkParamName in benchmarkStep.benchmarkParameters:
            totalBenchmarkPermutations *= len(
                benchmarkStep.benchmarkParameters[benchmarkParamName])
        maxPossibleSolutions = totalBenchmarkPermutations * numHardcoded
        print1("# MaxPossibleSolutions: %u = %u (hardcoded) * %u (benchmark)" % \
            (maxPossibleSolutions, numHardcoded, totalBenchmarkPermutations))

        benchmarkPermutations = []
        for i in range(0, totalBenchmarkPermutations):
            permutation = {}
            pIdx = i
            for benchmarkParamName in benchmarkStep.benchmarkParameters:
                benchmarkParamValues = deepcopy( \
                    benchmarkStep.benchmarkParameters[benchmarkParamName])
                valueIdx = pIdx % len(benchmarkParamValues)
                permutation[benchmarkParamName] = benchmarkParamValues[
                    valueIdx]
                pIdx /= len(benchmarkParamValues)
            benchmarkPermutations.append(permutation)

        ############################################################################
        # Enumerate Solutions = Hardcoded * Benchmark
        ############################################################################
        print1("# Enumerating Solutions")
        if globalParameters["PrintLevel"] >= 1:
            progressBar = ProgressBar(maxPossibleSolutions)
        solutionSet = set()  # avoid duplicates for nlca=-1, 1
        for hardcodedIdx in range(0, numHardcoded):
            solutions.append([])
            hardcodedParamDict = benchmarkStep.hardcodedParameters[
                hardcodedIdx]
            for benchmarkIdx in range(0, len(benchmarkPermutations)):
                benchmarkPermutation = benchmarkPermutations[benchmarkIdx]
                solution = {
                    "ProblemType": deepcopy(benchmarkProcess.problemType.state)
                }
                solution.update(benchmarkPermutation)
                solution.update(hardcodedParamDict)
                if benchmarkStepIdx > 0:
                    winningParameters = winners[hardcodedParamDict]
                    if winningParameters == None:
                        # this is a joined parameter that didn't have a winner, that's okay
                        continue
                    solution.update(winningParameters)

                # append default parameters where necessary
                for initialSolutionParameterName in benchmarkStep.initialSolutionParameters:
                    if initialSolutionParameterName not in solution:
                        solution[initialSolutionParameterName] = \
                            benchmarkStep.initialSolutionParameters[initialSolutionParameterName]
                # TODO check if solution matches problem size for exact tile kernels
                solutionObject = Solution(solution)
                if solutionObject["Valid"]:
                    if solutionObject not in solutionSet:
                        solutionSet.add(solutionObject)
                        solutions[hardcodedIdx].append(solutionObject)
                else:
                    if globalParameters["PrintSolutionRejectionReason"]:
                        print1("rejecting solution %s" % str(solutionObject))
                if globalParameters["PrintLevel"] >= 1:
                    progressBar.increment()

        # remove hardcoded that don't have any valid benchmarks
        removeHardcoded = []
        for hardcodedIdx in range(0, numHardcoded):
            if len(solutions[hardcodedIdx]) == 0:
                hardcodedParamDict = benchmarkStep.hardcodedParameters[
                    hardcodedIdx]
                removeHardcoded.append(hardcodedParamDict)
        removesExist = len(removeHardcoded) > 0
        for hardcodedParam in removeHardcoded:
            benchmarkStep.hardcodedParameters.remove(hardcodedParam)

        if removesExist:
            print1(
                "# Updating winners since enumeration removed unused hardcoded solutions.  removeHardcoded=%u winners=%u"
                % (len(removeHardcoded), len(winners.winners)))
            winners.wpdUpdate(benchmarkStep.hardcodedParameters)
            if globalParameters["PrintLevel"] >= 1:
                print1("")
            numHardcoded = len(benchmarkStep.hardcodedParameters)
            # remove from solution 2D list also
            for solutionList in shallowcopy(solutions):
                if len(solutionList) == 0:
                    solutions.remove(solutionList)
        elif winners.winners == {}:
            print1("# Populating initial winners (%u solutions)\n" %
                   len(benchmarkStep.hardcodedParameters))
            for hcParm in benchmarkStep.hardcodedParameters:
                winners.winners[FrozenDictionary(hcParm)] = [{}, -1]

        print1("# Actual Solutions: %u / %u\n" % ( len(solutions), \
            maxPossibleSolutions ))

        # create linear list
        solutionList = []
        for i in range(0, len(solutions)):
            solutionsForHardcoded = solutions[i]
            for j in range(0, len(solutionsForHardcoded)):
                solution = solutionsForHardcoded[j]
                solutionList.append(solution)
        if len(solutionList) == 0:
            msg = "Your parameters resulted in 0 valid solutions."
            if globalParameters["PrintSolutionRejectionReason"]:
                msg += "\nExamine reject and backtrace messages above to see why and where solutions were rejected."
            else:
                msg += "\nYou should re-run with \"PrintSolutionRejectionReason: True\" to see why each parameter combination was rejected."
            printExit(msg)
        if globalParameters["PrintLevel"] >= 1:
            for i in range(0, len(solutions)):
                solutionsForHardcoded = solutions[i]
                for j in range(0, len(solutionsForHardcoded)):
                    solution = solutionsForHardcoded[j]
                    print2("#    (%u:%u) %s" % (i, j, \
                        Solution.getNameFull(solution) ))
            print2(HR)

        # write benchmarkFiles
        writeBenchmarkFiles(stepBaseDir, solutionList, benchmarkStep.problemSizes, \
            shortName, filesToCopy)

        print1("# Copying files that differ from sourceTmp -> source")
        sourceTmp = globalParameters["WorkingPath"]
        files = os.listdir(sourceTmp)
        for f in files:
            f0 = os.path.join(sourceTmp, f)
            f1 = os.path.join(sourceDir, f)
            if os.path.isdir(f0):
                #print "cpDir:", f0, f1
                if os.path.isdir(f1):
                    shutil.rmtree(f1, True)
                shutil.copytree(f0, f1)
            elif not os.path.exists(f1) or not filecmp.cmp(f0, f1):
                #print "cp:", f0, f1
                shutil.copy(f0, f1)
        shutil.rmtree(sourceTmp, True)

        popWorkingPath()  # source

        ############################################################################
        # Run Benchmark Script
        ############################################################################
        resultsFileBase = os.path.normpath(os.path.join( \
            globalParameters["WorkingPath"], "../Data", shortName))
        if benchmarkStep.isFinal():
            resultsFileBaseFinal = resultsFileBase
        resultsFileName = resultsFileBase + ".csv"
        solutionsFileName = resultsFileBase + ".yaml"
        if not os.path.exists(resultsFileName) or \
            globalParameters["ForceRedoBenchmarkProblems"]:
            pushWorkingPath("build")

            # write runScript
            libraryLogicPath = None
            path = globalParameters["WorkingPath"]
            forBenchmark = True
            runScriptName = writeRunScript(path, libraryLogicPath,
                                           forBenchmark)

            # run runScript
            process = Popen(runScriptName, cwd=globalParameters["WorkingPath"])
            process.communicate()
            if process.returncode:
                benchmarkTestFails += 1
                printWarning(
                    "BenchmarkProblems: Benchmark Process exited with code %u"
                    % process.returncode)
            popWorkingPath()  # build
        else:
            print1("# Already benchmarked; skipping.")

        ############################################################################
        # Winners -> Determined Parameters
        ############################################################################
        results = getResults(resultsFileName, solutions)
        print2("CSV Results: %s" % results)
        winners.addResults(benchmarkStep.hardcodedParameters, \
            benchmarkPermutations, solutions, results)

        ############################################################################
        # Write Solutions YAML
        ############################################################################
        YAMLIO.writeSolutions(solutionsFileName, benchmarkStep.problemSizes, \
            solutions )

        # End Iteration
        popWorkingPath()  # stepName
        currentTime = time.time()
        elapsedTime = currentTime - startTime
        print1("%s\n# %s\n# %s: End - %.3fs\n%s\n" \
            % (HR, problemSizeGroupName, shortName, elapsedTime, HR))

    popWorkingPath()  # ProblemType
    return (resultsFileBaseFinal, benchmarkTestFails)
Beispiel #21
0
  def assignDerivedParameters(state):
    if "AssignedDerivedParameters" in state:
      if state["AssignedDerivedParameters"]:
        return
    state["AssignedDerivedParameters"] = False

    state["TotalIndices"] = max(max(state["IndexAssignmentsA"])+1, \
        max(state["IndexAssignmentsB"])+1)

    # determine num free, batch
    state["IndicesFree"] = []
    state["IndicesBatch"] = []
    state["IndicesSummation"] = []

    for i in range(0, state["NumIndicesC"]):
      inA = i in state["IndexAssignmentsA"]
      inB = i in state["IndexAssignmentsB"]
      if inA and inB:
        #state["NumIndicesBatch"] = (i+1)-state["NumIndicesFree"]
        state["IndicesBatch"].append(i)

      elif inA or inB:
        #state["NumIndicesFree"] = (i+1)
        state["IndicesFree"].append(i)
      else:
        printExit("invalid index %u" % i)

    # determine num summation
    for i in range(state["NumIndicesC"], state["TotalIndices"]):
      inA = i in state["IndexAssignmentsA"]
      inB = i in state["IndexAssignmentsB"]
      if inA and inB:
        #state["NumIndicesSummation"] = (i+1)-state["NumIndicesC"]
        state["IndicesSummation"].append(i)
      else:
        printExit("invalid index %u" % i)
    # print index assignments
    #print2("IndicesFree:  %s" % state["IndicesFree"])
    #print2("IndicesBatch: %s" % state["IndicesBatch"])
    #print2("IndicesSum:   %s" % state["IndicesSummation"])
    state["NumIndicesFree"] = len(state["IndicesFree"])
    state["NumIndicesBatch"] = len(state["IndicesBatch"])
    state["NumIndicesSummation"] = len(state["IndicesSummation"])
    if state["NumIndicesFree"] != 2:
      printExit("Tensile can only handle 2 free indices; FreeIndices=%s."%state["IndicesFree"])

    # by default, unroll index will be the last/inner summation index
    state["IndexUnroll"] = state["IndicesSummation"][len(state["IndicesSummation"])-1]
    for i in range(0, len(state["IndexAssignmentsA"])):
      if state["IndexAssignmentsA"][i] == state["IndexUnroll"]:
        state["IndexUnrollA"] = i
        break
    for i in range(0, len(state["IndexAssignmentsB"])):
      if state["IndexAssignmentsB"][i] == state["IndexUnroll"]:
        state["IndexUnrollB"] = i
        break
    #print2("IndexUnrollA: %u" % state["IndexUnrollA"])
    #print2("IndexUnrollB: %u" % state["IndexUnrollB"])

    # assign d0, d1
    state["Index01A"] = -1
    state["Index01B"] = -1
    for i in state["IndexAssignmentsA"]:
      if i in state["IndicesFree"]:
        state["Index01A"] = i
        break
    for i in state["IndexAssignmentsB"]:
      if i in state["IndicesFree"]:
        state["Index01B"] = i
        break
    #print2("Index01A: %u" % state["Index01A"])
    #print2("Index01B: %u" % state["Index01B"])
    # whichever has lower stride in C (lower value), is 0, other is 1
    if state["Index01A"] < state["Index01B"]:
      state["Index0"]  = state["Index01A"]
      state["Index1"]  = state["Index01B"]
      state["Tensor0"] = 0
      state["Tensor1"] = 1
      state["TileA"] = 0
      state["TileB"] = 1
    else:
      state["Index0"]  = state["Index01B"]
      state["Index1"]  = state["Index01A"]
      state["Tensor0"] = 1
      state["Tensor1"] = 0
      state["TileA"] = 1
      state["TileB"] = 0

    # generalize transpose
    strideIdxA = state["IndexAssignmentsA"].index(state["Index01A"])
    strideIdxB = state["IndexAssignmentsB"].index(state["Index01B"])
    unrollIdxA = state["IndexAssignmentsA"].index(state["IndexUnroll"])
    unrollIdxB = state["IndexAssignmentsB"].index(state["IndexUnroll"])
    state["TLUA"] = strideIdxA < unrollIdxA
    state["TLUB"] = strideIdxB < unrollIdxB

    #unrollDimStrideGreaterThanTileDimStrideA = TLUA = !transA = fast
    #!unrollDimStrideLessThanTileDimStrideB   = TLUB =  transB = fast
    state["AssignedDerivedParameters"] = True