def getSourceString(self, solution): kernels = solution.getKernels() kernelNames = [] for kernel in kernels: kernelName = self.kernelWriter.getKernelName(kernel) kernelNames.append(kernelName) s = "" t = "" # includes if not globalParameters["MergeFiles"]: solutionName = self.getSolutionName(solution) s += "#include \"%s.h\"\n" % solutionName #s += "#include \"MathTemplates.h\"\n" s += "\n" # solution function signature s += self.getSolutionSignature(solution) s += " {\n" t += " " s += "%sTensileStatus status;\n" % (t) # hipFunction Struct if solution["KernelLanguage"] == "Assembly": s += "\n" s += "%s/* module function args */\n" % (t) s += "%sstruct {\n" % t t += " " if globalParameters["DebugKernel"]: s += "%sunsigned int *debugBuffer;\n" % t solutionArgs = self.getArgList(solution["ProblemType"], True, False, False) for arg in solutionArgs: if arg[0] == "TensileHalf": s += "%s%s %s[2];\n" % (t, arg[0], arg[1]) else: s += "%s%s %s;\n" % (t, arg[0], arg[1]) if solution["PersistentKernel"]: # pass in the number of groups since not available in WG s += "%sunsigned int numGroupTiles0;\n" % t s += "%sunsigned int numGroupTiles1;\n" % t s += "%sunsigned int pad;\n" % t # FIXME can this be removed? t = t[2:] s += "%s} hipFunctionArgs;\n" % t #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t s += "%ssize_t hipFunctionArgsSize = sizeof(hipFunctionArgs);\n" % t s += "%svoid *hipLaunchParams[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &hipFunctionArgs, HIP_LAUNCH_PARAM_BUFFER_SIZE, &hipFunctionArgsSize, HIP_LAUNCH_PARAM_END};\n" % t #s += "%sprintf(\"size: %%lu\\n\", sizeof(unsigned int));\n" % t #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t #for arg in solutionArgs: # s += "%sprintf(\"%s: %%lu\\n\", static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)) - static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)));\n" % (t, arg[1], arg[1], solutionArgs[0][1]) # NOTE: host compiler aligns size of structs to 64-bits (at least) and aligns the offset of pointers to 64-bits, therefore, having pointers which are not at the beginning of the struct may get padded/shifted by the host compiler and, therefore, not coppied correctly to gpu # kernels s += "\n%s/* kernels */\n" % (t) s += "%sconst unsigned int numKernels = %u; // 1 or 4\n" % ( t, len(kernels)) s += "%sint deviceId;\n" % (t) s += "%shipCtxGetDevice(&deviceId);\n" % (t) s += "%shipDeviceProp_t deviceProperties;\n" % (t) s += "%shipGetDeviceProperties( &deviceProperties, deviceId );\n" % (t) if solution["KernelLanguage"] == "Source" and globalParameters[ "RuntimeLanguage"] == "OCL": s += "%sconst char *kernelSources[numKernels] = {\n" % (t) t += " " for kernelIdx in range(0, len(kernelNames)): kernelName = kernelNames[kernelIdx] s += "%s%s_src%s\n" % (t, kernelName, \ "," if kernelIdx < len(kernels)-1 else "" ) t = t[2:] s += "%s};\n" % (t) s += "%scl_kernel kernels[numKernels];\n" % (t) s += "%sconst char *buildOptions = \"-cl-std=cl2.0\";\n" % (t) s += "%sfor (unsigned int i = 0; i < numKernels; i++) {\n" % (t) s += "%s tensileGetCompiledOpenCLKernel(\n" % (t) s += "%s &kernels[i],\n" % (t) s += "%s kernelSources[i],\n" % (t) s += "%s stream,\n" % (t) s += "%s buildOptions);\n" % (t) s += "%s}\n" % (t) if solution["GlobalSplitU"] > 1: for beta in solution.getKernelsBetaOnly(): kernelName = self.kernelWriter.getKernelNameBetaOnly(beta) s += "%scl_kernel kernel_%s;\n" % (t, kernelName) s += "%s tensileGetCompiledOpenCLKernel(\n" % (t) s += "%s &kernel_%s,\n" % (t, kernelName) s += "%s %s_src,\n" % (t, kernelName) s += "%s stream,\n" % (t) s += "%s buildOptions);\n" % (t) elif solution["KernelLanguage"] == "Assembly": localStatic = True kernel = kernels[0] s += "%sint isa = deviceProperties.gcnArch;\n" % (t) s += "%shipFunction_t hipFunction;\n" % (t) kernelName = self.kernelWriter.getKernelName(kernel) s += t if localStatic: s += "%sstatic hipFunction_t *hipFunctions = nullptr;\n" % (t) s += "%sif ( !hipFunctions ) {\n" % ( t ) # not locking here means array might be double allocated and memory leak t += " " s += "%sstatic std::mutex initFunctionsMutex;\n" % (t) s += "%sstd::lock_guard<std::mutex> initFunctionsLock(initFunctionsMutex);\n" % ( t) s += "%sif ( !hipFunctions ) {\n" % ( t ) # not locking here means array might be double allocated and memory leak t += " " s += "%sstatic int numDevices = -1;\n" % (t) s += "%sstatus = hipGetDeviceCount( &numDevices );\n" % (t) s += "%shipFunction_t *tmp = new hipFunction_t[numDevices];\n" % ( t) s += "%sfor ( int i = 0; i < numDevices; i++) {\n" % (t) s += "%s tmp[i] = nullptr;\n" % (t) s += "%s}\n" % (t) s += "%shipFunctions = tmp;\n" % (t) t = t[2:] s += "%s}\n" % (t) t = t[2:] s += "%s}\n" % (t) s += "%sif ( !hipFunctions[deviceId] ) {\n" % (t) t += " " s += "%sstatic std::mutex loadModuleMutex;\n" % (t) s += "%sstd::lock_guard<std::mutex> loadModuleLock(loadModuleMutex);\n" % ( t) s += "%sif (!hipFunctions[deviceId]) {\n" % (t) t += " " s += "%shipModule_t module = nullptr;\n" % (t) s += "%shipModuleLoadData(&module, %s_coba);\n" % (t, kernelName) s += "%shipModuleGetFunction(&hipFunctions[deviceId], module, \"%s\");\n" % ( t, kernelName) t = t[2:] s += "%s}\n" % (t) t = t[2:] s += "%s}\n" % (t) s += "%shipFunction = hipFunctions[deviceId];\n" % (t) else: s += "%stensileGetHipFunctionFromCodeObjectByteArray(\n" % (t) s += "%s &hipFunction,\n" % (t) s += "%s \"%s\",\n" % (t, kernelName) s += "%s %s_coba); // code object byte array\n" % ( t, kernelName) typeName = solution["ProblemType"]["DataType"].toCpp() # index assignments s += "\n%s/* index assignments */\n" % (t) s += "%sconst unsigned int indexD0 = %u;\n" \ % (t, solution["ProblemType"]["Index0"]) s += "%sconst unsigned int indexD1 = %u;\n" \ % (t, solution["ProblemType"]["Index1"]) s += "%sconst unsigned int indexDU = %u;\n" \ % (t, solution["ProblemType"]["IndexUnroll"]) # num enqueues s += "\n%s/* num kernels */\n" % (t) s += "%sunsigned int numEnqueues[numKernels] = { 1" % (t) for i in range(1, len(kernels)): s += ", 1" s += " };\n" # grid size s += "\n%s/* grid sizes */\n" % (t) s += "%sconst unsigned int workDim = 3;\n" % (t) s += "%sconst unsigned int threadTile[2] = { %u, %u };\n" \ % (t, solution["ThreadTile0"], solution["ThreadTile1"]) s += "%sconst unsigned int groupSize[2] = { %u, %u };\n" \ % (t, solution["SubGroup0"], solution["SubGroup1"]) s += "%ssize_t localWorkSize[3] = { %3u, 1, 1 };\n" \ % (t, solution["NumThreads"]) s += "%ssize_t globalWorkSize[numKernels][3];\n" % (t) # grid size [2] s += "%sglobalWorkSize[0][2] = 1;\n" % (t) for i in range(0, solution["ProblemType"]["NumIndicesC"]): if i != solution["ProblemType"]["Index0"] and i != solution[ "ProblemType"]["Index1"]: s += "%sglobalWorkSize[0][2] *= size%s;\n" % ( t, self.indexChars[i]) # grid size [0,1] s += "%sunsigned int sizeOfC0 = size%s;\n" % (t, \ self.indexChars[solution["ProblemType"]["Index0"]]) s += "%sunsigned int sizeOfC1 = size%s;\n" % (t, \ self.indexChars[solution["ProblemType"]["Index1"]]) s += "%sunsigned int macroTile0 = static_cast<unsigned int>(groupSize[0] * threadTile[0]);\n" % ( t) s += "%sunsigned int macroTile1 = static_cast<unsigned int>(groupSize[1] * threadTile[1]);\n" % ( t) s += "%sunsigned int totalWorkGroups0 = sizeOfC0 / macroTile0;\n" % (t) s += "%sunsigned int totalWorkGroups1 = sizeOfC1 / macroTile1;\n" % (t) if kernel["EdgeType"] != "None": s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % ( t) s += "%sif (totalWorkGroups0*macroTile0 < sizeOfC0) { totalWorkGroups0++; }\n" % ( t) s += "%sif (totalWorkGroups1*macroTile1 < sizeOfC1) { totalWorkGroups1++; }\n" % ( t) if kernel["WorkGroupMappingType"] == "Z" and abs( kernel["WorkGroupMapping"]) == 2: s += "%sunsigned int totalWorkGroupsPow2 = totalWorkGroups0 > totalWorkGroups1 ? totalWorkGroups0 : totalWorkGroups1;\n" % ( t) s += "%stotalWorkGroupsPow2--;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 1;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 2;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 4;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 8;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 16;\n" % (t) s += "%stotalWorkGroupsPow2++;\n" % (t) s += "%stotalWorkGroups0 = totalWorkGroupsPow2;\n" % (t) s += "%stotalWorkGroups1 = totalWorkGroupsPow2;\n" % (t) if solution["GlobalSplitU"] > 1: s += "%stotalWorkGroups1 *= %u; // GlobalSplitU\n" % ( t, solution["GlobalSplitU"]) if solution["PersistentKernel"]: s += "%sglobalWorkSize[0][0] = deviceProperties.multiProcessorCount * %u;\n" \ % (t, solution["PersistentKernel"]) s += "%sglobalWorkSize[0][1] = 1;\n" % t else: s += "%sglobalWorkSize[0][0] = totalWorkGroups%u%s;\n" % ( t, 0 if kernel["WorkGroupMapping"] > 0 else 1, "*localWorkSize[0]" if self.language == "OCL" else "") s += "%sglobalWorkSize[0][1] = totalWorkGroups%u%s;\n" % ( t, 1 if kernel["WorkGroupMapping"] > 0 else 0, "*localWorkSize[1]" if self.language == "OCL" else "") # offsets s += "\n%s/* offsets */\n" % (t) s += "%sunsigned int offsets[numKernels][1][3];\n" % (t) for kernelIdx in range(0, len(kernels)): s += "%soffsets[%u][0][0] = offsetC; // tensorC\n" % (t, kernelIdx) s += "%soffsets[%u][0][1] = offsetA; // tensorA\n" % (t, kernelIdx) s += "%soffsets[%u][0][2] = offsetB; // tensorB\n" % (t, kernelIdx) # index sizes s += "\n%s/* index sizes */\n" % (t) s += "%sunsigned int sizes[numKernels][1][%u];\n" \ % (t, solution["ProblemType"]["TotalIndices"]) for kernelIdx in range(0, len(kernels)): kernel = kernels[kernelIdx] kernelName = self.kernelWriter.getKernelName(kernel) # free index sizes for i in range(0,solution["ProblemType"]["NumIndicesFree"] \ + solution["ProblemType"]["NumIndicesBatch"] ): s += "%ssizes[%u][0][%u] = size%s;\n" \ % (t, kernelIdx, i, self.indexChars[i]) # summation index sizes for i in range(solution["ProblemType"]["NumIndicesC"], \ solution["ProblemType"]["TotalIndices"] ): lastParam = i == solution["ProblemType"]["TotalIndices"] - 1 s += "%ssizes[%u][0][%u] = size%s;\n" \ % (t, kernelIdx, i, self.indexChars[i]) #s += "printf(\"Launching with grid=%zu_%zu problemGrid=%u_%u mt=%u_%u\\n\", globalWorkSize[0][0], globalWorkSize[0][1], totalWorkGroups0, totalWorkGroups1, macroTile0, macroTile1);\n" s += "\n" ######################################## # Enqueue Beta-Only Kernel ######################################## if solution["GlobalSplitU"] > 1: kernelNamesBetaOnly = [] numStridesC = solution["ProblemType"]["NumIndicesC"] - \ (0 if solution["ProblemType"]["UseInitialStrides"] else 1) for beta in solution.getKernelsBetaOnly(): kernelName = self.kernelWriter.getKernelNameBetaOnly(beta) kernelNamesBetaOnly.append(kernelName) s += "%s// enqueue Beta-Only kernel\n" % (t) # grid sizes s += "%ssize_t localWorkSizeBetaOnly[3] = { 8, 8, 1};\n" % (t) s += "%ssize_t globalWorkSizeBetaOnly[3];\n" % (t) #s += "%sunsigned int sizeOfC0 = size%s;\n" % (t, \ # self.indexChars[solution["ProblemType"]["Index0"]]) #s += "%sunsigned int sizeOfC1 = size%s;\n" % (t, \ # self.indexChars[solution["ProblemType"]["Index1"]]) s += "%ssize_t totalWorkGroupsBetaOnly0 = sizeOfC0 / localWorkSizeBetaOnly[0];\n" % ( t) s += "%ssize_t totalWorkGroupsBetaOnly1 = sizeOfC1 / localWorkSizeBetaOnly[1];\n" % ( t) s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % ( t) s += "%sif (totalWorkGroupsBetaOnly0*localWorkSizeBetaOnly[0] < sizeOfC0) { totalWorkGroupsBetaOnly0++; }\n" % ( t) s += "%sif (totalWorkGroupsBetaOnly1*localWorkSizeBetaOnly[1] < sizeOfC1) { totalWorkGroupsBetaOnly1++; }\n" % ( t) s += "%sglobalWorkSizeBetaOnly[0] = totalWorkGroupsBetaOnly0%s;\n" % ( t, "*localWorkSizeBetaOnly[0]" if self.language == "OCL" else "") s += "%sglobalWorkSizeBetaOnly[1] = totalWorkGroupsBetaOnly1%s;\n" % ( t, "*localWorkSizeBetaOnly[1]" if self.language == "OCL" else "") s += "%sglobalWorkSizeBetaOnly[2] = 1;\n" % (t) for i in range(0, solution["ProblemType"]["NumIndicesC"]): if i != solution["ProblemType"]["Index0"] and i != solution[ "ProblemType"]["Index1"]: s += "%sglobalWorkSizeBetaOnly[2] *= size%s;\n" % ( t, self.indexChars[i]) if solution["ProblemType"]["UseBeta"]: s += "%sbool betaZero = beta == 0;\n" % (t) if self.language == "OCL": if solution["ProblemType"]["UseBeta"]: s += "%scl_kernel kernelBetaOnly = betaZero ? kernel_%s : kernel_%s;\n" \ % (t, kernelNamesBetaOnly[0], kernelNamesBetaOnly[1]) else: #s += "%sbool betaZero = true;\n" % (t) s += "%scl_kernel kernelBetaOnly = kernel_%s;\n" \ % (t, kernelNamesBetaOnly[0]) argIdx = 0 s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % ( t, argIdx) argIdx += 1 s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &offsetC ); tensileStatusCheck(status);\n" % ( t, argIdx) argIdx += 1 # strides for i in range(0, numStridesC): s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % ( t, argIdx, self.strideList[i]) argIdx += 1 # sizes for i in range(0, solution["ProblemType"]["NumIndicesC"]): s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % ( t, argIdx, self.indexChars[i]) argIdx += 1 # beta if solution["ProblemType"]["UseBeta"]: s += "%sif (!betaZero) {\n" % (t) s += "%s status = clSetKernelArg( kernelBetaOnly, %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % ( t, argIdx, typeName) argIdx += 1 s += "%s}\n" % (t) # enqueue s += "%scl_event kernelEventBetaOnly;\n" % (t) s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t) t += " " s += "%sstream,\n" % (t) s += "%skernelBetaOnly,\n" % (t) s += "%sworkDim,\n" % (t) s += "%sNULL, // globalWorkOffset\n" % (t) s += "%sglobalWorkSizeBetaOnly,\n" % (t) s += "%slocalWorkSizeBetaOnly,\n" % (t) s += "%snumInputEvents,\n" % (t) s += "%sinputEvents,\n" % (t) #s += "%soutputEvent );\n" % (t) s += "%s&kernelEventBetaOnly );\n" % (t) t = t[2:] s += "%stensileStatusCheck(status);\n" % (t) if solution["ProblemType"]["UseBeta"]: s += "%sbeta = %s;\n" % ( t, solution["ProblemType"]["DataType"].zeroString( self.language, 1)) #s += "%sreturn tensileStatusSuccess;\n" % (t) s += "%sstatus = clFinish(stream);\n" % (t) s += "%stensileStatusCheck(status);\n" % (t) #s += " float tmp[128*128];\n" #s += "clEnqueueReadBuffer(stream, dataC, CL_TRUE, 0, 128*128*sizeof(float), tmp, 0, NULL, NULL);\n" #s += "for (unsigned int i = 0; i < 128*128; i++) { printf(\"%f\\n\", tmp[i]); }\n" else: s += "%sif( inputEvents != NULL )\n" % (t) t += " " s += "%shipEventRecord(inputEvents[0], stream );\n" % (t) t += " " s += "%stry {\n" % (t) if solution["ProblemType"]["UseBeta"]: s += "%sif (betaZero) {\n" % (t) t += " " s += "%shipLaunchKernelGGL(\n" % (t) t += " " s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[0]) s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % ( t) s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % ( t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sdataC,\n" % (t) s += "%soffsetC,\n" % (t) # strides for i in range(0, numStridesC): s += "%s%s,\n" % (t, self.strideList[i]) # sizes for i in range(0, solution["ProblemType"]["NumIndicesC"]): s += "%ssize%s%s" % ( t, self.indexChars[i], ",\n" if i < solution["ProblemType"]["NumIndicesC"] - 1 else ");\n") if solution["ProblemType"]["UseBeta"]: s += "%s} else {\n" % (t) s += "%shipLaunchKernelGGL(\n" % (t) t += " " s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[1]) s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % ( t) s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % ( t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sdataC,\n" % (t) s += "%soffsetC,\n" % (t) # strides for i in range(0, numStridesC): s += "%s%s,\n" % (t, self.strideList[i]) # sizes for i in range(0, solution["ProblemType"]["NumIndicesC"]): s += "%ssize%s,\n" % (t, self.indexChars[i]) s += "%sbeta);\n" % (t) s += "%s}\n" % (t) s += "%s} catch (const std::exception& e) {\n" % (t) s += "#ifdef DEBUG\n" s += "%s std::cerr << e.what() << std::endl;\n" % (t) s += "#endif\n" s += "%s return tensileStatusFailure;\n" % (t) s += "%s}\n" % (t) ######################################## # Enqueue Kernels ######################################## for kernelIdx in range(0, len(kernels)): kernel = kernels[kernelIdx] if kernel["KernelLanguage"] == "Source": kernel["ISA"] = ( 0, 0, 0) # HIP source kernels needs dummy ISA version kernelName = self.kernelWriter.getKernelName(kernel) s += "\n%s/* kernel %u: %s */\n" % (t, kernelIdx, kernelName) s += "%sunsigned int kernelIdx = %u;\n" % (t, kernelIdx) if self.language == "OCL": # set kernel args same for all enqueues s += "%s// kernel args same for all enqueues\n" % (t) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % ( t, 0) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataA ); tensileStatusCheck(status);\n" % ( t, 1) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataB ); tensileStatusCheck(status);\n" % ( t, 2) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &alpha ); tensileStatusCheck(status);\n" % ( t, 3, typeName) s += "%s%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (t, \ "" if solution["ProblemType"]["UseBeta"] else "//", 4, typeName) argIdx = 5 if solution["ProblemType"]["UseBeta"] else 4 argIdx += 3 # skipping offsets here for stride in self.strideList: s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % ( t, argIdx, stride) argIdx += 1 for sizeIdx in range(0, solution["ProblemType"]["TotalIndices"]): if sizeIdx not in [ solution["ProblemType"]["Index0"], solution["ProblemType"]["Index1"], solution["ProblemType"]["IndexUnroll"] ]: s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % ( t, argIdx, self.indexChars[sizeIdx]) argIdx += 1 s += "%sfor (unsigned int enqueueIdx = 0; enqueueIdx < numEnqueues[%u]; enqueueIdx++) {\n" % ( t, kernelIdx) t += " " # debug print kernel dimensions if globalParameters["LibraryPrintDebug"]: s += "%sprintf(\"%s: g{ %%u, %%u, %%u } l{ %%u, %%u, %%u}\\n\", static_cast<unsigned int>(globalWorkSize[kernelIdx][0]), static_cast<unsigned int>(globalWorkSize[kernelIdx][1]), static_cast<unsigned int>(globalWorkSize[kernelIdx][2]), static_cast<unsigned int>(localWorkSize[0]), static_cast<unsigned int>(localWorkSize[1]), static_cast<unsigned int>(localWorkSize[2]) );\n" % ( t, kernelName) # debug print kernel arguments # offsets for i in range(0, 3): s += "%sprintf(\" offset[%u] = %%u\\n\", offsets[kernelIdx][enqueueIdx][%u]);\n" % ( t, i, i) # strides for stride in self.strideList: s += "%sprintf(\" %s = %%u\\n\", %s);\n" % (t, stride, stride) # sizes for i in range(0, solution["ProblemType"]["TotalIndices"]): s += "%sprintf(\" sizes[kernelIdx][enqueueIdx][%u] = %%u\\n\", sizes[kernelIdx][enqueueIdx][%u] );\n" % ( t, i, i) ######################################## # OpenCL Runtime ######################################## if self.language == "OCL": # set kernel args different for all enqueues argIdx = 5 if solution["ProblemType"]["UseBeta"] else 4 # offsets s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &offsets[kernelIdx][enqueueIdx][0]); tensileStatusCheck(status);\n" % ( t, argIdx) argIdx += 1 s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &offsets[kernelIdx][enqueueIdx][1]); tensileStatusCheck(status);\n" % ( t, argIdx) argIdx += 1 s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &offsets[kernelIdx][enqueueIdx][2]); tensileStatusCheck(status);\n" % ( t, argIdx) argIdx += 1 argIdx += len(self.strideList) # sizes for sizeIdx in range(0, solution["ProblemType"]["TotalIndices"]): if sizeIdx in [ solution["ProblemType"]["Index0"], solution["ProblemType"]["Index1"], solution["ProblemType"]["IndexUnroll"] ]: s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % ( t, argIdx, self.indexChars[sizeIdx]) argIdx += 1 # enqueue s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t) t += " " s += "%sstream,\n" % (t) s += "%skernels[kernelIdx],\n" % (t) s += "%sworkDim,\n" % (t) s += "%sNULL, // globalWorkOffset\n" % (t) s += "%sglobalWorkSize[kernelIdx],\n" % (t) s += "%slocalWorkSize,\n" % (t) if False: # solution["GlobalSplitU"] > 1: s += "%s1,\n" % (t) s += "%s&kernelEventBetaOnly,\n" % (t) else: s += "%snumInputEvents,\n" % (t) s += "%sinputEvents,\n" % (t) s += "%soutputEvent );\n" % (t) s += "%stensileStatusCheck(status);\n" % (t) ######################################## # HIP Runtime ######################################## else: if not globalParameters["PreciseKernelTime"] or solution[ "KernelLanguage"] == "Source": s += "%sif( inputEvents != NULL )\n" % (t) t += " " s += "%shipEventRecord(inputEvents[enqueueIdx], stream );\n" % ( t) t = t[2:] s += "%stry {\n" % (t) t += " " # hip kernel if solution["KernelLanguage"] == "Source": s += "%shipLaunchKernelGGL(\n" % (t) t += " " s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelName) s += "%sdim3(globalWorkSize[kernelIdx][0], globalWorkSize[kernelIdx][1], globalWorkSize[kernelIdx][2]),\n" % ( t) s += "%sdim3(localWorkSize[0], localWorkSize[1], localWorkSize[2]),\n" % ( t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sdataC,\n" % (t) s += "%sdataA,\n" % (t) s += "%sdataB,\n" % (t) s += "%salpha,\n" % (t) s += "%s%sbeta,\n" % (t, \ "" if solution["ProblemType"]["UseBeta"] else "//") s += "%soffsets[kernelIdx][enqueueIdx][0],\n" % (t) s += "%soffsets[kernelIdx][enqueueIdx][1],\n" % (t) s += "%soffsets[kernelIdx][enqueueIdx][2],\n" % (t) # strides for stride in self.strideList: s += "%s%s,\n" % (t, stride) # sizes for i in range(0, solution["ProblemType"]["TotalIndices"]): lastParam = i == solution["ProblemType"][ "TotalIndices"] - 1 s += "%ssizes[kernelIdx][enqueueIdx][%u]%s\n" \ % (t, i, "" if lastParam else "," ) if solution["PersistentKernel"]: s += "%s,totalWorkGroups%u\n" % ( t, 0 if kernel["WorkGroupMapping"] > 0 else 1) s += "%s,totalWorkGroups%u\n" % ( t, 1 if kernel["WorkGroupMapping"] > 0 else 0) s += "%s);\n" % (t) # assembly kernel else: if globalParameters["DebugKernel"]: s += "%sconst unsigned int debugBufferElementsPerThread = 16;\n" % t s += "%sunsigned int debugBufferNumElem = debugBufferElementsPerThread;\n" % ( t) s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][0]);\n" % ( t) s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][1]);\n" % ( t) s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][2]);\n" % ( t) s += "%sdebugBufferNumElem *= localWorkSize[0];\n" % ( t) s += "%sdebugBufferNumElem *= localWorkSize[1];\n" % ( t) s += "%sdebugBufferNumElem *= localWorkSize[2];\n" % ( t) s += "%s printf(\"debugBufferNumElem: %%04i: \\n\", debugBufferNumElem);\n" % ( t) s += "%ssize_t debugBufferSize = debugBufferNumElem * sizeof(unsigned int);\n" % ( t) s += "%shipDevice_t device;\n" % t s += "%shipDeviceGet(&device, 0);\n" % t s += "%shipMalloc(&(hipFunctionArgs.debugBuffer), debugBufferSize);\n" % t s += "%sunsigned int *debugBufferHostPtr = new unsigned int[debugBufferNumElem];\n" % ( t) s += "%smemset(debugBufferHostPtr,0,debugBufferSize);\n" % ( t) s += "%shipMemcpyHtoD(hipFunctionArgs.debugBuffer, debugBufferHostPtr, debugBufferSize);\n" % ( t) s += "%smemset(debugBufferHostPtr,1,debugBufferSize);\n" % ( t) # hip assembly function s += "%shipFunctionArgs.dataC = dataC;\n" % (t) s += "%shipFunctionArgs.dataA = dataA;\n" % (t) s += "%shipFunctionArgs.dataB = dataB;\n" % (t) if solution["ProblemType"]["DataType"].isHalf(): s += "%shipFunctionArgs.alpha[0] = alpha;\n" % (t) s += "%shipFunctionArgs.alpha[1] = alpha;\n" % (t) else: s += "%shipFunctionArgs.alpha = alpha;\n" % (t) if solution["ProblemType"]["UseBeta"]: if solution["ProblemType"]["DataType"].isHalf(): s += "%shipFunctionArgs.beta[0] = beta;\n" % (t) s += "%shipFunctionArgs.beta[1] = beta;\n" % (t) else: s += "%shipFunctionArgs.beta = beta;\n" % (t) s += "%shipFunctionArgs.offsetC = offsets[kernelIdx][enqueueIdx][0];\n" % ( t) s += "%shipFunctionArgs.offsetA = offsets[kernelIdx][enqueueIdx][1];\n" % ( t) s += "%shipFunctionArgs.offsetB = offsets[kernelIdx][enqueueIdx][2];\n" % ( t) # strides for stride in self.strideList: s += "%shipFunctionArgs.%s = %s;\n" % (t, stride, stride) # sizes for i in range(0, solution["ProblemType"]["TotalIndices"]): lastParam = i == solution["ProblemType"][ "TotalIndices"] - 1 s += "%shipFunctionArgs.size%s = sizes[kernelIdx][enqueueIdx][%u];\n" \ % (t, globalParameters["IndexChars"][i], i ) if solution["PersistentKernel"]: # pass in the number of groups since not available in WG s += "%shipFunctionArgs.numGroupTiles0 = totalWorkGroups0;\n" % ( t) s += "%shipFunctionArgs.numGroupTiles1 = totalWorkGroups1;\n" % ( t) s += "%shipHccModuleLaunchKernel(\n" % (t) t += " " s += "%shipFunction,\n" % (t) s += "%sglobalWorkSize[kernelIdx][0]*localWorkSize[0],\n" % ( t) s += "%sglobalWorkSize[kernelIdx][1]*localWorkSize[1],\n" % ( t) s += "%sglobalWorkSize[kernelIdx][2]*localWorkSize[2],\n" % ( t) s += "%slocalWorkSize[0],\n" % (t) s += "%slocalWorkSize[1],\n" % (t) s += "%slocalWorkSize[2],\n" % (t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sNULL,\n" % (t) s += "%s(void**)hipLaunchParams\n" % (t) if globalParameters["PreciseKernelTime"]: s += "%s,inputEvents ? inputEvents[enqueueIdx]:nullptr\n" % ( t) s += "%s,outputEvent ? outputEvent[enqueueIdx]:nullptr\n" % ( t) s += "%s);\n" % (t) t = t[2:] if globalParameters["DebugKernel"]: # copy debug buffer s += "%shipMemcpyDtoH(debugBufferHostPtr, hipFunctionArgs.debugBuffer, debugBufferSize);\n" % ( t) s += "%sfor(unsigned int i = 0; i < debugBufferNumElem/debugBufferElementsPerThread; i++) {\n" % ( t) s += "%s printf(\"%%04i\", i);\n" % (t) s += "%s char u[debugBufferElementsPerThread] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};\n" % ( t) #s += "%s char u[debugBufferElementsPerThread] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};\n" % (t) #s += "%s char u[debugBufferElementsPerThread] = {1,1,0,0,1,1,0,0,1,1,1,1,1,1,1,1};\n" % (t) s += "%s for(unsigned int j = 0; j < debugBufferElementsPerThread; j++) {\n" % ( t) s += "%s if (u[j]) printf(\",%%4u\", debugBufferHostPtr[i*debugBufferElementsPerThread+j]);\n" % ( t) s += "%s else printf(\",%%4.0f\", ((float *)debugBufferHostPtr)[i*debugBufferElementsPerThread+j]);\n" % ( t) s += "%s }\n" % (t) s += "%s printf(\"\\n\");\n" % (t) s += "%s}\n" % (t) t = t[2:] s += "%s} catch (const std::exception& e) {\n" % (t) s += "#ifdef DEBUG\n" s += "%s std::cerr << e.what() << std::endl;\n" % (t) s += "#endif\n" s += "%s return tensileStatusFailure;\n" % (t) s += "%s}\n" % (t) if not globalParameters["PreciseKernelTime"] or solution[ "KernelLanguage"] == "Source": s += "%sif( outputEvent != NULL )\n" % (t) s += "%s hipEventRecord(outputEvent[enqueueIdx], stream );\n" % ( t) s += " }\n" s += "\n" s += " return tensileStatusSuccess;\n" s += "}\n" s += "\n" s += "/* Solution Parameters\n" s += Solution.getParametersIndented(solution.state, " ") s += "*/\n" s += "\n" return s
def getProblemSourceString(self, problemType, solution, kernelsWithBuildErrs): gsu = solution["GlobalSplitU"] persistent = solution["PersistentKernel"] kernelLanguage = solution["KernelLanguage"] tt0 = solution["ThreadTile0"] tt1 = solution["ThreadTile1"] sg0 = solution["SubGroup0"] sg1 = solution["SubGroup1"] nt = solution["NumThreads"] kernels = solution.getKernels() kernelNames = [] kernelBuildErr = 0 for kernel in kernels: kernelName = self.kernelWriter.getKernelName(kernel) if kernelName in kernelsWithBuildErrs: kernelBuildErr = 1 kernelNames.append( kernelName ) s = "" t = "" # includes problemType = solution["ProblemType"] # shortcut if not globalParameters["MergeFiles"]: solutionName = self.getSolutionName(solution) s += "#include \"%s.h\"\n" % solutionName s += "\n" # problem function signature #argList = self.getArgList(problemType, True, True, True, True) #for i in range(0, len(argList)): # argString = "%s %s" % argList[i] # s += "%s%s%s" % (t, argString, ",\n" if i < len(argList)-1 else ")" ) s += self.getSolutionSignature(solution) s += " {\n" if kernelBuildErr: s += "%s return tensileStatusFailure; // One or more kernels had build failures (%s)\n" % (t, kernelNames) s += "%s}\n" % (t) return s t += " " s += "%sTensileStatus status;\n" % (t) # hipFunction Struct if kernelLanguage == "Assembly": s += "\n" s += "%s/* module function args */\n" % (t) s += "%sstruct {\n" % t t += " " if globalParameters["DebugKernel"]: s += "%sunsigned int *debugBuffer;\n" % t # Tensor sizes in elements, including only packed dims, # and accounting for zero or other strides < size # Place these first in the structure since they are 64-bits # and need to avoid any unneeded padding: s += "%s// Size of Tensor's packed dims, in elements\n" % t s += "%suint64_t tensor2dSizeC;\n" % t s += "%suint64_t tensor2dSizeA;\n" % t s += "%suint64_t tensor2dSizeB;\n" % t solutionArgs = self.getArgList(problemType, False, True, False, False) for arg in solutionArgs: if arg[0] == "TensileHalf": s += "%s%s %s[2];\n" % (t, arg[0], arg[1]) else: s += "%s%s %s;\n" % (t, arg[0], arg[1]) for idxChar in solution["PackedC0Indices"][:-1]: s += "%sunsigned magicNumberSize%s;\n" % (t, idxChar) s += "%sunsigned magicShiftSize%s;\n" % (t, idxChar) for idxChar in solution["PackedC1Indices"][:-1]: s += "%sunsigned magicNumberSize%s;\n" % (t, idxChar) s += "%sunsigned magicShiftSize%s;\n" % (t, idxChar) # number of unroll loop iterations to stagger the start in "U" dim. s += "%sint staggerUIter;\n" % t # persistent s += "%sunsigned int problemNumGroupTiles0;\n" % t s += "%sunsigned int problemNumGroupTiles1;\n" % t s += "%sunsigned int magicNumberProblemNumGroupTiles0;\n" % t s += "%sunsigned int gridNumWorkGroups0;\n" % t s += "%sunsigned int numFullBlocks;\n" % t s += "%sunsigned int wgmRemainder1;\n" % t s += "%sunsigned int magicNumberWgmRemainder1;\n" % t s += "%sunsigned int pad;\n" % t # FIXME can this be removed? t = t[2:] s += "%s} hipFunctionArgs;\n" % t #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t s += "%ssize_t hipFunctionArgsSize = sizeof(hipFunctionArgs);\n" % t s += "%svoid *hipLaunchParams[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &hipFunctionArgs, HIP_LAUNCH_PARAM_BUFFER_SIZE, &hipFunctionArgsSize, HIP_LAUNCH_PARAM_END};\n" % t #s += "%sprintf(\"size: %%lu\\n\", sizeof(unsigned int));\n" % t #s += "%sprintf(\"hipFunctionArgsSize: %%lu\\n\", sizeof(hipFunctionArgs));\n" % t #for arg in solutionArgs: # s += "%sprintf(\"%s: %%lu\\n\", static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)) - static_cast<char*>(static_cast<void*>(&hipFunctionArgs.%s)));\n" % (t, arg[1], arg[1], solutionArgs[0][1]) # NOTE: host compiler aligns size of structs to 64-bits (at least) and aligns the offset of pointers to 64-bits, therefore, having pointers which are not at the beginning of the struct may get padded/shifted by the host compiler and, therefore, not coppied correctly to gpu if globalParameters["RuntimeLanguage"] == "HIP": s += "%sint deviceId;\n" % (t) s += "%shipGetDevice(&deviceId);\n" % (t) # kernels s += "\n%s/* kernels */\n" % (t) s += "%sconst unsigned int numKernels = %u; // 1 or 4\n" % (t, len(kernels)) if kernelLanguage == "Source" and globalParameters["RuntimeLanguage"] == "OCL": s += "%sconst char *kernelSources[numKernels] = {\n" % (t) t += " " for kernelIdx in range(0, len(kernelNames)): kernelName = kernelNames[kernelIdx] s += "%s%s_src%s\n" % (t, kernelName, \ "," if kernelIdx < len(kernels)-1 else "" ) t = t[2:] s += "%s};\n" % (t) s += "%scl_kernel kernels[numKernels];\n" % (t) s += "%sconst char *buildOptions = \"-cl-std=cl2.0\";\n" % (t) s += "%sfor (unsigned int i = 0; i < numKernels; i++) {\n" % (t) s += "%s tensileGetCompiledOpenCLKernel(\n" % (t) s += "%s &kernels[i],\n" % (t) s += "%s kernelSources[i],\n" % (t) s += "%s stream,\n" % (t) s += "%s buildOptions);\n" % (t) s += "%s}\n" % (t) if gsu > 1: for beta in Solution.getKernelsBetaOnlyFromProblem(problemType, gsu): kernelName = self.kernelWriter.getKernelNameBetaOnly(beta) s += "%scl_kernel kernel_%s;\n" % (t, kernelName) s += "%s tensileGetCompiledOpenCLKernel(\n" % (t) s += "%s &kernel_%s,\n" % (t, kernelName) s += "%s %s_src,\n" % (t, kernelName) s += "%s stream,\n" % (t) s += "%s buildOptions);\n" % (t) elif kernelLanguage == "Assembly": kernel = kernels[0] s += "%shipFunction_t hipFunction;\n" % (t) # if !CodeFromFiles then pass global _coba that points to code object s += "%sstatus = solutionLock->getFunction(&hipFunction, deviceId, \"%s\", %s);;\n" \ % (t, kernelName, "nullptr" if globalParameters["CodeFromFiles"] else kernelName+"_coba" ) s += "%sif (status) return status;\n" % (t) typeName = problemType["DataType"].toCpp() # num enqueues s += "\n%s/* num kernels */\n" % (t) s += "%sunsigned int numEnqueues[numKernels] = { 1" % (t) for i in range(1, len(kernels)): s += ", 1" s += " };\n" # grid size s += "\n%s/* grid sizes */\n" % (t) s += "%sconst unsigned int workDim = 3;\n" % (t) s += "%sconst unsigned int threadTile[2] = { %u, %u };\n" \ % (t, tt0, tt1) s += "%sconst unsigned int groupSize[2] = { %u, %u };\n" \ % (t, sg0, sg1) s += "%ssize_t localWorkSize[3] = { %3u, 1, 1 };\n" \ % (t, nt) s += "%ssize_t globalWorkSize[numKernels][3];\n" % (t) # grid size [2] s += "%sglobalWorkSize[0][2] = 1;\n" % (t) for i in range(0, problemType["NumIndicesC"]): if i != problemType["Index0"] and i != problemType["Index1"]: s += "%sglobalWorkSize[0][2] *= size%s;\n" % (t, self.indexChars[i]) s += "%sunsigned int sizeOfC0 = " % (t) s += " * ".join(["size" + i for i in solution["PackedC0Indices"]]) s += ";\n" s += "%sunsigned int sizeOfC1 = " % (t) s += " * ".join(["size" + i for i in solution["PackedC1Indices"]]) s += ";\n" for idxChar in solution["PackedC0Indices"][:-1]: s += "%sunsigned magicShiftSize%s = 33; // bozo, review\n" % (t, idxChar) s += "%sunsigned magicNumberSize%s = (1L<<magicShiftSize%s) / size%s + 1; // bozo, review\n" \ % (t, idxChar, idxChar, idxChar) for idxChar in solution["PackedC1Indices"][:-1]: s += "%sunsigned magicShiftSize%s = 33; // bozo, review\n" % (t, idxChar) s += "%sunsigned magicNumberSize%s = (1L<<magicShiftSize%s) / size%s + 1; // bozo, review\n" \ % (t, idxChar, idxChar, idxChar) s += "%sunsigned int macroTile0 = static_cast<unsigned int>(groupSize[0] * threadTile[0]);\n" % (t) s += "%sunsigned int macroTile1 = static_cast<unsigned int>(groupSize[1] * threadTile[1]);\n" % (t) s += "%sunsigned int totalWorkGroups0 = sizeOfC0 / macroTile0;\n" % (t) s += "%sunsigned int totalWorkGroups1 = sizeOfC1 / macroTile1;\n" % (t) if kernel["EdgeType"] != "None": s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % (t) s += "%sif (totalWorkGroups0*macroTile0 < sizeOfC0) { totalWorkGroups0++; }\n" % (t) s += "%sif (totalWorkGroups1*macroTile1 < sizeOfC1) { totalWorkGroups1++; }\n" % (t) if kernel["WorkGroupMappingType"] == "Z" and abs(kernel["WorkGroupMapping"]) == 2: s += "%sunsigned int totalWorkGroupsPow2 = totalWorkGroups0 > totalWorkGroups1 ? totalWorkGroups0 : totalWorkGroups1;\n" % (t) s += "%stotalWorkGroupsPow2--;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 1;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 2;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 4;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 8;\n" % (t) s += "%stotalWorkGroupsPow2 |= totalWorkGroupsPow2 >> 16;\n" % (t) s += "%stotalWorkGroupsPow2++;\n" % (t) s += "%stotalWorkGroups0 = totalWorkGroupsPow2;\n" % (t) s += "%stotalWorkGroups1 = totalWorkGroupsPow2;\n" % (t) # persistent: s += "%sunsigned int problemNumGroupTiles0 = totalWorkGroups0;\n" % (t) s += "%sunsigned int problemNumGroupTiles1 = totalWorkGroups1;\n" % (t) s += "%sconst unsigned smallNumMagicShift = 31; // bozo, review\n" % (t) s += "%sunsigned magicNumberProblemNumGroupTiles0 = (1L<<smallNumMagicShift) / problemNumGroupTiles0 + 1; // bozo, review\n" % (t) s += "%sunsigned numFullBlocks = problemNumGroupTiles1 / %u; // divide by WorkGroupMapping\n" \ % (t, abs(kernel["WorkGroupMapping"]) if abs(kernel["WorkGroupMapping"])>0 else 1) s += "%sunsigned wgmRemainder1 = %u ? (problemNumGroupTiles1 %% %u) : 0;\n" % \ (t, abs(kernel["WorkGroupMapping"]), abs(kernel["WorkGroupMapping"])) s += "%sif (wgmRemainder1 == 0) wgmRemainder1 = %u;\n" % (t, abs(kernel["WorkGroupMapping"])) s += "%sunsigned magicNumberWgmRemainder1 = ((1L<<smallNumMagicShift) / wgmRemainder1 + 1);\n" % (t) #s += ' printf ("wgmRemainder1=%u \\n", wgmRemainder1);' if gsu> 1: s += "%stotalWorkGroups1 *= %u; // GlobalSplitU\n" % (t, gsu) if persistent: s += "%shipDeviceProp_t deviceProperties;\n" % (t) # TODO - should cache the device properties - expensive to call on each iteration here: s += "%shipGetDeviceProperties( &deviceProperties, deviceId );\n" % (t) s += "%sunsigned int numGroups = totalWorkGroups0 * totalWorkGroups1;\n" % (t) s += "%sglobalWorkSize[0][0] = (deviceProperties.multiProcessorCount * %u < numGroups) ? (deviceProperties.multiProcessorCount * %u) : numGroups;\n" \ % (t, persistent, persistent) s += "%sglobalWorkSize[0][1] = 1;\n" % t else: s += "%sglobalWorkSize[0][0] = totalWorkGroups%u%s;\n" % (t, 0 if kernel["WorkGroupMapping"] >= 0 else 1, "*localWorkSize[0]" if self.language == "OCL" else "") s += "%sglobalWorkSize[0][1] = totalWorkGroups%u%s;\n" % (t, 1 if kernel["WorkGroupMapping"] >= 0 else 0, "*localWorkSize[1]" if self.language == "OCL" else "") # index sizes s += "\n%s/* index sizes */\n" % (t) s += "%sunsigned int sizes[numKernels][1][%u];\n" \ % (t, problemType["TotalIndices"]) for kernelIdx in range(0, len(kernels)): kernel = kernels[kernelIdx] kernelName = self.kernelWriter.getKernelName(kernel) # free index sizes for i in range(0,problemType["NumIndicesFree"] \ + problemType["NumIndicesBatch"] ): s += "%ssizes[%u][0][%u] = size%s;\n" \ % (t, kernelIdx, i, self.indexChars[i]) # summation index sizes for i in range(problemType["NumIndicesC"], \ problemType["TotalIndices"] ): lastParam = i == problemType["TotalIndices"]-1 s += "%ssizes[%u][0][%u] = size%s;\n" \ % (t, kernelIdx, i, self.indexChars[i]) # Tensor2DSizes - size excluding the batch dimension, accounts for cases where one of strides is 0 #print "IndexAssignmentsA=", problemType["IndexAssignmentsA"], "Batch=", problemType["IndicesBatch"] firstStride = 0 if problemType["UseInitialStrides"] else 1 del i numIdx = problemType["NumIndicesC"] printMe = 0 s += "%suint64_t tensor2dSizeC = %s" % \ (t, "1" if firstStride==1 else "strideC%u%s"% (0,self.indexChars[0])) for idx in range(0,numIdx): # Multiply only by packed tensor dims if idx in problemType["IndicesFree"]: printMe = True else: printMe = False if printMe: if idx+1 < numIdx: strideIdx = idx+1 s += " * std::max(size%s, strideC%u%s)" % \ (self.indexChars[idx], idx+1, self.indexChars[strideIdx]) else: s += " * size%s" % (self.indexChars[idx]) s += ";\n" s += "%suint64_t tensor2dSizeA = 1;\n" % t numIdx = len(problemType["IndexAssignmentsA"]) printMe = printedSum = False for i in range(0,numIdx): idx = problemType["IndexAssignmentsA"][i] # Multiply only by first free and first summation if idx in [ord(x)-ord(globalParameters["IndexChars"][0]) for x in solution["PackedC0Indices"]]: printMe = True elif idx in problemType["IndicesSummation"] and not printedSum: printMe = printedSum = True else: printMe = False if printMe: s += "%stensor2dSizeA = " % t if i+1 < numIdx: strideIdx = problemType["IndexAssignmentsA"][i+1] s += "std::max(tensor2dSizeA*size%s, (uint64_t)strideA%u%s);\n" \ % (self.indexChars[idx], i+1, self.indexChars[strideIdx]) else: s += " tensor2dSizeA * size%s" % (self.indexChars[idx]) s += ";\n" s += "%suint64_t tensor2dSizeB = 1;\n" % t numIdx = len(problemType["IndexAssignmentsB"]) printMe = printedSum = False for i in range(0,numIdx): idx = problemType["IndexAssignmentsB"][i] # Multiply only by first free and first summation if idx in [ord(x)-ord(globalParameters["IndexChars"][0]) for x in solution["PackedC1Indices"]]: printMe = True elif idx in problemType["IndicesSummation"] and not printedSum: printMe = printedSum = True else: printMe = False if printMe: s += "%stensor2dSizeB = " % t if i+1 < numIdx: strideIdx = problemType["IndexAssignmentsB"][i+1] s += "std::max(tensor2dSizeB*size%s, (uint64_t)strideB%u%s);\n" \ % (self.indexChars[idx], i+1, self.indexChars[strideIdx]) else: s += " tensor2dSizeB * size%s" % (self.indexChars[idx]) s += ";\n" unrollChar = globalParameters["IndexChars"][problemType["IndexUnroll"]] s += " unsigned int staggerUIter = %s; // how many stride-sized clicks to stagger start offset\n" \ % (solution["StaggerU"]) s += " int unrollLoopIters = size%s/%u/%u; // /DepthU/GSU\n" % (unrollChar, solution["DepthU"], gsu) s += " while (staggerUIter>1) {\n" s += " if (unrollLoopIters >= (staggerUIter*%u)) {\n" % (1<<solution["_staggerStrideShift"]) s += " break;}\n" s += " staggerUIter /= 2; // step down to smaller stagger\n" s += " }\n" s += " if (staggerUIter>=1) staggerUIter -= 1;\n" # convert to a mask #s += ' printf ("size%s=%%u StaggerU=%s unrollLoopIters=%%u, staggerUIter=%%d\\n", size%s, unrollLoopIters, staggerUIter);\n' % (unrollChar, solution["StaggerU"], unrollChar) #s += "printf(\"Launching with grid=%zu_%zu problemGrid=%u_%u mt=%u_%u\\n\", globalWorkSize[0][0], globalWorkSize[0][1], totalWorkGroups0, totalWorkGroups1, macroTile0, macroTile1);\n" s += "\n" s += "%sint kernelsLaunched=0;\n" % (t) ######################################## # Enqueue Beta-Only Kernel ######################################## if gsu > 1: kernelNamesBetaOnly = [] numStridesC = problemType["NumIndicesC"] - \ (0 if problemType["UseInitialStrides"] else 1) for beta in Solution.getKernelsBetaOnlyFromProblem(problemType, gsu): kernelName = self.kernelWriter.getKernelNameBetaOnly(beta) kernelNamesBetaOnly.append(kernelName) s += "%s// enqueue Beta-Only kernel\n" % (t) # grid sizes s += "%ssize_t localWorkSizeBetaOnly[3] = { 8, 8, 1};\n" % (t) s += "%ssize_t globalWorkSizeBetaOnly[3];\n" % (t) #s += "%sunsigned int sizeOfC0 = size%s;\n" % (t, \ # self.indexChars[problemType["Index0"]]) #s += "%sunsigned int sizeOfC1 = size%s;\n" % (t, \ # self.indexChars[problemType["Index1"]]) s += "%ssize_t totalWorkGroupsBetaOnly0 = sizeOfC0 / localWorkSizeBetaOnly[0];\n" % (t) s += "%ssize_t totalWorkGroupsBetaOnly1 = sizeOfC1 / localWorkSizeBetaOnly[1];\n" % (t) s += "%s// b/c single kernel, add extra work-group here if edge needed\n" % (t) s += "%sif (totalWorkGroupsBetaOnly0*localWorkSizeBetaOnly[0] < sizeOfC0) { totalWorkGroupsBetaOnly0++; }\n" % (t) s += "%sif (totalWorkGroupsBetaOnly1*localWorkSizeBetaOnly[1] < sizeOfC1) { totalWorkGroupsBetaOnly1++; }\n" % (t) s += "%sglobalWorkSizeBetaOnly[0] = totalWorkGroupsBetaOnly0%s;\n" % (t, "*localWorkSizeBetaOnly[0]" if self.language == "OCL" else "") s += "%sglobalWorkSizeBetaOnly[1] = totalWorkGroupsBetaOnly1%s;\n" % (t, "*localWorkSizeBetaOnly[1]" if self.language == "OCL" else "") s += "%sglobalWorkSizeBetaOnly[2] = 1;\n" % (t) for i in range(0, problemType["NumIndicesC"]): if i != problemType["Index0"] and i != problemType["Index1"]: s += "%sglobalWorkSizeBetaOnly[2] *= size%s;\n" % (t, self.indexChars[i]) if problemType["UseBeta"]: s += "%sbool betaZero = beta == 0;\n" % (t) if self.language == "OCL": if problemType["UseBeta"]: s += "%scl_kernel kernelBetaOnly = betaZero ? kernel_%s : kernel_%s;\n" \ % (t, kernelNamesBetaOnly[0], kernelNamesBetaOnly[1]) else: #s += "%sbool betaZero = true;\n" % (t) s += "%scl_kernel kernelBetaOnly = kernel_%s;\n" \ % (t, kernelNamesBetaOnly[0]) argIdx = 0 s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % (t, argIdx); argIdx+=1 # strides for i in range(0,numStridesC): s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.strideList[i]); argIdx+=1 # sizes for i in range(0, problemType["NumIndicesC"]): s += "%sstatus = clSetKernelArg( kernelBetaOnly, %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.indexChars[i]); argIdx+=1 # beta if problemType["UseBeta"]: s += "%sif (!betaZero) {\n" % (t) s += "%s status = clSetKernelArg( kernelBetaOnly, %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (t, argIdx, typeName); argIdx+=1 s += "%s}\n" % (t) # enqueue s += "%scl_event kernelEventBetaOnly;\n" % (t) s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t) t += " " s += "%sstream,\n" % (t) s += "%skernelBetaOnly,\n" % (t) s += "%sworkDim,\n" % (t) s += "%sNULL, // globalWorkOffset\n" % (t) s += "%sglobalWorkSizeBetaOnly,\n" % (t) s += "%slocalWorkSizeBetaOnly,\n" % (t) s += "%snumInputEvents,\n" % (t) s += "%sinputEvents,\n" % (t) #s += "%soutputEvent );\n" % (t) s += "%s&kernelEventBetaOnly );\n" % (t) t = t[2:] s += "%stensileStatusCheck(status);\n" % (t) if problemType["UseBeta"]: s += "%sbeta = %s;\n" % (t, problemType["DataType"].zeroString(self.language, 1) ) #s += "%sreturn tensileStatusSuccess;\n" % (t) s += "%sstatus = clFinish(stream);\n" % (t) s += "%stensileStatusCheck(status);\n" % (t) #s += " float tmp[128*128];\n" #s += "clEnqueueReadBuffer(stream, dataC, CL_TRUE, 0, 128*128*sizeof(float), tmp, 0, NULL, NULL);\n" #s += "for (unsigned int i = 0; i < 128*128; i++) { printf(\"%f\\n\", tmp[i]); }\n" else: s += "%stry {\n" % (t) # TODO - timing with beta kernels is somewhat pessimistic since it has this separate event only on the GSU path. # Introduces 2-3us of overhead ; may want to disable PreciseKernelTime so non-GSU have same overhead. # Long-term fix would be to launch the beta kernel with the hipHccModule* API and set start-event in that call if problemType["UseBeta"]: s += "%sif (betaZero) {\n" % (t) t += " " s += "%sif( inputEvents != NULL )\n" % (t) s += "%s hipEventRecord(inputEvents[0], stream );\n" % (t) s += "%skernelsLaunched++;\n" % (t) s += "%shipLaunchKernelGGL(\n" % (t) t += " " s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[0]) s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % (t) s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % (t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sdataD,\n" % (t) s += "%sdataC,\n" % (t) # strides for i in range(0,numStridesC*2): s += "%s%s,\n" % (t, self.strideList[i]) # sizes for i in range(0, problemType["NumIndicesC"]): s += "%ssize%s%s" % (t, self.indexChars[i], ",\n" if i < problemType["NumIndicesC"]-1 else ");\n") if problemType["UseBeta"]: s += "%s} else {\n" % (t) t = t[:-2] s += "%sif( inputEvents != NULL )\n" % (t) s += "%s hipEventRecord(inputEvents[0], stream );\n" % (t) s += "%skernelsLaunched++;\n" % (t) s += "%shipLaunchKernelGGL(\n" % (t) t += " " s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelNamesBetaOnly[1]) s += "%sdim3(globalWorkSizeBetaOnly[0], globalWorkSizeBetaOnly[1], globalWorkSizeBetaOnly[2]),\n" % (t) s += "%sdim3(localWorkSizeBetaOnly[0], localWorkSizeBetaOnly[1], localWorkSizeBetaOnly[2]),\n" % (t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sdataD,\n" % (t) s += "%sdataC,\n" % (t) # strides for i in range(0,numStridesC*2): s += "%s%s,\n" % (t, self.strideList[i]) # sizes for i in range(0, problemType["NumIndicesC"]): s += "%ssize%s,\n" % (t, self.indexChars[i]) s += "%sbeta);\n" % (t) s += "}\n" t = " " s += "%s} catch (const std::exception& e) {\n" % (t) s += "#ifdef DEBUG\n" s += "%s std::cerr << e.what() << std::endl;\n" % (t) s += "#endif\n" s += "%s return tensileStatusFailure;\n" % (t) s += "%s}\n" % (t) ######################################## # Enqueue Kernels ######################################## for kernelIdx in range(0, len(kernels)): kernel = kernels[kernelIdx] if kernel["KernelLanguage"] == "Source": kernel["ISA"] = (0, 0, 0) # HIP source kernels needs dummy ISA version kernelName = self.kernelWriter.getKernelName(kernel) s += "\n%s/* kernel %u: %s */\n" % (t, kernelIdx, kernelName) s += "%sunsigned int kernelIdx = %u;\n" % (t, kernelIdx) if self.language == "OCL": # set kernel args same for all enqueues s += "%s// kernel args same for all enqueues\n" % (t) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataD ); tensileStatusCheck(status);\n" % (t, 0) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataC ); tensileStatusCheck(status);\n" % (t, 1) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataA ); tensileStatusCheck(status);\n" % (t, 2) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(cl_mem), &dataB ); tensileStatusCheck(status);\n" % (t, 3) s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &alpha ); tensileStatusCheck(status);\n" % (t, 4, typeName) s += "%s%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(%s), &beta ); tensileStatusCheck(status);\n" % (t, \ "" if problemType["UseBeta"] else "//", 5, typeName) argIdx = 6 if problemType["UseBeta"] else 5 for stride in self.strideList: s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &%s ); tensileStatusCheck(status);\n" % (t, argIdx, stride) argIdx += 1 for sizeIdx in range(0, problemType["TotalIndices"]): if sizeIdx not in [ problemType["Index0"], problemType["Index1"], problemType["IndexUnroll"] ]: s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.indexChars[sizeIdx]) argIdx += 1 s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(staggerUIter), &staggerUIter ); tensileStatusCheck(status);\n" % (t, argIdx) argIdx += 1 s += "%sfor (unsigned int enqueueIdx = 0; enqueueIdx < numEnqueues[%u]; enqueueIdx++) {\n" % (t, kernelIdx) t += " " # debug print kernel dimensions if globalParameters["LibraryPrintDebug"]: s += "%sprintf(\"%s: g{ %%u, %%u, %%u } l{ %%u, %%u, %%u}\\n\", static_cast<unsigned int>(globalWorkSize[kernelIdx][0]), static_cast<unsigned int>(globalWorkSize[kernelIdx][1]), static_cast<unsigned int>(globalWorkSize[kernelIdx][2]), static_cast<unsigned int>(localWorkSize[0]), static_cast<unsigned int>(localWorkSize[1]), static_cast<unsigned int>(localWorkSize[2]) );\n" % (t, kernelName) # debug print kernel arguments # strides for stride in self.strideList: s += "%sprintf(\" %s = %%u\\n\", %s);\n" % (t, stride, stride) # sizes for i in range(0, problemType["TotalIndices"]): s += "%sprintf(\" sizes[kernelIdx][enqueueIdx][%u] = %%u\\n\", sizes[kernelIdx][enqueueIdx][%u] );\n" % (t, i, i ) s += "%sprintf(\" problemNumGroupTiles0== %%u\\n\", problemNumGroupTiles0 );\n" % (t) s += "%sprintf(\" problemNumGroupTiles1== %%u\\n\", problemNumGroupTiles1 );\n" % (t) s += "%sprintf(\" tensor2dSizeC== %%lu\\n\", tensor2dSizeC );\n" % (t) s += "%sprintf(\" tensor2dSizeA== %%lu\\n\", tensor2dSizeA );\n" % (t) s += "%sprintf(\" tensor2dSizeB== %%lu\\n\", tensor2dSizeB );\n" % (t) for idxChar in solution["PackedC0Indices"][:-1]: s += "%sprintf(\" magicNumberSize%s== 0x%%x, magicShiftSize%s== %%u)\\n\", magicNumberSize%s, magicShiftSize%s);\n" \ % (t, idxChar, idxChar, idxChar, idxChar) for idxChar in solution["PackedC1Indices"][:-1]: s += "%sprintf(\" magicNumberSize%s== 0x%%x, magicShiftSize%s== %%u)\\n\", magicNumberSize%s, magicShiftSize%s);\n" \ % (t, idxChar, idxChar, idxChar, idxChar) ######################################## # OpenCL Runtime ######################################## if self.language == "OCL": # set kernel args different for all enqueues argIdx = 6 if problemType["UseBeta"] else 5 argIdx += len(self.strideList) # sizes for sizeIdx in range(0, problemType["TotalIndices"]): if sizeIdx in [ problemType["Index0"], problemType["Index1"], problemType["IndexUnroll"] ]: s += "%sstatus = clSetKernelArg( kernels[kernelIdx], %u, sizeof(unsigned int), &size%s ); tensileStatusCheck(status);\n" % (t, argIdx, self.indexChars[sizeIdx]) argIdx += 1 # enqueue s += "%sstatus = clEnqueueNDRangeKernel(\n" % (t) t += " " s += "%sstream,\n" % (t) s += "%skernels[kernelIdx],\n" % (t) s += "%sworkDim,\n" % (t) s += "%sNULL, // globalWorkOffset\n" % (t) s += "%sglobalWorkSize[kernelIdx],\n" % (t) s += "%slocalWorkSize,\n" % (t) if False: # gsu > 1: s += "%s1,\n" % (t) s += "%s&kernelEventBetaOnly,\n" % (t) else: s += "%snumInputEvents,\n" % (t) s += "%sinputEvents,\n" % (t) s += "%soutputEvent );\n" % (t) s += "%stensileStatusCheck(status);\n" % (t) s += "%s}\n" % (t) ######################################## # HIP Runtime ######################################## else: if not globalParameters["PreciseKernelTime"] or kernelLanguage == "Source": s += "%sif( inputEvents != NULL )\n" % (t) t += " " s += "%shipEventRecord(inputEvents[enqueueIdx], stream );\n" % (t) t = t[2:] s += "%stry {\n" % (t) t += " " # hip kernel if kernelLanguage == "Source": s += "%skernelsLaunched++;\n" % (t) s += "%shipLaunchKernelGGL(\n" % (t) t += " " s += "%sHIP_KERNEL_NAME(%s),\n" % (t, kernelName) s += "%sdim3(globalWorkSize[kernelIdx][0], globalWorkSize[kernelIdx][1], globalWorkSize[kernelIdx][2]),\n" % (t) s += "%sdim3(localWorkSize[0], localWorkSize[1], localWorkSize[2]),\n" % (t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sdataD,\n" % (t) s += "%sdataC,\n" % (t) s += "%sdataA,\n" % (t) s += "%sdataB,\n" % (t) s += "%salpha,\n" % (t) s += "%s%sbeta,\n" % (t, \ "" if problemType["UseBeta"] else "//") # strides for stride in self.strideList: s += "%s%s,\n" % (t, stride) # sizes for i in range(0, problemType["TotalIndices"]): lastParam = i == problemType["TotalIndices"]-1 s += "%ssizes[kernelIdx][enqueueIdx][%u]%s\n" \ % (t, i, "" if lastParam else "," ) for idxChar in solution["PackedC0Indices"][:-1]: s += "%s,magicNumberSize%s\n" % (t, idxChar) s += "%s,magicShiftSize%s\n" % (t, idxChar) for idxChar in solution["PackedC1Indices"][:-1]: s += "%s,magicNumberSize%s\n" % (t, idxChar) s += "%s,magicShiftSize%s\n" % (t, idxChar) s += "%s,staggerUIter\n" % (t) #persistent: s += "%s,problemNumGroupTiles0\n" % (t) s += "%s,problemNumGroupTiles1\n" % (t) s += "%s,magicNumberProblemNumGroupTiles0\n" % (t) # magic number to use when dividing by problemNumGroupTiles0 s += "%s);\n" % (t) # assembly kernel else: if globalParameters["DebugKernel"]: s += "%sconst unsigned int debugBufferElementsPerThread = 16;\n" % t s += "%sunsigned int debugBufferNumElem = debugBufferElementsPerThread;\n" % (t) s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][0]);\n" % (t) s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][1]);\n" % (t) s += "%sdebugBufferNumElem *= max(1,globalWorkSize[kernelIdx][2]);\n" % (t) s += "%sdebugBufferNumElem *= localWorkSize[0];\n" % (t) s += "%sdebugBufferNumElem *= localWorkSize[1];\n" % (t) s += "%sdebugBufferNumElem *= localWorkSize[2];\n" % (t) s += "%s printf(\"debugBufferNumElem: %%04i: \\n\", debugBufferNumElem);\n" % (t) s += "%ssize_t debugBufferSize = debugBufferNumElem * sizeof(unsigned int);\n" % (t) s += "%shipDevice_t device;\n" % t s += "%shipDeviceGet(&device, 0);\n" % t s += "%shipMalloc(&(hipFunctionArgs.debugBuffer), debugBufferSize);\n" % t s += "%sunsigned int *debugBufferHostPtr = new unsigned int[debugBufferNumElem];\n" % (t) s += "%smemset(debugBufferHostPtr,0,debugBufferSize);\n" % (t) s += "%shipMemcpyHtoD(hipFunctionArgs.debugBuffer, debugBufferHostPtr, debugBufferSize);\n" % (t) s += "%smemset(debugBufferHostPtr,1,debugBufferSize);\n" % (t) # hip assembly function s += "%shipFunctionArgs.tensor2dSizeC = tensor2dSizeC;\n" % (t) s += "%shipFunctionArgs.tensor2dSizeA = tensor2dSizeA;\n" % (t) s += "%shipFunctionArgs.tensor2dSizeB = tensor2dSizeB;\n" % (t) s += "%shipFunctionArgs.dataD = dataD;\n" % (t) s += "%shipFunctionArgs.dataC = dataC;\n" % (t) s += "%shipFunctionArgs.dataA = dataA;\n" % (t) s += "%shipFunctionArgs.dataB = dataB;\n" % (t) if problemType["DataType"].isHalf(): s += "%shipFunctionArgs.alpha[0] = alpha;\n" % (t) s += "%shipFunctionArgs.alpha[1] = alpha;\n" % (t) else: s += "%shipFunctionArgs.alpha = alpha;\n" % (t) if problemType["UseBeta"]: if problemType["DataType"].isHalf(): s += "%shipFunctionArgs.beta[0] = beta;\n" % (t) s += "%shipFunctionArgs.beta[1] = beta;\n" % (t) else: s += "%shipFunctionArgs.beta = beta;\n" % (t) # strides for stride in self.strideList: s += "%shipFunctionArgs.%s = %s;\n" % (t, stride, stride) # sizes for i in range(0, problemType["TotalIndices"]): lastParam = i == problemType["TotalIndices"]-1 s += "%shipFunctionArgs.size%s = sizes[kernelIdx][enqueueIdx][%u];\n" \ % (t, globalParameters["IndexChars"][i], i ) s += "%shipFunctionArgs.tensor2dSizeC = tensor2dSizeC;\n" % (t) s += "%shipFunctionArgs.tensor2dSizeA = tensor2dSizeA;\n" % (t) s += "%shipFunctionArgs.tensor2dSizeB = tensor2dSizeB;\n" % (t) s += "%shipFunctionArgs.staggerUIter = staggerUIter;\n" % (t) # persistent - pass in the number of tiles in problem since not available in WG s += "\n" s += "%shipFunctionArgs.problemNumGroupTiles0 = problemNumGroupTiles0;\n" % (t) s += "%shipFunctionArgs.problemNumGroupTiles1 = problemNumGroupTiles1;\n" % (t) s += "%shipFunctionArgs.magicNumberProblemNumGroupTiles0 = magicNumberProblemNumGroupTiles0;\n" % (t) s += "%shipFunctionArgs.gridNumWorkGroups0 = globalWorkSize[kernelIdx][0];\n" % (t) # s += "%shipFunctionArgs.numFullBlocks = numFullBlocks;\n" % (t) s += "%shipFunctionArgs.wgmRemainder1 = wgmRemainder1;\n" % (t) s += "%shipFunctionArgs.magicNumberWgmRemainder1 = magicNumberWgmRemainder1;\n" % (t) # Magic numbers for packed indices: for idxChar in solution["PackedC0Indices"][:-1]: s += "%shipFunctionArgs.magicNumberSize%s = magicNumberSize%s;\n" % (t, idxChar, idxChar) s += "%shipFunctionArgs.magicShiftSize%s = magicShiftSize%s;\n" % (t, idxChar, idxChar) for idxChar in solution["PackedC1Indices"][:-1]: s += "%shipFunctionArgs.magicNumberSize%s = magicNumberSize%s;\n" % (t, idxChar, idxChar) s += "%shipFunctionArgs.magicShiftSize%s = magicShiftSize%s;\n" % (t, idxChar, idxChar) s += "%skernelsLaunched++;\n" % (t) s += "%shipHccModuleLaunchKernel(\n" % (t) t += " " s += "%shipFunction,\n" % (t) s += "%sglobalWorkSize[kernelIdx][0]*localWorkSize[0],\n" % (t) s += "%sglobalWorkSize[kernelIdx][1]*localWorkSize[1],\n" % (t) s += "%sglobalWorkSize[kernelIdx][2]*localWorkSize[2],\n" % (t) s += "%slocalWorkSize[0],\n" % (t) s += "%slocalWorkSize[1],\n" % (t) s += "%slocalWorkSize[2],\n" % (t) s += "%s0, // groupMemBytes\n" % (t) s += "%sstream,\n" % (t) s += "%sNULL,\n" % (t) s += "%s(void**)hipLaunchParams\n" % (t) if globalParameters["PreciseKernelTime"]: s += "%s,(inputEvents && kernelsLaunched==1) ? inputEvents[enqueueIdx]:nullptr\n" %(t) s += "%s,outputEvent ? outputEvent[enqueueIdx]:nullptr\n" % (t) s += "%s);\n" % (t) t = t[2:] if globalParameters["DebugKernel"]: # copy debug buffer s += "%shipMemcpyDtoH(debugBufferHostPtr, hipFunctionArgs.debugBuffer, debugBufferSize);\n" % (t) s += "%sfor(unsigned int i = 0; i < debugBufferNumElem/debugBufferElementsPerThread; i++) {\n" % (t) s += "%s printf(\"%%04i\", i);\n" % (t) s += "%s char u[debugBufferElementsPerThread] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};\n" % (t) #s += "%s char u[debugBufferElementsPerThread] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};\n" % (t) #s += "%s char u[debugBufferElementsPerThread] = {1,1,0,0,1,1,0,0,1,1,1,1,1,1,1,1};\n" % (t) s += "%s for(unsigned int j = 0; j < debugBufferElementsPerThread; j++) {\n" % (t) s += "%s if (u[j]) printf(\",%%4u\", debugBufferHostPtr[i*debugBufferElementsPerThread+j]);\n" % (t) s += "%s else printf(\",%%4.0f\", ((float *)debugBufferHostPtr)[i*debugBufferElementsPerThread+j]);\n" % (t) s += "%s }\n" % (t) s += "%s printf(\"\\n\");\n" % (t) s += "%s}\n" % (t) t = t[2:] s += "%s} catch (const std::exception& e) {\n" % (t) s += "#ifdef DEBUG\n" s += "%s std::cerr << e.what() << std::endl;\n" % (t) s += "#endif\n" s += "%s return tensileStatusFailure;\n" % (t) s += "%s}\n" % (t) if not globalParameters["PreciseKernelTime"] or kernelLanguage == "Source": s += "%sif( outputEvent != NULL )\n" % (t) s += "%s hipEventRecord(outputEvent[enqueueIdx], stream );\n" % (t) s += " }\n" s += "\n" s += " return tensileStatusSuccess;\n" s += "}\n" s += "\n" s += "/* Solution Parameters\n" s += Solution.getParametersIndented(solution.getAttributes(), " ") s += "*/\n" s += "\n" return s