Beispiel #1
0
def ifko_writeNT(ATLdir, ARCH, KF0, fko, rout, pre, l1bla, N, M, lda, wnt):
#
#  Time the default case
#
   warrs = []
   fkocmnd.callfko(fko, KF0)
   [t0,m0] = cmnd.time(ATLdir, ARCH, pre, l1bla, N, M, lda, "fkorout.s", 
                         "gcc", "-x assembler-with-cpp", opt=opt)
   print "WNT none : %.2f" % (m0)
   for wa in wnt:
      KFN = KF0 + " -W " + wa
      fkocmnd.callfko(fko, KFN)
      [tN,mN] = cmnd.time(ATLdir, ARCH, pre, l1bla, N, M,lda, "fkorout.s", 
                            "gcc", "-x assembler-with-cpp", opt=opt)
      print "WNT %s : %2.f" % (wa, mN)
      if mN > m0:
         KF0 = KFN
         m0 = mN
         warrs.append(wa)
      
   return [m0,KF0,warrs]
Beispiel #2
0
      if (CALLATL != 0):
         [time,mf] = l1cmnd.time(ATLdir, ARCH, pre, blas, N, l1atl[j], 
                                 CCatl[j], CCFat[j], opt=opt)
         assert(time > 0.0)
         print "ATL %20.20s : time=%f, mflop=%f" % (pre+l1atl[j], time, mf)
         atlT.append(time)
         atlMF.append(mf)

      if (CALLFKO != 0):
         rout = IFKOdir + '/blas/' + pre + blas + '.b'
         outf = ATLdir + '/tune/blas/level1/' + blas.upper() + '/fkorout.s'
         KF0 = fkocmnd.GetStandardFlags(fko, rout, pre)
         KFLAGS = KF0 + ' -o ' + outf + " " + rout
         if (os.path.exists(outf)):
            os.remove(outf)
         fkocmnd.callfko(fko, KFLAGS)
         [time,mf] = l1cmnd.time(ATLdir, ARCH, pre, blas, N, 'fkorout.s',
                                 "gcc", "-x assembler-with-cpp", opt=opt)
         assert(time > 0.0)
         print "FKO %20.20s : time=%f, mflop=%f" % (pre+blas+'.b', time, mf)
         print "                           flags =", KF0
         fkoT.append(time)
         fkoMF.append(mf)
      j += 1
   i += 1

print r"OPERATION   & gcc+ref & icc+ref & icc+prof & gcc+atlas&icc+atlas& cblas & fko     &     ifko\\\hline\hline"
form = "%12s& %5.0f & %5.0f & %5.0f & %5.0f & %5.0f & %5.0f & %5.0f &      \\\\\\hline"
form2= "%12s& %5.0f & %5.0f & %5.0f & %5.0f*& %5.0f*& %5.0f & %5.0f &      \\\\\\hline"

#fkoMF = [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1];
Beispiel #3
0
                                     CCFat[j],
                                     opt=opt)
            assert (time > 0.0)
            print "ATL %20.20s : time=%f, mflop=%f" % (pre + l1atl[j], time,
                                                       mf)
            atlT.append(time)
            atlMF.append(mf)

        if (CALLFKO != 0):
            rout = IFKOdir + '/blas/' + pre + blas + '.b'
            outf = ATLdir + '/tune/blas/level1/' + blas.upper() + '/fkorout.s'
            KF0 = fkocmnd.GetStandardFlags(fko, rout, pre)
            KFLAGS = KF0 + ' -o ' + outf + " " + rout
            if (os.path.exists(outf)):
                os.remove(outf)
            fkocmnd.callfko(fko, KFLAGS)
            [time, mf] = l1cmnd.time(ATLdir,
                                     ARCH,
                                     pre,
                                     blas,
                                     N,
                                     'fkorout.s',
                                     "gcc",
                                     "-x assembler-with-cpp",
                                     opt=opt)
            assert (time > 0.0)
            print "FKO %20.20s : time=%f, mflop=%f" % (pre + blas + '.b', time,
                                                       mf)
            print "                           flags =", KF0
            fkoT.append(time)
            fkoMF.append(mf)
Beispiel #4
0
def ifko0(l1bla, pre, N, M=None, lda=None):
   (IFKOdir, fko) = fkocmnd.GetFKOinfo()
   (ATLdir, ARCH) = fkocmnd.FindAtlas(IFKOdir)
   rout =  IFKOdir + '/blas/' + pre + l1bla + '.b'
   #outf =  ATLdir + '/tune/blas/level1/' + l1bla.upper() + '/fkorout.s'
   outf =  ATLdir + kernels.GetBlasPath(l1bla) + '/fkorout.s'
#
#  Majedul: calling new info func, info represents the old list
#  new data: [npath, red2onePath, vecMethod, vpathinfo, arrtypes] at the end
#
   #info = fkocmnd.info(fko, rout)

   newinfo = fkocmnd.NewInfo(fko, rout)
   info = [newinfo[i] for i in range(11) ]
   [npath, red1path, vecm, vpath, arrtypes] = [ newinfo[i] for i in range(11,16)]
   
   ncache = info[0]
   vec = info[5]
   #(fparrs, fpsets, fpuses, fpurs) = fkocmnd.GetFPInfo(info)
   (fparrs, fpsets, fpuses, fpurs) = fkocmnd.GetFPInfo(newinfo)
   nfp = len(fparrs)
#
#  Findout the default flags (it includes vector, default prefetch and unroll)
#
   #KFLAGS = fkocmnd.GetStandardFlags(fko, rout, pre) 
   KFLAGS = fkocmnd.GetOptStdFlags(fko, rout, pre, 1, URF) 
   print "\n   Default Flag = " + KFLAGS 
   KFLAGS = KFLAGS + " -o " + str(outf) + " " + rout
   mflist = []
   testlist = []
   #print KFLAGS
#
#  Majedul: default and vect case would not be same now. Vspec may be 
#  worse than NonVec case.
#  So, I will choose the best as the default for the later optimization
#

#
#  check best scalar xforms, delete any vector flag
#
   #j = KFLAGS.find("-V")
   #if j != -1 :
      #KFn = KFLAGS[0:j-1] + KFLAGS[j+2:]
#
#  find out best standard scalar flag 
#
   KFn = fkocmnd.GetOptStdFlags(fko, rout, pre, 0, URF) 
   KFn = KFn + " -o " + str(outf) + " " + rout
   #print KFn
#
#  standard flag without vect
#
   KF0 = KFn
   fkocmnd.callfko(fko, KF0)
   [t0,mf0] = cmnd.time(ATLdir, ARCH, pre, l1bla, N, M, lda, "fkorout.s", 
                        "gcc", "-x assembler-with-cpp", opt=opt)
   mflist.append(mf0)
   testlist.append("default") ## this is using std flags
   print "\n   Default Flag = " + KF0 

#
#  Finding the best path reduction option
#
   if npath > 1:
      [mfs, KFs] = ifko_PathXform(ATLdir, ARCH, KFn, ncache, fko, rout, pre,
                                 l1bla, N, M, lda,  npath, red1path)
      mflist.append(mfs)
      testlist.append("PathXform")
      if (mfs > mf0) :
         mf0 = mfs
         KF0 = KFs
#
#  Finding the best vector option with/without path reduction
#
   global isSV;
   if SB:
      KFv = fkocmnd.GetOptStdFlags(fko, rout, pre, 1, SB, URF)
   else:
      KFv = fkocmnd.GetOptStdFlags(fko, rout, pre, 1, 0, URF)
   
   print "\n   Standad Flag for Vect = " + KFv
   
   KFv = KFv + " -o " + str(outf) + " " + rout
   if vec:
      if 'v' in skipOpt:
         print '\n   SKIPPING VECTORIZATION'
      else:
         [mfv, KFv] =  ifko_Vec(ATLdir, ARCH, KFv, ncache, fko, rout, pre, 
                                l1bla, N, M, lda, npath, vecm, vpath)
         mflist.append(mfv)
         testlist.append("vect")
         if (mfv > mf0) :
            mf0 = mfv
            KF0 = KFv
#
#        if we have forceOpt, we will keep vec even if it's not better
#
         elif  'sv' in forceOpt or 'vrc' in forceOpt or 'vmmr' in forceOpt:
            print '\n   FORCING VECTORIZATION'
            mf0 = mfv
            KF0 = KFv
         else:  # no vector is selected, skip the SB too  #
            isSV = 0
#
#  choose the better as the ref of later opt
#
   KFLAGS = KF0
   mf = mf0
   
   print "\n   FLAGS so far =", fkocmnd.RemoveFilesFromFlags(l1bla, KFLAGS)

#
#  Previous code which is substituted by the above codes
#

#
#  Find performance of default case
#
#   j = KFLAGS.find("-V")
#   if j != -1 :
#      KFn = KFLAGS[0:j-1] + KFLAGS[j+2:]
#      fkocmnd.callfko(fko, KFn)
#      [t,mf] = l1cmnd.time(ATLdir, ARCH, pre, l1bla, N, "fkorout.s", 
#                           "gcc", "-x assembler-with-cpp", opt=opt)
#      mflist.append(mf)
#      testlist.append("default")
#      fkocmnd.callfko(fko, KFLAGS)
#      [t,mf] = l1cmnd.time(ATLdir, ARCH, pre, l1bla, N, "fkorout.s", 
#                           "gcc", "-x assembler-with-cpp", opt=opt)
#   else :
#      fkocmnd.callfko(fko, KFLAGS)
#      [t,mf] = l1cmnd.time(ATLdir, ARCH, pre, l1bla, N, "fkorout.s", 
#                           "gcc", "-x assembler-with-cpp", opt=opt)
#      testlist.append("default")
#      mflist.append(mf)
#   mflist.append(mf)
#   testlist.append("vect")
#
#  Eventually, want to try both -V and scalar, but for now, use -V whenever
#  possible

#
#  Find if we want to use cache-through writes on any arrays
#
   if 'wnt' in skipOpt:
      print '\n   SKIPPING WNT'
   else:
      n = len(fpsets)
      i = 0
      wnt = []
      while i < n:
         if fpsets[i] > 0 :
#        and fpuses[i] == 0:
            wnt.append(fparrs[i])
         i += 1
      if len(wnt) > 0:
         [mf,KFLAGS,wnt] = ifko_writeNT(ATLdir, ARCH, KFLAGS, fko, rout, pre,
                                     l1bla, N, M, lda, wnt)
      mflist.append(mf)
      testlist.append("writeNT") 
#
#  Find best PFD for each pfarr
#
   pfarrs = fparrs
   pfsets = fpsets
   for arr in pfarrs:
      [mf,KFLAGS] = FindPFD(ATLdir, ARCH, KFLAGS, fko, rout, pre,l1bla, N,M,lda,
                            info, arr)
   mflist.append(mf)
   testlist.append("pfdist")
   KFLAGS = fkocmnd.RemoveRedundantPrefFlags(KFLAGS, pfarrs)
#
#  Find best pf type
#
   [mf,KFLAGS] = ifko_pftype(ATLdir, ARCH, KFLAGS, ncache, fko, rout, pre, 
                             l1bla, N, M, lda, info, pfarrs, pfsets)
   mflist.append(mf)
   testlist.append("pftype")
   print "\n   FLAGS so far =", fkocmnd.RemoveFilesFromFlags(l1bla, KFLAGS)
   
#
#  Find best unroll
#
   if URF:
      print '\n   SKIPPING UNROLL TUNNING : FORCED TO %d' %URF
   else:
      [mf,KFLAGS] = FindUR(ATLdir, ARCH, KFLAGS, fko, rout, pre, l1bla, N, M, 
                            lda, info)
      mflist.append(mf)
      testlist.append("unroll")

#
#  Find best bet for over speculation
#  FIXME: find out the -U and pass it to the function
#  FIXME: can't apply Over Spec if there is a memory write inside the loop
#
   
   if isSV:
      if l1bla.find("irk1amax") != -1:
         print '\n   SKIPPING STRONGER BET UNROLLING for IRK1AMAX' 
      elif l1bla.find("irk2amax") != -1:
         print '\n   SKIPPING STRONGER BET UNROLLING for IRK2AMAX' 
      elif l1bla.find("irk3amax") != -1:
         print '\n   SKIPPING STRONGER BET UNROLLING for IRK3AMAX' 
      elif l1bla.find("sin") != -1:
         print '\n   SKIPPING STRONGER BET UNROLLING for SIN' 
      elif l1bla.find("cos") != -1:
         print '\n   SKIPPING STRONGER BET UNROLLING for COS' 
      else:
         [mf,KFLAGS] = FindBET(ATLdir, ARCH, KFLAGS, fko, rout, pre, l1bla, N, 
                               M, lda)
         mflist.append(mf)
         testlist.append("OverSpec")

#
#  See if we can apply accumulator expansion
#
#   acc = fkocmnd.GetFPAccum(info)
#   nacc = len(acc)
#   if nacc > 0 and nacc < 3:
#      [mf,KFLAGS] = FindAE(ATLdir, ARCH, KFLAGS, fko, rout, pre, l1bla, N, acc)
#   mflist.append(mf)
#   testlist.append("accexpans")

#
#  Majedul: See if we can apply scalar expansion (accexpan + man/min expansion)
#
   acc = fkocmnd.GetFPAccum(info)
   nacc = len(acc)
   if 're' in skipOpt:
      print '\n   SKIPPING SCALAR EXPANSION'
   elif isSV:
      print '\n   SKIPPING SCALAR EXPANSION: NOT SUPPORTED WITH SV'
   elif l1bla.find("iamax") != -1:
      print '\n   SKIPPING SCALAR EXPANSION FOR IAMAX'
   else:
      if nacc > 0 and nacc < 3:
         [mf,KFLAGS] = FindRE(ATLdir, ARCH, KFLAGS, fko, rout, pre, l1bla, N, 
                              M, lda, acc)
      mflist.append(mf)
      testlist.append("rdexp")
#
#  Majedul: shifted it here to test
#
#
#  Find if we want to use cache-through writes on any arrays
#
   """if 'wnt' in skipOpt:
      print '\n   SKIPPING WNT'
   else:
      n = len(fpsets)
      i = 0
      wnt = []
      while i < n:
         if fpsets[i] > 0 :
#        and fpuses[i] == 0:
            wnt.append(fparrs[i])
         i += 1
      if len(wnt) > 0:
         [mf,KFLAGS,wnt] = ifko_writeNT(ATLdir, ARCH, KFLAGS, fko, rout, pre,
                                     l1bla, N, wnt)
      mflist.append(mf)
      testlist.append("writeNT") 
#
#  Find best PFD for each pfarr
#
   pfarrs = fparrs
   pfsets = fpsets
   for arr in pfarrs:
      [mf,KFLAGS] = FindPFD(ATLdir, ARCH, KFLAGS, fko, rout, pre,l1bla, N, M, 
                            lda, info, arr)
   mflist.append(mf)
   testlist.append("pfdist")
   KFLAGS = fkocmnd.RemoveRedundantPrefFlags(KFLAGS, pfarrs)
#
#  Find best pf type
#
   [mf,KFLAGS] = ifko_pftype(ATLdir, ARCH, KFLAGS, ncache, fko, rout, pre, 
                             l1bla, N, info, pfarrs, pfsets)
   mflist.append(mf)
   testlist.append("pftype")
   print "\n   FLAGS so far =", fkocmnd.RemoveFilesFromFlags(l1bla, KFLAGS)
   """
#
#  tesing: re-tune the prefetch distance!
#  NOTE:  this re-tuning can be omitted just by enabling the comment
#
   #"""
   #KFLAGS = fkocmnd.SetDefaultPFD(KFLAGS, info)
   KFLAGS = fkocmnd.SetDefaultPFD(KFLAGS, newinfo)
   #print "default PFD: ", KFLAGS
   print "\n   TUNING PFD AGAIN: "
   for arr in pfarrs:
      [mf,KFLAGS] = FindPFD(ATLdir, ARCH, KFLAGS, fko, rout, pre,l1bla, N, M, 
                            lda, info, arr)
   KFLAGS = fkocmnd.RemoveRedundantPrefFlags(KFLAGS, pfarrs)
#
# FIXME: it will create problem for the calculaton of % of improvement
#
#   if 'pfdist' in testlist:
#      j = testlist.index('pfdist')
#      mflist[j] = mf
#   else:
#      mflist.append(mf)
#      testlist.append("pfdist")
   #KFLAGS = fkocmnd.RemoveRedundantPrefFlags(KFLAGS, pfarrs)

   mflist.append(mf)
   testlist.append("pfd2")
   
   #"""

#
#  Find performance of best case
#
#   fkocmnd.callfko(fko, KFLAGS)
#   [t,mf] = l1cmnd.time(ATLdir, ARCH, pre, l1bla, N, "fkorout.s", 
#                        "gcc", "-x assembler-with-cpp", opt=opt)
   print "\n\n   BEST FLAGS FOUND (%.2f) = %s" % (mf,
         fkocmnd.RemoveFilesFromFlags(l1bla, KFLAGS))
   res = fkocmnd.GetOptVals(KFLAGS, pfarrs, pfsets, acc)
   tst = cmnd.test(ATLdir, ARCH, pre, l1bla, N, M, lda, "fkorout.s",
                     cc="gcc", ccf="-x assembler-with-cpp", opt=optT)
   #tst = l1cmnd.silent_test(ATLdir, ARCH, pre, l1bla, N, "fkorout.s",
   #                     cc="gcc", ccf="-x assembler-with-cpp", opt=optT)

   return(res, KFLAGS, mf, tst, testlist, mflist)
Beispiel #5
0
def FindBET(ATLdir, ARCH, KF0, fko, rout, pre, blas, N, M, lda):
#
#  figure out the UR factor from flag
#
   j = KF0.find('-U')
   if j != -1:
      words = KF0[j:].split() 
      UR0 = int(words[1])
   else:
      UR0 = 1
#
#  set max bet as the UR but not less than 4, 
#  but not greater than 10
#
   #maxbet = UR0 * 2;
   maxbet = UR0
   if maxbet < 4:
      maxbet = 4
   if maxbet > 10:
      maxbet = 10
#
#  reduce tries for sine and cosine 
#
   if blas.find("sin") != -1:
      URm = 8;
   elif blas.find("cos") != -1:
      URm = 8;
   else:
      URm = 16 
#
#  time default case 
#
   fkocmnd.callfko(fko, KF0)
   [t0,m0] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                         "gcc", "-x assembler-with-cpp", opt=opt)
   KFn = KF0
   KF1 = KF0
   mf1 = m0
   UR = 1
   SB = 1

   print "   Finding Over Speculation Factor, UR=%d, mflop= %.2f" % (UR0, m0)
   
   for i in range (2, maxbet):
#
#  remove previous bet if exists
#
      match = re.search(r'-B\s\d+',KFn) ## should not put space \s at last!
      if match:
         pat = match.group()
         KFn = KFn.replace(pat, '') ## remove all bet

      KFn = '-B %d ' %i + KFn
#
#     SB * UR should not be greater than 32 
#
      URN = int (URm/i) 
      for j in range (1, URN+1):
#
#        remove prev unroll factor
#
         match = re.search(r'-U\s\d+',KFn) ## should not put space \s at last!
         if match:
            pat = match.group()
            KFn = KFn.replace(pat, '') ## remove all bet
      
         KFn = '-U %d ' %j + KFn
      
         fkocmnd.callfko(fko, KFn)
         [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                           "gcc", "-x assembler-with-cpp", opt=opt)
         
         print "      SB=%d, UR=%d, mflop= %.2f" % (i, j, mf)
         
         if mf > mf1*1.001:
            mf1 = mf
            KF1 = KFn
            SB = i
            UR = j

   if mf1 > m0:
      KFn = KF1
      print "   Over Speculation : SB=%d, UR=%d, mf=%2.f, KFn=%s" % (SB, UR, 
             mf1, KFn)
   else:
      print "   No Over Speculation Selected!\n" 
      KFn = KF0
      mf1 = m0

   return [mf1, KFn]
Beispiel #6
0
def ifko_Vec(ATLdir, ARCH, KF0, ncache, fko, rout, pre, blas, N, M, lda,  
             npath, vm, vpath):
   """ This function will try all vectorization methods incorporating other
   """
#
#  add vector flag if not in the flags
#
   if KF0.find('-V') == -1:
      KF0 = ' -V' + KF0
#
#  flag for speculation 
#
   global isSV
#
#  find out the standard vec flags
#
   #if SB:
   #   KF0 = GetOptStdFlags(fko, rout, pre, 1, SB);
   #else:
   #   KF0 = GetOptStdFlags(fko, rout, pre, 1);
   #print KF0
#
#  delete all flags which are associated with -p/rc/mmr
#
   j = KF0.find('-p')
   if j != -1:
      match = re.search(r'-p\s\d+\s',KF0) ## single instance
      if match:
         rem = match.group()
         KF0 = KF0.replace(rem, '') ## removing the -p val 
   j = KF0.find('-rc')
   if j != -1:
      KF0 = KF0.replace('-rc','')
   j = KF0.find('-mmr')
   if j != -1:
      KF0 = KF0.replace('-mmr','')

   print "\n   Finding best vectorization:"
#
#  check with other xforms
#  new vec info = maxmin, redcom, spec
#
   m0 = 0
   t0 = 0
   KFn = KF0
   if npath > 1:
      #if vm[0] or vm[1] or vm[2]:
      if vm[0] :
         if 'mmr' in skipOpt or 'vrc' in forceOpt or 'sv' in forceOpt:
            print '      SKIPPING MMR+V'
         else:
            KF1 = ' -mmr' + KF0
            fkocmnd.callfko(fko, KF1)
            [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                                "gcc", "-x assembler-with-cpp", opt=opt)
            print "      MMR+V, mflop = %.2f" % ( mf)
            if mf > m0:
               m0 = mf
               t0 = t
               KFn = KF1
      #if vm[3] :
      if vm[1] :
         if 'rc' in skipOpt or 'vmmr' in forceOpt or 'sv' in forceOpt:
            print '      SKIPPING RC+V'
         else:
            KF1 = ' -rc' + KF0 
            fkocmnd.callfko(fko, KF1)
            [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                                "gcc", "-x assembler-with-cpp", opt=opt)
            print "      RC+V,  mflop = %.2f" % ( mf)
            if mf > m0:
               m0 = mf
               t0 = t
               KFn = KF1
#
#     check for speculative 
#
      #if vm[4] :
      if vm[2] :
         if 'sv' in skipOpt or 'vmmr' in forceOpt or 'vrc' in forceOpt:
            print '      SKIPPING SPECULATIVE VECTORIZATION'
         else:
            for i in range(npath):
               if vpath[i] :
                  KF1 = ' -p %d' %(i+1) + KF0
                  if SB:
                     KF1 = ' -B %d ' %(SB) + KF1
                  fkocmnd.callfko(fko, KF1)
                  [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                                      "gcc", "-x assembler-with-cpp", opt=opt)
                  if SB:
                     print "      V+SP%d SB=%d, mflop = %.2f" % (i+1, SB, mf)
                  else:
                     print "      V+SP%d, mflop = %.2f" % (i+1, mf)
                  if mf > m0:
                     m0 = mf
                     t0 = t
                     KFn = KF1
                     isSV = 1  ## SV is superior, so is applied from now
   else:    ## no path to reduce 
      fkocmnd.callfko(fko, KF0)
      [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                           "gcc", "-x assembler-with-cpp", opt=opt)
      print "      V, mflop = %.2f" % (mf)
      if mf > m0:
         m0 = mf
         t0 = t
         KFn = KF0

   KF0 = KFn
   return [m0,KFn]
Beispiel #7
0
def ifko_pftype(ATLdir, ARCH, KF0, ncache, fko, rout, pre, blas, N, M, lda, 
                info, pfarrs, pfsets):
#
#  Time the default case
#
   fkocmnd.callfko(fko, KF0)
   [t0,m0] = cmnd.time(ATLdir, ARCH, pre, blas, N, M,lda, "fkorout.s", 
                         "gcc", "-x assembler-with-cpp", opt=opt)
   assert(t0 > 0.0)
   print "   base mf = %f.2" % m0
   KF = KF0
#
#  If there are output arrays, varying inst used for writes
#
   SETS = 0
   for i in pfsets:
      if i:
         SETS = 1
         break

   if SETS:
#
#     Try using 3DNow's prefetchw for all writes
#
      KF = KF0 + " -Paw 3"
      fkocmnd.callfko(fko, KF)
      [t1,m1] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                            "gcc", "-x assembler-with-cpp", opt=opt)
      if (m1 > 0.0):
         print "   prefetchw speedup: %.3f !" %  (m1/m0)
#
#     Try temporal prefetch
#
      KF1 = KF0 + " -Paw 0"
      fkocmnd.callfko(fko, KF1)
      [t2,m2] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                            "gcc", "-x assembler-with-cpp", opt=opt)
      print "   prefetchw speedup: %.3f !" %  (m2/m0)

      if m2 > m1:
         m1 = m2
         KF = KF1
      if m1 > m0:
         m0 = m1
         KF0 = KF
#
#  If we've got read-only arrays, try a varying pref inst for reads
#
   READO = 0
   for i in pfsets:
      if not i:
         READO = 1
         break
   if READO:
      KF = KF0 + " -Par 0"
      fkocmnd.callfko(fko, KF)
      [t1,m1] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                            "gcc", "-x assembler-with-cpp", opt=opt)
      print "   prefetchr speedup: %.3f !" %  (m1/m0)
      if m1 > m0:
         KF0 = KF
         m0 = m1
       
#
#  For each array, find best cache level to fetch to
#
   NC = info[0]
   LS = info[1]
   for arr in pfarrs:
      j = 1
      while j < NC:
         KF = " -P %s %d %d" % (arr, j, LS[j])
         KF = KF0 + KF
         fkocmnd.callfko(fko, KF)
         [t1,m1] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                               "gcc", "-x assembler-with-cpp", opt=opt)
         print "   lvl %d %s speedup: %.3f !" %  (j, arr, m1/m0)
         if m1 > m0:
            KF0 = KF
            m0 = m1
         j += 1
   return [m0,KF0]
Beispiel #8
0
def ifko_PathXform(ATLdir, ARCH, KF0, ncache, fko, rout, pre, blas, N, M, lda,
                   npath, red1p):
   """ This function will apply some basic transformation on scalar code 
       like: fall-thru xform, rc, mmr etc
       should only call this when npath > 1
   """
#
#  checkout the default performance 
#
   #print KF0 
   fkocmnd.callfko(fko, KF0)
   [t0,m0] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                         "gcc", "-x assembler-with-cpp", opt=opt)
#
#  must have multiple paths 
#  
   if npath == 1 :
      print 'Single path! No Xform can be applied!!'
      sys.exit(1);
   
#
#  apply fall-thru optimization and see which fall-thru path is better
#
   KFn = KF0
   print "\n   Finding best fall-thru path:"
   for i in range(npath):
      KF1 = ' -p %d' %(i+1) + KF0
      fkocmnd.callfko(fko, KF1)
      [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                           "gcc", "-x assembler-with-cpp", opt=opt)
      print "      Path = %d, mflop = %.2f" % (i+1, mf)
      if mf > m0:
         m0 = mf
         t0 = t
         KFn = KF1

#
#  apply various reduction methods
#  [mmr, maxr, minr, rc] => [mmr, rc]
#  newinfo = [maxr, minr, rc] => [mmr, rc]
#
   #if red1p[0] or red1p[1] or red1p[2] or red1p[3]:
   if red1p[0] or red1p[1] or red1p[2]:
      print "\n   Finding best path reduction:"
   #if red1p[0] or red1p[1] or red1p[2] :
   if red1p[0] or red1p[1] :
      if 'mmr' in skipOpt:
         print '      SKIPPING MMR'
      else:
         KF1 = ' -mmr' + KF0
         fkocmnd.callfko(fko, KF1)
         [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                              "gcc", "-x assembler-with-cpp", opt=opt)
         print "      MMR, mflop = %.2f" % ( mf)
         if mf > m0:
            m0 = mf
            t0 = t
            KFn = KF1 
   #if red1p[3] :
   if red1p[2] :
#
#     skipping rc for cos kernel, there is a problem in cos!
#
      #if 'rc' in skipOpt or blas.find("cos") != -1 :
      if 'rc' in skipOpt != -1 :
         print '      SKIPPING RC'
      else: 
         KF1 = ' -rc' + KF0 
         fkocmnd.callfko(fko, KF1)
         [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                              "gcc", "-x assembler-with-cpp", opt=opt)
         print "      RC,  mflop = %.2f" % ( mf)
         if mf > m0:
            m0 = mf
            t0 = t
            KFn = KF1

   KF0 = KFn
   return [m0, KF0]
Beispiel #9
0
def FindAE(ATLdir, ARCH, KF0, fko, rout, pre, blas, N, M, lda, acc, maxlen=6):
#
#  Time the default case
#
   fkocmnd.callfko(fko, KF0)
   [t0,m0] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                         "gcc", "-x assembler-with-cpp", opt=opt)
   mf0 = m0
#
#  Find present unrolling factor, and remove it from flags
#
   j = KF0.find("-U ")
   assert(j != -1)
   words = KF0[j:].split()
   ur = int(words[1])
   if j > 0: KFN = KF0[0:j-1]
   else : KFN = ""
   for word in words[2:] :
      KFN = KFN + " " + word
   KF0 = KFN
   print "   Finding AccumExpan, UR=%d, mflop= %.2f" % (ur, m0)
   mfB = 0.0
   urB = ur
   aeB = 1
   nacc = len(acc)
   for ac in acc:
      i = 2
      while i <= maxlen:
         if ur >= i :
            KFLAG = KFN + " -U %d -AE %s %d" % (ur, ac, i)
            fkocmnd.callfko(fko, KFLAG)
            [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                                 "gcc", "-x assembler-with-cpp", opt=opt)
            print "      '%s' AE=%d, UR=%d, mflop= %.2f" % (ac, i, ur, mf)
            if mf > mfB:
               mfB = mf
               urB = ur
               aeB = i
         if i < ur or ur%i :
            j = ((ur+i-1) / i)*i
            if j != ur:
               KFLAG = KFN + " -U %d -AE %s %d" % (j, ac, i)
               fkocmnd.callfko(fko, KFLAG)
               [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                                    "gcc", "-x assembler-with-cpp", opt=opt)
               print "      '%s' AE=%d, UR=%d, mflop= %.2f" % (ac, i, j, mf)
               if mf > mfB:
                  mfB = mf
                  urB = j
                  aeB = i

            j = (ur / i) * i
            if j  and j != ur:
               KFLAG = KFN + " -U %d -AE %s %d" % (j, ac, i)
               fkocmnd.callfko(fko, KFLAG)
               [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                                    "gcc", "-x assembler-with-cpp", opt=opt)
               print "      '%s' AE=%d, UR=%d, mflop= %.2f" % (ac, i, j, mf)
            if mf > mfB:
               mfB = mf
               urB = j
               aeB = i
         i += 1

      if mfB > mf0*1.001:
         KFN = KFN + " -U %d -AE %s %d" % (urB, ac, aeB)
         mf0 = mfB

   print "   AE=%d, UR=%d, mfB=%2.f, KFN=%s" % (aeB, urB, mfB, KFN)
   return[mfB, KFN]
Beispiel #10
0
def FindRE(ATLdir, ARCH, KF0, fko, rout, pre, blas, N, M, lda, acc, maxlen=6):
   """ Exactly same as FindAE but the fko command is changed 
   """
#
#  Time the default case
#  FIXED: need to save the default flag also. If RE is no better than previous
#  one, restore the default flag
#
   fkocmnd.callfko(fko, KF0)
   [t0,m0] = cmnd.time(ATLdir, ARCH, pre, blas, N, M,lda, "fkorout.s", 
                         "gcc", "-x assembler-with-cpp", opt=opt)
   mf0 = m0
#
#  Find present unrolling factor, and remove it from flags
#
   j = KF0.find("-U ")
   assert(j != -1)
   words = KF0[j:].split()
   ur = int(words[1])
   if j > 0: KFN = KF0[0:j-1]
   else : KFN = ""
   for word in words[2:] :
      KFN = KFN + " " + word
   #KF0 = KFN ## don't! lost the previous -U 
   
   print "   Finding ReduceExpan, UR=%d, mflop= %.2f" % (ur, m0)
   mfB = 0.0
   urB = ur
   aeB = 1
   nacc = len(acc)
   for ac in acc:
      i = 2
      while i <= maxlen:
         if ur >= i :
            KFLAG = KFN + " -U %d -RE %s %d" % (ur, ac, i)
            fkocmnd.callfko(fko, KFLAG)
            [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                                 "gcc", "-x assembler-with-cpp", opt=opt)
            print "      '%s' RE=%d, UR=%d, mflop= %.2f" % (ac, i, ur, mf)
            if mf > mfB:
               mfB = mf
               urB = ur
               aeB = i
         if i < ur or ur%i :
            j = ((ur+i-1) / i)*i
            if j != ur:
               KFLAG = KFN + " -U %d -RE %s %d" % (j, ac, i)
               fkocmnd.callfko(fko, KFLAG)
               [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                                    "gcc", "-x assembler-with-cpp", opt=opt)
               print "      '%s' RE=%d, UR=%d, mflop= %.2f" % (ac, i, j, mf)
               if mf > mfB:
                  mfB = mf
                  urB = j
                  aeB = i

            j = (ur / i) * i
            if j  and j != ur:
               KFLAG = KFN + " -U %d -RE %s %d" % (j, ac, i)
               fkocmnd.callfko(fko, KFLAG)
               [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N,M,lda, "fkorout.s", 
                                    "gcc", "-x assembler-with-cpp", opt=opt)
               print "      '%s' RE=%d, UR=%d, mflop= %.2f" % (ac, i, j, mf)
            if mf > mfB:
               mfB = mf
               urB = j
               aeB = i
         i += 1

      if mfB > mf0*1.001:
         KFN = KFN + " -U %d -RE %s %d" % (urB, ac, aeB)
         mf0 = mfB
#
#  check for altimate result
#
   #print mfB, KFN
   #print mf0, KF0

   if mfB > m0*1.001:
      print "   RE=%d, UR=%d, mfB=%2.f, KFN=%s" % (aeB, urB, mfB, KFN)
   else:
      KFN = KF0         ## restore original flags
      mfB = m0
      print "   KFN = KF0 = %s" % KFN
   return[mfB, KFN]
Beispiel #11
0
def FindUR(ATLdir, ARCH, KF0, fko, rout, pre, blas, N, M, lda, info, UR0=1, URN=64):

   print "   Finding best unroll:"
#
#  Get rid of default unrolling so we can add our own

#
#  default max unroll, URN is 64
#  but for now, we consider 32 for cos 
#
   if blas.find("cos") != -1 :
      URN = 32;
#
#  if speculation is applied, max_unroll is applied to 16
# 
   if isSV:
      if blas.find("sin") != -1 : 
         URN = 8;
      elif  blas.find("cos") != -1 :
         URN = 5;  # FIXME: bitvec exceeds the datatype limit
      else:
         URN = 16
#
#     if SB is specified, SB*UR can be MaxUnroll
#
      if SB:
         URN = int (URN+SB-1)/SB 

#  MAX_UR = 32
#   if blas.find("amax") != -1 :
#      if SB:
#         URN = int((MAX_UR+SB-1)/SB) ### just testing!!
#      else:
#         URN = MAX_UR
#   if blas.find("iamax") != -1 :
#      if SB:
#         URN = int((MAX_UR+SB-1)/SB)
#      else:
#         URN = MAX_UR
#   if blas.find("nrm2") != -1 : ## limit the blind unrolling for nrm2... 
#      if SB:
#         URN = int((MAX_UR+SB-1)/SB)
#      else:
#         URN = MAX_UR
   
   j = KF0.find("-U ")
   if j != -1:
      words = KF0[j:].split()
      if (j != 0): KF0 = KF0[0:j]
      else : KF0 = ""
      for word in words[2:]:
         KF0 = KF0 + " "  + word
   UR = UR0
   mf0 = 0
   URB = 1
   while UR <= URN:
      KFn = KF0 + " -U %d" % (UR)
      fkocmnd.callfko(fko, KFn)
      [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M,lda, "fkorout.s", 
                           "gcc", "-x assembler-with-cpp", opt=opt)
      print "      UR=%d, mflop=%.2f" % (UR, mf)
#
#     Demand that higher unrollings get at least 1% better
#
      if (mf >= mf0*1.01):
         URB = UR
         mf0 = mf
#
#     for SV, we need to increament by 1 
#
      if isSV:
         UR += 1
      else:
         UR *= 2
         #UR += 1

   KF0 = KF0 + " -U %d" % URB
   print "\n   BEST Unroll Factor = %d" %URB
   return [mf0,KF0]
Beispiel #12
0
def FindPFD(ATLdir, ARCH, KF0, fko, rout, pre, blas, N, M, lda, info, arr,
            pfd0=0, pfdN=2048, pfdinc=0):
#
#  Figure out the prefetch cache level
#
   st = "-P %s " % (arr)
   j = KF0.find(st)

   if j == -1:
      j = KF0.find("-P all ")
      assert(j != -1)
      j += 7
   else:
      j += 4 + len(arr)
   words = KF0[j:].split()
   pflvl = int(words[0])
   
   LS = info[1][pflvl];
   if not pfd0: pfd0 = LS
   if not pfdinc: pfdinc = LS
   print "\n   Finding PFD for %s in [%d:%d:%d]" % (arr, pfd0, pfdN, pfdinc)
   ipd = int(words[1])
   mfM = 0.0
   pfdM = 0
#
#  Majedul: delete previous default pref sch
# 
#   match = re.search(r'-Ps\s\w\s\w\s\d+\s\d+',KF0) ## find -Ps b A 0 1
#   if match:
#      pat = match.group()
#      KF0 = KF0.replace(pat, '') ## remove the all occurance
#   KF0 = KF0 + " -Ps b A 0 1"
#
#  will delete the -P all 0 128
#
#   match = re.search(r'-P\s\w+\s[-]?\d+\s\d+',KF0) ## find -P X 0 1
#   if match:
#      pat = match.group()
#      KF0 = KF0.replace(pat, '') ## remove all occurance

#
#  Scope very short PFD
#
   if pfd0 >= LS and LS > 32:
      pfd = 32
      while pfd <= LS:
         KFn = KF0 + " -P %s %d %d" % (arr, pflvl, pfd)
         fkocmnd.callfko(fko, KFn)
         [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                              "gcc", "-x assembler-with-cpp", opt=opt)
         print "      %s : PFD = %d mflop = %.2f" % (arr, pfd, mf)
         if mf > mfM*1.0001:
            mfM = mf
            pfdM = pfd
         pfd += 8
      
   pfd = pfd0
   while pfd <= pfdN:
      KFn = KF0 + " -P %s %d %d" % (arr, pflvl, pfd)
      fkocmnd.callfko(fko, KFn)
      [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                           "gcc", "-x assembler-with-cpp", opt=opt)
#      mfs.append(mf)
      print "      %s : PFD = %d mflop = %.2f" % (arr, pfd, mf)
#
#     Demand higher PFD is at least 1.0001 faster
#
      if mf > mfM*1.0001:
         mfM = mf
         pfdM = pfd
      pfd += LS
   print "\n   BEST prefetch distance = %d (%.2f)" % (pfdM, mfM)
#
#  Try not prefetching array at all
#
   KFn = KF0 + " -P %s -1 0" % (arr)
#   print KFn
   fkocmnd.callfko(fko, KFn)
   [t,mf] = cmnd.time(ATLdir, ARCH, pre, blas, N, M, lda, "fkorout.s", 
                        "gcc", "-x assembler-with-cpp", opt=opt)
   print "      %s : NO PREFETCH:   mflop = %.2f" % (arr, mf)
   if mf >= mfM:
      KF0 = KF0 + " -P %s -1 0" % (arr)
      mfM = mf
   elif pfdM != ipd:
      KF0 = KF0 + " -P %s %d %d" % (arr, pflvl, pfdM)
   return [mfM,KF0]