Example #1
0
    def visit_lowcreate(self, lc):
        cr = self.cur_scope.creates[lc.label]

        # compute the calling convention
        c = regmagic.mapcall(cr.args, funcname="create", loc=cr.loc)
        lc.callconv = c["nargs"]

        newbl = Block(loc=cr.loc)
        lbl = cr.label

        # generate the function pointer
        if cr.funtype == cr.FUN_ID:
            if lc.lowfun is not None:
                funvar = lc.lowfun
            else:
                # not yet split
                funvar = Opaque(cr.fun)
        else:
            assert cr.funtype == cr.FUN_VAR

            n = "C$mtF$%s" % lbl
            t = "C$mtF$%s" % lbl

            thetype = CTypeDecl(loc=cr.loc, name=t, ctype=CType(items=Opaque(text="void (*") + CTypeHead() + ")(void)"))
            self.cur_scope.decls += thetype
            funvar = CVarDecl(loc=cr.loc, name=n, ctype=CTypeUse(tdecl=thetype))
            self.cur_scope.decls += funvar

            if lc.lowfun is not None:
                thefun = lc.lowfun
            else:
                # not yet split
                thefun = CVarUse(decl=cr.fun)

            newbl += CVarSet(loc=cr.loc, decl=funvar, rhs=CCast(ctype=CTypeUse(tdecl=thetype), expr=thefun)) + ";"
            funvar = CVarUse(decl=funvar)

        # prepare structure for memory-passed arguments

        if c["gl_mem_offset"] is not None:
            maname = "C$mtM$%s" % lbl
            mat = "C$mtM$%s" % lbl

            thestruct = Opaque("struct {")
            for d in c["memlayout"]:
                thestruct = thestruct + (Opaque(loc=d["loc"]) + d["ctype"] + " " + d["name"] + ";")
            thestruct = thestruct + "}"

            thetype = CTypeDecl(loc=cr.loc, name=mat, ctype=thestruct)

            self.cur_scope.decls += thetype
            mavar = CVarDecl(loc=cr.loc, name=maname, ctype=CTypeUse(tdecl=thetype))
            self.cur_scope.decls += mavar
            mavar = CVarUse(decl=mavar)
            lc.mavar = mavar

        # Now handle body between create..sync

        newbl += lc.body.accept(self)

        # On to the real stuff. First try allocate

        fidvar = cr.cvar_fid

        usefvar = CVarUse(decl=fidvar)

        if cr.extras.has_attr("exclusive"):
            if not self.newisa:
                die("exclusive create not supported on this target", cr)
            allocinsn = "allocatex"
        elif lc.target_next is None:
            if cr.extras.has_attr("nowait"):
                warn("this create may fail and no alternative is available", cr)
                allocinsn = "allocate"
            else:
                allocinsn = "allocates"
        else:
            if cr.extras.has_attr("forcewait"):
                allocinsn = "allocates"
            else:
                allocinsn = "allocate"

        if allocinsn == "allocates" and not self.newisa:
            die("suspending create is not supported on this target", cr)

        start = CVarUse(decl=cr.cvar_start)
        limit = CVarUse(decl=cr.cvar_limit)
        step = CVarUse(decl=cr.cvar_step)
        block = CVarUse(decl=cr.cvar_block)

        strategyuse = CVarUse(cr.cvar_strategy)

        newbl += (
            flatten(cr.loc, '__asm__ __volatile__("%s %%2, %%0\\t! MT: CREATE %s"' ' : "=r"(' % (allocinsn, lbl))
            + usefvar
            + ') : "0"('
            + CVarUse(decl=cr.cvar_place)
            + '), "rP"('
            + strategyuse
            + "));"
        )

        if lc.target_next is not None:
            if self.newisa:
                failval = 0
            else:
                failval = -1

            newbl += (
                flatten(cr.loc, " if (__builtin_expect(%d == (" % failval)
                + usefvar
                + "), 0)) "
                + CGoto(target=lc.target_next)
            ) + ";"

        newbl += (
            flatten(cr.loc, '__asm__ ("setstart %%0, %%2\\t! MT: CREATE %s"' ' : "=r"(' % lbl)
            + usefvar
            + ') : "0"('
            + usefvar
            + '), "rP"('
            + start
            + ")); "
            '__asm__ ("setstep %%0, %%2\\t! MT: CREATE %s"' % lbl
            + ' : "=r"('
            + usefvar
            + ') : "0"('
            + usefvar
            + '), "rP"('
            + step
            + ")); "
            '__asm__ ("setblock %%0, %%2\\t! MT: CREATE %s"' % lbl
            + ' : "=r"('
            + usefvar
            + ') : "0"('
            + usefvar
            + '), "rP"('
            + block
            + ")); "
        )
        if self.newisa:
            newbl += (
                Opaque('__asm__ ("setlimit %%0, %%2\\t! MT: CREATE %s"' % lbl)
                + ' : "=r"('
                + usefvar
                + ') : "0"('
                + usefvar
                + '), "rP"('
                + limit
                + ")); "
            )
        else:  # not self.newisa:
            # FIXME: this "-1" business is an ugly hack : uT-LEON3 was based on a screwed up
            # simulator source which used inclusive limits.
            newbl += (
                Opaque('__asm__ ("setlimit %%0, %%2\\t! MT: CREATE %s"' % lbl)
                + ' : "=r"('
                + usefvar
                + ') : "0"('
                + usefvar
                + '), "rP"(('
                + limit
                + ")-1)); "
                '__asm__ ("setthread %%0, %%2\\t! MT: CREATE %s"' % lbl
                + ' : "=r"('
                + usefvar
                + ') : "0"('
                + usefvar
                + '), "rP"('
                + funvar
                + ")); "
            )

        argregs = set()
        crc = Scope()
        aregn = 0
        gargs = []
        for g in c["gislots"]:
            name = g["name"]
            r = regmagic.vname_to_legacy("l%d" % aregn)
            var = CVarDecl(
                loc=cr.loc,
                name="C$aR$%s$%s" % (cr.label, name),
                ctype=g["ctype"],
                init=CVarUse(decl=cr.arg_dic[name].cvar),
                reg=(not self.newisa) and r or None,
            )
            crc.decls += var
            gargs.append(CVarUse(decl=var))
            aregn += 1
            argregs.add(r)

        if c["gl_mem_offset"] is not None:
            # one extra global var
            r = regmagic.vname_to_legacy("l%d" % aregn)
            var = CVarDecl(
                loc=cr.loc,
                name="C$aR$%s$%s" % (cr.label, name),
                ctype=mat + "*",
                init=Opaque(text="&") + mavar,
                reg=(not self.newisa) and r or None,
            )
            crc.decls += var
            gargs.append(CVarUse(decl=var))
            aregn += 1
            argregs.add(r)

        collect = Block()
        sargs = []

        for s in c["sislots"]:
            name = s["name"]
            r = regmagic.vname_to_legacy("l%d" % aregn)
            arg_cvar = cr.arg_dic[name].cvar
            var = CVarDecl(
                loc=cr.loc,
                name="C$aR$%s$%s" % (cr.label, name),
                ctype=s["ctype"],
                init=CVarUse(decl=arg_cvar),
                reg=(not self.newisa) and r or None,
            )
            crc.decls += var
            sargs.append(CVarUse(decl=var))
            collect += CVarSet(decl=arg_cvar, rhs=CVarUse(decl=var)) + Opaque(";")
            aregn += 1
            argregs.add(r)

        if not self.newisa:
            # build reg arg lists
            # start with fidvar/shareds first, as these need to reference each other
            # and gcc has a limit on the number of back references
            olist = Opaque('"=r"(') + usefvar + ")"
            ilist = Opaque('"0"(') + usefvar + ")"
            roff = 1
            for v in sargs:
                olist += Opaque(', "=r"(') + v + ")"
                ilist += Opaque(', "%d"(' % roff) + v + ")"
                roff += 1
            for v in gargs:
                ilist += Opaque(', "r"(') + v + ")"

            crc += (
                flatten(
                    cr.loc,
                    ' __asm__ __volatile__("create %%0, %%0\\t! MT: CREATE %s DRAIN(%s)'
                    '\\n\\tmov %%0, %%0\\t! MT: SYNC %s" : ' % (lbl, ",".join(argregs).replace("%", "%%"), lbl),
                )
                + olist
                + " : "
                + ilist
                + ' : "memory");'
            )

            if cr.sync_type != "normal":
                warn("detached create not supported on this target, using normal sync instead", cr)

        else:
            crc += (
                flatten(cr.loc_end, ' __asm__ __volatile__("crei %%2, %%0\\t! MT: CREATE %s"' % lbl)
                + ' : "=r"('
                + usefvar
                + ') : "0"('
                + usefvar
                + "),"
                + '   "r"('
                + funvar
                + ') : "memory");'
            )

            aoff = 0
            for a in gargs:
                crc += (
                    flatten(cr.loc_end, ' __asm__("putg %%2, %%0, %d\\t! MT: set sarg"' ' : "=r"(' % aoff)
                    + usefvar
                    + ') : "0"('
                    + usefvar
                    + '), "r"('
                    + a
                    + "));"
                )
                aoff += 1
            aoff = 0
            for a in sargs:
                crc += (
                    flatten(cr.loc_end, ' __asm__("puts %%2, %%0, %d\\t! MT: set shared"' ' : "=r"(' % aoff)
                    + usefvar
                    + ') : "0"('
                    + usefvar
                    + '), "r"('
                    + a
                    + "));"
                )
                aoff += 1

            if cr.sync_type == "normal":
                crc += (
                    flatten(cr.loc_end, ' __asm__ __volatile__("sync %%0, %%1; ' ' mov %%1, %%1\\t! MT: SYNC %s"' % lbl)
                    + ' : "=r"('
                    + usefvar
                    + '), "=r"('
                    + CVarUse(decl=cr.cvar_exitcode)
                    + ') : "0"('
                    + usefvar
                    + ') : "memory");'
                )

                aoff = 0
                for a in sargs:
                    crc += (
                        flatten(cr.loc_end, ' __asm__("gets %%0, %d, %%1; ' ' mov %%1, %%1\\t! MT get shared"' % aoff)
                        + ' : "=r"('
                        + usefvar
                        + '), "=r"('
                        + a
                        + ') : "0"('
                        + usefvar
                        + "));"
                    )

            crc += (
                flatten(cr.loc_end, ' __asm__ __volatile__("release %%0\\t! MT: SYNC %s"' % lbl)
                + ' : : "r"('
                + usefvar
                + "));"
            )

        # Alias the shared arguments back
        crc += collect

        newbl += crc

        return newbl
Example #2
0
    def visit_lowcreate(self, lc):


        #print "IN LOWC (v = %x, d = %x, lc = %x)" % (id(self), id(self.__dict__), id(lc))
        cr = self.cur_scope.creates[lc.label]



        # compute the calling convention
        c = regmagic.mapcall(cr.args, funcname = "create", loc = cr.loc)

        newbl = Block(loc = cr.loc)
        lbl = cr.label

        # generate allocate + test for alternative
        fidvar = cr.cvar_fid
        start = CVarUse(decl = cr.cvar_start)
        limit = CVarUse(decl = cr.cvar_limit)
        step = CVarUse(decl = cr.cvar_step)
        block = CVarUse(decl = cr.cvar_block)
        
        usefvar = CVarUse(decl = fidvar)

        if cr.extras.has_attr('exclusive'):
            allocinsn = 'allocate/x'
        elif lc.target_next is None:
            if cr.extras.has_attr('nowait'):
                warn("this create may fail and no alternative is available", cr)
                allocinsn = 'allocate'
            else:
                allocinsn = 'allocate/s'
        else:
            if cr.extras.has_attr('forcewait'):
                allocinsn = 'allocate/s'
            else:
                allocinsn = 'allocate'

        strategyuse = CVarUse(cr.cvar_strategy)

        newbl += (flatten(cr.loc,
                          '__asm__ __volatile__("%s %%2, %%1, %%0\\t# MT: CREATE %s"'
                          ' : "=r"(' % (allocinsn, lbl)) + 
                  usefvar + ') : "rI"(' + strategyuse + '), "r"(' + CVarUse(decl = cr.cvar_place) + '));')
        
        if lc.target_next is not None:
            newbl += (flatten(cr.loc, ' if (!__builtin_expect(!!(') + 
                      usefvar + '), 1)) ' + 
                      CGoto(target = lc.target_next)) + ';'


        # generate the function pointer
        if cr.funtype == cr.FUN_ID:
            if lc.lowfun is not None:
                funvar = lc.lowfun
            else:
                # not yet split
                funvar = Opaque(cr.fun)
        else:
            assert cr.funtype == cr.FUN_VAR

            n = 'C$mtF$%s' % lbl
            t = 'C$mtF$%s' % lbl
            
            thetype = CTypeDecl(loc = cr.loc,
                                name = t,
                                ctype = CType(items = 
                                              Opaque(text = "void (*") +
                                              CTypeHead() + 
                                              ')(void)'))
            self.cur_scope.decls += thetype
            funvar = CVarDecl(loc = cr.loc, name = n, ctype = CTypeUse(tdecl = thetype))
            self.cur_scope.decls += funvar

            if lc.lowfun is not None:
                thefun = lc.lowfun
            else:
                # not yet split
                thefun = CVarUse(decl = cr.fun)

            newbl += CVarSet(loc = cr.loc, decl = funvar, 
                             rhs = CCast(ctype = CTypeUse(tdecl = thetype),
                                         expr = thefun)) + ';'
            funvar = CVarUse(decl = funvar)

        # prepare memory structure for memory-passed arguments
            ### FIXME: move stuff to cur_scope
        if c['gl_mem_offset'] is not None:
            maname = "C$mtM$%s" % lbl
            mat = 'C$mtM$%s' % lbl

            thestruct = Opaque('struct {')
            for d in c['memlayout']:
                thestruct = thestruct + (Opaque(loc = d['loc']) + d['ctype'] + ' ' + d['name'] + ';')
            thestruct = thestruct + '}'

            thetype = CTypeDecl(loc = cr.loc,
                                name = mat,
                                ctype = thestruct)

            self.cur_scope.decls += thetype
            mavar = CVarDecl(loc = cr.loc, name = maname, ctype = CTypeUse(tdecl = thetype))
            self.cur_scope.decls += mavar
            mavar = CVarUse(decl = mavar)
            lc.mavar = mavar

        # generate create
        newbl += (flatten(cr.loc, 
                         '__asm__ ("setstart %%0, %%2\\t# MT: CREATE %s"'
                         ' : "=r"(' % lbl) +
                  usefvar + ') : "0"(' + usefvar + '), "rI"(' + start + ')); ' +
                  '__asm__ ("setlimit %%0, %%2\\t# MT: CREATE %s"' % lbl +
                  ' : "=r"(' + usefvar + ') : "0"(' + usefvar  + '), "rI"(' + limit + ')); ' +
                  '__asm__ ("setstep %%0, %%2\\t# MT: CREATE %s"' % lbl +
                  ' : "=r"(' + usefvar + ') : "0"(' + usefvar + '), "rI"(' + step + ')); ' +
                  '__asm__ ("setblock %%0, %%2\\t# MT: CREATE %s"' % lbl +
                  ' : "=r"(' + usefvar + ') : "0"(' + usefvar + '), "rI"(' + block + ')); ' +
                  '__asm__ __volatile__("crei %%0, 0(%%2)\\t# MT: CREATE %s"' % lbl +
                  ' : "=r"(' + usefvar + ') : "0"(' + usefvar + '),' +
                  '   "r"(' + funvar + ') : "memory");')

        lc.callconv = c['nargs']
        lc.fidvar = usefvar

        newbl += lc.body.accept(self)


        # done with body, now handle sync

        # first of all, if there weresome memory-passed arguments,
        # we need to push the argument register to the child family.
        # A memory barrier is required because the remote thread(s) may
        # access the memory as soon as putg completes.
        
        if c['gl_mem_offset'] is not None:
            newbl += (flatten(cr.loc_end, 
                             ' __asm__ ("wmb; putg %%2, %%0, %d\\t#MT: set offset for memargs"' 
                             % c['gl_mem_offset']) + 
                      ' : "=r"(' + usefvar + ') : "0"(' + usefvar + '),' +
                      '   "r"(&' + mavar + '));')

        # now, on to the sync.
        if cr.sync_type == 'normal':
            # normal, synchronized create

            # first wait for child family to terminate.
            newbl += (flatten(cr.loc_end, 
                             '__asm__ __volatile__("sync %%0, %%1; '
                             ' mov %%1, $31\\t# MT: SYNC %s"' % lbl) +
                      ' : "=r"(' + usefvar + '), "=r"(' + 
                      CVarUse(decl = cr.cvar_exitcode) + 
                      ') : "0"(' + usefvar + ') : "memory");')
            
            # then pull shared arguments back.
            for name, arg in c['nargs'].iteritems():
                crarg = cr.arg_dic[name]
                if not crarg.seen_get:
                    # geta() is not used, so no need to retrieve
                    continue
                if arg['mode'] == 'reg' and arg['cat'] == 'sh':
                    if arg['species'] == 'f':
                        insn1 = 'fgets'
                        insn2 = 'fmov'
                        rspec = 'f'
                    else:
                        insn1 = 'gets'
                        insn2 = 'mov'
                        rspec = 'r'
                    regnr = arg['regnr']
                    argvar = crarg.cvar
                    # FIXME: perform "mov" after all "get" have been issued!
                    newbl += (flatten(cr.loc_end, 
                                     ' __asm__ ('
                                     '"%(insn1)s %%0, %(regnr)d, %%1; '
                                     ' %(insn2)s %%1, %%1'
                                     '\\t# MT: get shared"' % locals()) +
                              ' : "=r"(' + usefvar + '), "=%(rspec)s"(' % locals() +
                              CVarUse(decl = argvar) + ') : "0"(' + usefvar + '));')
    
                          
        if cr.sync_type != 'spawn':
            # for normal sync and detach, release resources.
            # spawn will sync+release later.
            newbl += (flatten(cr.loc_end, 
                              ' __asm__ __volatile__("release %%0\\t#MT: SYNC %s"' % lbl) +
                      ' : : "r"(' + usefvar + '));')

        return newbl
Example #3
0
    def visit_lowcreate(self, lc):


        #print "IN LOWC (v = %x, d = %x, lc = %x)" % (id(self), id(self.__dict__), id(lc))
        cr = self.cur_scope.creates[lc.label]



        # compute the calling convention
        c = regmagic.mapcall(cr.args, funcname = "create", loc = cr.loc)

        newbl = Block(loc = cr.loc)
        lbl = cr.label

        # generate allocate + test for alternative
        fidvar = cr.cvar_fid

        has_globals = False
        nr_globals = c['nrargregs']['gli']
        if nr_globals > 0:
            has_globals = True
            cr.cvar_rb = CVarDecl(loc = cr.loc, name = 'C$Fb$%s' % cr.label, ctype = 'long')
            self.cur_scope.decls += cr.cvar_rb
            gblvar = CVarUse(decl = cr.cvar_rb)
            lc.gblvar = gblvar
            
        start = CVarUse(decl = cr.cvar_start)
        limit = CVarUse(decl = cr.cvar_limit)
        step = CVarUse(decl = cr.cvar_step)
        block = CVarUse(decl = cr.cvar_block)
        
        usefvar = CVarUse(decl = fidvar)

        newbl += (flatten(cr.loc,
                          '__asm__ __volatile__("f_alloc %%0\\t! MT: CREATE %s"'
                          ' : "=r"(' % lbl) + 
                  usefvar + '));')
        
        if lc.target_next is not None:
            newbl += (flatten(cr.loc, ' if (!__builtin_expect(!!(') + 
                      usefvar + '), 1)) ' + 
                      CGoto(target = lc.target_next)) + ';'

        if has_globals:
            mask = 0
            if nr_globals > 8:
                mask = 4 # binary 100
            elif nr_globals > 4:
                mask = 6 # binary 110
            else:
                mask = 7 # binary 111
            newbl += (flatten(cr.loc,
                              '__asm__ __volatile__("r_allocsrb %%1, %%0\\t! MT: CREATE %s"'
                              ' : "=r"(' % lbl) + 
                              gblvar + ') : "rI"(%d));' % mask)
            if lc.target_next is not None:
                newbl += (flatten(cr.loc, ' if (!__builtin_expect(!!(') + 
                          gblvar + '), 1)) {' +
                          '__asm__ __volatile__("f_fence %%0, 31\\t! MT: ABORT1 CREATE %s" : : "r"(' % lbl + usefvar + '));' + 
                          CGoto(target = lc.target_next) + ';};')
            newbl += (flatten(cr.loc,
                             '__asm__ ("f_mapg %%0, %%2\\t! MT: CREATE %s" : "=r"(' % lbl) +
                      usefvar + ') : "r"(' + usefvar + '), "r"(' + gblvar + '));')
                
        # generate the function pointer
        if cr.funtype == cr.FUN_ID:
            if lc.lowfun is not None:
                funvar = lc.lowfun
            else:
                # not yet split
                funvar = Opaque(cr.fun)
        else:
            assert cr.funtype == cr.FUN_VAR

            n = 'C$mtF$%s' % lbl
            t = 'C$mtF$%s' % lbl
            
            thetype = CTypeDecl(loc = cr.loc,
                                name = t,
                                ctype = CType(items = 
                                              Opaque(text = "void (*") +
                                              CTypeHead() + 
                                              ')(void)'))
            self.cur_scope.decls += thetype
            funvar = CVarDecl(loc = cr.loc, name = n, ctype = CTypeUse(tdecl = thetype))
            self.cur_scope.decls += funvar

            if lc.lowfun is not None:
                thefun = lc.lowfun
            else:
                # not yet split
                thefun = CVarUse(decl = cr.fun)

            newbl += CVarSet(loc = cr.loc, decl = funvar, 
                             rhs = CCast(ctype = CTypeUse(tdecl = thetype),
                                         expr = thefun)) + ';'
            funvar = CVarUse(decl = funvar)

        # prepare the thread group
        tgname = "C$htg$%s" % lbl
        tgdecl = CVarDecl(loc = cr.loc, name = tgname, ctype = 'long')
        tgvar = CVarUse(decl = tgdecl)
        self.cur_scope.decls += tgdecl

        newbl += (flatten(cr.loc, ' if (!__builtin_expect(!!(') + 
                  block + '), 1)) ' + block + ' = -1;')

        newbl += (flatten(cr.loc,
                          '__asm__ __volatile__ ("t_allochtg %%1, %%0, %%0\\t! MT: CREATE %s FUN %%2 ' % lbl) + funvar +
                  '": "=&r"(' + tgvar + ') : "r"(' + block + '), ' + 
                  '"r"(' + funvar + '));')
        if lc.target_next is not None:
            newbl += (flatten(cr.loc, ' if (!__builtin_expect(!!(') + 
                      tgvar + '), 1)) {' +
                      '__asm__ __volatile__("f_fence %%0, 31\\t! MT: ABORT2 CREATE %s" : : "r"(' % lbl + usefvar + '));' + 
                      CGoto(target = lc.target_next) + ';};')
        newbl += (flatten(cr.loc,
                          '__asm__ ("f_maphtg %%0, %%2\\t! MT: CREATE %s"' % lbl) +
                  ': "=r"(' + usefvar + ') : "r"(' + usefvar + '), "r"(' + tgvar + '));')
            
        # prepare memory structure for memory-passed arguments
            ### FIXME: move stuff to cur_scope
        if c['gl_mem_offset'] is not None:
            maname = "C$mtM$%s" % lbl
            mat = 'C$mtM$%s' % lbl

            thestruct = Opaque('struct {')
            for d in c['memlayout']:
                thestruct = thestruct + (Opaque(loc = d['loc']) + d['ctype'] + ' ' + d['name'] + ';')
            thestruct = thestruct + '}'

            thetype = CTypeDecl(loc = cr.loc,
                                name = mat,
                                ctype = thestruct)

            self.cur_scope.decls += thetype
            mavar = CVarDecl(loc = cr.loc, name = maname, ctype = CTypeUse(tdecl = thetype))
            self.cur_scope.decls += mavar
            mavar = CVarUse(decl = mavar)
            lc.mavar = mavar

        # generate create
        newbl += (flatten(cr.loc, 
                          '__asm__ ("f_set_blocksize %%0, %%2\\t! MT: CREATE %s"'
                         ' : "=r"(' % lbl) +
                  usefvar + ') : "0"(' + usefvar + '), "rI"(' + limit + ')); ' +
                  '__asm__ ("f_set_gridsize %%0, %%2\\t! MT: CREATE %s"' % lbl +
                  ' : "=r"(' + usefvar + ') : "0"(' + usefvar + '), "rI"(' + step + ')); ' )

        lc.callconv = c['nargs']
        lc.fidvar = usefvar

        newbl += lc.body.accept(self)


        # done with body, now handle sync

        # first of all, if there weresome memory-passed arguments,
        # we need to push the argument register to the child family.
        # A memory barrier is required because the remote thread(s) may
        # access the memory as soon as r_write completes.
        
        if c['gl_mem_offset'] is not None:
            newbl += (flatten(cr.loc_end, 
                             ' __asm__ ("wmb; r_write %%2, %%3\\t!MT: set offset for memargs in %%0"') + 
                      ' : "=r"(' + usefvar + ') : "0"(' + usefvar + '),' +
                      '   "r"(' + gblvar + '+%d)' % c['gl_mem_offset'] +
                      ', "r"(&' + mavar + '));')

        # actually create the family
        newbl += (flatten(cr.loc_end,
                          '__asm__ __volatile__("f_create %%0, %%2, %%0\\t! MT: CREATE %s"' % lbl) +
                  ' : "=r"(' + usefvar + ') : "0"(' + usefvar + '),' +
                  '   "r"(' + funvar + ') : "memory");')
        
        # now, on to the sync.
        if cr.sync_type == 'normal':
            # then wait for child family to terminate.
            newbl += (flatten(cr.loc_end, 
                              '__asm__ __volatile__("f_fence %%0, 31; nop;'
                              ' t_wait\\t! MT: SYNC %s"' % lbl) +
                      ' : "=r"(' + usefvar + ') : "0"(' + usefvar + ') : "memory");') 
        elif cr.sync_type == 'detach':
            # automatically release resources upon termination
            newbl += (flatten(cr.loc_end, 
                              '__asm__ __volatile__("f_fence %%0, 30; nop;') +
                      ' : "=r"(' + usefvar + ') : "0"(' + usefvar + '));')

        return newbl