Beispiel #1
0
 def mask_to_register(mask):
     mask = Mask.as_immediate(mask)
     if mask in maskcache:
         maskcache.move_to_end(mask)
         return maskcache[mask]
     try:
         maskreg = MaskRegister(64, mask)
     except AllocationError:
         _, maskreg = maskcache.popitem(False)
     x86.mov(maskreg, mask)
     maskcache[mask] = maskreg
     return maskreg
Beispiel #2
0
def square_701_patience(out_data, in_data, n, callee_saved=0):
    x = list(range(701)) + 3 * [ZERO]
    regs = split_in_size_n(x, 64)

    seq = gen_sequence(n, 701) + 3 * [ZERO]
    seq_r = split_in_size_n(seq, 64)

    moved = [False] * len(seq_r)

    r = Register(64)
    t1 = Register(64)

    for i in range(callee_saved):
        x86.push_callee_saved(64)

    maskcache = OrderedDict()

    def mask_to_register(mask):
        mask = Mask.as_immediate(mask)
        if mask in maskcache:
            maskcache.move_to_end(mask)
            return maskcache[mask]
        try:
            maskreg = MaskRegister(64, mask)
        except AllocationError:
            _, maskreg = maskcache.popitem(False)
        x86.mov(maskreg, mask)
        maskcache[mask] = maskreg
        return maskreg

    for j, inreg in enumerate(regs):
        x86.mov(r, in_data[j])
        for i, seqreg in enumerate(seq_r):
            piledict = {}
            for rotation in range(64):
                ror_seqreg = seqreg[rotation:] + seqreg[:rotation]
                piles = []
                overlap = [x for x in ror_seqreg if x in inreg and x != ZERO]
                for x in overlap:
                    for pile in piles:
                        try:
                            if pile[-1] <= x:
                                pile.append(x)
                                break
                        except IndexError:  # pile is empty
                            pass
                    else:  # doesn't fit on any existing pile: start a new pile
                        piles.append([x])
                piledict[rotation] = piles
            min_pile_key = min(piledict, key=lambda x: len(piledict.get(x)))
            if len(piledict[0]) == len(piledict[min_pile_key]):
                min_pile_key = 0
            if min_pile_key > 0:
                ror_seqreg = seqreg[min_pile_key:] + seqreg[:min_pile_key]
            else:
                ror_seqreg = seqreg

            for pile in piledict[min_pile_key]:
                emask = [ZERO] * 64
                for bit in pile:
                    emask[inreg.index(bit)] = ONE
                dmask = [ZERO] * 64
                for bit in pile:
                    dmask[ror_seqreg.index(bit)] = ONE

                # For consecutive bits, we do not even need pext/pdep
                if (Mask.consec(dmask) and Mask.consec(emask) and
                    (Mask.degree(emask) < 32 or Mask.degree(dmask) < 32)):
                    delta = (Mask.degree(dmask) - Mask.degree(emask)) % 64
                    x86.mov(t1, r)
                    if Mask.degree(emask) < 32:
                        x86.iand(t1, Mask.as_immediate(emask))
                        x86.rol(t1, delta + min_pile_key)
                        min_pile_key = 0  # to avoid two rols
                    else:
                        x86.rol(t1, delta)
                        x86.iand(t1, Mask.as_immediate(dmask))
                else:
                    # if we can extract using AND instead..
                    if Mask.consec(emask, True) and Mask.degree(emask) < 32:
                        x86.mov(t1, r)
                        x86.iand(t1, Mask.as_immediate(emask))
                    else:
                        x86.pext(t1, r, mask_to_register(emask))
                    x86.pdep(t1, t1, mask_to_register(dmask))

                if min_pile_key > 0:
                    x86.rol(t1, min_pile_key)
                if moved[i]:  # stored per i, as it's not the outer loop
                    x86.xor(out_data[i], t1)
                else:
                    x86.mov(out_data[i], t1)
                    moved[i] = True
    x86.movq(out_data[11], 0)  # to fill up all 768 bits

    for mask in maskcache.values():
        mask.free()

    for i in range(callee_saved):
        x86.pop_callee_saved(64)