def mask_to_register(mask): mask = Mask.as_immediate(mask) if mask in maskcache: maskcache.move_to_end(mask) return maskcache[mask] try: maskreg = MaskRegister(64, mask) except AllocationError: _, maskreg = maskcache.popitem(False) x86.mov(maskreg, mask) maskcache[mask] = maskreg return maskreg
def square_701_patience(out_data, in_data, n, callee_saved=0): x = list(range(701)) + 3 * [ZERO] regs = split_in_size_n(x, 64) seq = gen_sequence(n, 701) + 3 * [ZERO] seq_r = split_in_size_n(seq, 64) moved = [False] * len(seq_r) r = Register(64) t1 = Register(64) for i in range(callee_saved): x86.push_callee_saved(64) maskcache = OrderedDict() def mask_to_register(mask): mask = Mask.as_immediate(mask) if mask in maskcache: maskcache.move_to_end(mask) return maskcache[mask] try: maskreg = MaskRegister(64, mask) except AllocationError: _, maskreg = maskcache.popitem(False) x86.mov(maskreg, mask) maskcache[mask] = maskreg return maskreg for j, inreg in enumerate(regs): x86.mov(r, in_data[j]) for i, seqreg in enumerate(seq_r): piledict = {} for rotation in range(64): ror_seqreg = seqreg[rotation:] + seqreg[:rotation] piles = [] overlap = [x for x in ror_seqreg if x in inreg and x != ZERO] for x in overlap: for pile in piles: try: if pile[-1] <= x: pile.append(x) break except IndexError: # pile is empty pass else: # doesn't fit on any existing pile: start a new pile piles.append([x]) piledict[rotation] = piles min_pile_key = min(piledict, key=lambda x: len(piledict.get(x))) if len(piledict[0]) == len(piledict[min_pile_key]): min_pile_key = 0 if min_pile_key > 0: ror_seqreg = seqreg[min_pile_key:] + seqreg[:min_pile_key] else: ror_seqreg = seqreg for pile in piledict[min_pile_key]: emask = [ZERO] * 64 for bit in pile: emask[inreg.index(bit)] = ONE dmask = [ZERO] * 64 for bit in pile: dmask[ror_seqreg.index(bit)] = ONE # For consecutive bits, we do not even need pext/pdep if (Mask.consec(dmask) and Mask.consec(emask) and (Mask.degree(emask) < 32 or Mask.degree(dmask) < 32)): delta = (Mask.degree(dmask) - Mask.degree(emask)) % 64 x86.mov(t1, r) if Mask.degree(emask) < 32: x86.iand(t1, Mask.as_immediate(emask)) x86.rol(t1, delta + min_pile_key) min_pile_key = 0 # to avoid two rols else: x86.rol(t1, delta) x86.iand(t1, Mask.as_immediate(dmask)) else: # if we can extract using AND instead.. if Mask.consec(emask, True) and Mask.degree(emask) < 32: x86.mov(t1, r) x86.iand(t1, Mask.as_immediate(emask)) else: x86.pext(t1, r, mask_to_register(emask)) x86.pdep(t1, t1, mask_to_register(dmask)) if min_pile_key > 0: x86.rol(t1, min_pile_key) if moved[i]: # stored per i, as it's not the outer loop x86.xor(out_data[i], t1) else: x86.mov(out_data[i], t1) moved[i] = True x86.movq(out_data[11], 0) # to fill up all 768 bits for mask in maskcache.values(): mask.free() for i in range(callee_saved): x86.pop_callee_saved(64)