def square_701_shufbytes(out_data, in_data, n): r = Register() out = [Register() for _ in range(3)] moved = [False] * 3 t1 = Register() t2 = Register() t3 = Register() t4 = Register() t5 = Register() seq = gen_sequence(n, 701) + 67 * [ZERO] seq_regvalues = split_in_size_n(seq, 256) for in_data_fragment in in_data: x86.vmovdqa(r, in_data_fragment) shift_in = shifted = r offset = 0 for delta in range(8): # 8 possible rotations may be necessary rol_meta = None if delta > 0: # if we've made the previous rotation persistent if shift_in is shifted: shifted = t4 if shifted is t3 else t3 d_nett = delta - offset rol_meta = len(x86.INSTRUCTIONS), str(shifted), str(t1) x86.macro_v256rol(shifted, shift_in, d_nett, t1, t2) rotated = [b for d in range(d_nett) for b in shifted[d::64]] # vpshufb cannot cross over xmm lanes for swap_xmms in [False, True]: if swap_xmms: swapped = t5 x86.vpermq(swapped, shifted, '01001110') else: swapped = shifted r_bytes = split_in_size_n(swapped, 8) while True: # could be necessary to extract twice from same r bitmask = [[] for _ in range(len(seq_regvalues))] shufmask = [None] * 32 for k, seq_value in enumerate(seq_regvalues): s_bytes = split_in_size_n(seq_value, 8) s_xmms = split_in_size_n(s_bytes, 16) r_xmms = split_in_size_n(r_bytes, 16) for i, (s128, r128) in enumerate(zip(s_xmms, r_xmms)): for l, s_byte in enumerate(s128): for m, r_byte in enumerate(r128): # if this byte is already taken; if (shufmask[i * 16 + l] is not None and shufmask[i * 16 + l] != m): continue bits = [ ONE if x == y and x != ZERO else ZERO for x, y in zip(r_byte, s_byte) ] if ONE not in bits: continue shufmask[i * 16 + l] = m bitmask[k] += bits break else: bitmask[k] += [ZERO] * 8 continue for m, (x, y) in enumerate(zip(bits, s_byte)): if x == ONE: seq_regvalues[k][i * 128 + l * 8 + m] = None s_bytes = split_in_size_n(seq_regvalues[k], 8) if all(x is None for x in shufmask): break x86.vpshufb(t2, swapped, IndicesMask(shufmask)) for k, seq_value in enumerate(seq_regvalues): if ONE not in bitmask[k]: continue if not moved[k]: x86.vpand(out[k], t2, Mask(bitmask[k])) moved[k] = True else: x86.vpand(t1, t2, Mask(bitmask[k])) x86.vpxor(out[k], out[k], t1) # check if we used any of the rotated bits for maskbit, bit in zip(bitmask[k], t2): if delta > 0 and bit in rotated and maskbit is ONE: rol_meta = None # TODO this is an ugly hack that should be abstracted if rol_meta is not None: i, dest, temp = rol_meta del x86.INSTRUCTIONS[i] # delete srlq x86.INSTRUCTIONS[i] = x86.INSTRUCTIONS[i].replace(temp, dest) del x86.INSTRUCTIONS[i + 1] # delete permq del x86.INSTRUCTIONS[i + 1] # delete xor else: # if we're keeping the rotation, make it persistent so that the # next rotation is smaller (and thus more likely ignorable) shift_in = shifted offset = delta for m, r in zip(out_data, out): x86.vmovdqa(m, r)
def square_701_patience(out_data, in_data, n, callee_saved=0): x = list(range(701)) + 3 * [ZERO] regs = split_in_size_n(x, 64) seq = gen_sequence(n, 701) + 3 * [ZERO] seq_r = split_in_size_n(seq, 64) moved = [False] * len(seq_r) r = Register(64) t1 = Register(64) for i in range(callee_saved): x86.push_callee_saved(64) maskcache = OrderedDict() def mask_to_register(mask): mask = Mask.as_immediate(mask) if mask in maskcache: maskcache.move_to_end(mask) return maskcache[mask] try: maskreg = MaskRegister(64, mask) except AllocationError: _, maskreg = maskcache.popitem(False) x86.mov(maskreg, mask) maskcache[mask] = maskreg return maskreg for j, inreg in enumerate(regs): x86.mov(r, in_data[j]) for i, seqreg in enumerate(seq_r): piledict = {} for rotation in range(64): ror_seqreg = seqreg[rotation:] + seqreg[:rotation] piles = [] overlap = [x for x in ror_seqreg if x in inreg and x != ZERO] for x in overlap: for pile in piles: try: if pile[-1] <= x: pile.append(x) break except IndexError: # pile is empty pass else: # doesn't fit on any existing pile: start a new pile piles.append([x]) piledict[rotation] = piles min_pile_key = min(piledict, key=lambda x: len(piledict.get(x))) if len(piledict[0]) == len(piledict[min_pile_key]): min_pile_key = 0 if min_pile_key > 0: ror_seqreg = seqreg[min_pile_key:] + seqreg[:min_pile_key] else: ror_seqreg = seqreg for pile in piledict[min_pile_key]: emask = [ZERO] * 64 for bit in pile: emask[inreg.index(bit)] = ONE dmask = [ZERO] * 64 for bit in pile: dmask[ror_seqreg.index(bit)] = ONE # For consecutive bits, we do not even need pext/pdep if (Mask.consec(dmask) and Mask.consec(emask) and (Mask.degree(emask) < 32 or Mask.degree(dmask) < 32)): delta = (Mask.degree(dmask) - Mask.degree(emask)) % 64 x86.mov(t1, r) if Mask.degree(emask) < 32: x86.iand(t1, Mask.as_immediate(emask)) x86.rol(t1, delta + min_pile_key) min_pile_key = 0 # to avoid two rols else: x86.rol(t1, delta) x86.iand(t1, Mask.as_immediate(dmask)) else: # if we can extract using AND instead.. if Mask.consec(emask, True) and Mask.degree(emask) < 32: x86.mov(t1, r) x86.iand(t1, Mask.as_immediate(emask)) else: x86.pext(t1, r, mask_to_register(emask)) x86.pdep(t1, t1, mask_to_register(dmask)) if min_pile_key > 0: x86.rol(t1, min_pile_key) if moved[i]: # stored per i, as it's not the outer loop x86.xor(out_data[i], t1) else: x86.mov(out_data[i], t1) moved[i] = True x86.movq(out_data[11], 0) # to fill up all 768 bits for mask in maskcache.values(): mask.free() for i in range(callee_saved): x86.pop_callee_saved(64)