def mask_to_register(mask): mask = Mask.as_immediate(mask) if mask in maskcache: maskcache.move_to_end(mask) return maskcache[mask] try: maskreg = MaskRegister(64, mask) except AllocationError: _, maskreg = maskcache.popitem(False) x86.mov(maskreg, mask) maskcache[mask] = maskreg return maskreg
def square_350_701(dst, src): """ Requires source and destination registers to be disjunct. """ r = src r_out = dst maskreg = MaskRegister() lowbitmask = Mask('0' * 255 + '1') x86.vmovdqa(maskreg, lowbitmask) lowbitreg = Register() x86.vpand(lowbitreg, maskreg, r[0]) x86.vpandn(r[0], maskreg, r[0]) rest = Register() twobits = Register() nexttwobits = Register() mask0001 = Mask('0001') x86.vmovdqa(maskreg, mask0001) for i in range(2, -1, -1): x86.vpsllq(rest, r[i], 2) x86.vpsrlq(twobits, r[i], 62) x86.vpermq(twobits, twobits, '10010011') x86.vpand(nexttwobits, maskreg, twobits) x86.vpandn(twobits, maskreg, twobits) x86.vpxor(r[i], rest, twobits) if i + 1 < 3: x86.vpxor(r[i + 1], r[i + 1], nexttwobits) mask_bit_in_byte = [ Mask(32 * ([ZERO] * i + [ONE] + [ZERO] * (7 - i))) for i in range(8) ] bits = Register() accum = Register() for i in range(2, -1, -1): for j in range(8): x86.vpand(bits, r[i], mask_bit_in_byte[j]) if j == 0: x86.vpshlq(accum, bits, 7 - 2 * j) else: x86.vpshlq(bits, bits, 7 - 2 * j) if j == 7: x86.vpxor(r[i], accum, bits) else: x86.vpxor(accum, accum, bits) x86.vpermq(lowbitreg, lowbitreg, '11001111') x86.vpshlq(lowbitreg, lowbitreg, 56) x86.vpxor(r[2], lowbitreg, r[2]) indices = IndicesMask( list(range(15, -1, -1)) + [None] * 8 + list(range(7, -1, -1))) x86.vpshufb(r_out[2], r[0], indices) x86.vpermq(r_out[2], r_out[2], '10010011') t1 = Register() for i in range(2): indices = IndicesMask([None] * 24 + list(range(15, 7, -1))) x86.vpshufb(r_out[1 - i], r[i], indices) indices = IndicesMask( list(range(15, -1, -1)) + list(range(7, -1, -1)) + [None] * 8) x86.vpshufb(t1, r[i + 1], indices) x86.vpxor(r_out[1 - i], t1, r_out[1 - i]) x86.vpermq(r_out[1 - i], r_out[1 - i], '11010010')