NEG(reg_n) LEA(reg_mask, [reg_mask + reg_n * 4 + 16]) ymm_mask = YMMRegister() VMOVUPS(ymm_mask, [reg_mask]) ymm_temp = YMMRegister() VMASKMOVPS(ymm_temp, ymm_mask, [reg_v]) VBLENDVPS(ymm_temp, ymm_temp, ymm_m, ymm_mask) VMAXPS(ymm_m, ymm_m, ymm_temp) ymm_temp = YMMRegister() VPERM2F128(ymm_temp, ymm_m, ymm_m, 0x01) VMAXPS(ymm_m, ymm_m, ymm_temp) VPERMILPS(ymm_temp, ymm_m, _MM_SHUFFLE(1, 0, 3, 2)) VMAXPS(ymm_m, ymm_m, ymm_temp) VPERMILPS(ymm_temp, ymm_m, _MM_SHUFFLE(2, 3, 0, 1)) VMAXPS(ymm_m, ymm_m, ymm_temp) RETURN(ymm_m.as_xmm) arg_n = Argument(size_t, "n") arg_v = Argument(ptr(const_float_), "v") arg_c = Argument(float_, "c") with Function("sum_exp_minus_c__avx2", (arg_n, arg_v, arg_c), float_, target=uarch.default + isa.avx2): reg_n = GeneralPurposeRegister64()
def fft8_within_rows(ymm_real_rows, ymm_imag_rows, transformation="forward"): if isinstance(ymm_real_rows, YMMRegister) and isinstance( ymm_imag_rows, YMMRegister): return fft8_within_rows([ymm_real_rows], [ymm_imag_rows], transformation) assert isinstance(ymm_real_rows, list) and all( isinstance(ymm_real, YMMRegister) for ymm_real in ymm_real_rows) assert isinstance(ymm_imag_rows, list) and all( isinstance(ymm_imag, YMMRegister) for ymm_imag in ymm_imag_rows) assert transformation in {"forward", "inverse"} ymm_fft8_butterfly_factor = YMMRegister() VMOVAPS(ymm_fft8_butterfly_factor, Constant.float32x8(+1.0, +1.0, +1.0, +1.0, -1.0, -1.0, -1.0, -1.0)) # FFT8: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_real_flipped = YMMRegister() VPERM2F128(ymm_real_flipped, ymm_real, ymm_real, 0x01) VFMADD132PS(ymm_real, ymm_real_flipped, ymm_fft8_butterfly_factor) ymm_imag_flipped = YMMRegister() VPERM2F128(ymm_imag_flipped, ymm_imag, ymm_imag, 0x01) VFMADD132PS(ymm_imag, ymm_imag_flipped, ymm_fft8_butterfly_factor) # FFT8: Multiplication by twiddle factors ymm_fft8_cos_twiddle_factor = YMMRegister() VMOVAPS( ymm_fft8_cos_twiddle_factor, Constant.float32x8(1.0, 1.0, 1.0, 1.0, cos_npi_over_4[0], cos_npi_over_4[1], cos_npi_over_4[2], cos_npi_over_4[3])) ymm_fft8_sin_twiddle_factor = YMMRegister() VMOVAPS( ymm_fft8_sin_twiddle_factor, Constant.float32x8(0.0, 0.0, 0.0, 0.0, sin_npi_over_4[0], sin_npi_over_4[1], sin_npi_over_4[2], sin_npi_over_4[3])) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real, ymm_new_imag = YMMRegister(), YMMRegister() VMULPS(ymm_new_real, ymm_real, ymm_fft8_cos_twiddle_factor) VMULPS(ymm_new_imag, ymm_imag, ymm_fft8_cos_twiddle_factor) if transformation == "forward": VFMADD231PS(ymm_new_real, ymm_imag, ymm_fft8_sin_twiddle_factor) VFNMADD231PS(ymm_new_imag, ymm_real, ymm_fft8_sin_twiddle_factor) else: VFNMADD231PS(ymm_new_real, ymm_imag, ymm_fft8_sin_twiddle_factor) VFMADD231PS(ymm_new_imag, ymm_real, ymm_fft8_sin_twiddle_factor) SWAP.REGISTERS(ymm_real, ymm_new_real) SWAP.REGISTERS(ymm_imag, ymm_new_imag) # 2x FFT4: Butterfly ymm_fft4_butterfly_factor = YMMRegister() VMOVAPS(ymm_fft4_butterfly_factor, Constant.float32x8(+1.0, +1.0, -1.0, -1.0, +1.0, +1.0, -1.0, -1.0)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_real_flipped = YMMRegister() VPERMILPS(ymm_real_flipped, ymm_real, _MM_SHUFFLE(1, 0, 3, 2)) VFMADD132PS(ymm_real, ymm_real_flipped, ymm_fft4_butterfly_factor) ymm_imag_flipped = YMMRegister() VPERMILPS(ymm_imag_flipped, ymm_imag, _MM_SHUFFLE(1, 0, 3, 2)) VFMADD132PS(ymm_imag, ymm_imag_flipped, ymm_fft4_butterfly_factor) # 2x FFT4: Multiplication by twiddle factors for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real, ymm_new_imag = YMMRegister(), YMMRegister() VBLENDPS(ymm_new_real, ymm_real, ymm_imag, 0b10001000) VBLENDPS(ymm_new_imag, ymm_imag, ymm_real, 0b10001000) if transformation == "forward": VXORPS( ymm_new_imag, ymm_new_imag, Constant.float32x8(+0.0, +0.0, +0.0, -0.0, +0.0, +0.0, +0.0, -0.0)) else: VXORPS( ymm_new_real, ymm_new_real, Constant.float32x8(+0.0, +0.0, +0.0, -0.0, +0.0, +0.0, +0.0, -0.0)) SWAP.REGISTERS(ymm_real, ymm_new_real) SWAP.REGISTERS(ymm_imag, ymm_new_imag) # 4x FFT2: Butterfly ymm_fft2_butterfly_factor = YMMRegister() VMOVAPS(ymm_fft2_butterfly_factor, Constant.float32x8(+1.0, -1.0, +1.0, -1.0, +1.0, -1.0, +1.0, -1.0)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_real_flipped = YMMRegister() VPERMILPS(ymm_real_flipped, ymm_real, _MM_SHUFFLE(2, 3, 0, 1)) VFMADD132PS(ymm_real, ymm_real_flipped, ymm_fft2_butterfly_factor) ymm_imag_flipped = YMMRegister() VPERMILPS(ymm_imag_flipped, ymm_imag, _MM_SHUFFLE(2, 3, 0, 1)) VFMADD132PS(ymm_imag, ymm_imag_flipped, ymm_fft2_butterfly_factor) # Bit reversal ymm_bit_reversal_mask = YMMRegister() VMOVAPS(ymm_bit_reversal_mask, Constant.uint32x8(0, 4, 2, 6, 1, 5, 3, 7)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): VPERMPS(ymm_real, ymm_bit_reversal_mask, ymm_real) VPERMPS(ymm_imag, ymm_bit_reversal_mask, ymm_imag) # Scale if transformation == "inverse": ymm_scale_factor = YMMRegister() VMOVAPS(ymm_scale_factor, Constant.float32x8(0.125)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): VMULPS(ymm_real, ymm_real, ymm_scale_factor) VMULPS(ymm_imag, ymm_imag, ymm_scale_factor)
def fft16_within_rows(ymm_real_rows, ymm_imag_rows, bit_reversal=True): if isinstance(ymm_real_rows, tuple) and isinstance(ymm_imag_rows, tuple): return fft16_within_rows([ymm_real_rows], [ymm_imag_rows]) assert isinstance(ymm_real_rows, list) and all( isinstance(ymm_real, tuple) and all( isinstance(ymm, YMMRegister) for ymm in ymm_real) for ymm_real in ymm_real_rows) assert isinstance(ymm_imag_rows, list) and all( isinstance(ymm_imag, tuple) and all( isinstance(ymm, YMMRegister) for ymm in ymm_imag) for ymm_imag in ymm_imag_rows) # FFT16: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) # FFT16: Multiplication by twiddle factors ymm_fft16_cos_twiddle_factor, ymm_fft16_sin_twiddle_factor = YMMRegister( ), YMMRegister() VMOVAPS(ymm_fft16_cos_twiddle_factor, Constant.float32x8(*cos_npi_over_8)) VMOVAPS(ymm_fft16_sin_twiddle_factor, Constant.float32x8(*sin_npi_over_8)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister() VMULPS(ymm_new_real1, ymm_real[1], ymm_fft16_cos_twiddle_factor) VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft16_cos_twiddle_factor) VFMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft16_sin_twiddle_factor) VFNMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft16_sin_twiddle_factor) SWAP.REGISTERS(ymm_real[1], ymm_new_real1) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1) # 2x FFT8: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): transpose2x2x128(ymm_real[0], ymm_real[1]) transpose2x2x128(ymm_imag[0], ymm_imag[1]) # w[0] = x0 x1 x2 x3 x8 x9 x10 x11 # w[1] = x4 x5 x6 x7 x12 x13 x14 x15 for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) # 2x FFT8: Multiplication by twiddle factors ymm_fft8_cos_twiddle_factor, ymm_fft8_sin_twiddle_factor = YMMRegister( ), YMMRegister() VMOVAPS(ymm_fft8_cos_twiddle_factor, Constant.float32x8(*(cos_npi_over_4 * 2))) VMOVAPS(ymm_fft8_sin_twiddle_factor, Constant.float32x8(*(sin_npi_over_4 * 2))) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister() VMULPS(ymm_new_real1, ymm_real[1], ymm_fft8_cos_twiddle_factor) VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft8_cos_twiddle_factor) VFMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft8_sin_twiddle_factor) VFNMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft8_sin_twiddle_factor) SWAP.REGISTERS(ymm_real[1], ymm_new_real1) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1) # 4x FFT4: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): transpose2x2x2x64(ymm_real[0], ymm_real[1]) transpose2x2x2x64(ymm_imag[0], ymm_imag[1]) # w[0] = x0 x1 x4 x5 x8 x9 x12 x13 # w[1] = x2 x3 x6 x7 x10 x11 x14 x15 for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) # 4x FFT4: Multiplication by twiddle factors and 8x FFT2: Butterfly ymm_fft4_twiddle_factor = YMMRegister() VMOVAPS(ymm_fft4_twiddle_factor, Constant.float32x8(+1.0, +1.0, -1.0, -1.0, +1.0, +1.0, -1.0, -1.0)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real = YMMRegister(), YMMRegister() VSHUFPS(ymm_new_real[0], ymm_real[0], ymm_real[1], _MM_SHUFFLE(2, 0, 2, 0)) VSHUFPS(ymm_new_real[1], ymm_real[0], ymm_imag[1], _MM_SHUFFLE(3, 1, 3, 1)) butterfly(ymm_new_real[0], ymm_new_real[1]) ymm_new_imag = YMMRegister(), YMMRegister() VSHUFPS(ymm_new_imag[0], ymm_imag[0], ymm_imag[1], _MM_SHUFFLE(2, 0, 2, 0)) VSHUFPS(ymm_new_imag[1], ymm_imag[0], ymm_real[1], _MM_SHUFFLE(3, 1, 3, 1)) butterfly(ymm_new_imag[0], ymm_new_imag[1], scale_b=ymm_fft4_twiddle_factor) SWAP.REGISTERS(ymm_real[0], ymm_new_real[0]) SWAP.REGISTERS(ymm_real[1], ymm_new_real[1]) SWAP.REGISTERS(ymm_imag[0], ymm_new_imag[0]) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag[1]) # w[0] = x0 x4 x2 x6 x8 x12 x10 x14 # w[1] = x1 x5 x3 x7 x9 x11 x13 x15 if bit_reversal: # Bit reversal ymm_bit_reversal_mask = YMMRegister() VMOVDQA(ymm_bit_reversal_mask, Constant.uint32x8(0, 4, 1, 5, 2, 6, 3, 7)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): for i in range(2): VPERMPS(ymm_real[i], ymm_bit_reversal_mask, ymm_real[i]) VPERMPS(ymm_imag[i], ymm_bit_reversal_mask, ymm_imag[i])
def fft8_within_rows(ymm_real_rows, ymm_imag_rows, transformation="forward"): if isinstance(ymm_real_rows, YMMRegister) and isinstance(ymm_imag_rows, YMMRegister): return fft8_within_rows([ymm_real_rows], [ymm_imag_rows], transformation) assert isinstance(ymm_real_rows, list) and all(isinstance(ymm_real, YMMRegister) for ymm_real in ymm_real_rows) assert isinstance(ymm_imag_rows, list) and all(isinstance(ymm_imag, YMMRegister) for ymm_imag in ymm_imag_rows) assert transformation in {"forward", "inverse"} ymm_fft8_butterfly_factor = YMMRegister() VMOVAPS(ymm_fft8_butterfly_factor, Constant.float32x8(+1.0, +1.0, +1.0, +1.0, -1.0, -1.0, -1.0, -1.0)) # FFT8: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_real_flipped = YMMRegister() VPERM2F128(ymm_real_flipped, ymm_real, ymm_real, 0x01) VFMADD132PS(ymm_real, ymm_real_flipped, ymm_fft8_butterfly_factor) ymm_imag_flipped = YMMRegister() VPERM2F128(ymm_imag_flipped, ymm_imag, ymm_imag, 0x01) VFMADD132PS(ymm_imag, ymm_imag_flipped, ymm_fft8_butterfly_factor) # FFT8: Multiplication by twiddle factors ymm_fft8_cos_twiddle_factor = YMMRegister() VMOVAPS(ymm_fft8_cos_twiddle_factor, Constant.float32x8(1.0, 1.0, 1.0, 1.0, cos_npi_over_4[0], cos_npi_over_4[1], cos_npi_over_4[2], cos_npi_over_4[3])) ymm_fft8_sin_twiddle_factor = YMMRegister() VMOVAPS(ymm_fft8_sin_twiddle_factor, Constant.float32x8(0.0, 0.0, 0.0, 0.0, sin_npi_over_4[0], sin_npi_over_4[1], sin_npi_over_4[2], sin_npi_over_4[3])) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real, ymm_new_imag = YMMRegister(), YMMRegister() VMULPS(ymm_new_real, ymm_real, ymm_fft8_cos_twiddle_factor) VMULPS(ymm_new_imag, ymm_imag, ymm_fft8_cos_twiddle_factor) if transformation == "forward": VFMADD231PS(ymm_new_real, ymm_imag, ymm_fft8_sin_twiddle_factor) VFNMADD231PS(ymm_new_imag, ymm_real, ymm_fft8_sin_twiddle_factor) else: VFNMADD231PS(ymm_new_real, ymm_imag, ymm_fft8_sin_twiddle_factor) VFMADD231PS(ymm_new_imag, ymm_real, ymm_fft8_sin_twiddle_factor) SWAP.REGISTERS(ymm_real, ymm_new_real) SWAP.REGISTERS(ymm_imag, ymm_new_imag) # 2x FFT4: Butterfly ymm_fft4_butterfly_factor = YMMRegister() VMOVAPS(ymm_fft4_butterfly_factor, Constant.float32x8(+1.0, +1.0, -1.0, -1.0, +1.0, +1.0, -1.0, -1.0)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_real_flipped = YMMRegister() VPERMILPS(ymm_real_flipped, ymm_real, _MM_SHUFFLE(1, 0, 3, 2)) VFMADD132PS(ymm_real, ymm_real_flipped, ymm_fft4_butterfly_factor) ymm_imag_flipped = YMMRegister() VPERMILPS(ymm_imag_flipped, ymm_imag, _MM_SHUFFLE(1, 0, 3, 2)) VFMADD132PS(ymm_imag, ymm_imag_flipped, ymm_fft4_butterfly_factor) # 2x FFT4: Multiplication by twiddle factors for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real, ymm_new_imag = YMMRegister(), YMMRegister() VBLENDPS(ymm_new_real, ymm_real, ymm_imag, 0b10001000) VBLENDPS(ymm_new_imag, ymm_imag, ymm_real, 0b10001000) if transformation == "forward": VXORPS(ymm_new_imag, ymm_new_imag, Constant.float32x8(+0.0, +0.0, +0.0, -0.0, +0.0, +0.0, +0.0, -0.0)) else: VXORPS(ymm_new_real, ymm_new_real, Constant.float32x8(+0.0, +0.0, +0.0, -0.0, +0.0, +0.0, +0.0, -0.0)) SWAP.REGISTERS(ymm_real, ymm_new_real) SWAP.REGISTERS(ymm_imag, ymm_new_imag) # 4x FFT2: Butterfly ymm_fft2_butterfly_factor = YMMRegister() VMOVAPS(ymm_fft2_butterfly_factor, Constant.float32x8(+1.0, -1.0, +1.0, -1.0, +1.0, -1.0, +1.0, -1.0)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_real_flipped = YMMRegister() VPERMILPS(ymm_real_flipped, ymm_real, _MM_SHUFFLE(2, 3, 0, 1)) VFMADD132PS(ymm_real, ymm_real_flipped, ymm_fft2_butterfly_factor) ymm_imag_flipped = YMMRegister() VPERMILPS(ymm_imag_flipped, ymm_imag, _MM_SHUFFLE(2, 3, 0, 1)) VFMADD132PS(ymm_imag, ymm_imag_flipped, ymm_fft2_butterfly_factor) # Bit reversal ymm_bit_reversal_mask = YMMRegister() VMOVAPS(ymm_bit_reversal_mask, Constant.uint32x8(0, 4, 2, 6, 1, 5, 3, 7)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): VPERMPS(ymm_real, ymm_bit_reversal_mask, ymm_real) VPERMPS(ymm_imag, ymm_bit_reversal_mask, ymm_imag) # Scale if transformation == "inverse": ymm_scale_factor = YMMRegister() VMOVAPS(ymm_scale_factor, Constant.float32x8(0.125)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): VMULPS(ymm_real, ymm_real, ymm_scale_factor) VMULPS(ymm_imag, ymm_imag, ymm_scale_factor)
def fft16_within_rows(ymm_real_rows, ymm_imag_rows, bit_reversal=True): if isinstance(ymm_real_rows, tuple) and isinstance(ymm_imag_rows, tuple): return fft16_within_rows([ymm_real_rows], [ymm_imag_rows]) assert isinstance(ymm_real_rows, list) and all(isinstance(ymm_real, tuple) and all(isinstance(ymm, YMMRegister) for ymm in ymm_real) for ymm_real in ymm_real_rows) assert isinstance(ymm_imag_rows, list) and all(isinstance(ymm_imag, tuple) and all(isinstance(ymm, YMMRegister) for ymm in ymm_imag) for ymm_imag in ymm_imag_rows) # FFT16: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) # FFT16: Multiplication by twiddle factors ymm_fft16_cos_twiddle_factor, ymm_fft16_sin_twiddle_factor = YMMRegister(), YMMRegister() VMOVAPS(ymm_fft16_cos_twiddle_factor, Constant.float32x8(*cos_npi_over_8)) VMOVAPS(ymm_fft16_sin_twiddle_factor, Constant.float32x8(*sin_npi_over_8)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister() VMULPS(ymm_new_real1, ymm_real[1], ymm_fft16_cos_twiddle_factor) VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft16_cos_twiddle_factor) VFMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft16_sin_twiddle_factor) VFNMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft16_sin_twiddle_factor) SWAP.REGISTERS(ymm_real[1], ymm_new_real1) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1) # 2x FFT8: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): transpose2x2x128(ymm_real[0], ymm_real[1]) transpose2x2x128(ymm_imag[0], ymm_imag[1]) # w[0] = x0 x1 x2 x3 x8 x9 x10 x11 # w[1] = x4 x5 x6 x7 x12 x13 x14 x15 for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) # 2x FFT8: Multiplication by twiddle factors ymm_fft8_cos_twiddle_factor, ymm_fft8_sin_twiddle_factor = YMMRegister(), YMMRegister() VMOVAPS(ymm_fft8_cos_twiddle_factor, Constant.float32x8(*(cos_npi_over_4 * 2))) VMOVAPS(ymm_fft8_sin_twiddle_factor, Constant.float32x8(*(sin_npi_over_4 * 2))) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister() VMULPS(ymm_new_real1, ymm_real[1], ymm_fft8_cos_twiddle_factor) VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft8_cos_twiddle_factor) VFMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft8_sin_twiddle_factor) VFNMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft8_sin_twiddle_factor) SWAP.REGISTERS(ymm_real[1], ymm_new_real1) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1) # 4x FFT4: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): transpose2x2x2x64(ymm_real[0], ymm_real[1]) transpose2x2x2x64(ymm_imag[0], ymm_imag[1]) # w[0] = x0 x1 x4 x5 x8 x9 x12 x13 # w[1] = x2 x3 x6 x7 x10 x11 x14 x15 for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) # 4x FFT4: Multiplication by twiddle factors and 8x FFT2: Butterfly ymm_fft4_twiddle_factor = YMMRegister() VMOVAPS(ymm_fft4_twiddle_factor, Constant.float32x8(+1.0, +1.0, -1.0, -1.0, +1.0, +1.0, -1.0, -1.0)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real = YMMRegister(), YMMRegister() VSHUFPS(ymm_new_real[0], ymm_real[0], ymm_real[1], _MM_SHUFFLE(2, 0, 2, 0)) VSHUFPS(ymm_new_real[1], ymm_real[0], ymm_imag[1], _MM_SHUFFLE(3, 1, 3, 1)) butterfly(ymm_new_real[0], ymm_new_real[1]) ymm_new_imag = YMMRegister(), YMMRegister() VSHUFPS(ymm_new_imag[0], ymm_imag[0], ymm_imag[1], _MM_SHUFFLE(2, 0, 2, 0)) VSHUFPS(ymm_new_imag[1], ymm_imag[0], ymm_real[1], _MM_SHUFFLE(3, 1, 3, 1)) butterfly(ymm_new_imag[0], ymm_new_imag[1], scale_b=ymm_fft4_twiddle_factor) SWAP.REGISTERS(ymm_real[0], ymm_new_real[0]) SWAP.REGISTERS(ymm_real[1], ymm_new_real[1]) SWAP.REGISTERS(ymm_imag[0], ymm_new_imag[0]) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag[1]) # w[0] = x0 x4 x2 x6 x8 x12 x10 x14 # w[1] = x1 x5 x3 x7 x9 x11 x13 x15 if bit_reversal: # Bit reversal ymm_bit_reversal_mask = YMMRegister() VMOVDQA(ymm_bit_reversal_mask, Constant.uint32x8(0, 4, 1, 5, 2, 6, 3, 7)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): for i in range(2): VPERMPS(ymm_real[i], ymm_bit_reversal_mask, ymm_real[i]) VPERMPS(ymm_imag[i], ymm_bit_reversal_mask, ymm_imag[i])
VMASKMOVPS(ymm_row1[0], ymm_src_mask_columns_0_to_8, [reg_src_ptr]) VBLENDVPS(ymm_row1[0], ymm_minus_inf, ymm_row1[0], ymm_src_mask_columns_0_to_8) VMASKMOVPS(ymm_row1[1], ymm_src_mask_columns_8_to_16, [reg_src_ptr + YMMRegister.size]) VBLENDVPS(ymm_row1[1], ymm_minus_inf, ymm_row1[1], ymm_src_mask_columns_8_to_16) # ymm_row[0] = ( x7 x6 x5 x4 x3 x2 x1 x0 ) # ymm_row[1] = ( x15 x14 x13 x12 x11 x10 x9 x8 ) ymm_row = YMMRegister(), YMMRegister() VMAXPS(ymm_row[0], ymm_row0[0], ymm_row1[0]) VMAXPS(ymm_row[1], ymm_row0[1], ymm_row1[1]) # ymm_row[0] = ( x14 x12 x6 x4 x10 x8 x2 x0 ) # ymm_row[1] = ( x15 x13 x7 x5 x11 x9 x3 x1 ) ymm_tmp = YMMRegister() VSHUFPS(ymm_tmp, ymm_row[0], ymm_row[1], _MM_SHUFFLE(2, 0, 2, 0)) VSHUFPS(ymm_row[1], ymm_row[0], ymm_row[1], _MM_SHUFFLE(3, 1, 3, 1)) SWAP.REGISTERS(ymm_row[0], ymm_tmp) # ymm_out = ( y7 y6 y3 y2 y5 y4 y1 y0 ) ymm_out = YMMRegister() VMAXPS(ymm_out, ymm_row[0], ymm_row[1]) VPERMPD(ymm_out, ymm_out, _MM_SHUFFLE(3, 1, 2, 0)) VMASKMOVPS([reg_dst_ptr], ymm_dst_mask_columns_0_to_8, ymm_out) RETURN()
INC(reg_src_row_index) CMP(reg_src_row_index, reg_src_row_count) JAE(load_row1.end) VMASKMOVPS(ymm_row1[0], ymm_src_mask_columns_0_to_8, [reg_src_ptr]) VBLENDVPS(ymm_row1[0], ymm_minus_inf, ymm_row1[0], ymm_src_mask_columns_0_to_8) VMASKMOVPS(ymm_row1[1], ymm_src_mask_columns_8_to_16, [reg_src_ptr + YMMRegister.size]) VBLENDVPS(ymm_row1[1], ymm_minus_inf, ymm_row1[1], ymm_src_mask_columns_8_to_16) # ymm_row[0] = ( x7 x6 x5 x4 x3 x2 x1 x0 ) # ymm_row[1] = ( x15 x14 x13 x12 x11 x10 x9 x8 ) ymm_row = YMMRegister(), YMMRegister() VMAXPS(ymm_row[0], ymm_row0[0], ymm_row1[0]) VMAXPS(ymm_row[1], ymm_row0[1], ymm_row1[1]) # ymm_row[0] = ( x14 x12 x6 x4 x10 x8 x2 x0 ) # ymm_row[1] = ( x15 x13 x7 x5 x11 x9 x3 x1 ) ymm_tmp = YMMRegister() VSHUFPS(ymm_tmp, ymm_row[0], ymm_row[1], _MM_SHUFFLE(2, 0, 2, 0)) VSHUFPS(ymm_row[1], ymm_row[0], ymm_row[1], _MM_SHUFFLE(3, 1, 3, 1)) SWAP.REGISTERS(ymm_row[0], ymm_tmp) # ymm_out = ( y7 y6 y3 y2 y5 y4 y1 y0 ) ymm_out = YMMRegister() VMAXPS(ymm_out, ymm_row[0], ymm_row[1]) VPERMPD(ymm_out, ymm_out, _MM_SHUFFLE(3, 1, 2, 0)) VMASKMOVPS([reg_dst_ptr], ymm_dst_mask_columns_0_to_8, ymm_out) RETURN()