def transpose8x3(xmm_rows): assert isinstance(xmm_rows, list) and len(xmm_rows) == 8 and all( isinstance(xmm_row, XMMRegister) for xmm_row in xmm_rows) # xmm_rows[0] = ( 0.0, g02, g01, g00 ) # xmm_rows[1] = ( 0.0, g12, g11, g10 ) # xmm_rows[2] = ( 0.0, g22, g21, g20 ) # xmm_rows[3] = ( 0.0, g32, g31, g30 ) # xmm_rows[4] = ( 0.0, g42, g41, g40 ) # xmm_rows[5] = ( 0.0, g52, g51, g50 ) # xmm_rows[6] = ( 0.0, g62, g61, g60 ) # xmm_rows[7] = ( 0.0, g72, g71, g70 ) ymm_rows = [YMMRegister() for _ in range(4)] VINSERTF128(ymm_rows[0], xmm_rows[0].as_ymm, xmm_rows[4], 1) VINSERTF128(ymm_rows[1], xmm_rows[1].as_ymm, xmm_rows[5], 1) VINSERTF128(ymm_rows[2], xmm_rows[2].as_ymm, xmm_rows[6], 1) VINSERTF128(ymm_rows[3], xmm_rows[3].as_ymm, xmm_rows[7], 1) # ymm_rows[0] = ( 0.0, g42, g41, g40, 0.0, g02, g01, g00 ) # ymm_rows[1] = ( 0.0, g52, g51, g50, 0.0, g12, g11, g10 ) # ymm_rows[2] = ( 0.0, g62, g61, g60, 0.0, g22, g21, g20 ) # ymm_rows[3] = ( 0.0, g72, g71, g70, 0.0, g32, g31, g30 ) ymm_new_rows = [YMMRegister() for _ in range(4)] VUNPCKLPS(ymm_new_rows[0], ymm_rows[0], ymm_rows[1]) VUNPCKHPS(ymm_new_rows[1], ymm_rows[0], ymm_rows[1]) VUNPCKLPS(ymm_new_rows[2], ymm_rows[2], ymm_rows[3]) VUNPCKHPS(ymm_new_rows[3], ymm_rows[2], ymm_rows[3]) for ymm_row, ymm_new_row in zip(ymm_rows, ymm_new_rows): SWAP.REGISTERS(ymm_row, ymm_new_row) # ymm_rows[0] = ( g51, g41, g50, g40, g11, g01, g10, g00 ) # ymm_rows[1] = ( 0.0, 0.0, g52, g42, 0.0, 0.0, g12, g02 ) # ymm_rows[2] = ( g71, g61, g70, g60, g31, g21, g30, g20 ) # ymm_rows[3] = ( 0.0, 0.0, g72, g62, 0.0, 0.0, g32, g22 ) # ymm_rows[0] = ( g70, g60, g50, g40, g30, g20, g10, g00 ) # ymm_rows[2] = ( g71, g61, g51, g41, g31, g21, g11, g01 ) transpose2x2x2x64(ymm_rows[0], ymm_rows[2]) # ymm_rows[1] = ( g72, g62, g52, g42, g32, g22, g12, g02 ) VUNPCKLPD(ymm_rows[1], ymm_rows[1], ymm_rows[3]) SWAP.REGISTERS(ymm_rows[1], ymm_rows[2]) return ymm_rows[0:3]
def transpose8x3(xmm_rows): assert isinstance(xmm_rows, list) and len(xmm_rows) == 8 and all(isinstance(xmm_row, XMMRegister) for xmm_row in xmm_rows) # xmm_rows[0] = ( 0.0, g02, g01, g00 ) # xmm_rows[1] = ( 0.0, g12, g11, g10 ) # xmm_rows[2] = ( 0.0, g22, g21, g20 ) # xmm_rows[3] = ( 0.0, g32, g31, g30 ) # xmm_rows[4] = ( 0.0, g42, g41, g40 ) # xmm_rows[5] = ( 0.0, g52, g51, g50 ) # xmm_rows[6] = ( 0.0, g62, g61, g60 ) # xmm_rows[7] = ( 0.0, g72, g71, g70 ) ymm_rows = [YMMRegister() for _ in range(4)] VINSERTF128(ymm_rows[0], xmm_rows[0].as_ymm, xmm_rows[4], 1) VINSERTF128(ymm_rows[1], xmm_rows[1].as_ymm, xmm_rows[5], 1) VINSERTF128(ymm_rows[2], xmm_rows[2].as_ymm, xmm_rows[6], 1) VINSERTF128(ymm_rows[3], xmm_rows[3].as_ymm, xmm_rows[7], 1) # ymm_rows[0] = ( 0.0, g42, g41, g40, 0.0, g02, g01, g00 ) # ymm_rows[1] = ( 0.0, g52, g51, g50, 0.0, g12, g11, g10 ) # ymm_rows[2] = ( 0.0, g62, g61, g60, 0.0, g22, g21, g20 ) # ymm_rows[3] = ( 0.0, g72, g71, g70, 0.0, g32, g31, g30 ) ymm_new_rows = [YMMRegister() for _ in range(4)] VUNPCKLPS(ymm_new_rows[0], ymm_rows[0], ymm_rows[1]) VUNPCKHPS(ymm_new_rows[1], ymm_rows[0], ymm_rows[1]) VUNPCKLPS(ymm_new_rows[2], ymm_rows[2], ymm_rows[3]) VUNPCKHPS(ymm_new_rows[3], ymm_rows[2], ymm_rows[3]) for ymm_row, ymm_new_row in zip(ymm_rows, ymm_new_rows): SWAP.REGISTERS(ymm_row, ymm_new_row) # ymm_rows[0] = ( g51, g41, g50, g40, g11, g01, g10, g00 ) # ymm_rows[1] = ( 0.0, 0.0, g52, g42, 0.0, 0.0, g12, g02 ) # ymm_rows[2] = ( g71, g61, g70, g60, g31, g21, g30, g20 ) # ymm_rows[3] = ( 0.0, 0.0, g72, g62, 0.0, 0.0, g32, g22 ) # ymm_rows[0] = ( g70, g60, g50, g40, g30, g20, g10, g00 ) # ymm_rows[2] = ( g71, g61, g51, g41, g31, g21, g11, g01 ) transpose2x2x2x64(ymm_rows[0], ymm_rows[2]) # ymm_rows[1] = ( g72, g62, g52, g42, g32, g22, g12, g02 ) VUNPCKLPD(ymm_rows[1], ymm_rows[1], ymm_rows[3]) SWAP.REGISTERS(ymm_rows[1], ymm_rows[2]) return ymm_rows[0:3]
def transpose6x8(ymm_rows): assert isinstance(ymm_rows, list) and len(ymm_rows) == 6 and all( isinstance(ymm_row, YMMRegister) for ymm_row in ymm_rows) # ymm_rows[0] = ( g07, g06, g05, g04, g03, g02, g01, g00 ) # ymm_rows[1] = ( g17, g16, g15, g14, g13, g12, g11, g10 ) # ymm_rows[2] = ( g27, g26, g25, g24, g23, g22, g21, g20 ) # ymm_rows[3] = ( g37, g36, g35, g34, g33, g32, g31, g30 ) # ymm_rows[4] = ( g47, g46, g45, g44, g43, g42, g41, g40 ) # ymm_rows[5] = ( g57, g56, g55, g54, g53, g52, g51, g50 ) for ymm_even_row, ymm_odd_row in zip(ymm_rows[0::2], ymm_rows[1::2]): ymm_temp = YMMRegister() VUNPCKLPS(ymm_temp, ymm_even_row, ymm_odd_row) VUNPCKHPS(ymm_odd_row, ymm_even_row, ymm_odd_row) SWAP.REGISTERS(ymm_even_row, ymm_temp) # ymm_rows[0] = ( g15, g05, g14, g04, g11, g01, g10, g00 ) # ymm_rows[1] = ( g17, g07, g16, g06, g13, g03, g12, g02 ) # ymm_rows[2] = ( g35, g25, g34, g24, g31, g21, g30, g20 ) # ymm_rows[3] = ( g37, g27, g36, g26, g33, g23, g32, g22 ) # ymm_rows[4] = ( g55, g45, g54, g44, g51, g41, g50, g40 ) # ymm_rows[5] = ( g57, g47, g56, g46, g53, g43, g52, g42 ) ymm_zero_rows = [YMMRegister(), YMMRegister()] for ymm_zero in ymm_zero_rows: VXORPS(ymm_zero, ymm_zero, ymm_zero) ymm_rows += ymm_zero_rows # ymm_rows[6] = ( 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ) # ymm_rows[7] = ( 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ) transpose2x2x2x64(ymm_rows[0], ymm_rows[2]) transpose2x2x2x64(ymm_rows[1], ymm_rows[3]) transpose2x2x2x64(ymm_rows[4], ymm_rows[6]) transpose2x2x2x64(ymm_rows[5], ymm_rows[7]) # ymm_rows[0] = ( g34, g24, g14, g04, g30, g20, g10, g00 ) # ymm_rows[1] = ( g36, g26, g16, g06, g32, g22, g12, g02 ) # ymm_rows[2] = ( g35, g25, g15, g05, g31, g21, g11, g01 ) # ymm_rows[3] = ( g37, g27, g17, g07, g33, g23, g13, g03 ) # ymm_rows[4] = ( 0.0, 0.0, g54, g44, 0.0, 0.0, g50, g40 ) # ymm_rows[5] = ( 0.0, 0.0, g56, g46, 0.0, 0.0, g52, g42 ) # ymm_rows[6] = ( 0.0, 0.0, g55, g45, 0.0, 0.0, g51, g41 ) # ymm_rows[7] = ( 0.0, 0.0, g57, g47, 0.0, 0.0, g53, g43 ) transpose2x2x128(ymm_rows[0], ymm_rows[4]) transpose2x2x128(ymm_rows[1], ymm_rows[5]) transpose2x2x128(ymm_rows[2], ymm_rows[6]) transpose2x2x128(ymm_rows[3], ymm_rows[7]) SWAP.REGISTERS(ymm_rows[1], ymm_rows[2]) SWAP.REGISTERS(ymm_rows[5], ymm_rows[6]) return ymm_rows
def transpose6x8(ymm_rows): assert isinstance(ymm_rows, list) and len(ymm_rows) == 6 and all(isinstance(ymm_row, YMMRegister) for ymm_row in ymm_rows) # ymm_rows[0] = ( g07, g06, g05, g04, g03, g02, g01, g00 ) # ymm_rows[1] = ( g17, g16, g15, g14, g13, g12, g11, g10 ) # ymm_rows[2] = ( g27, g26, g25, g24, g23, g22, g21, g20 ) # ymm_rows[3] = ( g37, g36, g35, g34, g33, g32, g31, g30 ) # ymm_rows[4] = ( g47, g46, g45, g44, g43, g42, g41, g40 ) # ymm_rows[5] = ( g57, g56, g55, g54, g53, g52, g51, g50 ) for ymm_even_row, ymm_odd_row in zip(ymm_rows[0::2], ymm_rows[1::2]): ymm_temp = YMMRegister() VUNPCKLPS(ymm_temp, ymm_even_row, ymm_odd_row) VUNPCKHPS(ymm_odd_row, ymm_even_row, ymm_odd_row) SWAP.REGISTERS(ymm_even_row, ymm_temp) # ymm_rows[0] = ( g15, g05, g14, g04, g11, g01, g10, g00 ) # ymm_rows[1] = ( g17, g07, g16, g06, g13, g03, g12, g02 ) # ymm_rows[2] = ( g35, g25, g34, g24, g31, g21, g30, g20 ) # ymm_rows[3] = ( g37, g27, g36, g26, g33, g23, g32, g22 ) # ymm_rows[4] = ( g55, g45, g54, g44, g51, g41, g50, g40 ) # ymm_rows[5] = ( g57, g47, g56, g46, g53, g43, g52, g42 ) ymm_zero_rows = [YMMRegister(), YMMRegister()] for ymm_zero in ymm_zero_rows: VXORPS(ymm_zero, ymm_zero, ymm_zero) ymm_rows += ymm_zero_rows # ymm_rows[6] = ( 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ) # ymm_rows[7] = ( 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ) transpose2x2x2x64(ymm_rows[0], ymm_rows[2]) transpose2x2x2x64(ymm_rows[1], ymm_rows[3]) transpose2x2x2x64(ymm_rows[4], ymm_rows[6]) transpose2x2x2x64(ymm_rows[5], ymm_rows[7]) # ymm_rows[0] = ( g34, g24, g14, g04, g30, g20, g10, g00 ) # ymm_rows[1] = ( g36, g26, g16, g06, g32, g22, g12, g02 ) # ymm_rows[2] = ( g35, g25, g15, g05, g31, g21, g11, g01 ) # ymm_rows[3] = ( g37, g27, g17, g07, g33, g23, g13, g03 ) # ymm_rows[4] = ( 0.0, 0.0, g54, g44, 0.0, 0.0, g50, g40 ) # ymm_rows[5] = ( 0.0, 0.0, g56, g46, 0.0, 0.0, g52, g42 ) # ymm_rows[6] = ( 0.0, 0.0, g55, g45, 0.0, 0.0, g51, g41 ) # ymm_rows[7] = ( 0.0, 0.0, g57, g47, 0.0, 0.0, g53, g43 ) transpose2x2x128(ymm_rows[0], ymm_rows[4]) transpose2x2x128(ymm_rows[1], ymm_rows[5]) transpose2x2x128(ymm_rows[2], ymm_rows[6]) transpose2x2x128(ymm_rows[3], ymm_rows[7]) SWAP.REGISTERS(ymm_rows[1], ymm_rows[2]) SWAP.REGISTERS(ymm_rows[5], ymm_rows[6]) return ymm_rows
def transpose8x8(ymm_rows): assert isinstance(ymm_rows, list) and len(ymm_rows) == 8 and all( isinstance(ymm_row, YMMRegister) for ymm_row in ymm_rows) # ymm_rows[0] = ( g07, g06, g05, g04, g03, g02, g01, g00 ) # ymm_rows[1] = ( g17, g16, g15, g14, g13, g12, g11, g10 ) # ymm_rows[2] = ( g27, g26, g25, g24, g23, g22, g21, g20 ) # ymm_rows[3] = ( g37, g36, g35, g34, g33, g32, g31, g30 ) # ymm_rows[4] = ( g47, g46, g45, g44, g43, g42, g41, g40 ) # ymm_rows[5] = ( g57, g56, g55, g54, g53, g52, g51, g50 ) # ymm_rows[6] = ( g67, g66, g65, g64, g63, g62, g61, g60 ) # ymm_rows[7] = ( g77, g76, g75, g74, g73, g72, g71, g70 ) for ymm_even_row, ymm_odd_row in zip(ymm_rows[0::2], ymm_rows[1::2]): ymm_temp = YMMRegister() VUNPCKLPS(ymm_temp, ymm_even_row, ymm_odd_row) VUNPCKHPS(ymm_odd_row, ymm_even_row, ymm_odd_row) SWAP.REGISTERS(ymm_even_row, ymm_temp) # ymm_rows[0] = ( g15, g05, g14, g04, g11, g01, g10, g00 ) # ymm_rows[1] = ( g17, g07, g16, g06, g13, g03, g12, g02 ) # ymm_rows[2] = ( g35, g25, g34, g24, g31, g21, g30, g20 ) # ymm_rows[3] = ( g37, g27, g36, g26, g33, g23, g32, g22 ) # ymm_rows[4] = ( g55, g45, g54, g44, g51, g41, g50, g40 ) # ymm_rows[5] = ( g57, g47, g56, g46, g53, g43, g52, g42 ) # ymm_rows[6] = ( g75, g65, g74, g64, g71, g61, g70, g60 ) # ymm_rows[7] = ( g77, g67, g76, g66, g73, g63, g72, g62 ) transpose2x2x2x64(ymm_rows[0], ymm_rows[2]) transpose2x2x2x64(ymm_rows[1], ymm_rows[3]) transpose2x2x2x64(ymm_rows[4], ymm_rows[6]) transpose2x2x2x64(ymm_rows[5], ymm_rows[7]) # ymm_rows[0] = ( g34, g24, g14, g04, g30, g20, g10, g00 ) # ymm_rows[1] = ( g36, g26, g16, g06, g32, g22, g12, g02 ) # ymm_rows[2] = ( g35, g25, g15, g05, g31, g21, g11, g01 ) # ymm_rows[3] = ( g37, g27, g17, g07, g33, g23, g13, g03 ) # ymm_rows[4] = ( g74, g64, g54, g44, g70, g60, g50, g40 ) # ymm_rows[5] = ( g76, g66, g56, g46, g72, g62, g52, g42 ) # ymm_rows[6] = ( g75, g65, g55, g45, g71, g61, g51, g41 ) # ymm_rows[7] = ( g77, g67, g57, g47, g73, g63, g53, g43 ) transpose2x2x128(ymm_rows[0], ymm_rows[4]) transpose2x2x128(ymm_rows[1], ymm_rows[5]) transpose2x2x128(ymm_rows[2], ymm_rows[6]) transpose2x2x128(ymm_rows[3], ymm_rows[7]) SWAP.REGISTERS(ymm_rows[1], ymm_rows[2]) SWAP.REGISTERS(ymm_rows[5], ymm_rows[6])
def transpose8x8(ymm_rows): assert isinstance(ymm_rows, list) and len(ymm_rows) == 8 and all(isinstance(ymm_row, YMMRegister) for ymm_row in ymm_rows) # ymm_rows[0] = ( g07, g06, g05, g04, g03, g02, g01, g00 ) # ymm_rows[1] = ( g17, g16, g15, g14, g13, g12, g11, g10 ) # ymm_rows[2] = ( g27, g26, g25, g24, g23, g22, g21, g20 ) # ymm_rows[3] = ( g37, g36, g35, g34, g33, g32, g31, g30 ) # ymm_rows[4] = ( g47, g46, g45, g44, g43, g42, g41, g40 ) # ymm_rows[5] = ( g57, g56, g55, g54, g53, g52, g51, g50 ) # ymm_rows[6] = ( g67, g66, g65, g64, g63, g62, g61, g60 ) # ymm_rows[7] = ( g77, g76, g75, g74, g73, g72, g71, g70 ) for ymm_even_row, ymm_odd_row in zip(ymm_rows[0::2], ymm_rows[1::2]): ymm_temp = YMMRegister() VUNPCKLPS(ymm_temp, ymm_even_row, ymm_odd_row) VUNPCKHPS(ymm_odd_row, ymm_even_row, ymm_odd_row) SWAP.REGISTERS(ymm_even_row, ymm_temp) # ymm_rows[0] = ( g15, g05, g14, g04, g11, g01, g10, g00 ) # ymm_rows[1] = ( g17, g07, g16, g06, g13, g03, g12, g02 ) # ymm_rows[2] = ( g35, g25, g34, g24, g31, g21, g30, g20 ) # ymm_rows[3] = ( g37, g27, g36, g26, g33, g23, g32, g22 ) # ymm_rows[4] = ( g55, g45, g54, g44, g51, g41, g50, g40 ) # ymm_rows[5] = ( g57, g47, g56, g46, g53, g43, g52, g42 ) # ymm_rows[6] = ( g75, g65, g74, g64, g71, g61, g70, g60 ) # ymm_rows[7] = ( g77, g67, g76, g66, g73, g63, g72, g62 ) transpose2x2x2x64(ymm_rows[0], ymm_rows[2]) transpose2x2x2x64(ymm_rows[1], ymm_rows[3]) transpose2x2x2x64(ymm_rows[4], ymm_rows[6]) transpose2x2x2x64(ymm_rows[5], ymm_rows[7]) # ymm_rows[0] = ( g34, g24, g14, g04, g30, g20, g10, g00 ) # ymm_rows[1] = ( g36, g26, g16, g06, g32, g22, g12, g02 ) # ymm_rows[2] = ( g35, g25, g15, g05, g31, g21, g11, g01 ) # ymm_rows[3] = ( g37, g27, g17, g07, g33, g23, g13, g03 ) # ymm_rows[4] = ( g74, g64, g54, g44, g70, g60, g50, g40 ) # ymm_rows[5] = ( g76, g66, g56, g46, g72, g62, g52, g42 ) # ymm_rows[6] = ( g75, g65, g55, g45, g71, g61, g51, g41 ) # ymm_rows[7] = ( g77, g67, g57, g47, g73, g63, g53, g43 ) transpose2x2x128(ymm_rows[0], ymm_rows[4]) transpose2x2x128(ymm_rows[1], ymm_rows[5]) transpose2x2x128(ymm_rows[2], ymm_rows[6]) transpose2x2x128(ymm_rows[3], ymm_rows[7]) SWAP.REGISTERS(ymm_rows[1], ymm_rows[2]) SWAP.REGISTERS(ymm_rows[5], ymm_rows[6])
def ifft16_within_rows(ymm_real_rows, ymm_imag_rows, bit_reversal=True): if isinstance(ymm_real_rows, tuple) and isinstance(ymm_imag_rows, tuple): return ifft16_within_rows([ymm_real_rows], [ymm_imag_rows]) assert isinstance(ymm_real_rows, list) and all( isinstance(ymm_real, tuple) and all( isinstance(ymm, YMMRegister) for ymm in ymm_real) for ymm_real in ymm_real_rows) assert isinstance(ymm_imag_rows, list) and all( isinstance(ymm_imag, tuple) and all( isinstance(ymm, YMMRegister) for ymm in ymm_imag) for ymm_imag in ymm_imag_rows) if bit_reversal: # Bit reversal # w[0] = x0 x8 x4 x12 x2 x10 x6 x14 # w[1] = x1 x9 x5 x13 x3 x11 x7 x15 ymm_bit_reversal_mask = YMMRegister() VMOVDQA(ymm_bit_reversal_mask, Constant.uint32x8(0, 2, 4, 6, 1, 3, 5, 7)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): for i in range(2): VPERMPS(ymm_real[i], ymm_bit_reversal_mask, ymm_real[i]) VPERMPS(ymm_imag[i], ymm_bit_reversal_mask, ymm_imag[i]) # 8x FFT2: Butterfly # w[0] = x0 x4 x2 x6 x8 x12 x10 x14 # w[1] = x1 x5 x3 x7 x9 x13 x11 x15 for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) ymm_new_real = YMMRegister(), YMMRegister() VUNPCKLPS(ymm_new_real[0], ymm_real[0], ymm_real[1]) VUNPCKHPS(ymm_new_real[1], ymm_real[0], ymm_imag[1]) ymm_new_imag = YMMRegister(), YMMRegister() VUNPCKLPS(ymm_new_imag[0], ymm_imag[0], ymm_imag[1]) VUNPCKHPS(ymm_new_imag[1], ymm_imag[0], ymm_real[1]) SWAP.REGISTERS(ymm_imag[0], ymm_new_imag[0]) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag[1]) SWAP.REGISTERS(ymm_real[0], ymm_new_real[0]) SWAP.REGISTERS(ymm_real[1], ymm_new_real[1]) # w[0] = x0 x1 x4 x5 x8 x9 x12 x13 # w[1] = x2 x3 x6 x7 x10 x11 x14 x15 # 4x FFT4: Butterfly and multiplication by twiddle factors ymm_fft4_twiddle_factor = YMMRegister() VMOVAPS(ymm_fft4_twiddle_factor, Constant.float32x8(+1.0, -1.0, +1.0, -1.0, +1.0, -1.0, +1.0, -1.0)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1], scale_b=ymm_fft4_twiddle_factor) butterfly(ymm_imag[0], ymm_imag[1]) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): transpose2x2x2x64(ymm_real[0], ymm_real[1]) transpose2x2x2x64(ymm_imag[0], ymm_imag[1]) # w[0] = x0 x1 x2 x3 x8 x9 x10 x11 # w[1] = x4 x5 x6 x7 x12 x13 x14 x15 # 2x FFT8: Multiplication by twiddle factors ymm_fft8_cos_twiddle_factor, ymm_fft8_sin_twiddle_factor = YMMRegister( ), YMMRegister() VMOVAPS(ymm_fft8_cos_twiddle_factor, Constant.float32x8(*(cos_npi_over_4 * 2))) VMOVAPS(ymm_fft8_sin_twiddle_factor, Constant.float32x8(*(sin_npi_over_4 * 2))) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister() VMULPS(ymm_new_real1, ymm_real[1], ymm_fft8_cos_twiddle_factor) VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft8_cos_twiddle_factor) VFNMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft8_sin_twiddle_factor) VFMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft8_sin_twiddle_factor) SWAP.REGISTERS(ymm_real[1], ymm_new_real1) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1) # 2x FFT8: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): transpose2x2x128(ymm_real[0], ymm_real[1]) transpose2x2x128(ymm_imag[0], ymm_imag[1]) # w[0] = x0 x1 x2 x3 x4 x5 x6 x7 # w[1] = x8 x9 x10 x11 x12 x13 x14 x15 # FFT16: Multiplication by twiddle factors and scale scale_factor = 0.0625 ymm_fft16_cos_scale_twiddle_factor, ymm_fft16_sin_scale_twiddle_factor = YMMRegister( ), YMMRegister() VMOVAPS( ymm_fft16_cos_scale_twiddle_factor, Constant.float32x8(*[cos * scale_factor for cos in cos_npi_over_8])) VMOVAPS( ymm_fft16_sin_scale_twiddle_factor, Constant.float32x8(*[sin * scale_factor for sin in sin_npi_over_8])) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister() VMULPS(ymm_new_real1, ymm_real[1], ymm_fft16_cos_scale_twiddle_factor) VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft16_cos_scale_twiddle_factor) VFNMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft16_sin_scale_twiddle_factor) VFMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft16_sin_scale_twiddle_factor) SWAP.REGISTERS(ymm_real[1], ymm_new_real1) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1) # FFT16: Butterfly and scale ymm_scale_factor = YMMRegister() VMOVAPS(ymm_scale_factor, Constant.float32x8(scale_factor)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1], scale_a=ymm_scale_factor) butterfly(ymm_imag[0], ymm_imag[1], scale_a=ymm_scale_factor)
def fft16_within_rows(ymm_real_rows, ymm_imag_rows, bit_reversal=True): if isinstance(ymm_real_rows, tuple) and isinstance(ymm_imag_rows, tuple): return fft16_within_rows([ymm_real_rows], [ymm_imag_rows]) assert isinstance(ymm_real_rows, list) and all( isinstance(ymm_real, tuple) and all( isinstance(ymm, YMMRegister) for ymm in ymm_real) for ymm_real in ymm_real_rows) assert isinstance(ymm_imag_rows, list) and all( isinstance(ymm_imag, tuple) and all( isinstance(ymm, YMMRegister) for ymm in ymm_imag) for ymm_imag in ymm_imag_rows) # FFT16: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) # FFT16: Multiplication by twiddle factors ymm_fft16_cos_twiddle_factor, ymm_fft16_sin_twiddle_factor = YMMRegister( ), YMMRegister() VMOVAPS(ymm_fft16_cos_twiddle_factor, Constant.float32x8(*cos_npi_over_8)) VMOVAPS(ymm_fft16_sin_twiddle_factor, Constant.float32x8(*sin_npi_over_8)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister() VMULPS(ymm_new_real1, ymm_real[1], ymm_fft16_cos_twiddle_factor) VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft16_cos_twiddle_factor) VFMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft16_sin_twiddle_factor) VFNMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft16_sin_twiddle_factor) SWAP.REGISTERS(ymm_real[1], ymm_new_real1) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1) # 2x FFT8: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): transpose2x2x128(ymm_real[0], ymm_real[1]) transpose2x2x128(ymm_imag[0], ymm_imag[1]) # w[0] = x0 x1 x2 x3 x8 x9 x10 x11 # w[1] = x4 x5 x6 x7 x12 x13 x14 x15 for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) # 2x FFT8: Multiplication by twiddle factors ymm_fft8_cos_twiddle_factor, ymm_fft8_sin_twiddle_factor = YMMRegister( ), YMMRegister() VMOVAPS(ymm_fft8_cos_twiddle_factor, Constant.float32x8(*(cos_npi_over_4 * 2))) VMOVAPS(ymm_fft8_sin_twiddle_factor, Constant.float32x8(*(sin_npi_over_4 * 2))) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister() VMULPS(ymm_new_real1, ymm_real[1], ymm_fft8_cos_twiddle_factor) VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft8_cos_twiddle_factor) VFMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft8_sin_twiddle_factor) VFNMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft8_sin_twiddle_factor) SWAP.REGISTERS(ymm_real[1], ymm_new_real1) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1) # 4x FFT4: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): transpose2x2x2x64(ymm_real[0], ymm_real[1]) transpose2x2x2x64(ymm_imag[0], ymm_imag[1]) # w[0] = x0 x1 x4 x5 x8 x9 x12 x13 # w[1] = x2 x3 x6 x7 x10 x11 x14 x15 for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) # 4x FFT4: Multiplication by twiddle factors and 8x FFT2: Butterfly ymm_fft4_twiddle_factor = YMMRegister() VMOVAPS(ymm_fft4_twiddle_factor, Constant.float32x8(+1.0, +1.0, -1.0, -1.0, +1.0, +1.0, -1.0, -1.0)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real = YMMRegister(), YMMRegister() VSHUFPS(ymm_new_real[0], ymm_real[0], ymm_real[1], _MM_SHUFFLE(2, 0, 2, 0)) VSHUFPS(ymm_new_real[1], ymm_real[0], ymm_imag[1], _MM_SHUFFLE(3, 1, 3, 1)) butterfly(ymm_new_real[0], ymm_new_real[1]) ymm_new_imag = YMMRegister(), YMMRegister() VSHUFPS(ymm_new_imag[0], ymm_imag[0], ymm_imag[1], _MM_SHUFFLE(2, 0, 2, 0)) VSHUFPS(ymm_new_imag[1], ymm_imag[0], ymm_real[1], _MM_SHUFFLE(3, 1, 3, 1)) butterfly(ymm_new_imag[0], ymm_new_imag[1], scale_b=ymm_fft4_twiddle_factor) SWAP.REGISTERS(ymm_real[0], ymm_new_real[0]) SWAP.REGISTERS(ymm_real[1], ymm_new_real[1]) SWAP.REGISTERS(ymm_imag[0], ymm_new_imag[0]) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag[1]) # w[0] = x0 x4 x2 x6 x8 x12 x10 x14 # w[1] = x1 x5 x3 x7 x9 x11 x13 x15 if bit_reversal: # Bit reversal ymm_bit_reversal_mask = YMMRegister() VMOVDQA(ymm_bit_reversal_mask, Constant.uint32x8(0, 4, 1, 5, 2, 6, 3, 7)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): for i in range(2): VPERMPS(ymm_real[i], ymm_bit_reversal_mask, ymm_real[i]) VPERMPS(ymm_imag[i], ymm_bit_reversal_mask, ymm_imag[i])
def ifft16_within_rows(ymm_real_rows, ymm_imag_rows, bit_reversal=True): if isinstance(ymm_real_rows, tuple) and isinstance(ymm_imag_rows, tuple): return ifft16_within_rows([ymm_real_rows], [ymm_imag_rows]) assert isinstance(ymm_real_rows, list) and all(isinstance(ymm_real, tuple) and all(isinstance(ymm, YMMRegister) for ymm in ymm_real) for ymm_real in ymm_real_rows) assert isinstance(ymm_imag_rows, list) and all(isinstance(ymm_imag, tuple) and all(isinstance(ymm, YMMRegister) for ymm in ymm_imag) for ymm_imag in ymm_imag_rows) if bit_reversal: # Bit reversal # w[0] = x0 x8 x4 x12 x2 x10 x6 x14 # w[1] = x1 x9 x5 x13 x3 x11 x7 x15 ymm_bit_reversal_mask = YMMRegister() VMOVDQA(ymm_bit_reversal_mask, Constant.uint32x8(0, 2, 4, 6, 1, 3, 5, 7)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): for i in range(2): VPERMPS(ymm_real[i], ymm_bit_reversal_mask, ymm_real[i]) VPERMPS(ymm_imag[i], ymm_bit_reversal_mask, ymm_imag[i]) # 8x FFT2: Butterfly # w[0] = x0 x4 x2 x6 x8 x12 x10 x14 # w[1] = x1 x5 x3 x7 x9 x13 x11 x15 for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) ymm_new_real = YMMRegister(), YMMRegister() VUNPCKLPS(ymm_new_real[0], ymm_real[0], ymm_real[1]) VUNPCKHPS(ymm_new_real[1], ymm_real[0], ymm_imag[1]) ymm_new_imag = YMMRegister(), YMMRegister() VUNPCKLPS(ymm_new_imag[0], ymm_imag[0], ymm_imag[1]) VUNPCKHPS(ymm_new_imag[1], ymm_imag[0], ymm_real[1]) SWAP.REGISTERS(ymm_imag[0], ymm_new_imag[0]) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag[1]) SWAP.REGISTERS(ymm_real[0], ymm_new_real[0]) SWAP.REGISTERS(ymm_real[1], ymm_new_real[1]) # w[0] = x0 x1 x4 x5 x8 x9 x12 x13 # w[1] = x2 x3 x6 x7 x10 x11 x14 x15 # 4x FFT4: Butterfly and multiplication by twiddle factors ymm_fft4_twiddle_factor = YMMRegister() VMOVAPS(ymm_fft4_twiddle_factor, Constant.float32x8(+1.0, -1.0, +1.0, -1.0, +1.0, -1.0, +1.0, -1.0)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1], scale_b=ymm_fft4_twiddle_factor) butterfly(ymm_imag[0], ymm_imag[1]) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): transpose2x2x2x64(ymm_real[0], ymm_real[1]) transpose2x2x2x64(ymm_imag[0], ymm_imag[1]) # w[0] = x0 x1 x2 x3 x8 x9 x10 x11 # w[1] = x4 x5 x6 x7 x12 x13 x14 x15 # 2x FFT8: Multiplication by twiddle factors ymm_fft8_cos_twiddle_factor, ymm_fft8_sin_twiddle_factor = YMMRegister(), YMMRegister() VMOVAPS(ymm_fft8_cos_twiddle_factor, Constant.float32x8(*(cos_npi_over_4 * 2))) VMOVAPS(ymm_fft8_sin_twiddle_factor, Constant.float32x8(*(sin_npi_over_4 * 2))) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister() VMULPS(ymm_new_real1, ymm_real[1], ymm_fft8_cos_twiddle_factor) VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft8_cos_twiddle_factor) VFNMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft8_sin_twiddle_factor) VFMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft8_sin_twiddle_factor) SWAP.REGISTERS(ymm_real[1], ymm_new_real1) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1) # 2x FFT8: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): transpose2x2x128(ymm_real[0], ymm_real[1]) transpose2x2x128(ymm_imag[0], ymm_imag[1]) # w[0] = x0 x1 x2 x3 x4 x5 x6 x7 # w[1] = x8 x9 x10 x11 x12 x13 x14 x15 # FFT16: Multiplication by twiddle factors and scale scale_factor = 0.0625 ymm_fft16_cos_scale_twiddle_factor, ymm_fft16_sin_scale_twiddle_factor = YMMRegister(), YMMRegister() VMOVAPS(ymm_fft16_cos_scale_twiddle_factor, Constant.float32x8(*[cos * scale_factor for cos in cos_npi_over_8])) VMOVAPS(ymm_fft16_sin_scale_twiddle_factor, Constant.float32x8(*[sin * scale_factor for sin in sin_npi_over_8])) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister() VMULPS(ymm_new_real1, ymm_real[1], ymm_fft16_cos_scale_twiddle_factor) VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft16_cos_scale_twiddle_factor) VFNMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft16_sin_scale_twiddle_factor) VFMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft16_sin_scale_twiddle_factor) SWAP.REGISTERS(ymm_real[1], ymm_new_real1) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1) # FFT16: Butterfly and scale ymm_scale_factor = YMMRegister() VMOVAPS(ymm_scale_factor, Constant.float32x8(scale_factor)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1], scale_a=ymm_scale_factor) butterfly(ymm_imag[0], ymm_imag[1], scale_a=ymm_scale_factor)
def fft16_within_rows(ymm_real_rows, ymm_imag_rows, bit_reversal=True): if isinstance(ymm_real_rows, tuple) and isinstance(ymm_imag_rows, tuple): return fft16_within_rows([ymm_real_rows], [ymm_imag_rows]) assert isinstance(ymm_real_rows, list) and all(isinstance(ymm_real, tuple) and all(isinstance(ymm, YMMRegister) for ymm in ymm_real) for ymm_real in ymm_real_rows) assert isinstance(ymm_imag_rows, list) and all(isinstance(ymm_imag, tuple) and all(isinstance(ymm, YMMRegister) for ymm in ymm_imag) for ymm_imag in ymm_imag_rows) # FFT16: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) # FFT16: Multiplication by twiddle factors ymm_fft16_cos_twiddle_factor, ymm_fft16_sin_twiddle_factor = YMMRegister(), YMMRegister() VMOVAPS(ymm_fft16_cos_twiddle_factor, Constant.float32x8(*cos_npi_over_8)) VMOVAPS(ymm_fft16_sin_twiddle_factor, Constant.float32x8(*sin_npi_over_8)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister() VMULPS(ymm_new_real1, ymm_real[1], ymm_fft16_cos_twiddle_factor) VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft16_cos_twiddle_factor) VFMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft16_sin_twiddle_factor) VFNMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft16_sin_twiddle_factor) SWAP.REGISTERS(ymm_real[1], ymm_new_real1) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1) # 2x FFT8: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): transpose2x2x128(ymm_real[0], ymm_real[1]) transpose2x2x128(ymm_imag[0], ymm_imag[1]) # w[0] = x0 x1 x2 x3 x8 x9 x10 x11 # w[1] = x4 x5 x6 x7 x12 x13 x14 x15 for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) # 2x FFT8: Multiplication by twiddle factors ymm_fft8_cos_twiddle_factor, ymm_fft8_sin_twiddle_factor = YMMRegister(), YMMRegister() VMOVAPS(ymm_fft8_cos_twiddle_factor, Constant.float32x8(*(cos_npi_over_4 * 2))) VMOVAPS(ymm_fft8_sin_twiddle_factor, Constant.float32x8(*(sin_npi_over_4 * 2))) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister() VMULPS(ymm_new_real1, ymm_real[1], ymm_fft8_cos_twiddle_factor) VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft8_cos_twiddle_factor) VFMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft8_sin_twiddle_factor) VFNMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft8_sin_twiddle_factor) SWAP.REGISTERS(ymm_real[1], ymm_new_real1) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1) # 4x FFT4: Butterfly for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): transpose2x2x2x64(ymm_real[0], ymm_real[1]) transpose2x2x2x64(ymm_imag[0], ymm_imag[1]) # w[0] = x0 x1 x4 x5 x8 x9 x12 x13 # w[1] = x2 x3 x6 x7 x10 x11 x14 x15 for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): butterfly(ymm_real[0], ymm_real[1]) butterfly(ymm_imag[0], ymm_imag[1]) # 4x FFT4: Multiplication by twiddle factors and 8x FFT2: Butterfly ymm_fft4_twiddle_factor = YMMRegister() VMOVAPS(ymm_fft4_twiddle_factor, Constant.float32x8(+1.0, +1.0, -1.0, -1.0, +1.0, +1.0, -1.0, -1.0)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): ymm_new_real = YMMRegister(), YMMRegister() VSHUFPS(ymm_new_real[0], ymm_real[0], ymm_real[1], _MM_SHUFFLE(2, 0, 2, 0)) VSHUFPS(ymm_new_real[1], ymm_real[0], ymm_imag[1], _MM_SHUFFLE(3, 1, 3, 1)) butterfly(ymm_new_real[0], ymm_new_real[1]) ymm_new_imag = YMMRegister(), YMMRegister() VSHUFPS(ymm_new_imag[0], ymm_imag[0], ymm_imag[1], _MM_SHUFFLE(2, 0, 2, 0)) VSHUFPS(ymm_new_imag[1], ymm_imag[0], ymm_real[1], _MM_SHUFFLE(3, 1, 3, 1)) butterfly(ymm_new_imag[0], ymm_new_imag[1], scale_b=ymm_fft4_twiddle_factor) SWAP.REGISTERS(ymm_real[0], ymm_new_real[0]) SWAP.REGISTERS(ymm_real[1], ymm_new_real[1]) SWAP.REGISTERS(ymm_imag[0], ymm_new_imag[0]) SWAP.REGISTERS(ymm_imag[1], ymm_new_imag[1]) # w[0] = x0 x4 x2 x6 x8 x12 x10 x14 # w[1] = x1 x5 x3 x7 x9 x11 x13 x15 if bit_reversal: # Bit reversal ymm_bit_reversal_mask = YMMRegister() VMOVDQA(ymm_bit_reversal_mask, Constant.uint32x8(0, 4, 1, 5, 2, 6, 3, 7)) for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows): for i in range(2): VPERMPS(ymm_real[i], ymm_bit_reversal_mask, ymm_real[i]) VPERMPS(ymm_imag[i], ymm_bit_reversal_mask, ymm_imag[i])