예제 #1
0
def transpose8x3(xmm_rows):
    assert isinstance(xmm_rows, list) and len(xmm_rows) == 8 and all(
        isinstance(xmm_row, XMMRegister) for xmm_row in xmm_rows)
    # xmm_rows[0] = ( 0.0, g02, g01, g00 )
    # xmm_rows[1] = ( 0.0, g12, g11, g10 )
    # xmm_rows[2] = ( 0.0, g22, g21, g20 )
    # xmm_rows[3] = ( 0.0, g32, g31, g30 )
    # xmm_rows[4] = ( 0.0, g42, g41, g40 )
    # xmm_rows[5] = ( 0.0, g52, g51, g50 )
    # xmm_rows[6] = ( 0.0, g62, g61, g60 )
    # xmm_rows[7] = ( 0.0, g72, g71, g70 )

    ymm_rows = [YMMRegister() for _ in range(4)]

    VINSERTF128(ymm_rows[0], xmm_rows[0].as_ymm, xmm_rows[4], 1)
    VINSERTF128(ymm_rows[1], xmm_rows[1].as_ymm, xmm_rows[5], 1)
    VINSERTF128(ymm_rows[2], xmm_rows[2].as_ymm, xmm_rows[6], 1)
    VINSERTF128(ymm_rows[3], xmm_rows[3].as_ymm, xmm_rows[7], 1)

    # ymm_rows[0] = ( 0.0, g42, g41, g40, 0.0, g02, g01, g00 )
    # ymm_rows[1] = ( 0.0, g52, g51, g50, 0.0, g12, g11, g10 )
    # ymm_rows[2] = ( 0.0, g62, g61, g60, 0.0, g22, g21, g20 )
    # ymm_rows[3] = ( 0.0, g72, g71, g70, 0.0, g32, g31, g30 )

    ymm_new_rows = [YMMRegister() for _ in range(4)]
    VUNPCKLPS(ymm_new_rows[0], ymm_rows[0], ymm_rows[1])
    VUNPCKHPS(ymm_new_rows[1], ymm_rows[0], ymm_rows[1])
    VUNPCKLPS(ymm_new_rows[2], ymm_rows[2], ymm_rows[3])
    VUNPCKHPS(ymm_new_rows[3], ymm_rows[2], ymm_rows[3])
    for ymm_row, ymm_new_row in zip(ymm_rows, ymm_new_rows):
        SWAP.REGISTERS(ymm_row, ymm_new_row)

    # ymm_rows[0] = ( g51, g41, g50, g40, g11, g01, g10, g00 )
    # ymm_rows[1] = ( 0.0, 0.0, g52, g42, 0.0, 0.0, g12, g02 )
    # ymm_rows[2] = ( g71, g61, g70, g60, g31, g21, g30, g20 )
    # ymm_rows[3] = ( 0.0, 0.0, g72, g62, 0.0, 0.0, g32, g22 )

    # ymm_rows[0] = ( g70, g60, g50, g40, g30, g20, g10, g00 )
    # ymm_rows[2] = ( g71, g61, g51, g41, g31, g21, g11, g01 )
    transpose2x2x2x64(ymm_rows[0], ymm_rows[2])
    # ymm_rows[1] = ( g72, g62, g52, g42, g32, g22, g12, g02 )
    VUNPCKLPD(ymm_rows[1], ymm_rows[1], ymm_rows[3])
    SWAP.REGISTERS(ymm_rows[1], ymm_rows[2])

    return ymm_rows[0:3]
예제 #2
0
def transpose8x3(xmm_rows):
    assert isinstance(xmm_rows, list) and len(xmm_rows) == 8 and all(isinstance(xmm_row, XMMRegister) for xmm_row in xmm_rows)
    # xmm_rows[0] = ( 0.0, g02, g01, g00 )
    # xmm_rows[1] = ( 0.0, g12, g11, g10 )
    # xmm_rows[2] = ( 0.0, g22, g21, g20 )
    # xmm_rows[3] = ( 0.0, g32, g31, g30 )
    # xmm_rows[4] = ( 0.0, g42, g41, g40 )
    # xmm_rows[5] = ( 0.0, g52, g51, g50 )
    # xmm_rows[6] = ( 0.0, g62, g61, g60 )
    # xmm_rows[7] = ( 0.0, g72, g71, g70 )

    ymm_rows = [YMMRegister() for _ in range(4)]

    VINSERTF128(ymm_rows[0], xmm_rows[0].as_ymm, xmm_rows[4], 1)
    VINSERTF128(ymm_rows[1], xmm_rows[1].as_ymm, xmm_rows[5], 1)
    VINSERTF128(ymm_rows[2], xmm_rows[2].as_ymm, xmm_rows[6], 1)
    VINSERTF128(ymm_rows[3], xmm_rows[3].as_ymm, xmm_rows[7], 1)

    # ymm_rows[0] = ( 0.0, g42, g41, g40, 0.0, g02, g01, g00 )
    # ymm_rows[1] = ( 0.0, g52, g51, g50, 0.0, g12, g11, g10 )
    # ymm_rows[2] = ( 0.0, g62, g61, g60, 0.0, g22, g21, g20 )
    # ymm_rows[3] = ( 0.0, g72, g71, g70, 0.0, g32, g31, g30 )

    ymm_new_rows = [YMMRegister() for _ in range(4)]
    VUNPCKLPS(ymm_new_rows[0], ymm_rows[0], ymm_rows[1])
    VUNPCKHPS(ymm_new_rows[1], ymm_rows[0], ymm_rows[1])
    VUNPCKLPS(ymm_new_rows[2], ymm_rows[2], ymm_rows[3])
    VUNPCKHPS(ymm_new_rows[3], ymm_rows[2], ymm_rows[3])
    for ymm_row, ymm_new_row in zip(ymm_rows, ymm_new_rows):
        SWAP.REGISTERS(ymm_row, ymm_new_row)

    # ymm_rows[0] = ( g51, g41, g50, g40, g11, g01, g10, g00 )
    # ymm_rows[1] = ( 0.0, 0.0, g52, g42, 0.0, 0.0, g12, g02 )
    # ymm_rows[2] = ( g71, g61, g70, g60, g31, g21, g30, g20 )
    # ymm_rows[3] = ( 0.0, 0.0, g72, g62, 0.0, 0.0, g32, g22 )

    # ymm_rows[0] = ( g70, g60, g50, g40, g30, g20, g10, g00 )
    # ymm_rows[2] = ( g71, g61, g51, g41, g31, g21, g11, g01 )
    transpose2x2x2x64(ymm_rows[0], ymm_rows[2])
    # ymm_rows[1] = ( g72, g62, g52, g42, g32, g22, g12, g02 )
    VUNPCKLPD(ymm_rows[1], ymm_rows[1], ymm_rows[3])
    SWAP.REGISTERS(ymm_rows[1], ymm_rows[2])

    return ymm_rows[0:3]
예제 #3
0
def transpose6x8(ymm_rows):
    assert isinstance(ymm_rows, list) and len(ymm_rows) == 6 and all(
        isinstance(ymm_row, YMMRegister) for ymm_row in ymm_rows)
    # ymm_rows[0] = ( g07, g06, g05, g04, g03, g02, g01, g00 )
    # ymm_rows[1] = ( g17, g16, g15, g14, g13, g12, g11, g10 )
    # ymm_rows[2] = ( g27, g26, g25, g24, g23, g22, g21, g20 )
    # ymm_rows[3] = ( g37, g36, g35, g34, g33, g32, g31, g30 )
    # ymm_rows[4] = ( g47, g46, g45, g44, g43, g42, g41, g40 )
    # ymm_rows[5] = ( g57, g56, g55, g54, g53, g52, g51, g50 )

    for ymm_even_row, ymm_odd_row in zip(ymm_rows[0::2], ymm_rows[1::2]):
        ymm_temp = YMMRegister()
        VUNPCKLPS(ymm_temp, ymm_even_row, ymm_odd_row)
        VUNPCKHPS(ymm_odd_row, ymm_even_row, ymm_odd_row)
        SWAP.REGISTERS(ymm_even_row, ymm_temp)

    # ymm_rows[0] = ( g15, g05, g14, g04, g11, g01, g10, g00 )
    # ymm_rows[1] = ( g17, g07, g16, g06, g13, g03, g12, g02 )
    # ymm_rows[2] = ( g35, g25, g34, g24, g31, g21, g30, g20 )
    # ymm_rows[3] = ( g37, g27, g36, g26, g33, g23, g32, g22 )
    # ymm_rows[4] = ( g55, g45, g54, g44, g51, g41, g50, g40 )
    # ymm_rows[5] = ( g57, g47, g56, g46, g53, g43, g52, g42 )

    ymm_zero_rows = [YMMRegister(), YMMRegister()]
    for ymm_zero in ymm_zero_rows:
        VXORPS(ymm_zero, ymm_zero, ymm_zero)
    ymm_rows += ymm_zero_rows

    # ymm_rows[6] = ( 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 )
    # ymm_rows[7] = ( 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 )

    transpose2x2x2x64(ymm_rows[0], ymm_rows[2])
    transpose2x2x2x64(ymm_rows[1], ymm_rows[3])
    transpose2x2x2x64(ymm_rows[4], ymm_rows[6])
    transpose2x2x2x64(ymm_rows[5], ymm_rows[7])

    # ymm_rows[0] = ( g34, g24, g14, g04, g30, g20, g10, g00 )
    # ymm_rows[1] = ( g36, g26, g16, g06, g32, g22, g12, g02 )
    # ymm_rows[2] = ( g35, g25, g15, g05, g31, g21, g11, g01 )
    # ymm_rows[3] = ( g37, g27, g17, g07, g33, g23, g13, g03 )
    # ymm_rows[4] = ( 0.0, 0.0, g54, g44, 0.0, 0.0, g50, g40 )
    # ymm_rows[5] = ( 0.0, 0.0, g56, g46, 0.0, 0.0, g52, g42 )
    # ymm_rows[6] = ( 0.0, 0.0, g55, g45, 0.0, 0.0, g51, g41 )
    # ymm_rows[7] = ( 0.0, 0.0, g57, g47, 0.0, 0.0, g53, g43 )

    transpose2x2x128(ymm_rows[0], ymm_rows[4])
    transpose2x2x128(ymm_rows[1], ymm_rows[5])
    transpose2x2x128(ymm_rows[2], ymm_rows[6])
    transpose2x2x128(ymm_rows[3], ymm_rows[7])

    SWAP.REGISTERS(ymm_rows[1], ymm_rows[2])
    SWAP.REGISTERS(ymm_rows[5], ymm_rows[6])

    return ymm_rows
예제 #4
0
def transpose6x8(ymm_rows):
    assert isinstance(ymm_rows, list) and len(ymm_rows) == 6 and all(isinstance(ymm_row, YMMRegister) for ymm_row in ymm_rows)
    # ymm_rows[0] = ( g07, g06, g05, g04, g03, g02, g01, g00 )
    # ymm_rows[1] = ( g17, g16, g15, g14, g13, g12, g11, g10 )
    # ymm_rows[2] = ( g27, g26, g25, g24, g23, g22, g21, g20 )
    # ymm_rows[3] = ( g37, g36, g35, g34, g33, g32, g31, g30 )
    # ymm_rows[4] = ( g47, g46, g45, g44, g43, g42, g41, g40 )
    # ymm_rows[5] = ( g57, g56, g55, g54, g53, g52, g51, g50 )

    for ymm_even_row, ymm_odd_row in zip(ymm_rows[0::2], ymm_rows[1::2]):
        ymm_temp = YMMRegister()
        VUNPCKLPS(ymm_temp, ymm_even_row, ymm_odd_row)
        VUNPCKHPS(ymm_odd_row, ymm_even_row, ymm_odd_row)
        SWAP.REGISTERS(ymm_even_row, ymm_temp)

    # ymm_rows[0] = ( g15, g05, g14, g04, g11, g01, g10, g00 )
    # ymm_rows[1] = ( g17, g07, g16, g06, g13, g03, g12, g02 )
    # ymm_rows[2] = ( g35, g25, g34, g24, g31, g21, g30, g20 )
    # ymm_rows[3] = ( g37, g27, g36, g26, g33, g23, g32, g22 )
    # ymm_rows[4] = ( g55, g45, g54, g44, g51, g41, g50, g40 )
    # ymm_rows[5] = ( g57, g47, g56, g46, g53, g43, g52, g42 )

    ymm_zero_rows = [YMMRegister(), YMMRegister()]
    for ymm_zero in ymm_zero_rows:
        VXORPS(ymm_zero, ymm_zero, ymm_zero)
    ymm_rows += ymm_zero_rows

    # ymm_rows[6] = ( 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 )
    # ymm_rows[7] = ( 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 )

    transpose2x2x2x64(ymm_rows[0], ymm_rows[2])
    transpose2x2x2x64(ymm_rows[1], ymm_rows[3])
    transpose2x2x2x64(ymm_rows[4], ymm_rows[6])
    transpose2x2x2x64(ymm_rows[5], ymm_rows[7])

    # ymm_rows[0] = ( g34, g24, g14, g04, g30, g20, g10, g00 )
    # ymm_rows[1] = ( g36, g26, g16, g06, g32, g22, g12, g02 )
    # ymm_rows[2] = ( g35, g25, g15, g05, g31, g21, g11, g01 )
    # ymm_rows[3] = ( g37, g27, g17, g07, g33, g23, g13, g03 )
    # ymm_rows[4] = ( 0.0, 0.0, g54, g44, 0.0, 0.0, g50, g40 )
    # ymm_rows[5] = ( 0.0, 0.0, g56, g46, 0.0, 0.0, g52, g42 )
    # ymm_rows[6] = ( 0.0, 0.0, g55, g45, 0.0, 0.0, g51, g41 )
    # ymm_rows[7] = ( 0.0, 0.0, g57, g47, 0.0, 0.0, g53, g43 )

    transpose2x2x128(ymm_rows[0], ymm_rows[4])
    transpose2x2x128(ymm_rows[1], ymm_rows[5])
    transpose2x2x128(ymm_rows[2], ymm_rows[6])
    transpose2x2x128(ymm_rows[3], ymm_rows[7])

    SWAP.REGISTERS(ymm_rows[1], ymm_rows[2])
    SWAP.REGISTERS(ymm_rows[5], ymm_rows[6])

    return ymm_rows
예제 #5
0
def transpose8x8(ymm_rows):
    assert isinstance(ymm_rows, list) and len(ymm_rows) == 8 and all(
        isinstance(ymm_row, YMMRegister) for ymm_row in ymm_rows)
    # ymm_rows[0] = ( g07, g06, g05, g04, g03, g02, g01, g00 )
    # ymm_rows[1] = ( g17, g16, g15, g14, g13, g12, g11, g10 )
    # ymm_rows[2] = ( g27, g26, g25, g24, g23, g22, g21, g20 )
    # ymm_rows[3] = ( g37, g36, g35, g34, g33, g32, g31, g30 )
    # ymm_rows[4] = ( g47, g46, g45, g44, g43, g42, g41, g40 )
    # ymm_rows[5] = ( g57, g56, g55, g54, g53, g52, g51, g50 )
    # ymm_rows[6] = ( g67, g66, g65, g64, g63, g62, g61, g60 )
    # ymm_rows[7] = ( g77, g76, g75, g74, g73, g72, g71, g70 )

    for ymm_even_row, ymm_odd_row in zip(ymm_rows[0::2], ymm_rows[1::2]):
        ymm_temp = YMMRegister()
        VUNPCKLPS(ymm_temp, ymm_even_row, ymm_odd_row)
        VUNPCKHPS(ymm_odd_row, ymm_even_row, ymm_odd_row)
        SWAP.REGISTERS(ymm_even_row, ymm_temp)

    # ymm_rows[0] = ( g15, g05, g14, g04, g11, g01, g10, g00 )
    # ymm_rows[1] = ( g17, g07, g16, g06, g13, g03, g12, g02 )
    # ymm_rows[2] = ( g35, g25, g34, g24, g31, g21, g30, g20 )
    # ymm_rows[3] = ( g37, g27, g36, g26, g33, g23, g32, g22 )
    # ymm_rows[4] = ( g55, g45, g54, g44, g51, g41, g50, g40 )
    # ymm_rows[5] = ( g57, g47, g56, g46, g53, g43, g52, g42 )
    # ymm_rows[6] = ( g75, g65, g74, g64, g71, g61, g70, g60 )
    # ymm_rows[7] = ( g77, g67, g76, g66, g73, g63, g72, g62 )

    transpose2x2x2x64(ymm_rows[0], ymm_rows[2])
    transpose2x2x2x64(ymm_rows[1], ymm_rows[3])
    transpose2x2x2x64(ymm_rows[4], ymm_rows[6])
    transpose2x2x2x64(ymm_rows[5], ymm_rows[7])

    # ymm_rows[0] = ( g34, g24, g14, g04, g30, g20, g10, g00 )
    # ymm_rows[1] = ( g36, g26, g16, g06, g32, g22, g12, g02 )
    # ymm_rows[2] = ( g35, g25, g15, g05, g31, g21, g11, g01 )
    # ymm_rows[3] = ( g37, g27, g17, g07, g33, g23, g13, g03 )
    # ymm_rows[4] = ( g74, g64, g54, g44, g70, g60, g50, g40 )
    # ymm_rows[5] = ( g76, g66, g56, g46, g72, g62, g52, g42 )
    # ymm_rows[6] = ( g75, g65, g55, g45, g71, g61, g51, g41 )
    # ymm_rows[7] = ( g77, g67, g57, g47, g73, g63, g53, g43 )

    transpose2x2x128(ymm_rows[0], ymm_rows[4])
    transpose2x2x128(ymm_rows[1], ymm_rows[5])
    transpose2x2x128(ymm_rows[2], ymm_rows[6])
    transpose2x2x128(ymm_rows[3], ymm_rows[7])

    SWAP.REGISTERS(ymm_rows[1], ymm_rows[2])
    SWAP.REGISTERS(ymm_rows[5], ymm_rows[6])
예제 #6
0
def transpose8x8(ymm_rows):
    assert isinstance(ymm_rows, list) and len(ymm_rows) == 8 and all(isinstance(ymm_row, YMMRegister) for ymm_row in ymm_rows)
    # ymm_rows[0] = ( g07, g06, g05, g04, g03, g02, g01, g00 )
    # ymm_rows[1] = ( g17, g16, g15, g14, g13, g12, g11, g10 )
    # ymm_rows[2] = ( g27, g26, g25, g24, g23, g22, g21, g20 )
    # ymm_rows[3] = ( g37, g36, g35, g34, g33, g32, g31, g30 )
    # ymm_rows[4] = ( g47, g46, g45, g44, g43, g42, g41, g40 )
    # ymm_rows[5] = ( g57, g56, g55, g54, g53, g52, g51, g50 )
    # ymm_rows[6] = ( g67, g66, g65, g64, g63, g62, g61, g60 )
    # ymm_rows[7] = ( g77, g76, g75, g74, g73, g72, g71, g70 )

    for ymm_even_row, ymm_odd_row in zip(ymm_rows[0::2], ymm_rows[1::2]):
        ymm_temp = YMMRegister()
        VUNPCKLPS(ymm_temp, ymm_even_row, ymm_odd_row)
        VUNPCKHPS(ymm_odd_row, ymm_even_row, ymm_odd_row)
        SWAP.REGISTERS(ymm_even_row, ymm_temp)

    # ymm_rows[0] = ( g15, g05, g14, g04, g11, g01, g10, g00 )
    # ymm_rows[1] = ( g17, g07, g16, g06, g13, g03, g12, g02 )
    # ymm_rows[2] = ( g35, g25, g34, g24, g31, g21, g30, g20 )
    # ymm_rows[3] = ( g37, g27, g36, g26, g33, g23, g32, g22 )
    # ymm_rows[4] = ( g55, g45, g54, g44, g51, g41, g50, g40 )
    # ymm_rows[5] = ( g57, g47, g56, g46, g53, g43, g52, g42 )
    # ymm_rows[6] = ( g75, g65, g74, g64, g71, g61, g70, g60 )
    # ymm_rows[7] = ( g77, g67, g76, g66, g73, g63, g72, g62 )

    transpose2x2x2x64(ymm_rows[0], ymm_rows[2])
    transpose2x2x2x64(ymm_rows[1], ymm_rows[3])
    transpose2x2x2x64(ymm_rows[4], ymm_rows[6])
    transpose2x2x2x64(ymm_rows[5], ymm_rows[7])

    # ymm_rows[0] = ( g34, g24, g14, g04, g30, g20, g10, g00 )
    # ymm_rows[1] = ( g36, g26, g16, g06, g32, g22, g12, g02 )
    # ymm_rows[2] = ( g35, g25, g15, g05, g31, g21, g11, g01 )
    # ymm_rows[3] = ( g37, g27, g17, g07, g33, g23, g13, g03 )
    # ymm_rows[4] = ( g74, g64, g54, g44, g70, g60, g50, g40 )
    # ymm_rows[5] = ( g76, g66, g56, g46, g72, g62, g52, g42 )
    # ymm_rows[6] = ( g75, g65, g55, g45, g71, g61, g51, g41 )
    # ymm_rows[7] = ( g77, g67, g57, g47, g73, g63, g53, g43 )

    transpose2x2x128(ymm_rows[0], ymm_rows[4])
    transpose2x2x128(ymm_rows[1], ymm_rows[5])
    transpose2x2x128(ymm_rows[2], ymm_rows[6])
    transpose2x2x128(ymm_rows[3], ymm_rows[7])

    SWAP.REGISTERS(ymm_rows[1], ymm_rows[2])
    SWAP.REGISTERS(ymm_rows[5], ymm_rows[6])
예제 #7
0
def ifft16_within_rows(ymm_real_rows, ymm_imag_rows, bit_reversal=True):
    if isinstance(ymm_real_rows, tuple) and isinstance(ymm_imag_rows, tuple):
        return ifft16_within_rows([ymm_real_rows], [ymm_imag_rows])

    assert isinstance(ymm_real_rows, list) and all(
        isinstance(ymm_real, tuple) and all(
            isinstance(ymm, YMMRegister) for ymm in ymm_real)
        for ymm_real in ymm_real_rows)
    assert isinstance(ymm_imag_rows, list) and all(
        isinstance(ymm_imag, tuple) and all(
            isinstance(ymm, YMMRegister) for ymm in ymm_imag)
        for ymm_imag in ymm_imag_rows)

    if bit_reversal:
        # Bit reversal
        # w[0] = x0 x8 x4 x12 x2 x10 x6 x14
        # w[1] = x1 x9 x5 x13 x3 x11 x7 x15
        ymm_bit_reversal_mask = YMMRegister()
        VMOVDQA(ymm_bit_reversal_mask,
                Constant.uint32x8(0, 2, 4, 6, 1, 3, 5, 7))
        for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
            for i in range(2):
                VPERMPS(ymm_real[i], ymm_bit_reversal_mask, ymm_real[i])
                VPERMPS(ymm_imag[i], ymm_bit_reversal_mask, ymm_imag[i])

    # 8x FFT2: Butterfly
    # w[0] = x0 x4 x2 x6 x8 x12 x10 x14
    # w[1] = x1 x5 x3 x7 x9 x13 x11 x15
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1])
        butterfly(ymm_imag[0], ymm_imag[1])

        ymm_new_real = YMMRegister(), YMMRegister()
        VUNPCKLPS(ymm_new_real[0], ymm_real[0], ymm_real[1])
        VUNPCKHPS(ymm_new_real[1], ymm_real[0], ymm_imag[1])

        ymm_new_imag = YMMRegister(), YMMRegister()
        VUNPCKLPS(ymm_new_imag[0], ymm_imag[0], ymm_imag[1])
        VUNPCKHPS(ymm_new_imag[1], ymm_imag[0], ymm_real[1])

        SWAP.REGISTERS(ymm_imag[0], ymm_new_imag[0])
        SWAP.REGISTERS(ymm_imag[1], ymm_new_imag[1])
        SWAP.REGISTERS(ymm_real[0], ymm_new_real[0])
        SWAP.REGISTERS(ymm_real[1], ymm_new_real[1])
    # w[0] = x0 x1 x4 x5 x8  x9  x12 x13
    # w[1] = x2 x3 x6 x7 x10 x11 x14 x15

    # 4x FFT4: Butterfly and multiplication by twiddle factors
    ymm_fft4_twiddle_factor = YMMRegister()
    VMOVAPS(ymm_fft4_twiddle_factor,
            Constant.float32x8(+1.0, -1.0, +1.0, -1.0, +1.0, -1.0, +1.0, -1.0))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1], scale_b=ymm_fft4_twiddle_factor)
        butterfly(ymm_imag[0], ymm_imag[1])
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        transpose2x2x2x64(ymm_real[0], ymm_real[1])
        transpose2x2x2x64(ymm_imag[0], ymm_imag[1])
    # w[0] = x0 x1 x2 x3  x8  x9 x10 x11
    # w[1] = x4 x5 x6 x7 x12 x13 x14 x15

    # 2x FFT8: Multiplication by twiddle factors
    ymm_fft8_cos_twiddle_factor, ymm_fft8_sin_twiddle_factor = YMMRegister(
    ), YMMRegister()
    VMOVAPS(ymm_fft8_cos_twiddle_factor,
            Constant.float32x8(*(cos_npi_over_4 * 2)))
    VMOVAPS(ymm_fft8_sin_twiddle_factor,
            Constant.float32x8(*(sin_npi_over_4 * 2)))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister()
        VMULPS(ymm_new_real1, ymm_real[1], ymm_fft8_cos_twiddle_factor)
        VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft8_cos_twiddle_factor)

        VFNMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft8_sin_twiddle_factor)
        VFMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft8_sin_twiddle_factor)

        SWAP.REGISTERS(ymm_real[1], ymm_new_real1)
        SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1)

    # 2x FFT8: Butterfly
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1])
        butterfly(ymm_imag[0], ymm_imag[1])
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        transpose2x2x128(ymm_real[0], ymm_real[1])
        transpose2x2x128(ymm_imag[0], ymm_imag[1])
    # w[0] = x0 x1  x2  x3  x4  x5  x6  x7
    # w[1] = x8 x9 x10 x11 x12 x13 x14 x15

    # FFT16: Multiplication by twiddle factors and scale
    scale_factor = 0.0625
    ymm_fft16_cos_scale_twiddle_factor, ymm_fft16_sin_scale_twiddle_factor = YMMRegister(
    ), YMMRegister()
    VMOVAPS(
        ymm_fft16_cos_scale_twiddle_factor,
        Constant.float32x8(*[cos * scale_factor for cos in cos_npi_over_8]))
    VMOVAPS(
        ymm_fft16_sin_scale_twiddle_factor,
        Constant.float32x8(*[sin * scale_factor for sin in sin_npi_over_8]))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister()
        VMULPS(ymm_new_real1, ymm_real[1], ymm_fft16_cos_scale_twiddle_factor)
        VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft16_cos_scale_twiddle_factor)

        VFNMADD231PS(ymm_new_real1, ymm_imag[1],
                     ymm_fft16_sin_scale_twiddle_factor)
        VFMADD231PS(ymm_new_imag1, ymm_real[1],
                    ymm_fft16_sin_scale_twiddle_factor)

        SWAP.REGISTERS(ymm_real[1], ymm_new_real1)
        SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1)

    # FFT16: Butterfly and scale
    ymm_scale_factor = YMMRegister()
    VMOVAPS(ymm_scale_factor, Constant.float32x8(scale_factor))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1], scale_a=ymm_scale_factor)
        butterfly(ymm_imag[0], ymm_imag[1], scale_a=ymm_scale_factor)
예제 #8
0
def fft16_within_rows(ymm_real_rows, ymm_imag_rows, bit_reversal=True):
    if isinstance(ymm_real_rows, tuple) and isinstance(ymm_imag_rows, tuple):
        return fft16_within_rows([ymm_real_rows], [ymm_imag_rows])

    assert isinstance(ymm_real_rows, list) and all(
        isinstance(ymm_real, tuple) and all(
            isinstance(ymm, YMMRegister) for ymm in ymm_real)
        for ymm_real in ymm_real_rows)
    assert isinstance(ymm_imag_rows, list) and all(
        isinstance(ymm_imag, tuple) and all(
            isinstance(ymm, YMMRegister) for ymm in ymm_imag)
        for ymm_imag in ymm_imag_rows)

    # FFT16: Butterfly
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1])
        butterfly(ymm_imag[0], ymm_imag[1])

    # FFT16: Multiplication by twiddle factors
    ymm_fft16_cos_twiddle_factor, ymm_fft16_sin_twiddle_factor = YMMRegister(
    ), YMMRegister()
    VMOVAPS(ymm_fft16_cos_twiddle_factor, Constant.float32x8(*cos_npi_over_8))
    VMOVAPS(ymm_fft16_sin_twiddle_factor, Constant.float32x8(*sin_npi_over_8))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister()
        VMULPS(ymm_new_real1, ymm_real[1], ymm_fft16_cos_twiddle_factor)
        VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft16_cos_twiddle_factor)

        VFMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft16_sin_twiddle_factor)
        VFNMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft16_sin_twiddle_factor)

        SWAP.REGISTERS(ymm_real[1], ymm_new_real1)
        SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1)

    # 2x FFT8: Butterfly
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        transpose2x2x128(ymm_real[0], ymm_real[1])
        transpose2x2x128(ymm_imag[0], ymm_imag[1])
    # w[0] = x0 x1 x2 x3  x8  x9 x10 x11
    # w[1] = x4 x5 x6 x7 x12 x13 x14 x15
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1])
        butterfly(ymm_imag[0], ymm_imag[1])

    # 2x FFT8: Multiplication by twiddle factors
    ymm_fft8_cos_twiddle_factor, ymm_fft8_sin_twiddle_factor = YMMRegister(
    ), YMMRegister()
    VMOVAPS(ymm_fft8_cos_twiddle_factor,
            Constant.float32x8(*(cos_npi_over_4 * 2)))
    VMOVAPS(ymm_fft8_sin_twiddle_factor,
            Constant.float32x8(*(sin_npi_over_4 * 2)))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister()
        VMULPS(ymm_new_real1, ymm_real[1], ymm_fft8_cos_twiddle_factor)
        VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft8_cos_twiddle_factor)

        VFMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft8_sin_twiddle_factor)
        VFNMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft8_sin_twiddle_factor)

        SWAP.REGISTERS(ymm_real[1], ymm_new_real1)
        SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1)

    # 4x FFT4: Butterfly
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        transpose2x2x2x64(ymm_real[0], ymm_real[1])
        transpose2x2x2x64(ymm_imag[0], ymm_imag[1])
    # w[0] = x0 x1 x4 x5  x8  x9 x12 x13
    # w[1] = x2 x3 x6 x7 x10 x11 x14 x15
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1])
        butterfly(ymm_imag[0], ymm_imag[1])

    # 4x FFT4: Multiplication by twiddle factors and 8x FFT2: Butterfly
    ymm_fft4_twiddle_factor = YMMRegister()
    VMOVAPS(ymm_fft4_twiddle_factor,
            Constant.float32x8(+1.0, +1.0, -1.0, -1.0, +1.0, +1.0, -1.0, -1.0))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        ymm_new_real = YMMRegister(), YMMRegister()
        VSHUFPS(ymm_new_real[0], ymm_real[0], ymm_real[1],
                _MM_SHUFFLE(2, 0, 2, 0))
        VSHUFPS(ymm_new_real[1], ymm_real[0], ymm_imag[1],
                _MM_SHUFFLE(3, 1, 3, 1))
        butterfly(ymm_new_real[0], ymm_new_real[1])

        ymm_new_imag = YMMRegister(), YMMRegister()
        VSHUFPS(ymm_new_imag[0], ymm_imag[0], ymm_imag[1],
                _MM_SHUFFLE(2, 0, 2, 0))
        VSHUFPS(ymm_new_imag[1], ymm_imag[0], ymm_real[1],
                _MM_SHUFFLE(3, 1, 3, 1))
        butterfly(ymm_new_imag[0],
                  ymm_new_imag[1],
                  scale_b=ymm_fft4_twiddle_factor)

        SWAP.REGISTERS(ymm_real[0], ymm_new_real[0])
        SWAP.REGISTERS(ymm_real[1], ymm_new_real[1])
        SWAP.REGISTERS(ymm_imag[0], ymm_new_imag[0])
        SWAP.REGISTERS(ymm_imag[1], ymm_new_imag[1])

    # w[0] = x0 x4 x2 x6 x8 x12 x10 x14
    # w[1] = x1 x5 x3 x7 x9 x11 x13 x15

    if bit_reversal:
        # Bit reversal
        ymm_bit_reversal_mask = YMMRegister()
        VMOVDQA(ymm_bit_reversal_mask,
                Constant.uint32x8(0, 4, 1, 5, 2, 6, 3, 7))
        for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
            for i in range(2):
                VPERMPS(ymm_real[i], ymm_bit_reversal_mask, ymm_real[i])
                VPERMPS(ymm_imag[i], ymm_bit_reversal_mask, ymm_imag[i])
예제 #9
0
def ifft16_within_rows(ymm_real_rows, ymm_imag_rows, bit_reversal=True):
    if isinstance(ymm_real_rows, tuple) and isinstance(ymm_imag_rows, tuple):
        return ifft16_within_rows([ymm_real_rows], [ymm_imag_rows])

    assert isinstance(ymm_real_rows, list) and all(isinstance(ymm_real, tuple) and all(isinstance(ymm, YMMRegister) for ymm in ymm_real) for ymm_real in ymm_real_rows)
    assert isinstance(ymm_imag_rows, list) and all(isinstance(ymm_imag, tuple) and all(isinstance(ymm, YMMRegister) for ymm in ymm_imag) for ymm_imag in ymm_imag_rows)

    if bit_reversal:
        # Bit reversal
        # w[0] = x0 x8 x4 x12 x2 x10 x6 x14
        # w[1] = x1 x9 x5 x13 x3 x11 x7 x15
        ymm_bit_reversal_mask = YMMRegister()
        VMOVDQA(ymm_bit_reversal_mask, Constant.uint32x8(0, 2, 4, 6, 1, 3, 5, 7))
        for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
            for i in range(2):
                VPERMPS(ymm_real[i], ymm_bit_reversal_mask, ymm_real[i])
                VPERMPS(ymm_imag[i], ymm_bit_reversal_mask, ymm_imag[i])

    # 8x FFT2: Butterfly
    # w[0] = x0 x4 x2 x6 x8 x12 x10 x14
    # w[1] = x1 x5 x3 x7 x9 x13 x11 x15
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1])
        butterfly(ymm_imag[0], ymm_imag[1])

        ymm_new_real = YMMRegister(), YMMRegister()
        VUNPCKLPS(ymm_new_real[0], ymm_real[0], ymm_real[1])
        VUNPCKHPS(ymm_new_real[1], ymm_real[0], ymm_imag[1])

        ymm_new_imag = YMMRegister(), YMMRegister()
        VUNPCKLPS(ymm_new_imag[0], ymm_imag[0], ymm_imag[1])
        VUNPCKHPS(ymm_new_imag[1], ymm_imag[0], ymm_real[1])

        SWAP.REGISTERS(ymm_imag[0], ymm_new_imag[0])
        SWAP.REGISTERS(ymm_imag[1], ymm_new_imag[1])
        SWAP.REGISTERS(ymm_real[0], ymm_new_real[0])
        SWAP.REGISTERS(ymm_real[1], ymm_new_real[1])
    # w[0] = x0 x1 x4 x5 x8  x9  x12 x13
    # w[1] = x2 x3 x6 x7 x10 x11 x14 x15

    # 4x FFT4: Butterfly and multiplication by twiddle factors
    ymm_fft4_twiddle_factor = YMMRegister()
    VMOVAPS(ymm_fft4_twiddle_factor, Constant.float32x8(+1.0, -1.0, +1.0, -1.0, +1.0, -1.0, +1.0, -1.0))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1], scale_b=ymm_fft4_twiddle_factor)
        butterfly(ymm_imag[0], ymm_imag[1])
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        transpose2x2x2x64(ymm_real[0], ymm_real[1])
        transpose2x2x2x64(ymm_imag[0], ymm_imag[1])
    # w[0] = x0 x1 x2 x3  x8  x9 x10 x11
    # w[1] = x4 x5 x6 x7 x12 x13 x14 x15

    # 2x FFT8: Multiplication by twiddle factors
    ymm_fft8_cos_twiddle_factor, ymm_fft8_sin_twiddle_factor = YMMRegister(), YMMRegister()
    VMOVAPS(ymm_fft8_cos_twiddle_factor, Constant.float32x8(*(cos_npi_over_4 * 2)))
    VMOVAPS(ymm_fft8_sin_twiddle_factor, Constant.float32x8(*(sin_npi_over_4 * 2)))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister()
        VMULPS(ymm_new_real1, ymm_real[1], ymm_fft8_cos_twiddle_factor)
        VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft8_cos_twiddle_factor)

        VFNMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft8_sin_twiddle_factor)
        VFMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft8_sin_twiddle_factor)

        SWAP.REGISTERS(ymm_real[1], ymm_new_real1)
        SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1)

    # 2x FFT8: Butterfly
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1])
        butterfly(ymm_imag[0], ymm_imag[1])
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        transpose2x2x128(ymm_real[0], ymm_real[1])
        transpose2x2x128(ymm_imag[0], ymm_imag[1])
    # w[0] = x0 x1  x2  x3  x4  x5  x6  x7
    # w[1] = x8 x9 x10 x11 x12 x13 x14 x15

    # FFT16: Multiplication by twiddle factors and scale
    scale_factor = 0.0625
    ymm_fft16_cos_scale_twiddle_factor, ymm_fft16_sin_scale_twiddle_factor = YMMRegister(), YMMRegister()
    VMOVAPS(ymm_fft16_cos_scale_twiddle_factor, Constant.float32x8(*[cos * scale_factor for cos in cos_npi_over_8]))
    VMOVAPS(ymm_fft16_sin_scale_twiddle_factor, Constant.float32x8(*[sin * scale_factor for sin in sin_npi_over_8]))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister()
        VMULPS(ymm_new_real1, ymm_real[1], ymm_fft16_cos_scale_twiddle_factor)
        VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft16_cos_scale_twiddle_factor)

        VFNMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft16_sin_scale_twiddle_factor)
        VFMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft16_sin_scale_twiddle_factor)

        SWAP.REGISTERS(ymm_real[1], ymm_new_real1)
        SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1)

    # FFT16: Butterfly and scale
    ymm_scale_factor = YMMRegister()
    VMOVAPS(ymm_scale_factor, Constant.float32x8(scale_factor))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1], scale_a=ymm_scale_factor)
        butterfly(ymm_imag[0], ymm_imag[1], scale_a=ymm_scale_factor)
예제 #10
0
def fft16_within_rows(ymm_real_rows, ymm_imag_rows, bit_reversal=True):
    if isinstance(ymm_real_rows, tuple) and isinstance(ymm_imag_rows, tuple):
        return fft16_within_rows([ymm_real_rows], [ymm_imag_rows])

    assert isinstance(ymm_real_rows, list) and all(isinstance(ymm_real, tuple) and all(isinstance(ymm, YMMRegister) for ymm in ymm_real) for ymm_real in ymm_real_rows)
    assert isinstance(ymm_imag_rows, list) and all(isinstance(ymm_imag, tuple) and all(isinstance(ymm, YMMRegister) for ymm in ymm_imag) for ymm_imag in ymm_imag_rows)

    # FFT16: Butterfly
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1])
        butterfly(ymm_imag[0], ymm_imag[1])

    # FFT16: Multiplication by twiddle factors
    ymm_fft16_cos_twiddle_factor, ymm_fft16_sin_twiddle_factor = YMMRegister(), YMMRegister()
    VMOVAPS(ymm_fft16_cos_twiddle_factor, Constant.float32x8(*cos_npi_over_8))
    VMOVAPS(ymm_fft16_sin_twiddle_factor, Constant.float32x8(*sin_npi_over_8))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister()
        VMULPS(ymm_new_real1, ymm_real[1], ymm_fft16_cos_twiddle_factor)
        VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft16_cos_twiddle_factor)

        VFMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft16_sin_twiddle_factor)
        VFNMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft16_sin_twiddle_factor)

        SWAP.REGISTERS(ymm_real[1], ymm_new_real1)
        SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1)

    # 2x FFT8: Butterfly
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        transpose2x2x128(ymm_real[0], ymm_real[1])
        transpose2x2x128(ymm_imag[0], ymm_imag[1])
    # w[0] = x0 x1 x2 x3  x8  x9 x10 x11
    # w[1] = x4 x5 x6 x7 x12 x13 x14 x15
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1])
        butterfly(ymm_imag[0], ymm_imag[1])

    # 2x FFT8: Multiplication by twiddle factors
    ymm_fft8_cos_twiddle_factor, ymm_fft8_sin_twiddle_factor = YMMRegister(), YMMRegister()
    VMOVAPS(ymm_fft8_cos_twiddle_factor, Constant.float32x8(*(cos_npi_over_4 * 2)))
    VMOVAPS(ymm_fft8_sin_twiddle_factor, Constant.float32x8(*(sin_npi_over_4 * 2)))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        ymm_new_real1, ymm_new_imag1 = YMMRegister(), YMMRegister()
        VMULPS(ymm_new_real1, ymm_real[1], ymm_fft8_cos_twiddle_factor)
        VMULPS(ymm_new_imag1, ymm_imag[1], ymm_fft8_cos_twiddle_factor)

        VFMADD231PS(ymm_new_real1, ymm_imag[1], ymm_fft8_sin_twiddle_factor)
        VFNMADD231PS(ymm_new_imag1, ymm_real[1], ymm_fft8_sin_twiddle_factor)

        SWAP.REGISTERS(ymm_real[1], ymm_new_real1)
        SWAP.REGISTERS(ymm_imag[1], ymm_new_imag1)

    # 4x FFT4: Butterfly
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        transpose2x2x2x64(ymm_real[0], ymm_real[1])
        transpose2x2x2x64(ymm_imag[0], ymm_imag[1])
    # w[0] = x0 x1 x4 x5  x8  x9 x12 x13
    # w[1] = x2 x3 x6 x7 x10 x11 x14 x15
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        butterfly(ymm_real[0], ymm_real[1])
        butterfly(ymm_imag[0], ymm_imag[1])

    # 4x FFT4: Multiplication by twiddle factors and 8x FFT2: Butterfly
    ymm_fft4_twiddle_factor = YMMRegister()
    VMOVAPS(ymm_fft4_twiddle_factor, Constant.float32x8(+1.0, +1.0, -1.0, -1.0, +1.0, +1.0, -1.0, -1.0))
    for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
        ymm_new_real = YMMRegister(), YMMRegister()
        VSHUFPS(ymm_new_real[0], ymm_real[0], ymm_real[1], _MM_SHUFFLE(2, 0, 2, 0))
        VSHUFPS(ymm_new_real[1], ymm_real[0], ymm_imag[1], _MM_SHUFFLE(3, 1, 3, 1))
        butterfly(ymm_new_real[0], ymm_new_real[1])

        ymm_new_imag = YMMRegister(), YMMRegister()
        VSHUFPS(ymm_new_imag[0], ymm_imag[0], ymm_imag[1], _MM_SHUFFLE(2, 0, 2, 0))
        VSHUFPS(ymm_new_imag[1], ymm_imag[0], ymm_real[1], _MM_SHUFFLE(3, 1, 3, 1))
        butterfly(ymm_new_imag[0], ymm_new_imag[1], scale_b=ymm_fft4_twiddle_factor)

        SWAP.REGISTERS(ymm_real[0], ymm_new_real[0])
        SWAP.REGISTERS(ymm_real[1], ymm_new_real[1])
        SWAP.REGISTERS(ymm_imag[0], ymm_new_imag[0])
        SWAP.REGISTERS(ymm_imag[1], ymm_new_imag[1])

    # w[0] = x0 x4 x2 x6 x8 x12 x10 x14
    # w[1] = x1 x5 x3 x7 x9 x11 x13 x15

    if bit_reversal:
        # Bit reversal
        ymm_bit_reversal_mask = YMMRegister()
        VMOVDQA(ymm_bit_reversal_mask, Constant.uint32x8(0, 4, 1, 5, 2, 6, 3, 7))
        for ymm_real, ymm_imag in zip(ymm_real_rows, ymm_imag_rows):
            for i in range(2):
                VPERMPS(ymm_real[i], ymm_bit_reversal_mask, ymm_real[i])
                VPERMPS(ymm_imag[i], ymm_bit_reversal_mask, ymm_imag[i])