Beispiel #1
0
        reg_row_cnt = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_row_cnt, arg_row_count)

        reg_col_cnt = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_col_cnt, arg_column_count)

        reg_row_off = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_row_off, arg_row_offset)

        reg_col_off = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_col_off, arg_column_offset)

        ymm_data = [YMMRegister(i) for i in range(8)]
        ymm_real, ymm_imag = ymm_data[0::2], ymm_data[1::2]

        block8x8.load_with_padding(ymm_data, reg_t, reg_inct, reg_row_off, reg_row_cnt, reg_col_off, reg_col_cnt)

        fft.real_to_complex_soa_perm.fft8_across_rows(ymm_data)
        fft.complex_soa.fft8_within_rows(ymm_real, ymm_imag)
        fft.two_real_to_two_complex_soa_perm_planar.fft8_within_rows_postprocess(ymm_real[0], ymm_imag[0])

        VSTOREPS = {"store": VMOVAPS, "stream": VMOVNTPS}[post_operation]
        for ymm_re, ymm_im in zip(ymm_real, ymm_imag):
            VSTOREPS([reg_f], ymm_re)
            VSTOREPS([reg_f + YMMRegister.size], ymm_im)
            if ymm_re is not ymm_real[-1]:
                ADD(reg_f, reg_incf)

        RETURN()

Beispiel #2
0
        reg_row_cnt = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_row_cnt, arg_row_count)

        reg_col_cnt = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_col_cnt, arg_column_count)

        reg_row_off = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_row_off, arg_row_offset)

        reg_col_off = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_col_off, arg_column_offset)

        ymm_data = [YMMRegister(i) for i in range(8)]
        ymm_real, ymm_imag = ymm_data[0::2], ymm_data[1::2]

        block8x8.load_with_padding(ymm_data, reg_t, reg_inct, reg_row_off,
                                   reg_row_cnt, reg_col_off, reg_col_cnt)

        fft.real_to_complex_soa_perm.fft8_across_rows(ymm_data)
        fft.complex_soa.fft8_within_rows(ymm_real, ymm_imag)
        fft.two_real_to_two_complex_soa_perm_planar.fft8_within_rows_postprocess(
            ymm_real[0], ymm_imag[0])

        VSTOREPS = {"store": VMOVAPS, "stream": VMOVNTPS}[post_operation]
        for ymm_re, ymm_im in zip(ymm_real, ymm_imag):
            VSTOREPS([reg_f], ymm_re)
            VSTOREPS([reg_f + YMMRegister.size], ymm_im)
            if ymm_re is not ymm_real[-1]:
                ADD(reg_f, reg_incf)

        RETURN()
Beispiel #3
0
        reg_row_cnt = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_row_cnt, arg_row_count)

        reg_col_cnt = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_col_cnt, arg_column_count)

        reg_row_off = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_row_off, arg_row_offset)

        reg_col_off = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_col_off, arg_column_offset)

        ymm_data = [YMMRegister() for _ in range(8)]

        block8x8.load_with_padding(ymm_data, reg_d, reg_stride_d, reg_row_off, reg_row_cnt, reg_col_off, reg_col_cnt)

        ymm_data = winograd.o6x6k3x3.input_transform(ymm_data)
        winograd.o6x6k3x3.transpose8x8(ymm_data)
        ymm_data = winograd.o6x6k3x3.input_transform(ymm_data)

        VSTOREPS = {"store": VMOVAPS, "stream": VMOVNTPS}[post_operation]
        for ymm_row in ymm_data:
            VSTOREPS([reg_wd], ymm_row)
            if ymm_row is not ymm_data[-1]:
                ADD(reg_wd, reg_stride_wd)

        RETURN()


for reverse_kernel in [False, True]: