Beispiel #1
0
        reg_s_stride = GeneralPurposeRegister64()
        LOAD.ARGUMENT(reg_s_stride, arg_s_stride)

        reg_row_count = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_row_count, arg_row_count)

        reg_column_count = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_column_count, arg_column_count)

        ymm_m = [YMMRegister() for _ in range(8)]
        for ymm in ymm_m:
            if with_bias and ymm is ymm_m[1]:
                VADDPS(ymm, xmm_bias.as_ymm, [reg_m])
            else:
                VMOVAPS(ymm, [reg_m])

            if ymm is not ymm_m[-1]:
                ADD(reg_m, reg_m_stride)

        ymm_t = winograd.o6x6k3x3.output_transform(ymm_m)

        ymm_tt = winograd.o6x6k3x3.transpose6x8(ymm_t)

        ymm_s = winograd.o6x6k3x3.output_transform(ymm_tt)

        block8x8.store_packed(ymm_s, reg_s, reg_s_stride, reg_row_count,
                              reg_column_count, None, None, with_relu)

        RETURN()
Beispiel #2
0
            reg_column_start = None

        ymm_data = [YMMRegister(i) for i in range(8)]
        ymm_real, ymm_imag = ymm_data[0::2], ymm_data[1::2]

        if with_bias:
            ymm_bias = YMMRegister()
            VMOVSS(ymm_bias.as_xmm, [reg_bias])

        for ymm_re, ymm_im in zip(ymm_real, ymm_imag):
            VMOVAPS(ymm_re, [reg_f])
            VMOVAPS(ymm_im, [reg_f + YMMRegister.size])
            if with_bias and ymm_re is ymm_real[0]:
                VFMADD231PS(ymm_re, ymm_bias, Constant.float32x8(64.0))

            if ymm_im is not ymm_imag[-1]:
                ADD(reg_f, reg_f_stride)

        fft.two_complex_soa_perm_to_two_real_planar.ifft8_within_rows_preprocess(
            ymm_real[0], ymm_imag[0])
        fft.complex_soa.fft8_within_rows(ymm_real,
                                         ymm_imag,
                                         transformation="inverse")
        fft.complex_soa_perm_to_real.ifft8_across_rows(ymm_data)

        block8x8.store_packed(ymm_data, reg_t, reg_t_stride, reg_row_count,
                              reg_column_end, reg_row_start, reg_column_start,
                              with_relu)

        RETURN()
Beispiel #3
0
        LOAD.ARGUMENT(reg_m_stride, arg_m_stride)

        reg_s_stride = GeneralPurposeRegister64()
        LOAD.ARGUMENT(reg_s_stride, arg_s_stride)

        reg_row_count = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_row_count, arg_row_count)

        reg_column_count = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_column_count, arg_column_count)

        ymm_m = [YMMRegister() for _ in range(8)]
        for ymm in ymm_m:
            if with_bias and ymm is ymm_m[1]:
                VADDPS(ymm, xmm_bias.as_ymm, [reg_m])
            else:
                VMOVAPS(ymm, [reg_m])

            if ymm is not ymm_m[-1]:
                ADD(reg_m, reg_m_stride)

        ymm_t = winograd.o6x6k3x3.output_transform(ymm_m)

        ymm_tt = winograd.o6x6k3x3.transpose6x8(ymm_t)

        ymm_s = winograd.o6x6k3x3.output_transform(ymm_tt)

        block8x8.store_packed(ymm_s, reg_s, reg_s_stride, reg_row_count, reg_column_count)

        RETURN()
Beispiel #4
0
            reg_column_start = GeneralPurposeRegister32()
            LOAD.ARGUMENT(reg_column_start, arg_column_offset)
            ADD(reg_column_end, reg_column_start)
        else:
            reg_row_start = None
            reg_column_start = None

        ymm_data = [YMMRegister(i) for i in range(8)]
        ymm_real, ymm_imag = ymm_data[0::2], ymm_data[1::2]

        if with_bias:
            ymm_bias = YMMRegister()
            VMOVSS(ymm_bias.as_xmm, [reg_bias])

        for ymm_re, ymm_im in zip(ymm_real, ymm_imag):
            VMOVAPS(ymm_re, [reg_f])
            VMOVAPS(ymm_im, [reg_f + YMMRegister.size])
            if with_bias and ymm_re is ymm_real[0]:
                VFMADD231PS(ymm_re, ymm_bias, Constant.float32x8(64.0))

            if ymm_im is not ymm_imag[-1]:
                ADD(reg_f, reg_f_stride)

        fft.two_complex_soa_perm_to_two_real_planar.ifft8_within_rows_preprocess(ymm_real[0], ymm_imag[0])
        fft.complex_soa.fft8_within_rows(ymm_real, ymm_imag, transformation="inverse")
        fft.complex_soa_perm_to_real.ifft8_across_rows(ymm_data)

        block8x8.store_packed(ymm_data, reg_t, reg_t_stride, reg_row_count, reg_column_end, reg_row_start, reg_column_start, with_relu)

        RETURN()
        LOAD.ARGUMENT(reg_m_stride, arg_m_stride)

        reg_s_stride = GeneralPurposeRegister64()
        LOAD.ARGUMENT(reg_s_stride, arg_s_stride)

        reg_row_count = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_row_count, arg_row_count)

        reg_column_count = GeneralPurposeRegister32()
        LOAD.ARGUMENT(reg_column_count, arg_column_count)

        ymm_m = [YMMRegister() for _ in range(8)]
        for ymm in ymm_m:
            if with_bias and ymm is ymm_m[1]:
                VADDPS(ymm, xmm_bias.as_ymm, [reg_m])
            else:
                VMOVAPS(ymm, [reg_m])

            if ymm is not ymm_m[-1]:
                ADD(reg_m, reg_m_stride)

        ymm_t = winograd.o6x6k3x3.output_transform(ymm_m)

        ymm_tt = winograd.o6x6k3x3.transpose6x8(ymm_t)

        ymm_s = winograd.o6x6k3x3.output_transform(ymm_tt)

        block8x8.store_packed(ymm_s, reg_s, reg_s_stride, reg_row_count, reg_column_count, None, None, with_relu)

        RETURN()