Ejemplo n.º 1
0
def _conv_rgba_bgra_asm():
    bits = platform.architecture()[0]
    if bits == '64bit':
        code = _conv_rgba_bgra_asm64()
        mc = Tdasm().assemble(code, ia32=False)
    else:
        code = _conv_rgba_bgra_asm32()
        mc = Tdasm().assemble(code, ia32=True)

    runtime = Runtime()
    ds = runtime.load("convert", mc)
    return runtime, ds
Ejemplo n.º 2
0
    def test_pow_ps(self):
        asm = Tdasm()
        mc = asm.assemble(POW_CODE_PS)
        runtime = Runtime()
        load_math_func("fast_pow_ps", runtime)
        ds = runtime.load("pow_ps", mc)

        for x in range(1000):
            num1 = random.random() * 3
            num2 = random.random() * 3
            num3 = random.random() * 3
            num4 = random.random() * 3
            num5 = random.random() * 3
            num6 = random.random() * 3
            num7 = random.random() * 3
            num8 = random.random() * 3
            ds["v1"] = (num1, num2, num3, num4)
            ds["v2"] = (num5, num6, num7, num8)
            runtime.run("pow_ps")
            rez_asm = ds["v1"]
            rez_py1 = math.pow(num1, num5)
            rez_py2 = math.pow(num2, num6)
            rez_py3 = math.pow(num3, num7)
            rez_py4 = math.pow(num4, num8)

            self.assertAlmostEqual(rez_asm[0], rez_py1, 1)
            self.assertAlmostEqual(rez_asm[1], rez_py2, 1)
            self.assertAlmostEqual(rez_asm[2], rez_py3, 1)
            self.assertAlmostEqual(rez_asm[3], rez_py4, 1)
Ejemplo n.º 3
0
    def prepare(self, runtimes):
        self._load_color_funcs(runtimes)

        if self.loader:
            self.loader(runtimes)

        for s in self._shaders:
            s.prepare(runtimes)

        self._runtimes = runtimes
        asm = Tdasm()
        name = 'shader' + str(id(self))

        for fun in self._functions:
            fun_name, fun_label, avx, bit = fun
            load_asm_function(fun_name, fun_label, runtimes, avx, bit)

        ds = []
        for r in runtimes:
            if not r.global_exists(self._name):
                if self._name in self._mc_cache:
                    ds.append(r.load(name, self._mc_cache[self._name]))
                else:
                    mc = asm.assemble(self._code, self._func)
                    self._mc_cache[self._name] = mc
                    ds.append(r.load(name, mc))
        if ds:
            self._ds = ds
Ejemplo n.º 4
0
 def _create_struct(self, shape):
     code = " #DATA " + shape.asm_struct() + """
     #CODE
     #END
     """
     mc = Tdasm().assemble(code)
     return mc.get_struct(shape.asm_struct_name())
Ejemplo n.º 5
0
 def _create_struct(self, struct_def, name):
     code = " #DATA \n" + struct_def + """
     #CODE
     #END
     """
     mc = Tdasm().assemble(code)
     return mc.get_struct(name)
Ejemplo n.º 6
0
    def test_sincos_ps(self):
        asm = Tdasm()
        mc = asm.assemble(SINCOS_CODE_PS)
        runtime = Runtime()
        load_math_func("fast_sincos_ps", runtime)
        ds = runtime.load("sincos_ps", mc)

        for x in range(1000):
            num1 = random.random() * 2000
            num2 = random.random() * 2000
            num3 = random.random() * 2000
            num4 = random.random() * 2000
            ds["v1"] = (num1, num2, num3, num4)
            runtime.run("sincos_ps")
            rez_asm_sin = ds["v1"]
            rez_asm_cos = ds["v2"]
            rez_py1_sin = math.sin(num1)
            rez_py2_sin = math.sin(num2)
            rez_py3_sin = math.sin(num3)
            rez_py4_sin = math.sin(num4)
            rez_py1_cos = math.cos(num1)
            rez_py2_cos = math.cos(num2)
            rez_py3_cos = math.cos(num3)
            rez_py4_cos = math.cos(num4)

            self.assertAlmostEqual(rez_asm_sin[0], rez_py1_sin, 3)
            self.assertAlmostEqual(rez_asm_sin[1], rez_py2_sin, 3)
            self.assertAlmostEqual(rez_asm_sin[2], rez_py3_sin, 3)
            self.assertAlmostEqual(rez_asm_sin[3], rez_py4_sin, 3)
            self.assertAlmostEqual(rez_asm_cos[0], rez_py1_cos, 3)
            self.assertAlmostEqual(rez_asm_cos[1], rez_py2_cos, 3)
            self.assertAlmostEqual(rez_asm_cos[2], rez_py3_cos, 3)
            self.assertAlmostEqual(rez_asm_cos[3], rez_py4_cos, 3)
Ejemplo n.º 7
0
 def __init__(self):
     asm = Tdasm()
     m = asm.assemble(MEMCPY)
     self.r = Runtime()
     self.ds = self.r.load("memcpy", m)
     m2 = asm.assemble(BLTRGBA)
     self.ds2 = self.r.load("bltrgba", m2)
     m3 = asm.assemble(BLTFLOATRGBA)
     self.ds3 = self.r.load("bltfloatrgba", m3)
Ejemplo n.º 8
0
 def create_assembler(self):
     assembler = Tdasm()
     assembler.register_macro('eq128', arithmetic128)
     assembler.register_macro('eq32', arithmetic32)
     assembler.register_macro('broadcast', broadcast)
     assembler.register_macro('if', macro_if)
     assembler.register_macro('dot', dot_product)
     assembler.register_macro('normalization', normalization)
     assembler.register_macro('cross', cross_product)
     return assembler
Ejemplo n.º 9
0
def create_float_image(runtime):
    img = renmas.gui.ImageFloatRGBA(150, 150)

    img.set_pixel_asm(runtime, "set_pixel")

    asm = Tdasm()
    mc = asm.assemble(ASM)
    runtime.load("write", mc)
    runtime.run("write")
    return img
Ejemplo n.º 10
0
def regular_sampler():
    runtime = Runtime()
    sampler = renmas2.samplers.RegularSampler(2, 2, pixel=1.0)
    sampler.get_sample_asm([runtime], 'get_sample')
    tile = renmas2.core.Tile(0, 0, 2, 2)
    tile.split(1)
    sampler.set_tile(tile)
    asm = Tdasm()
    mc = asm.assemble(ASM_CODE)
    runtime.load("test", mc)
    return (sampler, runtime, 'test')
Ejemplo n.º 11
0
 def prepare(self, runtimes):
     for s in self._shaders:
         s.prepare(runtimes)
     self._ds = []
     asm = Tdasm()
     mc = asm.assemble(self._code, self._func)
     #mc.print_machine_code()
     name = 'shader' + str(id(self))
     self._runtimes = runtimes
     for r in runtimes:
         #TODO check if shader allread exist in runtime
         #TODO if shader is function load it as function
         self._ds.append(r.load(name, mc))
Ejemplo n.º 12
0
 def __init__(self, width, height, pitch, address):
     self.addr = address
     self.width = width
     self.height = height
     asm = Tdasm()
     m = asm.assemble(ASM_STR)
     self.r = Runtime()
     self.ds = self.r.load("set_pixel", m)
     self.ds["color"] = 0xFF00FF00  # red color is default
     self.ds["address"] = address
     self.ds["width"] = width
     self.ds["height"] = height
     self.ds["pitch"] = pitch
Ejemplo n.º 13
0
 def compile(self, shaders=[]):
     stms = parse(self._code)
     cgen = CodeGenerator()
     asm, ret_type = cgen.generate_code(stms,
                                        args=self._args,
                                        is_func=self._is_func,
                                        name=self._name,
                                        func_args=self._func_args,
                                        shaders=shaders)
     self._asm_code = asm
     self._ret_type = ret_type
     asm = Tdasm()
     self._mc = asm.assemble(self._asm_code, self._is_func)
Ejemplo n.º 14
0
    def test_log(self):
        asm = Tdasm()
        mc = asm.assemble(LOG_CODE)
        runtime = Runtime()
        load_math_func("fast_log_ss", runtime)
        ds = runtime.load("log", mc)

        for x in range(1000):
            num = random.random()
            ds["x"] = num
            runtime.run("log")
            rez_asm = ds["x"]
            rez_py = math.log(num)
            self.assertAlmostEqual(rez_asm, rez_py, 3)
Ejemplo n.º 15
0
    def test_exp(self):
        asm = Tdasm()
        mc = asm.assemble(EXP_CODE)
        runtime = Runtime()
        load_math_func("fast_exp_ss", runtime)
        ds = runtime.load("exp", mc)

        for x in range(1000):
            num = random.random() * 4
            ds["x"] = num
            runtime.run("exp")
            rez_asm = ds["x"]
            rez_py = math.exp(num)
            self.assertAlmostEqual(rez_asm, rez_py, 2)
Ejemplo n.º 16
0
    def test_atan(self):
        asm = Tdasm()
        mc = asm.assemble(ATAN_CODE)
        runtime = Runtime()
        load_math_func("fast_atan_ss", runtime)
        ds = runtime.load("atan", mc)

        for x in range(1000):
            num = random.random() * 2000
            ds["x"] = num
            runtime.run("atan")
            rez_asm = ds["x"]
            rez_py = math.atan(num)
            self.assertAlmostEqual(rez_asm, rez_py, 3)
Ejemplo n.º 17
0
 def _create_assembler(self):
     assembler = Tdasm()
     self._macro_call = macro_call = MacroCall()
     assembler.register_macro('call', macro_call.macro_call)
     assembler.register_macro('eq128', arithmetic128)
     assembler.register_macro('eq32', arithmetic32)
     assembler.register_macro('broadcast', broadcast)
     assembler.register_macro('if', macro_if)
     assembler.register_macro('dot', dot_product)
     assembler.register_macro('normalization', normalization)
     assembler.register_macro('cross', cross_product)
     self._macro_spectrum = MacroSpectrum(self)
     assembler.register_macro('spectrum',
                              self._macro_spectrum.macro_spectrum)
     return assembler
Ejemplo n.º 18
0
    def set_pixel_asm(self, runtime, label):
        
        bits = platform.architecture()[0]
        if bits == "64bit": ecx = "rcx"
        else: ecx = "ecx"

        if util.AVX:
            line = "vmovaps oword [" + ecx + "], xmm0"
        else:
            line = "movaps oword [" + ecx + "], xmm0"

        bits = platform.architecture()[0]
        if bits == "64bit":
            l1 = "uint64 ptr_buffer"
            l2 = "mov rcx, qword [ptr_buffer]"
            l3 = "add rcx, rax"
        else:
            l1 = "uint32 ptr_buffer"
            l2 = "mov ecx, dword [ptr_buffer]"
            l3 = "add ecx, eax"

        asm_code = """
        #DATA
        """
        asm_code += l1 + """
        uint32 pitch
        #CODE
        ; eax = x , ebx = y, value = xmm0
        """
        asm_code += "global " + label + ": \n"
        asm_code += """
        imul ebx, dword [pitch]
        imul eax , eax, 16
        """
        asm_code += l2 + """
        add eax, ebx
        """
        asm_code += l3 + "\n"
        asm_code += line + """
        ret
        """

        asm = Tdasm()
        mc = asm.assemble(asm_code, True)
        name = "ImageFloatRGBA" + str(hash(self)) 
        self.ds = runtime.load(name, mc)
        self.ds["ptr_buffer"] = self.pixels.ptr()
        self.ds["pitch"] = self.pitch
Ejemplo n.º 19
0
    def test_pow(self):
        asm = Tdasm()
        mc = asm.assemble(POW_CODE)
        runtime = Runtime()
        load_math_func("fast_pow_ss", runtime)
        ds = runtime.load("pow", mc)

        for x in range(1000):
            num = random.random() * 3
            num1 = random.random() * 3
            ds["x"] = num
            ds["y"] = num1
            runtime.run("pow")
            rez_asm = ds["x"]
            rez_py = math.pow(num, num1)
            self.assertAlmostEqual(rez_asm, rez_py, 1)
Ejemplo n.º 20
0
def create_assembler():
    assembler = Tdasm()
    assembler.register_macro('mov', mov)
    assembler.register_macro('lea', lea)
    assembler.register_macro('eq128', arithmetic128)
    assembler.register_macro('eq32', arithmetic32)
    assembler.register_macro('broadcast', broadcast)
    assembler.register_macro('if', macro_if)
    assembler.register_macro('dot', dot_product)
    assembler.register_macro('normalization', normalization)
    assembler.register_macro('cross', cross_product)
    assembler.register_macro('generate_one', generate_one)
    assembler.register_macro('push', push)
    assembler.register_macro('pop', pop)
    assembler.register_macro('sqrtss', sqrtss)
    return assembler
Ejemplo n.º 21
0
    def test_sincos(self):
        asm = Tdasm()
        mc = asm.assemble(SINCOS_CODE)
        runtime = Runtime()
        load_math_func("fast_sincos_ss", runtime)
        ds = runtime.load("sincos", mc)

        for x in range(1000):
            num = random.random() * 2000
            ds["x"] = num
            runtime.run("sincos")
            rez_asm1 = ds["x"]
            rez_asm2 = ds["y"]

            rez_py1, rez_py2 = math.sin(num), math.cos(num)
            self.assertAlmostEqual(rez_asm1, rez_py1, 3)
            self.assertAlmostEqual(rez_asm2, rez_py2, 3)
Ejemplo n.º 22
0
 def compile(self, shaders=[], color_mgr=None):
     stms = parse(self._code)
     cgen = CodeGenerator()
     asm, ret_type, fns = cgen.generate_code(stms,
                                             args=self._args,
                                             is_func=self._is_func,
                                             name=self._name,
                                             func_args=self._func_args,
                                             shaders=shaders,
                                             color_mgr=color_mgr)
     self._asm_code = asm
     self._ret_type = ret_type
     self._ext_functions = fns
     asm = Tdasm()
     self._mc = asm.assemble(self._asm_code,
                             naked=self._is_func,
                             ia32=not cgen.BIT64)
Ejemplo n.º 23
0
def random_sampler():
    runtime = Runtime()
    width = 1
    height = 1
    spp = 1
    sampler = renmas2.samplers.RandomSampler(width, height, spp=spp, pixel=1.0)
    sampler.get_sample_asm([runtime], 'get_sample')
    tile = renmas2.core.Tile(0, 0, width, height)
    tile.split(1)
    sampler.set_tile(tile)
    asm = Tdasm()
    mc = asm.assemble(ASM_CODE)
    runtime.load("test", mc)

    nsamples = width * height * spp
    for x in range(nsamples):
        get_sample(sampler, runtime, "test")

    get_sample(sampler, runtime, "test")
Ejemplo n.º 24
0
def get_asm():

    from renmas.macros import eq32, eq128, eq32_32, eq32_128, eq128_128, eq128_32
    from renmas.macros import dot_product, macro_if, broadcast
    global assembler
    if assembler is None:
        assembler = Tdasm()
        assembler.register_macro("eq128", eq128)
        assembler.register_macro("eq32", eq32)

        assembler.register_macro("eq128_32", eq128_32)
        assembler.register_macro("eq32_128", eq32_128)
        assembler.register_macro("eq128_128", eq128_128)
        assembler.register_macro("eq32_32", eq32_32)

        assembler.register_macro("dot", dot_product)
        assembler.register_macro("if", macro_if)
        assembler.register_macro("broadcast", broadcast)

    return assembler
Ejemplo n.º 25
0
    def test_log_ps(self):
        asm = Tdasm()
        mc = asm.assemble(LOG_CODE_PS)
        runtime = Runtime()
        load_math_func("fast_log_ps", runtime)
        ds = runtime.load("log_ps", mc)

        for x in range(1000):
            num1 = random.random()
            num2 = random.random()
            num3 = random.random()
            num4 = random.random()
            ds["v1"] = (num1, num2, num3, num4)
            runtime.run("log_ps")
            rez_asm = ds["v1"]
            rez_py1 = math.log(num1)
            rez_py2 = math.log(num2)
            rez_py3 = math.log(num3)
            rez_py4 = math.log(num4)

            self.assertAlmostEqual(rez_asm[0], rez_py1, 3)
            self.assertAlmostEqual(rez_asm[1], rez_py2, 3)
            self.assertAlmostEqual(rez_asm[2], rez_py3, 3)
            self.assertAlmostEqual(rez_asm[3], rez_py4, 3)
Ejemplo n.º 26
0
            code += lst_inst2[l] + "\n"
        for l in range(len(lst_inst2), len(lst_inst1)):
            code += lst_inst1[l] + "\n"

    return code

def arth128_32(tokens):
    return arth_mix(tokens, 128, 32)

def arth32_128(tokens):
    return arth_mix(tokens, 32, 128)

def arth128_128(tokens):
    return arth_mix(tokens, 128, 128)

def arth32_32(tokens):
    return arth_mix(tokens, 32, 32)

if __name__ == "__main__":
    asm = Tdasm()
    asm.register_macro("arth128", arth128)
    asm.register_macro("arth32", arth32)
    mc = asm.assemble(ASM_CODE)

    run = Runtime()
    ds = run.load("test", mc)
    run.run("test")

    print(ds["rez"])

Ejemplo n.º 27
0
            #END
        """
    else:
        code = """
            #DATA
            uint32 sa, da, n

            #CODE
            mov ecx, dword [n]
            mov esi, dword [sa] 
            mov edi, dword [da]
            rep movs byte [edi], byte [esi]

            #END
        """
    return code 

_mc = Tdasm().assemble(_memcpy_code())
_runtime = Runtime()
_data_section = _runtime.load("memcpy", _mc)

def memcpy(da, sa, n):
    """
        Copy n bytes form source address(sa) to destination address(da).
    """
    _data_section["da"] = da
    _data_section["sa"] = sa
    _data_section["n"] = n
    _runtime.run("memcpy")

Ejemplo n.º 28
0
def cos_ps():

    data = """
    #DATA
    uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
    float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679
    float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236
    uint32 _epi32_1[4] = 1, 1, 1, 1
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    uint32 _epi32_2[4] = 2, 2, 2, 2

    float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413
    float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262
    float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 
    float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896
    """

    asm_code = data + """

    #CODE
    global fast_cos_ps:
    andps	xmm0, oword [_ps_am_inv_sign_mask]
    addps	xmm0, oword [_ps_am_pi_o_2]
    mulps	xmm0, oword [_ps_am_2_o_pi]

    pxor	xmm3, xmm3
    movdqa	xmm5, oword [_epi32_1]
    movaps	xmm4, oword [_ps_am_1]
    cvttps2dq	xmm2, xmm0
    pand	xmm5, xmm2
    pcmpeqd	xmm5, xmm3
    cvtdq2ps	xmm6, xmm2
    pand	xmm2, oword [_epi32_2]
    pslld	xmm2, 30 

    subps	xmm0, xmm6
    minps	xmm0, xmm4
    subps	xmm4, xmm0
    andps	xmm0, xmm5
    andnps	xmm5, xmm4
    orps	xmm0, xmm5

    movaps	xmm1, xmm0
    mulps	xmm0, xmm0
    orps	xmm1, xmm2
    movaps	xmm7, xmm0
    mulps	xmm0, oword [_ps_sincos_p3]
    addps	xmm0, oword [_ps_sincos_p2]
    mulps	xmm0, xmm7
    addps	xmm0, oword [_ps_sincos_p1]
    mulps	xmm0, xmm7
    addps	xmm0, oword [_ps_sincos_p0]
    mulps	xmm0, xmm1
    ret
    """

    avx_code = data + """

    #CODE
    global fast_cos_ps:
    vandps	xmm0, xmm0, oword [_ps_am_inv_sign_mask]
    vaddps	xmm0, xmm0, oword [_ps_am_pi_o_2]
    vmulps	xmm0, xmm0, oword [_ps_am_2_o_pi]

    vpxor	xmm3, xmm3, xmm3
    vmovdqa	xmm5, oword [_epi32_1]
    vmovaps	xmm4, oword [_ps_am_1]
    vcvttps2dq	xmm2, xmm0
    vpand	xmm5, xmm5, xmm2
    vpcmpeqd	xmm5, xmm5, xmm3
    vcvtdq2ps	xmm6, xmm2
    vpand	xmm2, xmm2, oword [_epi32_2]
    vpslld	xmm2, xmm2, 30 

    vsubps	xmm0, xmm0, xmm6
    vminps	xmm0, xmm0, xmm4
    vsubps	xmm4, xmm4, xmm0
    vandps	xmm0, xmm0, xmm5
    vandnps	xmm5, xmm5, xmm4
    vorps	xmm0, xmm0, xmm5

    vmovaps	xmm1, xmm0
    vmulps	xmm0, xmm0, xmm0
    vorps	xmm1, xmm1, xmm2
    vmovaps	xmm7, xmm0
    vmulps	xmm0, xmm0, oword [_ps_sincos_p3]
    vaddps	xmm0, xmm0, oword [_ps_sincos_p2]
    vmulps	xmm0, xmm0, xmm7
    vaddps	xmm0, xmm0, oword [_ps_sincos_p1]
    vmulps	xmm0, xmm0, xmm7
    vaddps	xmm0, xmm0, oword [_ps_sincos_p0]
    vmulps	xmm0, xmm0, xmm1
    ret
    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)

    return mc
Ejemplo n.º 29
0
def asin_ps():
    data = """
    #DATA
    uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    float _ps_am_m1[4] = -1.0, -1.0, -1.0, -1.0
    float _ps_atan_t0[4] = -0.091646118527, -0.091646118527, -0.091646118527, -0.091646118527
    float _ps_atan_s0[4] = 1.2797564625, 1.2797564625, 1.2797564625, 1.2797564625
    float _ps_atan_s1[4] = 2.1972168858, 2.1972168858, 2.1972168858, 2.1972168858
    float _ps_atan_t1[4] = -1.395694568, -1.395694568, -1.395694568, -1.395694568
    float _ps_atan_s2[4] = 6.8193064723, 6.8193064723, 6.8193064723 ,6.8193064723
    float _ps_atan_t2[4] = -94.3939261227, -94.3939261227, -94.3939261227, -94.3939261227
    float _ps_atan_s3[4] = 28.205206687, 28.205206687, 28.205206687, 28.205206687
    float _ps_atan_t3[4] = 12.888383034, 12.888383034, 12.888383034, 12.888383034
    float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679

    """
    asm_code = data + """

    #CODE
    global fast_asin_ps:
    movaps xmm1, oword [_ps_am_1]
    movaps xmm2, xmm1
    addps xmm1, xmm0
    subps xmm2, xmm0
    mulps xmm1, xmm2
    rsqrtps xmm1, xmm1
    mulps xmm0, xmm1

    ;atan
    movaps	xmm5, oword [_ps_am_1]
	movaps	xmm6, oword [_ps_am_m1]
	rcpps	xmm4, xmm0

	cmpps	xmm5, xmm0, 1
	cmpps	xmm6, xmm0, 6
	movaps	xmm1, oword [_ps_atan_s0]
	orps	xmm5, xmm6

	andps	xmm4, xmm5
	movaps	xmm2, oword [_ps_atan_t0]
	movaps	xmm7, xmm5
	andnps	xmm5, xmm0
	movaps	xmm3, oword [_ps_atan_s1]
	orps	xmm4, xmm5
	movaps	xmm0, xmm4

	movaps	xmm6, oword [_ps_atan_t1]
	mulps	xmm4, xmm4

	addps	xmm1, xmm4
	movaps	xmm5, oword [_ps_atan_s2]
	rcpps	xmm1, xmm1
	mulps	xmm1, xmm2
	movaps	xmm2, oword [_ps_atan_t2]
	addps	xmm3, xmm4
	addps	xmm1, xmm3

	movaps	xmm3, oword [_ps_atan_s3]
	rcpps	xmm1, xmm1
	mulps	xmm1, xmm6
	movaps	xmm6, oword [_ps_atan_t3]
	addps	xmm5, xmm4
	addps	xmm1, xmm5

	movaps	xmm5, oword [_ps_am_sign_mask]
	rcpps	xmm1, xmm1
	mulps	xmm1, xmm2
	addps	xmm3, xmm4
	movaps	xmm4, oword [_ps_am_pi_o_2]
	mulps	xmm6, xmm0
	addps	xmm1, xmm3

	andps	xmm0, xmm5
	rcpps	xmm1, xmm1
	mulps	xmm1, xmm6

	orps	xmm0, xmm4
	subps	xmm0, xmm1

	andps	xmm0, xmm7
	andnps	xmm7, xmm1
	orps	xmm0, xmm7
	ret

    """

    avx_code = data + """

    #CODE
    global fast_asin_ps:
    vmovaps xmm1, oword [_ps_am_1]
    vmovaps xmm2, xmm1
    vaddps xmm1, xmm1, xmm0
    vsubps xmm2, xmm2, xmm0
    vmulps xmm1, xmm1, xmm2
    vrsqrtps xmm1, xmm1
    vmulps xmm0, xmm0, xmm1

    ;atan
    vmovaps	xmm5, oword [_ps_am_1]
	vmovaps	xmm6, oword [_ps_am_m1]
	vrcpps	xmm4, xmm0

	vcmpps	xmm5, xmm5, xmm0, 1
	vcmpps	xmm6, xmm6, xmm0, 6
	vmovaps	xmm1, oword [_ps_atan_s0]
	vorps	xmm5, xmm5, xmm6

	vandps	xmm4, xmm4, xmm5
	vmovaps	xmm2, oword [_ps_atan_t0]
	vmovaps	xmm7, xmm5
	vandnps	xmm5, xmm5, xmm0
	vmovaps	xmm3, oword [_ps_atan_s1]
	vorps	xmm4, xmm4, xmm5
	vmovaps	xmm0, xmm4

	vmovaps	xmm6, oword [_ps_atan_t1]
	vmulps	xmm4, xmm4, xmm4

	vaddps	xmm1, xmm1, xmm4
	vmovaps	xmm5, oword [_ps_atan_s2]
	vrcpps	xmm1, xmm1
	vmulps	xmm1, xmm1, xmm2
	vmovaps	xmm2, oword [_ps_atan_t2]
	vaddps	xmm3, xmm3, xmm4
	vaddps	xmm1, xmm1, xmm3

	vmovaps	xmm3, oword [_ps_atan_s3]
	vrcpps	xmm1, xmm1
	vmulps	xmm1, xmm1, xmm6
	vmovaps	xmm6, oword [_ps_atan_t3]
	vaddps	xmm5, xmm5, xmm4
	vaddps	xmm1, xmm1, xmm5

	vmovaps	xmm5, oword [_ps_am_sign_mask]
	vrcpps	xmm1, xmm1
	vmulps	xmm1, xmm1, xmm2
	vaddps	xmm3, xmm3, xmm4
	vmovaps	xmm4, oword [_ps_am_pi_o_2]
	vmulps	xmm6, xmm6, xmm0
	vaddps	xmm1, xmm1, xmm3

	vandps	xmm0, xmm0, xmm5
	vrcpps	xmm1, xmm1
	vmulps	xmm1, xmm1, xmm6

	vorps	xmm0, xmm0, xmm4
	vsubps	xmm0, xmm0, xmm1

	vandps	xmm0, xmm0, xmm7
	vandnps	xmm7, xmm7, xmm1
	vorps	xmm0, xmm0, xmm7
	ret

    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)
    
    return mc
Ejemplo n.º 30
0
def sin_ss():
    data = """
    #DATA

    uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
    uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000
    float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236
    uint32 _epi32_1[4] = 1, 1, 1, 1
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    uint32 _epi32_2[4] = 2, 2, 2, 2

    float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413
    float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262
    float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 
    float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896

    """

    asm_code = data + """

    #CODE
    global fast_sin_ss:
	movaps	xmm7, xmm0
	movss	xmm1, dword [_ps_am_inv_sign_mask]
	movss	xmm2, dword [_ps_am_sign_mask]
	movss	xmm3, dword [_ps_am_2_o_pi]
	andps	xmm0, xmm1
	andps	xmm7, xmm2
	mulss	xmm0, xmm3

	pxor	xmm3, xmm3
	movd	xmm5, dword [_epi32_1]
	movss	xmm4, dword [_ps_am_1]
	cvttps2dq	xmm2, xmm0
	pand	xmm5, xmm2
	movd	xmm1, dword [_epi32_2]
	pcmpeqd	xmm5, xmm3
	cvtdq2ps	xmm6, xmm2
	pand	xmm2, xmm1
	pslld	xmm2, 30

	subss	xmm0, xmm6
	movss	xmm3, dword [_ps_sincos_p3]
	minss	xmm0, xmm4
	subss	xmm4, xmm0
	andps	xmm0, xmm5
	andnps	xmm5, xmm4
	orps	xmm0, xmm5

	movaps	xmm1, xmm0
	movss	xmm4, dword [_ps_sincos_p2]
	mulss	xmm0, xmm0
	xorps	xmm2, xmm7
	movss	xmm5, dword [_ps_sincos_p1]
	orps	xmm1, xmm2
	movaps	xmm7, xmm0
	mulss	xmm0, xmm3
	movss	xmm6, dword [_ps_sincos_p0]
	addss	xmm0, xmm4
	mulss	xmm0, xmm7
	addss	xmm0, xmm5
	mulss	xmm0, xmm7
	addss	xmm0, xmm6
	mulss	xmm0, xmm1
    ret
    """

    avx_code = data + """

    #CODE
    global fast_sin_ss:
    vmovaps	xmm7, xmm0 
	vmovss	xmm1, dword [_ps_am_inv_sign_mask]
	vmovss	xmm2, dword [_ps_am_sign_mask]
	vmovss	xmm3, dword [_ps_am_2_o_pi]

	vandps	xmm0, xmm0, xmm1
	vandps	xmm7, xmm7, xmm2 
	vmulss	xmm0, xmm0, xmm3

	vpxor	xmm3, xmm3, xmm3 
	vmovd	xmm5, dword [_epi32_1]
	vmovss	xmm4, dword [_ps_am_1]
	vcvttps2dq	xmm2, xmm0
	vpand	xmm5, xmm5, xmm2
	vmovd	xmm1, dword [_epi32_2]
	vpcmpeqd	xmm5, xmm5, xmm3
	vcvtdq2ps	xmm6, xmm2
	vpand	xmm2, xmm2, xmm1
	vpslld	xmm2, xmm2, 30

	vsubss	xmm0, xmm0, xmm6
	vmovss	xmm3, dword [_ps_sincos_p3]
	vminss	xmm0, xmm0, xmm4
	vsubss	xmm4, xmm4, xmm0
	vandps	xmm0, xmm0, xmm5
	vandnps	xmm5, xmm5, xmm4
	vorps	xmm0, xmm0, xmm5

	vmovaps	xmm1, xmm0
	vmovss	xmm4, dword [_ps_sincos_p2]
	vmulss	xmm0, xmm0, xmm0
	vxorps	xmm2, xmm2, xmm7
	vmovss	xmm5, dword [_ps_sincos_p1]
	vorps	xmm1, xmm1, xmm2
	vmovaps	xmm7, xmm0
	vmulss	xmm0, xmm0, xmm3
	vmovss	xmm6, dword [_ps_sincos_p0]
	vaddss	xmm0, xmm0, xmm4
	vmulss	xmm0, xmm0, xmm7
	vaddss	xmm0, xmm0, xmm5
	vmulss	xmm0, xmm0, xmm7
	vaddss	xmm0, xmm0, xmm6
	vmulss	xmm0, xmm0, xmm1
    ret
    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)

    return mc