def _conv_rgba_bgra_asm(): bits = platform.architecture()[0] if bits == '64bit': code = _conv_rgba_bgra_asm64() mc = Tdasm().assemble(code, ia32=False) else: code = _conv_rgba_bgra_asm32() mc = Tdasm().assemble(code, ia32=True) runtime = Runtime() ds = runtime.load("convert", mc) return runtime, ds
def test_pow_ps(self): asm = Tdasm() mc = asm.assemble(POW_CODE_PS) runtime = Runtime() load_math_func("fast_pow_ps", runtime) ds = runtime.load("pow_ps", mc) for x in range(1000): num1 = random.random() * 3 num2 = random.random() * 3 num3 = random.random() * 3 num4 = random.random() * 3 num5 = random.random() * 3 num6 = random.random() * 3 num7 = random.random() * 3 num8 = random.random() * 3 ds["v1"] = (num1, num2, num3, num4) ds["v2"] = (num5, num6, num7, num8) runtime.run("pow_ps") rez_asm = ds["v1"] rez_py1 = math.pow(num1, num5) rez_py2 = math.pow(num2, num6) rez_py3 = math.pow(num3, num7) rez_py4 = math.pow(num4, num8) self.assertAlmostEqual(rez_asm[0], rez_py1, 1) self.assertAlmostEqual(rez_asm[1], rez_py2, 1) self.assertAlmostEqual(rez_asm[2], rez_py3, 1) self.assertAlmostEqual(rez_asm[3], rez_py4, 1)
def prepare(self, runtimes): self._load_color_funcs(runtimes) if self.loader: self.loader(runtimes) for s in self._shaders: s.prepare(runtimes) self._runtimes = runtimes asm = Tdasm() name = 'shader' + str(id(self)) for fun in self._functions: fun_name, fun_label, avx, bit = fun load_asm_function(fun_name, fun_label, runtimes, avx, bit) ds = [] for r in runtimes: if not r.global_exists(self._name): if self._name in self._mc_cache: ds.append(r.load(name, self._mc_cache[self._name])) else: mc = asm.assemble(self._code, self._func) self._mc_cache[self._name] = mc ds.append(r.load(name, mc)) if ds: self._ds = ds
def _create_struct(self, shape): code = " #DATA " + shape.asm_struct() + """ #CODE #END """ mc = Tdasm().assemble(code) return mc.get_struct(shape.asm_struct_name())
def _create_struct(self, struct_def, name): code = " #DATA \n" + struct_def + """ #CODE #END """ mc = Tdasm().assemble(code) return mc.get_struct(name)
def test_sincos_ps(self): asm = Tdasm() mc = asm.assemble(SINCOS_CODE_PS) runtime = Runtime() load_math_func("fast_sincos_ps", runtime) ds = runtime.load("sincos_ps", mc) for x in range(1000): num1 = random.random() * 2000 num2 = random.random() * 2000 num3 = random.random() * 2000 num4 = random.random() * 2000 ds["v1"] = (num1, num2, num3, num4) runtime.run("sincos_ps") rez_asm_sin = ds["v1"] rez_asm_cos = ds["v2"] rez_py1_sin = math.sin(num1) rez_py2_sin = math.sin(num2) rez_py3_sin = math.sin(num3) rez_py4_sin = math.sin(num4) rez_py1_cos = math.cos(num1) rez_py2_cos = math.cos(num2) rez_py3_cos = math.cos(num3) rez_py4_cos = math.cos(num4) self.assertAlmostEqual(rez_asm_sin[0], rez_py1_sin, 3) self.assertAlmostEqual(rez_asm_sin[1], rez_py2_sin, 3) self.assertAlmostEqual(rez_asm_sin[2], rez_py3_sin, 3) self.assertAlmostEqual(rez_asm_sin[3], rez_py4_sin, 3) self.assertAlmostEqual(rez_asm_cos[0], rez_py1_cos, 3) self.assertAlmostEqual(rez_asm_cos[1], rez_py2_cos, 3) self.assertAlmostEqual(rez_asm_cos[2], rez_py3_cos, 3) self.assertAlmostEqual(rez_asm_cos[3], rez_py4_cos, 3)
def __init__(self): asm = Tdasm() m = asm.assemble(MEMCPY) self.r = Runtime() self.ds = self.r.load("memcpy", m) m2 = asm.assemble(BLTRGBA) self.ds2 = self.r.load("bltrgba", m2) m3 = asm.assemble(BLTFLOATRGBA) self.ds3 = self.r.load("bltfloatrgba", m3)
def create_assembler(self): assembler = Tdasm() assembler.register_macro('eq128', arithmetic128) assembler.register_macro('eq32', arithmetic32) assembler.register_macro('broadcast', broadcast) assembler.register_macro('if', macro_if) assembler.register_macro('dot', dot_product) assembler.register_macro('normalization', normalization) assembler.register_macro('cross', cross_product) return assembler
def create_float_image(runtime): img = renmas.gui.ImageFloatRGBA(150, 150) img.set_pixel_asm(runtime, "set_pixel") asm = Tdasm() mc = asm.assemble(ASM) runtime.load("write", mc) runtime.run("write") return img
def regular_sampler(): runtime = Runtime() sampler = renmas2.samplers.RegularSampler(2, 2, pixel=1.0) sampler.get_sample_asm([runtime], 'get_sample') tile = renmas2.core.Tile(0, 0, 2, 2) tile.split(1) sampler.set_tile(tile) asm = Tdasm() mc = asm.assemble(ASM_CODE) runtime.load("test", mc) return (sampler, runtime, 'test')
def prepare(self, runtimes): for s in self._shaders: s.prepare(runtimes) self._ds = [] asm = Tdasm() mc = asm.assemble(self._code, self._func) #mc.print_machine_code() name = 'shader' + str(id(self)) self._runtimes = runtimes for r in runtimes: #TODO check if shader allread exist in runtime #TODO if shader is function load it as function self._ds.append(r.load(name, mc))
def __init__(self, width, height, pitch, address): self.addr = address self.width = width self.height = height asm = Tdasm() m = asm.assemble(ASM_STR) self.r = Runtime() self.ds = self.r.load("set_pixel", m) self.ds["color"] = 0xFF00FF00 # red color is default self.ds["address"] = address self.ds["width"] = width self.ds["height"] = height self.ds["pitch"] = pitch
def compile(self, shaders=[]): stms = parse(self._code) cgen = CodeGenerator() asm, ret_type = cgen.generate_code(stms, args=self._args, is_func=self._is_func, name=self._name, func_args=self._func_args, shaders=shaders) self._asm_code = asm self._ret_type = ret_type asm = Tdasm() self._mc = asm.assemble(self._asm_code, self._is_func)
def test_log(self): asm = Tdasm() mc = asm.assemble(LOG_CODE) runtime = Runtime() load_math_func("fast_log_ss", runtime) ds = runtime.load("log", mc) for x in range(1000): num = random.random() ds["x"] = num runtime.run("log") rez_asm = ds["x"] rez_py = math.log(num) self.assertAlmostEqual(rez_asm, rez_py, 3)
def test_exp(self): asm = Tdasm() mc = asm.assemble(EXP_CODE) runtime = Runtime() load_math_func("fast_exp_ss", runtime) ds = runtime.load("exp", mc) for x in range(1000): num = random.random() * 4 ds["x"] = num runtime.run("exp") rez_asm = ds["x"] rez_py = math.exp(num) self.assertAlmostEqual(rez_asm, rez_py, 2)
def test_atan(self): asm = Tdasm() mc = asm.assemble(ATAN_CODE) runtime = Runtime() load_math_func("fast_atan_ss", runtime) ds = runtime.load("atan", mc) for x in range(1000): num = random.random() * 2000 ds["x"] = num runtime.run("atan") rez_asm = ds["x"] rez_py = math.atan(num) self.assertAlmostEqual(rez_asm, rez_py, 3)
def _create_assembler(self): assembler = Tdasm() self._macro_call = macro_call = MacroCall() assembler.register_macro('call', macro_call.macro_call) assembler.register_macro('eq128', arithmetic128) assembler.register_macro('eq32', arithmetic32) assembler.register_macro('broadcast', broadcast) assembler.register_macro('if', macro_if) assembler.register_macro('dot', dot_product) assembler.register_macro('normalization', normalization) assembler.register_macro('cross', cross_product) self._macro_spectrum = MacroSpectrum(self) assembler.register_macro('spectrum', self._macro_spectrum.macro_spectrum) return assembler
def set_pixel_asm(self, runtime, label): bits = platform.architecture()[0] if bits == "64bit": ecx = "rcx" else: ecx = "ecx" if util.AVX: line = "vmovaps oword [" + ecx + "], xmm0" else: line = "movaps oword [" + ecx + "], xmm0" bits = platform.architecture()[0] if bits == "64bit": l1 = "uint64 ptr_buffer" l2 = "mov rcx, qword [ptr_buffer]" l3 = "add rcx, rax" else: l1 = "uint32 ptr_buffer" l2 = "mov ecx, dword [ptr_buffer]" l3 = "add ecx, eax" asm_code = """ #DATA """ asm_code += l1 + """ uint32 pitch #CODE ; eax = x , ebx = y, value = xmm0 """ asm_code += "global " + label + ": \n" asm_code += """ imul ebx, dword [pitch] imul eax , eax, 16 """ asm_code += l2 + """ add eax, ebx """ asm_code += l3 + "\n" asm_code += line + """ ret """ asm = Tdasm() mc = asm.assemble(asm_code, True) name = "ImageFloatRGBA" + str(hash(self)) self.ds = runtime.load(name, mc) self.ds["ptr_buffer"] = self.pixels.ptr() self.ds["pitch"] = self.pitch
def test_pow(self): asm = Tdasm() mc = asm.assemble(POW_CODE) runtime = Runtime() load_math_func("fast_pow_ss", runtime) ds = runtime.load("pow", mc) for x in range(1000): num = random.random() * 3 num1 = random.random() * 3 ds["x"] = num ds["y"] = num1 runtime.run("pow") rez_asm = ds["x"] rez_py = math.pow(num, num1) self.assertAlmostEqual(rez_asm, rez_py, 1)
def create_assembler(): assembler = Tdasm() assembler.register_macro('mov', mov) assembler.register_macro('lea', lea) assembler.register_macro('eq128', arithmetic128) assembler.register_macro('eq32', arithmetic32) assembler.register_macro('broadcast', broadcast) assembler.register_macro('if', macro_if) assembler.register_macro('dot', dot_product) assembler.register_macro('normalization', normalization) assembler.register_macro('cross', cross_product) assembler.register_macro('generate_one', generate_one) assembler.register_macro('push', push) assembler.register_macro('pop', pop) assembler.register_macro('sqrtss', sqrtss) return assembler
def test_sincos(self): asm = Tdasm() mc = asm.assemble(SINCOS_CODE) runtime = Runtime() load_math_func("fast_sincos_ss", runtime) ds = runtime.load("sincos", mc) for x in range(1000): num = random.random() * 2000 ds["x"] = num runtime.run("sincos") rez_asm1 = ds["x"] rez_asm2 = ds["y"] rez_py1, rez_py2 = math.sin(num), math.cos(num) self.assertAlmostEqual(rez_asm1, rez_py1, 3) self.assertAlmostEqual(rez_asm2, rez_py2, 3)
def compile(self, shaders=[], color_mgr=None): stms = parse(self._code) cgen = CodeGenerator() asm, ret_type, fns = cgen.generate_code(stms, args=self._args, is_func=self._is_func, name=self._name, func_args=self._func_args, shaders=shaders, color_mgr=color_mgr) self._asm_code = asm self._ret_type = ret_type self._ext_functions = fns asm = Tdasm() self._mc = asm.assemble(self._asm_code, naked=self._is_func, ia32=not cgen.BIT64)
def random_sampler(): runtime = Runtime() width = 1 height = 1 spp = 1 sampler = renmas2.samplers.RandomSampler(width, height, spp=spp, pixel=1.0) sampler.get_sample_asm([runtime], 'get_sample') tile = renmas2.core.Tile(0, 0, width, height) tile.split(1) sampler.set_tile(tile) asm = Tdasm() mc = asm.assemble(ASM_CODE) runtime.load("test", mc) nsamples = width * height * spp for x in range(nsamples): get_sample(sampler, runtime, "test") get_sample(sampler, runtime, "test")
def get_asm(): from renmas.macros import eq32, eq128, eq32_32, eq32_128, eq128_128, eq128_32 from renmas.macros import dot_product, macro_if, broadcast global assembler if assembler is None: assembler = Tdasm() assembler.register_macro("eq128", eq128) assembler.register_macro("eq32", eq32) assembler.register_macro("eq128_32", eq128_32) assembler.register_macro("eq32_128", eq32_128) assembler.register_macro("eq128_128", eq128_128) assembler.register_macro("eq32_32", eq32_32) assembler.register_macro("dot", dot_product) assembler.register_macro("if", macro_if) assembler.register_macro("broadcast", broadcast) return assembler
def test_log_ps(self): asm = Tdasm() mc = asm.assemble(LOG_CODE_PS) runtime = Runtime() load_math_func("fast_log_ps", runtime) ds = runtime.load("log_ps", mc) for x in range(1000): num1 = random.random() num2 = random.random() num3 = random.random() num4 = random.random() ds["v1"] = (num1, num2, num3, num4) runtime.run("log_ps") rez_asm = ds["v1"] rez_py1 = math.log(num1) rez_py2 = math.log(num2) rez_py3 = math.log(num3) rez_py4 = math.log(num4) self.assertAlmostEqual(rez_asm[0], rez_py1, 3) self.assertAlmostEqual(rez_asm[1], rez_py2, 3) self.assertAlmostEqual(rez_asm[2], rez_py3, 3) self.assertAlmostEqual(rez_asm[3], rez_py4, 3)
code += lst_inst2[l] + "\n" for l in range(len(lst_inst2), len(lst_inst1)): code += lst_inst1[l] + "\n" return code def arth128_32(tokens): return arth_mix(tokens, 128, 32) def arth32_128(tokens): return arth_mix(tokens, 32, 128) def arth128_128(tokens): return arth_mix(tokens, 128, 128) def arth32_32(tokens): return arth_mix(tokens, 32, 32) if __name__ == "__main__": asm = Tdasm() asm.register_macro("arth128", arth128) asm.register_macro("arth32", arth32) mc = asm.assemble(ASM_CODE) run = Runtime() ds = run.load("test", mc) run.run("test") print(ds["rez"])
#END """ else: code = """ #DATA uint32 sa, da, n #CODE mov ecx, dword [n] mov esi, dword [sa] mov edi, dword [da] rep movs byte [edi], byte [esi] #END """ return code _mc = Tdasm().assemble(_memcpy_code()) _runtime = Runtime() _data_section = _runtime.load("memcpy", _mc) def memcpy(da, sa, n): """ Copy n bytes form source address(sa) to destination address(da). """ _data_section["da"] = da _data_section["sa"] = sa _data_section["n"] = n _runtime.run("memcpy")
def cos_ps(): data = """ #DATA uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679 float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236 uint32 _epi32_1[4] = 1, 1, 1, 1 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _epi32_2[4] = 2, 2, 2, 2 float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413 float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262 float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896 """ asm_code = data + """ #CODE global fast_cos_ps: andps xmm0, oword [_ps_am_inv_sign_mask] addps xmm0, oword [_ps_am_pi_o_2] mulps xmm0, oword [_ps_am_2_o_pi] pxor xmm3, xmm3 movdqa xmm5, oword [_epi32_1] movaps xmm4, oword [_ps_am_1] cvttps2dq xmm2, xmm0 pand xmm5, xmm2 pcmpeqd xmm5, xmm3 cvtdq2ps xmm6, xmm2 pand xmm2, oword [_epi32_2] pslld xmm2, 30 subps xmm0, xmm6 minps xmm0, xmm4 subps xmm4, xmm0 andps xmm0, xmm5 andnps xmm5, xmm4 orps xmm0, xmm5 movaps xmm1, xmm0 mulps xmm0, xmm0 orps xmm1, xmm2 movaps xmm7, xmm0 mulps xmm0, oword [_ps_sincos_p3] addps xmm0, oword [_ps_sincos_p2] mulps xmm0, xmm7 addps xmm0, oword [_ps_sincos_p1] mulps xmm0, xmm7 addps xmm0, oword [_ps_sincos_p0] mulps xmm0, xmm1 ret """ avx_code = data + """ #CODE global fast_cos_ps: vandps xmm0, xmm0, oword [_ps_am_inv_sign_mask] vaddps xmm0, xmm0, oword [_ps_am_pi_o_2] vmulps xmm0, xmm0, oword [_ps_am_2_o_pi] vpxor xmm3, xmm3, xmm3 vmovdqa xmm5, oword [_epi32_1] vmovaps xmm4, oword [_ps_am_1] vcvttps2dq xmm2, xmm0 vpand xmm5, xmm5, xmm2 vpcmpeqd xmm5, xmm5, xmm3 vcvtdq2ps xmm6, xmm2 vpand xmm2, xmm2, oword [_epi32_2] vpslld xmm2, xmm2, 30 vsubps xmm0, xmm0, xmm6 vminps xmm0, xmm0, xmm4 vsubps xmm4, xmm4, xmm0 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vorps xmm0, xmm0, xmm5 vmovaps xmm1, xmm0 vmulps xmm0, xmm0, xmm0 vorps xmm1, xmm1, xmm2 vmovaps xmm7, xmm0 vmulps xmm0, xmm0, oword [_ps_sincos_p3] vaddps xmm0, xmm0, oword [_ps_sincos_p2] vmulps xmm0, xmm0, xmm7 vaddps xmm0, xmm0, oword [_ps_sincos_p1] vmulps xmm0, xmm0, xmm7 vaddps xmm0, xmm0, oword [_ps_sincos_p0] vmulps xmm0, xmm0, xmm1 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
def asin_ps(): data = """ #DATA uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 float _ps_am_m1[4] = -1.0, -1.0, -1.0, -1.0 float _ps_atan_t0[4] = -0.091646118527, -0.091646118527, -0.091646118527, -0.091646118527 float _ps_atan_s0[4] = 1.2797564625, 1.2797564625, 1.2797564625, 1.2797564625 float _ps_atan_s1[4] = 2.1972168858, 2.1972168858, 2.1972168858, 2.1972168858 float _ps_atan_t1[4] = -1.395694568, -1.395694568, -1.395694568, -1.395694568 float _ps_atan_s2[4] = 6.8193064723, 6.8193064723, 6.8193064723 ,6.8193064723 float _ps_atan_t2[4] = -94.3939261227, -94.3939261227, -94.3939261227, -94.3939261227 float _ps_atan_s3[4] = 28.205206687, 28.205206687, 28.205206687, 28.205206687 float _ps_atan_t3[4] = 12.888383034, 12.888383034, 12.888383034, 12.888383034 float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679 """ asm_code = data + """ #CODE global fast_asin_ps: movaps xmm1, oword [_ps_am_1] movaps xmm2, xmm1 addps xmm1, xmm0 subps xmm2, xmm0 mulps xmm1, xmm2 rsqrtps xmm1, xmm1 mulps xmm0, xmm1 ;atan movaps xmm5, oword [_ps_am_1] movaps xmm6, oword [_ps_am_m1] rcpps xmm4, xmm0 cmpps xmm5, xmm0, 1 cmpps xmm6, xmm0, 6 movaps xmm1, oword [_ps_atan_s0] orps xmm5, xmm6 andps xmm4, xmm5 movaps xmm2, oword [_ps_atan_t0] movaps xmm7, xmm5 andnps xmm5, xmm0 movaps xmm3, oword [_ps_atan_s1] orps xmm4, xmm5 movaps xmm0, xmm4 movaps xmm6, oword [_ps_atan_t1] mulps xmm4, xmm4 addps xmm1, xmm4 movaps xmm5, oword [_ps_atan_s2] rcpps xmm1, xmm1 mulps xmm1, xmm2 movaps xmm2, oword [_ps_atan_t2] addps xmm3, xmm4 addps xmm1, xmm3 movaps xmm3, oword [_ps_atan_s3] rcpps xmm1, xmm1 mulps xmm1, xmm6 movaps xmm6, oword [_ps_atan_t3] addps xmm5, xmm4 addps xmm1, xmm5 movaps xmm5, oword [_ps_am_sign_mask] rcpps xmm1, xmm1 mulps xmm1, xmm2 addps xmm3, xmm4 movaps xmm4, oword [_ps_am_pi_o_2] mulps xmm6, xmm0 addps xmm1, xmm3 andps xmm0, xmm5 rcpps xmm1, xmm1 mulps xmm1, xmm6 orps xmm0, xmm4 subps xmm0, xmm1 andps xmm0, xmm7 andnps xmm7, xmm1 orps xmm0, xmm7 ret """ avx_code = data + """ #CODE global fast_asin_ps: vmovaps xmm1, oword [_ps_am_1] vmovaps xmm2, xmm1 vaddps xmm1, xmm1, xmm0 vsubps xmm2, xmm2, xmm0 vmulps xmm1, xmm1, xmm2 vrsqrtps xmm1, xmm1 vmulps xmm0, xmm0, xmm1 ;atan vmovaps xmm5, oword [_ps_am_1] vmovaps xmm6, oword [_ps_am_m1] vrcpps xmm4, xmm0 vcmpps xmm5, xmm5, xmm0, 1 vcmpps xmm6, xmm6, xmm0, 6 vmovaps xmm1, oword [_ps_atan_s0] vorps xmm5, xmm5, xmm6 vandps xmm4, xmm4, xmm5 vmovaps xmm2, oword [_ps_atan_t0] vmovaps xmm7, xmm5 vandnps xmm5, xmm5, xmm0 vmovaps xmm3, oword [_ps_atan_s1] vorps xmm4, xmm4, xmm5 vmovaps xmm0, xmm4 vmovaps xmm6, oword [_ps_atan_t1] vmulps xmm4, xmm4, xmm4 vaddps xmm1, xmm1, xmm4 vmovaps xmm5, oword [_ps_atan_s2] vrcpps xmm1, xmm1 vmulps xmm1, xmm1, xmm2 vmovaps xmm2, oword [_ps_atan_t2] vaddps xmm3, xmm3, xmm4 vaddps xmm1, xmm1, xmm3 vmovaps xmm3, oword [_ps_atan_s3] vrcpps xmm1, xmm1 vmulps xmm1, xmm1, xmm6 vmovaps xmm6, oword [_ps_atan_t3] vaddps xmm5, xmm5, xmm4 vaddps xmm1, xmm1, xmm5 vmovaps xmm5, oword [_ps_am_sign_mask] vrcpps xmm1, xmm1 vmulps xmm1, xmm1, xmm2 vaddps xmm3, xmm3, xmm4 vmovaps xmm4, oword [_ps_am_pi_o_2] vmulps xmm6, xmm6, xmm0 vaddps xmm1, xmm1, xmm3 vandps xmm0, xmm0, xmm5 vrcpps xmm1, xmm1 vmulps xmm1, xmm1, xmm6 vorps xmm0, xmm0, xmm4 vsubps xmm0, xmm0, xmm1 vandps xmm0, xmm0, xmm7 vandnps xmm7, xmm7, xmm1 vorps xmm0, xmm0, xmm7 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
def sin_ss(): data = """ #DATA uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000 float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236 uint32 _epi32_1[4] = 1, 1, 1, 1 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _epi32_2[4] = 2, 2, 2, 2 float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413 float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262 float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896 """ asm_code = data + """ #CODE global fast_sin_ss: movaps xmm7, xmm0 movss xmm1, dword [_ps_am_inv_sign_mask] movss xmm2, dword [_ps_am_sign_mask] movss xmm3, dword [_ps_am_2_o_pi] andps xmm0, xmm1 andps xmm7, xmm2 mulss xmm0, xmm3 pxor xmm3, xmm3 movd xmm5, dword [_epi32_1] movss xmm4, dword [_ps_am_1] cvttps2dq xmm2, xmm0 pand xmm5, xmm2 movd xmm1, dword [_epi32_2] pcmpeqd xmm5, xmm3 cvtdq2ps xmm6, xmm2 pand xmm2, xmm1 pslld xmm2, 30 subss xmm0, xmm6 movss xmm3, dword [_ps_sincos_p3] minss xmm0, xmm4 subss xmm4, xmm0 andps xmm0, xmm5 andnps xmm5, xmm4 orps xmm0, xmm5 movaps xmm1, xmm0 movss xmm4, dword [_ps_sincos_p2] mulss xmm0, xmm0 xorps xmm2, xmm7 movss xmm5, dword [_ps_sincos_p1] orps xmm1, xmm2 movaps xmm7, xmm0 mulss xmm0, xmm3 movss xmm6, dword [_ps_sincos_p0] addss xmm0, xmm4 mulss xmm0, xmm7 addss xmm0, xmm5 mulss xmm0, xmm7 addss xmm0, xmm6 mulss xmm0, xmm1 ret """ avx_code = data + """ #CODE global fast_sin_ss: vmovaps xmm7, xmm0 vmovss xmm1, dword [_ps_am_inv_sign_mask] vmovss xmm2, dword [_ps_am_sign_mask] vmovss xmm3, dword [_ps_am_2_o_pi] vandps xmm0, xmm0, xmm1 vandps xmm7, xmm7, xmm2 vmulss xmm0, xmm0, xmm3 vpxor xmm3, xmm3, xmm3 vmovd xmm5, dword [_epi32_1] vmovss xmm4, dword [_ps_am_1] vcvttps2dq xmm2, xmm0 vpand xmm5, xmm5, xmm2 vmovd xmm1, dword [_epi32_2] vpcmpeqd xmm5, xmm5, xmm3 vcvtdq2ps xmm6, xmm2 vpand xmm2, xmm2, xmm1 vpslld xmm2, xmm2, 30 vsubss xmm0, xmm0, xmm6 vmovss xmm3, dword [_ps_sincos_p3] vminss xmm0, xmm0, xmm4 vsubss xmm4, xmm4, xmm0 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vorps xmm0, xmm0, xmm5 vmovaps xmm1, xmm0 vmovss xmm4, dword [_ps_sincos_p2] vmulss xmm0, xmm0, xmm0 vxorps xmm2, xmm2, xmm7 vmovss xmm5, dword [_ps_sincos_p1] vorps xmm1, xmm1, xmm2 vmovaps xmm7, xmm0 vmulss xmm0, xmm0, xmm3 vmovss xmm6, dword [_ps_sincos_p0] vaddss xmm0, xmm0, xmm4 vmulss xmm0, xmm0, xmm7 vaddss xmm0, xmm0, xmm5 vmulss xmm0, xmm0, xmm7 vaddss xmm0, xmm0, xmm6 vmulss xmm0, xmm0, xmm1 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc