def __init__(self): asm = Tdasm() m = asm.assemble(MEMCPY) self.r = Runtime() self.ds = self.r.load("memcpy", m) m2 = asm.assemble(BLTRGBA) self.ds2 = self.r.load("bltrgba", m2) m3 = asm.assemble(BLTFLOATRGBA) self.ds3 = self.r.load("bltfloatrgba", m3)
def prepare(self, runtimes): self._load_color_funcs(runtimes) if self.loader: self.loader(runtimes) for s in self._shaders: s.prepare(runtimes) self._runtimes = runtimes asm = Tdasm() name = 'shader' + str(id(self)) for fun in self._functions: fun_name, fun_label, avx, bit = fun load_asm_function(fun_name, fun_label, runtimes, avx, bit) ds = [] for r in runtimes: if not r.global_exists(self._name): if self._name in self._mc_cache: ds.append(r.load(name, self._mc_cache[self._name])) else: mc = asm.assemble(self._code, self._func) self._mc_cache[self._name] = mc ds.append(r.load(name, mc)) if ds: self._ds = ds
def test_sincos_ps(self): asm = Tdasm() mc = asm.assemble(SINCOS_CODE_PS) runtime = Runtime() load_math_func("fast_sincos_ps", runtime) ds = runtime.load("sincos_ps", mc) for x in range(1000): num1 = random.random() * 2000 num2 = random.random() * 2000 num3 = random.random() * 2000 num4 = random.random() * 2000 ds["v1"] = (num1, num2, num3, num4) runtime.run("sincos_ps") rez_asm_sin = ds["v1"] rez_asm_cos = ds["v2"] rez_py1_sin = math.sin(num1) rez_py2_sin = math.sin(num2) rez_py3_sin = math.sin(num3) rez_py4_sin = math.sin(num4) rez_py1_cos = math.cos(num1) rez_py2_cos = math.cos(num2) rez_py3_cos = math.cos(num3) rez_py4_cos = math.cos(num4) self.assertAlmostEqual(rez_asm_sin[0], rez_py1_sin, 3) self.assertAlmostEqual(rez_asm_sin[1], rez_py2_sin, 3) self.assertAlmostEqual(rez_asm_sin[2], rez_py3_sin, 3) self.assertAlmostEqual(rez_asm_sin[3], rez_py4_sin, 3) self.assertAlmostEqual(rez_asm_cos[0], rez_py1_cos, 3) self.assertAlmostEqual(rez_asm_cos[1], rez_py2_cos, 3) self.assertAlmostEqual(rez_asm_cos[2], rez_py3_cos, 3) self.assertAlmostEqual(rez_asm_cos[3], rez_py4_cos, 3)
def test_pow_ps(self): asm = Tdasm() mc = asm.assemble(POW_CODE_PS) runtime = Runtime() load_math_func("fast_pow_ps", runtime) ds = runtime.load("pow_ps", mc) for x in range(1000): num1 = random.random() * 3 num2 = random.random() * 3 num3 = random.random() * 3 num4 = random.random() * 3 num5 = random.random() * 3 num6 = random.random() * 3 num7 = random.random() * 3 num8 = random.random() * 3 ds["v1"] = (num1, num2, num3, num4) ds["v2"] = (num5, num6, num7, num8) runtime.run("pow_ps") rez_asm = ds["v1"] rez_py1 = math.pow(num1, num5) rez_py2 = math.pow(num2, num6) rez_py3 = math.pow(num3, num7) rez_py4 = math.pow(num4, num8) self.assertAlmostEqual(rez_asm[0], rez_py1, 1) self.assertAlmostEqual(rez_asm[1], rez_py2, 1) self.assertAlmostEqual(rez_asm[2], rez_py3, 1) self.assertAlmostEqual(rez_asm[3], rez_py4, 1)
def create_float_image(runtime): img = renmas.gui.ImageFloatRGBA(150, 150) img.set_pixel_asm(runtime, "set_pixel") asm = Tdasm() mc = asm.assemble(ASM) runtime.load("write", mc) runtime.run("write") return img
def compile(self, shaders=[]): stms = parse(self._code) cgen = CodeGenerator() asm, ret_type = cgen.generate_code( stms, args=self._args, is_func=self._is_func, name=self._name, func_args=self._func_args, shaders=shaders ) self._asm_code = asm self._ret_type = ret_type asm = Tdasm() self._mc = asm.assemble(self._asm_code, self._is_func)
def regular_sampler(): runtime = Runtime() sampler = renmas2.samplers.RegularSampler(2, 2, pixel=1.0) sampler.get_sample_asm([runtime], 'get_sample') tile = renmas2.core.Tile(0, 0, 2, 2) tile.split(1) sampler.set_tile(tile) asm = Tdasm() mc = asm.assemble(ASM_CODE) runtime.load("test", mc) return (sampler, runtime, 'test')
def __init__(self, width, height, pitch, address): self.addr = address self.width = width self.height = height asm = Tdasm() m = asm.assemble(ASM_STR) self.r = Runtime() self.ds = self.r.load("set_pixel", m) self.ds["color"] = 0xFF00FF00 # red color is default self.ds["address"] = address self.ds["width"] = width self.ds["height"] = height self.ds["pitch"] = pitch
def prepare(self, runtimes): for s in self._shaders: s.prepare(runtimes) self._ds = [] asm = Tdasm() mc = asm.assemble(self._code, self._func) #mc.print_machine_code() name = 'shader' + str(id(self)) self._runtimes = runtimes for r in runtimes: #TODO check if shader allread exist in runtime #TODO if shader is function load it as function self._ds.append(r.load(name, mc))
def compile(self, shaders=[]): stms = parse(self._code) cgen = CodeGenerator() asm, ret_type = cgen.generate_code(stms, args=self._args, is_func=self._is_func, name=self._name, func_args=self._func_args, shaders=shaders) self._asm_code = asm self._ret_type = ret_type asm = Tdasm() self._mc = asm.assemble(self._asm_code, self._is_func)
def test_atan(self): asm = Tdasm() mc = asm.assemble(ATAN_CODE) runtime = Runtime() load_math_func("fast_atan_ss", runtime) ds = runtime.load("atan", mc) for x in range(1000): num = random.random() * 2000 ds["x"] = num runtime.run("atan") rez_asm = ds["x"] rez_py = math.atan(num) self.assertAlmostEqual(rez_asm, rez_py, 3)
def test_log(self): asm = Tdasm() mc = asm.assemble(LOG_CODE) runtime = Runtime() load_math_func("fast_log_ss", runtime) ds = runtime.load("log", mc) for x in range(1000): num = random.random() ds["x"] = num runtime.run("log") rez_asm = ds["x"] rez_py = math.log(num) self.assertAlmostEqual(rez_asm, rez_py, 3)
def test_exp(self): asm = Tdasm() mc = asm.assemble(EXP_CODE) runtime = Runtime() load_math_func("fast_exp_ss", runtime) ds = runtime.load("exp", mc) for x in range(1000): num = random.random() * 4 ds["x"] = num runtime.run("exp") rez_asm = ds["x"] rez_py = math.exp(num) self.assertAlmostEqual(rez_asm, rez_py, 2)
class Structures: def __init__(self, renderer): self.tdasm = Tdasm() self.renderer = renderer self._line1 = "struct spectrum \n" self._line3 = "end struct \n" def get_struct(self, name): if name in structures: return structures[name] elif name == "spectrum": if self.renderer.spectral_rendering: line2 = "float values[" + str( self.renderer.nspectrum_samples) + "] \n" else: line2 = "float values[4] \n" return self._line1 + line2 + self._line3 elif name == "hitpoint": if self.renderer.spectral_rendering: line2 = "float values[" + str( self.renderer.nspectrum_samples) + "] \n" else: line2 = "float values[4] \n" spec = self._line1 + line2 + self._line3 return spec + HITPOINT return None def get_compiled_struct(self, name): if name in structures: asm_code = """ #DATA """ asm_code += self.get_struct(name) asm_code += """ #CODE #END """ mc = self.tdasm.assemble(asm_code) return mc.get_struct(name) return None def structs(self, names): code = "" for name in names: struct = self.get_struct(name) if struct is None: raise ValueError("Structure " + str(name) + " doesn't exist!") code += struct return code
def compile(self, shaders=[], color_mgr=None): stms = parse(self._code) cgen = CodeGenerator() asm, ret_type, fns = cgen.generate_code(stms, args=self._args, is_func=self._is_func, name=self._name, func_args=self._func_args, shaders=shaders, color_mgr=color_mgr) self._asm_code = asm self._ret_type = ret_type self._ext_functions = fns asm = Tdasm() self._mc = asm.assemble(self._asm_code, naked=self._is_func, ia32=not cgen.BIT64)
def set_pixel_asm(self, runtime, label): bits = platform.architecture()[0] if bits == "64bit": ecx = "rcx" else: ecx = "ecx" if util.AVX: line = "vmovaps oword [" + ecx + "], xmm0" else: line = "movaps oword [" + ecx + "], xmm0" bits = platform.architecture()[0] if bits == "64bit": l1 = "uint64 ptr_buffer" l2 = "mov rcx, qword [ptr_buffer]" l3 = "add rcx, rax" else: l1 = "uint32 ptr_buffer" l2 = "mov ecx, dword [ptr_buffer]" l3 = "add ecx, eax" asm_code = """ #DATA """ asm_code += l1 + """ uint32 pitch #CODE ; eax = x , ebx = y, value = xmm0 """ asm_code += "global " + label + ": \n" asm_code += """ imul ebx, dword [pitch] imul eax , eax, 16 """ asm_code += l2 + """ add eax, ebx """ asm_code += l3 + "\n" asm_code += line + """ ret """ asm = Tdasm() mc = asm.assemble(asm_code, True) name = "ImageFloatRGBA" + str(hash(self)) self.ds = runtime.load(name, mc) self.ds["ptr_buffer"] = self.pixels.ptr() self.ds["pitch"] = self.pitch
def test_pow(self): asm = Tdasm() mc = asm.assemble(POW_CODE) runtime = Runtime() load_math_func("fast_pow_ss", runtime) ds = runtime.load("pow", mc) for x in range(1000): num = random.random() * 3 num1 = random.random() * 3 ds["x"] = num ds["y"] = num1 runtime.run("pow") rez_asm = ds["x"] rez_py = math.pow(num, num1) self.assertAlmostEqual(rez_asm, rez_py, 1)
class Structures: def __init__(self, renderer): self.tdasm = Tdasm() self.renderer = renderer self._line1 = "struct spectrum \n" self._line3 = "end struct \n" def get_struct(self, name): if name in structures: return structures[name] elif name == "spectrum": if self.renderer.spectral_rendering: line2 = "float values[" + str(self.renderer.nspectrum_samples) + "] \n" else: line2 = "float values[4] \n" return self._line1 + line2 + self._line3 elif name == "hitpoint": if self.renderer.spectral_rendering: line2 = "float values[" + str(self.renderer.nspectrum_samples) + "] \n" else: line2 = "float values[4] \n" spec = self._line1 + line2 + self._line3 return spec + HITPOINT return None def get_compiled_struct(self, name): if name in structures: asm_code = """ #DATA """ asm_code += self.get_struct(name) asm_code += """ #CODE #END """ mc = self.tdasm.assemble(asm_code) return mc.get_struct(name) return None def structs(self, names): code = "" for name in names: struct = self.get_struct(name) if struct is None: raise ValueError("Structure " + str(name) + " doesn't exist!") code += struct return code
def test_sincos(self): asm = Tdasm() mc = asm.assemble(SINCOS_CODE) runtime = Runtime() load_math_func("fast_sincos_ss", runtime) ds = runtime.load("sincos", mc) for x in range(1000): num = random.random() * 2000 ds["x"] = num runtime.run("sincos") rez_asm1 = ds["x"] rez_asm2 = ds["y"] rez_py1, rez_py2 = math.sin(num), math.cos(num) self.assertAlmostEqual(rez_asm1, rez_py1, 3) self.assertAlmostEqual(rez_asm2, rez_py2, 3)
def random_sampler(): runtime = Runtime() width = 1 height = 1 spp = 1 sampler = renmas2.samplers.RandomSampler(width, height, spp=spp, pixel=1.0) sampler.get_sample_asm([runtime], 'get_sample') tile = renmas2.core.Tile(0, 0, width, height) tile.split(1) sampler.set_tile(tile) asm = Tdasm() mc = asm.assemble(ASM_CODE) runtime.load("test", mc) nsamples = width * height * spp for x in range(nsamples): get_sample(sampler, runtime, "test") get_sample(sampler, runtime, "test")
def test_log_ps(self): asm = Tdasm() mc = asm.assemble(LOG_CODE_PS) runtime = Runtime() load_math_func("fast_log_ps", runtime) ds = runtime.load("log_ps", mc) for x in range(1000): num1 = random.random() num2 = random.random() num3 = random.random() num4 = random.random() ds["v1"] = (num1, num2, num3, num4) runtime.run("log_ps") rez_asm = ds["v1"] rez_py1 = math.log(num1) rez_py2 = math.log(num2) rez_py3 = math.log(num3) rez_py4 = math.log(num4) self.assertAlmostEqual(rez_asm[0], rez_py1, 3) self.assertAlmostEqual(rez_asm[1], rez_py2, 3) self.assertAlmostEqual(rez_asm[2], rez_py3, 3) self.assertAlmostEqual(rez_asm[3], rez_py4, 3)
def cos_ss(): data = """ #DATA uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679 float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236 uint32 _epi32_1[4] = 1, 1, 1, 1 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _epi32_2[4] = 2, 2, 2, 2 float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413 float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262 float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896 """ asm_code = data + """ #CODE global fast_cos_ss: movss xmm1, dword [_ps_am_inv_sign_mask] movss xmm2, dword [_ps_am_pi_o_2] movss xmm3, dword [_ps_am_2_o_pi] andps xmm0, xmm1 addss xmm0, xmm2 mulss xmm0, xmm3 pxor xmm3, xmm3 movd xmm5, dword [_epi32_1] movss xmm4, dword [_ps_am_1] cvttps2dq xmm2, xmm0 pand xmm5, xmm2 movd xmm1, dword [_epi32_2] pcmpeqd xmm5, xmm3 cvtdq2ps xmm6, xmm2 pand xmm2, xmm1 pslld xmm2, 30 subss xmm0, xmm6 movss xmm3, dword [_ps_sincos_p3] minss xmm0, xmm4 subss xmm4, xmm0 andps xmm0, xmm5 andnps xmm5, xmm4 orps xmm0, xmm5 movaps xmm1, xmm0 movss xmm4, dword [_ps_sincos_p2] mulss xmm0, xmm0 movss xmm5, dword [_ps_sincos_p1] orps xmm1, xmm2 movaps xmm7, xmm0 mulss xmm0, xmm3 movss xmm6, dword [_ps_sincos_p0] addss xmm0, xmm4 mulss xmm0, xmm7 addss xmm0, xmm5 mulss xmm0, xmm7 addss xmm0, xmm6 mulss xmm0, xmm1 ret """ avx_code = data + """ #CODE global fast_cos_ss: vmovss xmm1, dword [_ps_am_inv_sign_mask] vmovss xmm2, dword [_ps_am_pi_o_2] vmovss xmm3, dword [_ps_am_2_o_pi] vandps xmm0, xmm0, xmm1 vaddss xmm0, xmm0, xmm2 vmulss xmm0, xmm0, xmm3 vpxor xmm3, xmm3, xmm3 vmovd xmm5, dword [_epi32_1] vmovss xmm4, dword [_ps_am_1] vcvttps2dq xmm2, xmm0 vpand xmm5, xmm5, xmm2 vmovd xmm1, dword [_epi32_2] vpcmpeqd xmm5, xmm5, xmm3 vcvtdq2ps xmm6, xmm2 vpand xmm2, xmm2, xmm1 vpslld xmm2, xmm2, 30 vsubss xmm0, xmm0, xmm6 vmovss xmm3, dword [_ps_sincos_p3] vminss xmm0, xmm0, xmm4 vsubss xmm4, xmm4, xmm0 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vorps xmm0, xmm0, xmm5 vmovaps xmm1, xmm0 vmovss xmm4, dword [_ps_sincos_p2] vmulss xmm0, xmm0, xmm0 vmovss xmm5, dword [_ps_sincos_p1] vorps xmm1, xmm1, xmm2 vmovaps xmm7, xmm0 vmulss xmm0, xmm0, xmm3 vmovss xmm6, dword [_ps_sincos_p0] vaddss xmm0, xmm0, xmm4 vmulss xmm0, xmm0, xmm7 vaddss xmm0, xmm0, xmm5 vmulss xmm0, xmm0, xmm7 vaddss xmm0, xmm0, xmm6 vmulss xmm0, xmm0, xmm1 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
add dword [y], 1 jmp _bltrgba _endblt: ret """ return code def _blt_floatrgba_code(bgra=True): bits = platform.architecture()[0] if bits == '64bit': return _blt_floatrgba_code64(bgra) else: return _blt_floatrgba_code32(bgra) _asm = Tdasm() _mc = _asm.assemble(_blt_floatrgba_code()) _runtime = Runtime() _data_section = _runtime.load("blt_prgba_to_bgra", _mc) _mc2 = _asm.assemble(_blt_floatrgba_code(bgra=False)) _data_section2 = _runtime.load("blt_prgba_to_rgba", _mc2) # blt float rgba to byte bgra def blt_prgba_to_bgra(src, dest): assert isinstance(src, ImagePRGBA) assert isinstance(dest, ImageBGRA) sa, spitch = src.address_info() da, dpitch = dest.address_info() dx = dy = sx = sy = 0
def acos_ss(): data = """ #DATA uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000 float _ps_am_m1[4] = -1.0, -1.0, -1.0, -1.0 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 float _ps_atan_t0[4] = -0.091646118527, -0.091646118527, -0.091646118527, -0.091646118527 float _ps_atan_s0[4] = 1.2797564625, 1.2797564625, 1.2797564625, 1.2797564625 float _ps_atan_s1[4] = 2.1972168858, 2.1972168858, 2.1972168858, 2.1972168858 float _ps_atan_t1[4] = -1.395694568, -1.395694568, -1.395694568, -1.395694568 float _ps_atan_s2[4] = 6.8193064723, 6.8193064723, 6.8193064723 ,6.8193064723 float _ps_atan_t2[4] = -94.3939261227, -94.3939261227, -94.3939261227, -94.3939261227 float _ps_atan_s3[4] = 28.205206687, 28.205206687, 28.205206687, 28.205206687 float _ps_atan_t3[4] = 12.888383034, 12.888383034, 12.888383034, 12.888383034 float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679 """ asm_code = data + """ #CODE global fast_acos_ss: movss xmm1, dword [_ps_am_1] movss xmm2, xmm1 subss xmm1, xmm0 addss xmm2, xmm0 rcpss xmm1, xmm1 mulss xmm2, xmm1 rsqrtss xmm0, xmm2 ;atan movss xmm1, dword [_ps_am_sign_mask] rcpss xmm4, xmm0 orps xmm1, xmm0 movss xmm6, xmm4 comiss xmm1, dword [_ps_am_m1] movss xmm3, dword [_ps_atan_t0] jnc l_small ; 'c' is 'lt' for comiss ;l_big: mulss xmm6, xmm6 movss xmm5, dword [_ps_atan_s0] addss xmm5, xmm6 movss xmm7, dword [_ps_atan_s1] rcpss xmm5, xmm5 mulss xmm5, xmm3 movss xmm3, dword [_ps_atan_t1] addss xmm7, xmm6 addss xmm5, xmm7 movss xmm7, dword [_ps_atan_s2] rcpss xmm5, xmm5 mulss xmm5, xmm3 movss xmm3, dword [_ps_atan_t2] addss xmm7, xmm6 addss xmm5, xmm7 movss xmm7, dword [_ps_atan_s3] rcpss xmm5, xmm5 mulss xmm5, xmm3 movss xmm3, dword [_ps_atan_t3] addss xmm7, xmm6 movss xmm2, dword [_ps_am_sign_mask] mulss xmm4, xmm3 addss xmm5, xmm7 movss xmm7, dword [_ps_am_pi_o_2] rcpss xmm5, xmm5 mulss xmm5, xmm4 andps xmm0, xmm2 orps xmm0, xmm7 subss xmm0, xmm5 ret l_small: movaps xmm2, xmm0 mulss xmm2, xmm2 movss xmm1, dword [_ps_atan_s0] addss xmm1, xmm2 movss xmm7, dword [_ps_atan_s1] rcpss xmm1, xmm1 mulss xmm1, xmm3 movss xmm3, dword [_ps_atan_t1] addss xmm7, xmm2 addss xmm1, xmm7 movss xmm7, dword [_ps_atan_s2] rcpss xmm1, xmm1 mulss xmm1, xmm3 movss xmm3, dword [_ps_atan_t2] addss xmm7, xmm2 addss xmm1, xmm7 movss xmm7, dword [_ps_atan_s3] rcpss xmm1, xmm1 mulss xmm1, xmm3 movss xmm3, dword [_ps_atan_t3] addss xmm7, xmm2 mulss xmm0, xmm3 addss xmm1, xmm7 rcpss xmm1, xmm1 mulss xmm0, xmm1 addss xmm0, xmm0 ;this line is not part of atan ret """ avx_code = data + """ #CODE global fast_acos_ss: vmovss xmm1, dword [_ps_am_1] vmovss xmm2, xmm2, xmm1 vsubss xmm1, xmm1, xmm0 vaddss xmm2, xmm2, xmm0 vrcpss xmm1, xmm1, xmm1 vmulss xmm2, xmm2, xmm1 vrsqrtss xmm0, xmm0, xmm2 ;atan vmovss xmm1, dword [_ps_am_sign_mask] vrcpss xmm4, xmm4, xmm0 vorps xmm1, xmm1, xmm0 vmovss xmm6, xmm6, xmm4 vcomiss xmm1, dword [_ps_am_m1] vmovss xmm3, dword [_ps_atan_t0] jnc l_small ; 'c' is 'lt' for comiss ;l_big: vmulss xmm6, xmm6, xmm6 vmovss xmm5, dword [_ps_atan_s0] vaddss xmm5, xmm5, xmm6 vmovss xmm7, dword [_ps_atan_s1] vrcpss xmm5, xmm5, xmm5 vmulss xmm5, xmm5, xmm3 vmovss xmm3, dword [_ps_atan_t1] vaddss xmm7, xmm7, xmm6 vaddss xmm5, xmm5, xmm7 vmovss xmm7, dword [_ps_atan_s2] vrcpss xmm5, xmm5, xmm5 vmulss xmm5, xmm5, xmm3 vmovss xmm3, dword [_ps_atan_t2] vaddss xmm7, xmm7, xmm6 vaddss xmm5, xmm5, xmm7 vmovss xmm7, dword [_ps_atan_s3] vrcpss xmm5, xmm5, xmm5 vmulss xmm5, xmm5, xmm3 vmovss xmm3, dword [_ps_atan_t3] vaddss xmm7, xmm7, xmm6 vmovss xmm2, dword [_ps_am_sign_mask] vmulss xmm4, xmm4, xmm3 vaddss xmm5, xmm5, xmm7 vmovss xmm7, dword [_ps_am_pi_o_2] vrcpss xmm5, xmm5, xmm5 vmulss xmm5, xmm5, xmm4 vandps xmm0, xmm0, xmm2 vorps xmm0, xmm0, xmm7 vsubss xmm0, xmm0, xmm5 ret l_small: vmovaps xmm2, xmm0 vmulss xmm2, xmm2, xmm2 vmovss xmm1, dword [_ps_atan_s0] vaddss xmm1, xmm1, xmm2 vmovss xmm7, dword [_ps_atan_s1] vrcpss xmm1, xmm1, xmm1 vmulss xmm1, xmm1, xmm3 vmovss xmm3, dword [_ps_atan_t1] vaddss xmm7, xmm7, xmm2 vaddss xmm1, xmm1, xmm7 vmovss xmm7, dword [_ps_atan_s2] vrcpss xmm1, xmm1, xmm1 vmulss xmm1, xmm1, xmm3 vmovss xmm3, dword [_ps_atan_t2] vaddss xmm7, xmm7, xmm2 vaddss xmm1, xmm1, xmm7 vmovss xmm7, dword [_ps_atan_s3] vrcpss xmm1, xmm1, xmm1 vmulss xmm1, xmm1, xmm3 vmovss xmm3, dword [_ps_atan_t3] vaddss xmm7, xmm7, xmm2 vmulss xmm0, xmm0, xmm3 vaddss xmm1, xmm1, xmm7 vrcpss xmm1, xmm1, xmm1 vmulss xmm0, xmm0, xmm1 vaddss xmm0, xmm0, xmm0 ;this line is not part of atan ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
def sincos_ss(): data = """ #DATA uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000 float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679 float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236 uint32 _epi32_1[4] = 1, 1, 1, 1 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _epi32_2[4] = 2, 2, 2, 2 float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413 float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262 float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896 """ asm_code = data + """ #CODE global fast_sincos_ss: movaps xmm7, xmm0 movss xmm1, dword [_ps_am_inv_sign_mask] movss xmm2, dword [_ps_am_sign_mask] movss xmm3, dword [_ps_am_2_o_pi] andps xmm0, xmm1 andps xmm7, xmm2 mulss xmm0, xmm3 pxor xmm3, xmm3 movd xmm5, dword [_epi32_1] movss xmm4, dword [_ps_am_1] cvttps2dq xmm2, xmm0 pand xmm5, xmm2 movd xmm1, dword [_epi32_2] pcmpeqd xmm5, xmm3 movd xmm3, dword [_epi32_1] cvtdq2ps xmm6, xmm2 paddd xmm3, xmm2 pand xmm2, xmm1 pand xmm3, xmm1 subss xmm0, xmm6 pslld xmm2, 30 minss xmm0, xmm4 ;mov eax, [esp + 4 + 16] ;mov edx, [esp + 4 + 16 + 4] subss xmm4, xmm0 pslld xmm3, 30 movaps xmm6, xmm4 xorps xmm2, xmm7 movaps xmm7, xmm5 andps xmm6, xmm7 andnps xmm7, xmm0 andps xmm0, xmm5 andnps xmm5, xmm4 movss xmm4, dword [_ps_sincos_p3] orps xmm6, xmm7 orps xmm0, xmm5 movss xmm5, dword [_ps_sincos_p2] movaps xmm1, xmm0 movaps xmm7, xmm6 mulss xmm0, xmm0 mulss xmm6, xmm6 orps xmm1, xmm2 orps xmm7, xmm3 movaps xmm2, xmm0 movaps xmm3, xmm6 mulss xmm0, xmm4 mulss xmm6, xmm4 movss xmm4, dword [_ps_sincos_p1] addss xmm0, xmm5 addss xmm6, xmm5 movss xmm5, dword [_ps_sincos_p0] mulss xmm0, xmm2 mulss xmm6, xmm3 addss xmm0, xmm4 addss xmm6, xmm4 mulss xmm0, xmm2 mulss xmm6, xmm3 addss xmm0, xmm5 addss xmm6, xmm5 mulss xmm0, xmm1 mulss xmm6, xmm7 ;use full stores since caller might reload with full loads ;movaps [eax], xmm0 ;movaps [edx], xmm6 ret """ avx_code = data + """ #CODE global fast_sincos_ss: vmovaps xmm7, xmm0 vmovss xmm1, dword [_ps_am_inv_sign_mask] vmovss xmm2, dword [_ps_am_sign_mask] vmovss xmm3, dword [_ps_am_2_o_pi] vandps xmm0, xmm0, xmm1 vandps xmm7, xmm7, xmm2 vmulss xmm0, xmm0, xmm3 vpxor xmm3, xmm3, xmm3 vmovd xmm5, dword [_epi32_1] vmovss xmm4, dword [_ps_am_1] vcvttps2dq xmm2, xmm0 vpand xmm5, xmm5, xmm2 vmovd xmm1, dword [_epi32_2] vpcmpeqd xmm5, xmm5, xmm3 vmovd xmm3, dword [_epi32_1] vcvtdq2ps xmm6, xmm2 vpaddd xmm3, xmm3, xmm2 vpand xmm2, xmm2, xmm1 vpand xmm3, xmm3, xmm1 vsubss xmm0, xmm0, xmm6 vpslld xmm2, xmm2, 30 vminss xmm0, xmm0, xmm4 ;mov eax, [esp + 4 + 16] ;mov edx, [esp + 4 + 16 + 4] vsubss xmm4, xmm4, xmm0 vpslld xmm3, xmm3, 30 vmovaps xmm6, xmm4 vxorps xmm2, xmm2, xmm7 vmovaps xmm7, xmm5 vandps xmm6, xmm6, xmm7 vandnps xmm7, xmm7, xmm0 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vmovss xmm4, dword [_ps_sincos_p3] vorps xmm6, xmm6, xmm7 vorps xmm0, xmm0, xmm5 vmovss xmm5, dword [_ps_sincos_p2] vmovaps xmm1, xmm0 vmovaps xmm7, xmm6 vmulss xmm0, xmm0, xmm0 vmulss xmm6, xmm6, xmm6 vorps xmm1, xmm1, xmm2 vorps xmm7, xmm7, xmm3 vmovaps xmm2, xmm0 vmovaps xmm3, xmm6 vmulss xmm0, xmm0, xmm4 vmulss xmm6, xmm6, xmm4 vmovss xmm4, dword [_ps_sincos_p1] vaddss xmm0, xmm0, xmm5 vaddss xmm6, xmm6, xmm5 vmovss xmm5, dword [_ps_sincos_p0] vmulss xmm0, xmm0, xmm2 vmulss xmm6, xmm6, xmm3 vaddss xmm0, xmm0, xmm4 vaddss xmm6, xmm6, xmm4 vmulss xmm0, xmm0, xmm2 vmulss xmm6, xmm6, xmm3 vaddss xmm0, xmm0, xmm5 vaddss xmm6, xmm6, xmm5 vmulss xmm0, xmm0, xmm1 vmulss xmm6, xmm6, xmm7 ;use full stores since caller might reload with full loads ;movaps [eax], xmm0 ;movaps [edx], xmm6 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
def cos_ps(): data = """ #DATA uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679 float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236 uint32 _epi32_1[4] = 1, 1, 1, 1 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _epi32_2[4] = 2, 2, 2, 2 float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413 float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262 float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896 """ asm_code = data + """ #CODE global fast_cos_ps: andps xmm0, oword [_ps_am_inv_sign_mask] addps xmm0, oword [_ps_am_pi_o_2] mulps xmm0, oword [_ps_am_2_o_pi] pxor xmm3, xmm3 movdqa xmm5, oword [_epi32_1] movaps xmm4, oword [_ps_am_1] cvttps2dq xmm2, xmm0 pand xmm5, xmm2 pcmpeqd xmm5, xmm3 cvtdq2ps xmm6, xmm2 pand xmm2, oword [_epi32_2] pslld xmm2, 30 subps xmm0, xmm6 minps xmm0, xmm4 subps xmm4, xmm0 andps xmm0, xmm5 andnps xmm5, xmm4 orps xmm0, xmm5 movaps xmm1, xmm0 mulps xmm0, xmm0 orps xmm1, xmm2 movaps xmm7, xmm0 mulps xmm0, oword [_ps_sincos_p3] addps xmm0, oword [_ps_sincos_p2] mulps xmm0, xmm7 addps xmm0, oword [_ps_sincos_p1] mulps xmm0, xmm7 addps xmm0, oword [_ps_sincos_p0] mulps xmm0, xmm1 ret """ avx_code = data + """ #CODE global fast_cos_ps: vandps xmm0, xmm0, oword [_ps_am_inv_sign_mask] vaddps xmm0, xmm0, oword [_ps_am_pi_o_2] vmulps xmm0, xmm0, oword [_ps_am_2_o_pi] vpxor xmm3, xmm3, xmm3 vmovdqa xmm5, oword [_epi32_1] vmovaps xmm4, oword [_ps_am_1] vcvttps2dq xmm2, xmm0 vpand xmm5, xmm5, xmm2 vpcmpeqd xmm5, xmm5, xmm3 vcvtdq2ps xmm6, xmm2 vpand xmm2, xmm2, oword [_epi32_2] vpslld xmm2, xmm2, 30 vsubps xmm0, xmm0, xmm6 vminps xmm0, xmm0, xmm4 vsubps xmm4, xmm4, xmm0 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vorps xmm0, xmm0, xmm5 vmovaps xmm1, xmm0 vmulps xmm0, xmm0, xmm0 vorps xmm1, xmm1, xmm2 vmovaps xmm7, xmm0 vmulps xmm0, xmm0, oword [_ps_sincos_p3] vaddps xmm0, xmm0, oword [_ps_sincos_p2] vmulps xmm0, xmm0, xmm7 vaddps xmm0, xmm0, oword [_ps_sincos_p1] vmulps xmm0, xmm0, xmm7 vaddps xmm0, xmm0, oword [_ps_sincos_p0] vmulps xmm0, xmm0, xmm1 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
def random_float(cls, runtime, label): asm_code = cls.data + """ #CODE """ asm_code += " global " + label + ": " + """ movdqa xmm0, oword [gadd] movdqa xmm1, oword [mult] movdqa xmm2, oword [mask] pshufd xmm4, oword [cur_seed], 10110001b movdqa xmm5, oword [cur_seed] pmuludq xmm5, xmm1 pshufd xmm1, xmm1, 10110001b pmuludq xmm4, xmm1 pand xmm5, xmm2 pand xmm4, xmm2 pshufd xmm4, xmm4, 10110001b por xmm5, xmm4 paddd xmm5, xmm0 movdqa oword [cur_seed], xmm5 ;convert to float pand xmm5, oword [_random_sign_mask] cvtdq2ps xmm0, xmm5 mulps xmm0, oword [_random_flt] ret """ avx_code = cls.data + """ #CODE """ avx_code += " global " + label + ": " + """ vmovdqa xmm0, oword [gadd] vmovdqa xmm1, oword [mult] vmovdqa xmm2, oword [mask] vpshufd xmm4, oword [cur_seed], 10110001b vmovdqa xmm5, oword [cur_seed] vpmuludq xmm5, xmm5, xmm1 vpshufd xmm1, xmm1, 10110001b vpmuludq xmm4, xmm4, xmm1 vpand xmm5, xmm5, xmm2 vpand xmm4, xmm4, xmm2 vpshufd xmm4, xmm4, 10110001b vpor xmm5, xmm5, xmm4 vpaddd xmm5, xmm5, xmm0 vmovdqa oword [cur_seed], xmm5 ;convert to float vpand xmm5, xmm5, oword [_random_sign_mask] vcvtdq2ps xmm0, xmm5 vmulps xmm0, xmm0, oword [_random_flt] ret """ asm = Tdasm() if util.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) name = "randomfloat" + str(util.unique()) ds = runtime.load(name, mc) v1 = random.randint(0, 4000000000) v2 = random.randint(0, 4000000000) v3 = random.randint(0, 4000000000) v4 = random.randint(0, 4000000000) ds['cur_seed'] = (v1, v2, v3, v4)
def log_ps(): data = """ #DATA float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _ps_am_min_norm_pos[4] = 0x00800000, 0x00800000, 0x00800000, 0x00800000 uint32 _ps_am_inv_mant_mask[4] = 0x807FFFFF, 0x807FFFFF, 0x807FFFFF, 0x807FFFFF uint32 _epi32_0x7f[4] = 0x7F, 0x7F, 0x7F, 0x7F float _ps_log_p0[4] = -0.789580278884, -0.789580278884, -0.789580278884, -0.789580278884 float _ps_log_q0[4] = -35.6722798256, -35.6722798256, -35.6722798256, -35.6722798256 float _ps_log_p1[4] = 16.38666456995, 16.38666456995, 16.38666456995, 16.38666456995 float _ps_log_q1[4] = 312.0937663722, 312.0937663722, 312.0937663722, 312.0937663722 float _ps_log_p2[4] = -64.14099529587, -64.14099529587, -64.14099529587, -64.14099529587 float _ps_log_q2[4] = -769.69194355046, -769.69194355046, -769.69194355046, -769.69194355046 float _ps_log_c0[4] = 0.6931471805599, 0.6931471805599, 0.6931471805599, 0.6931471805599 """ asm_code = data + """ #CODE global fast_log_ps: maxps xmm0, oword [_ps_am_min_norm_pos] ; cut off denormalized stuff movaps xmm1, oword [_ps_am_1] movaps xmm3, xmm0 andps xmm0, oword [_ps_am_inv_mant_mask] orps xmm0, xmm1 movaps xmm4, xmm0 subps xmm0, xmm1 addps xmm4, xmm1 psrld xmm3, 23 rcpps xmm4, xmm4 mulps xmm0, xmm4 psubd xmm3, oword [_epi32_0x7f] addps xmm0, xmm0 movaps xmm2, xmm0 mulps xmm0, xmm0 movaps xmm4, oword [_ps_log_p0] movaps xmm6, oword [_ps_log_q0] mulps xmm4, xmm0 movaps xmm5, oword [_ps_log_p1] mulps xmm6, xmm0 movaps xmm7, oword [_ps_log_q1] addps xmm4, xmm5 addps xmm6, xmm7 movaps xmm5, oword [_ps_log_p2] mulps xmm4, xmm0 movaps xmm7, oword [_ps_log_q2] mulps xmm6, xmm0 addps xmm4, xmm5 movaps xmm5, oword [_ps_log_c0] addps xmm6, xmm7 cvtdq2ps xmm1, xmm3 mulps xmm0, xmm4 rcpps xmm6, xmm6 mulps xmm0, xmm6 mulps xmm0, xmm2 mulps xmm1, xmm5 addps xmm0, xmm2 addps xmm0, xmm1 ret """ avx_code = data + """ #CODE global fast_log_ps: vmaxps xmm0, xmm0, oword [_ps_am_min_norm_pos] ; cut off denormalized stuff vmovaps xmm1, oword [_ps_am_1] vmovaps xmm3, xmm0 vandps xmm0, xmm0, oword [_ps_am_inv_mant_mask] vorps xmm0, xmm0, xmm1 vmovaps xmm4, xmm0 vsubps xmm0, xmm0, xmm1 vaddps xmm4, xmm4, xmm1 vpsrld xmm3, xmm3, 23 vrcpps xmm4, xmm4 vmulps xmm0, xmm0, xmm4 vpsubd xmm3, xmm3, oword [_epi32_0x7f] vaddps xmm0, xmm0, xmm0 vmovaps xmm2, xmm0 vmulps xmm0, xmm0, xmm0 vmovaps xmm4, oword [_ps_log_p0] vmovaps xmm6, oword [_ps_log_q0] vmulps xmm4, xmm4, xmm0 vmovaps xmm5, oword [_ps_log_p1] vmulps xmm6, xmm6, xmm0 vmovaps xmm7, oword [_ps_log_q1] vaddps xmm4, xmm4, xmm5 vaddps xmm6, xmm6, xmm7 vmovaps xmm5, oword [_ps_log_p2] vmulps xmm4, xmm4, xmm0 vmovaps xmm7, oword [_ps_log_q2] vmulps xmm6, xmm6, xmm0 vaddps xmm4, xmm4, xmm5 vmovaps xmm5, oword [_ps_log_c0] vaddps xmm6, xmm6, xmm7 vcvtdq2ps xmm1, xmm3 vmulps xmm0, xmm0, xmm4 vrcpps xmm6, xmm6 vmulps xmm0, xmm0, xmm6 vmulps xmm0, xmm0, xmm2 vmulps xmm1, xmm1, xmm5 vaddps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmm1 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
def tan_ps(): data = """ #DATA uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF float _ps_am_4_o_pi[4] = 1.273239544735, 1.273239544735, 1.273239544735, 1.273239544735 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 float _ps_am_pi_o_4[4] = 0.78539816339, 0.78539816339, 0.78539816339, 0.78539816339 int32 _epi32_1[4] = 1, 1, 1, 1 int32 _epi32_7[4] = 7, 7, 7, 7 int32 _epi32_2[4] = 2, 2, 2, 2 uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000 float _ps_tan_p0[4] = -17956525.197648, -17956525.197648, -17956525.197648, -17956525.197648 float _ps_tan_q0[4] = -53869575.592945, -53869575.592945, -53869575.592945, -53869575.592945 float _ps_tan_p1[4] = 1153516.64838587, 1153516.64838587, 1153516.64838587, 1153516.64838587 float _ps_tan_q1[4] = 25008380.18233579, 25008380.18233579, 25008380.18233579, 25008380.18233579 float _ps_tan_p2[4] = -13093.693918138, -13093.693918138, -13093.693918138, -13093.693918138 float _ps_tan_q2[4] = -1320892.3444021, -1320892.3444021, -1320892.3444021, -1320892.3444021 float _ps_tan_q3[4] = 13681.296347069, 13681.296347069, 13681.296347069, 13681.296347069 float _ps_tan_poleval[4] = 36893500000000000000.0, 36893500000000000000.0, 36893500000000000000.0, 36893500000000000000.0 """ asm_code = data + """ #CODE global fast_tan_ps: movaps xmm7, xmm0 andps xmm0, oword [_ps_am_inv_sign_mask] andps xmm7, oword [_ps_am_sign_mask] movaps xmm1, xmm0 mulps xmm0, oword [_ps_am_4_o_pi] cvttps2dq xmm0, xmm0 movdqa xmm4, oword [_epi32_1] movdqa xmm5, oword [_epi32_7] pand xmm4, xmm0 pand xmm5, xmm0 movaps xmm3, oword [_ps_am_1] paddd xmm0, xmm4 paddd xmm5, xmm4 cvtdq2ps xmm0, xmm0 mulps xmm0, oword [_ps_am_pi_o_4] xorps xmm6, xmm6 subps xmm1, xmm0 movaps xmm2, oword [_ps_tan_p2] minps xmm1, xmm3 movaps xmm3, oword [_ps_tan_q3] movaps xmm0, xmm1 mulps xmm1, xmm1 mulps xmm2, xmm1 addps xmm3, xmm1 addps xmm2, oword [_ps_tan_p1] mulps xmm3, xmm1 mulps xmm2, xmm1 addps xmm3, oword [_ps_tan_q2] addps xmm2, oword [_ps_tan_p0] mulps xmm3, xmm1 mulps xmm2, xmm1 addps xmm3, oword [_ps_tan_q1] xorps xmm0, xmm7 mulps xmm3, xmm1 pand xmm5, oword [_epi32_2] addps xmm3, oword [_ps_tan_q0] mulps xmm2, xmm0 cmpps xmm6, xmm1, 4 rcpps xmm4, xmm3 pxor xmm7, xmm7 mulps xmm3, xmm4 pcmpeqd xmm5, xmm7 mulps xmm3, xmm4 addps xmm4, xmm4 orps xmm6, xmm5 subps xmm4, xmm3 mulps xmm2, xmm4 movaps xmm1, oword [_ps_am_sign_mask] movmskps eax, xmm6 addps xmm2, xmm0 rcpps xmm4, xmm2 cmp eax, 0xf movaps xmm0, xmm2 mulps xmm2, xmm4 mulps xmm2, xmm4 addps xmm4, xmm4 subps xmm4, xmm2 jne l_pole xorps xmm4, xmm1 andps xmm0, xmm5 andnps xmm5, xmm4 orps xmm0, xmm5 ret l_pole: movaps xmm7, xmm1 movaps xmm3, oword [_ps_tan_poleval] andps xmm1, xmm0 orps xmm3, xmm1 andps xmm4, xmm6 andnps xmm6, xmm3 orps xmm4, xmm6 xorps xmm4, xmm7 andps xmm0, xmm5 andnps xmm5, xmm4 orps xmm0, xmm5 ret """ avx_code = data + """ #CODE global fast_tan_ps: vmovaps xmm7, xmm0 vandps xmm0, xmm0, oword [_ps_am_inv_sign_mask] vandps xmm7, xmm7, oword [_ps_am_sign_mask] vmovaps xmm1, xmm0 vmulps xmm0, xmm0, oword [_ps_am_4_o_pi] vcvttps2dq xmm0, xmm0 vmovdqa xmm4, oword [_epi32_1] vmovdqa xmm5, oword [_epi32_7] vpand xmm4, xmm4, xmm0 vpand xmm5, xmm5, xmm0 vmovaps xmm3, oword [_ps_am_1] vpaddd xmm0, xmm0, xmm4 vpaddd xmm5, xmm5, xmm4 vcvtdq2ps xmm0, xmm0 vmulps xmm0, xmm0, oword [_ps_am_pi_o_4] vxorps xmm6, xmm6, xmm6 vsubps xmm1, xmm1, xmm0 vmovaps xmm2, oword [_ps_tan_p2] vminps xmm1, xmm1, xmm3 vmovaps xmm3, oword [_ps_tan_q3] vmovaps xmm0, xmm1 vmulps xmm1, xmm1, xmm1 vmulps xmm2, xmm2, xmm1 vaddps xmm3, xmm3, xmm1 vaddps xmm2, xmm2, oword [_ps_tan_p1] vmulps xmm3, xmm3, xmm1 vmulps xmm2, xmm2, xmm1 vaddps xmm3, xmm3, oword [_ps_tan_q2] vaddps xmm2, xmm2, oword [_ps_tan_p0] vmulps xmm3, xmm3, xmm1 vmulps xmm2, xmm2, xmm1 vaddps xmm3, xmm3, oword [_ps_tan_q1] vxorps xmm0, xmm0, xmm7 vmulps xmm3, xmm3, xmm1 vpand xmm5, xmm5, oword [_epi32_2] vaddps xmm3, xmm3, oword [_ps_tan_q0] vmulps xmm2, xmm2, xmm0 vcmpps xmm6, xmm6, xmm1, 4 vrcpps xmm4, xmm3 vpxor xmm7, xmm7, xmm7 vmulps xmm3, xmm3, xmm4 vpcmpeqd xmm5, xmm5, xmm7 vmulps xmm3, xmm3, xmm4 vaddps xmm4, xmm4, xmm4 vorps xmm6, xmm6, xmm5 vsubps xmm4, xmm4, xmm3 vmulps xmm2, xmm2, xmm4 vmovaps xmm1, oword [_ps_am_sign_mask] vmovmskps eax, xmm6 vaddps xmm2, xmm2, xmm0 vrcpps xmm4, xmm2 cmp eax, 0xf vmovaps xmm0, xmm2 vmulps xmm2, xmm2, xmm4 vmulps xmm2, xmm2, xmm4 vaddps xmm4, xmm4, xmm4 vsubps xmm4, xmm4, xmm2 jne l_pole vxorps xmm4, xmm4, xmm1 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vorps xmm0, xmm0, xmm5 ret l_pole: vmovaps xmm7, xmm1 vmovaps xmm3, oword [_ps_tan_poleval] vandps xmm1, xmm1, xmm0 vorps xmm3, xmm3, xmm1 vandps xmm4, xmm4, xmm6 vandnps xmm6, xmm6, xmm3 vorps xmm4, xmm4, xmm6 vxorps xmm4, xmm4, xmm7 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vorps xmm0, xmm0, xmm5 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
_endblt: ret """ return code def _blt_rgba_to_prgba_code(): bits = platform.architecture()[0] if bits == '64bit': return _blt_floatrgba_code64() else: return _blt_floatrgba_code32() _asm = Tdasm() _mc = _asm.assemble(_blt_rgba_to_prgba_code()) _runtime = Runtime() _data_section = _runtime.load("blt_rgba_to_prgba", _mc) def blt_rgba_to_prgba(src, dest): assert isinstance(src, ImageRGBA) assert isinstance(dest, ImagePRGBA) #TODO blt only part of image sa, spitch = src.address_info() da, dpitch = dest.address_info() dx = dy = sx = sy = 0 sw, sh = src.size()
def pow_ps(): bits = platform.architecture()[0] data = """ #DATA uint32 _epi32_1[4] = 1, 1, 1, 1 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _ps_am_inv_mant_mask[4] = 0x807FFFFF, 0x807FFFFF, 0x807FFFFF, 0x807FFFFF uint32 _ps_am_min_norm_pos[4] = 0x00800000, 0x00800000, 0x00800000, 0x00800000 uint32 _epi32_0x7f[4] = 0x7F, 0x7F, 0x7F, 0x7F float _ps_log_p0[4] = -0.7895802788, -0.7895802788, -0.7895802788, -0.7895802788 float _ps_log_q0[4] = -35.6722798256, -35.6722798256, -35.6722798256, -35.6722798256 float _ps_log_p1[4] = 16.3866645699, 16.3866645699, 16.3866645699, 16.3866645699 float _ps_log_q1[4] = 312.093766372, 312.093766372, 312.093766372, 312.093766372 float _ps_log_p2[4] = -64.14099529, -64.14099529, -64.14099529, -64.14099529 float _ps_log_q2[4] = -769.691943550, -769.691943550, -769.691943550, -769.691943550 float _ps_log2_c0[4] = 1.442695040, 1.442695040, 1.442695040, 1.442695040 float _ps_exp2_hi[4] = 127.4999961, 127.4999961, 127.4999961, 127.4999961 float _ps_exp2_lo[4] = -127.4999961, -127.4999961, -127.4999961, -127.4999961 float _ps_am_0p5[4] = 0.5, 0.5, 0.5, 0.5 float _ps_exp2_p0[4] = 0.0230933477, 0.0230933477, 0.0230933477, 0.0230933477 float _ps_exp2_q0[4] = 233.18421172, 233.18421172, 233.18421172, 233.18421172 float _ps_exp2_p1[4] = 20.202065669, 20.202065669, 20.202065669, 20.202065669 float _ps_exp2_q1[4] = 4368.211668, 4368.211668, 4368.211668, 4368.211668 float _ps_exp2_p2[4] = 1513.90680, 1513.90680, 1513.90680, 1513.90680 """ asm_code = data + """ #CODE global fast_pow_ps: xorps xmm5, xmm5 cmpps xmm5, xmm0, 1 """ if bits == '64bit': asm_code += "mov rax, rsp\n" else: asm_code += "mov eax, esp\n" asm_code += """ maxps xmm0, oword [ _ps_am_min_norm_pos] ;// cut off denormalized stuff movaps xmm7, oword [_ps_am_1] movaps xmm3, xmm0 """ if bits == '64bit': asm_code += "and rax, 0xFFFFFFF0\n" else: asm_code += "and eax, 0xFFFFFFF0\n" asm_code += """ andps xmm0, oword [_ps_am_inv_mant_mask] orps xmm0, xmm7 """ if bits == '64bit': asm_code += "movaps oword [rax - 16], xmm5\n" else: asm_code += "movaps oword [eax - 16], xmm5\n" asm_code += """ movaps xmm4, xmm0 subps xmm0, xmm7 addps xmm4, xmm7 psrld xmm3, 23 rcpps xmm4, xmm4 mulps xmm0, xmm4 psubd xmm3, oword [_epi32_0x7f] addps xmm0, xmm0 movaps xmm2, xmm0 mulps xmm0, xmm0 movaps xmm4, oword [_ps_log_p0] movaps xmm6, oword [_ps_log_q0] mulps xmm4, xmm0 movaps xmm5, oword [_ps_log_p1] mulps xmm6, xmm0 movaps xmm7, oword [_ps_log_q1] addps xmm4, xmm5 addps xmm6, xmm7 movaps xmm5, oword [_ps_log_p2] mulps xmm4, xmm0 movaps xmm7, oword [_ps_log_q2] mulps xmm6, xmm0 addps xmm4, xmm5 movaps xmm5, oword [_ps_log2_c0] addps xmm6, xmm7 cvtdq2ps xmm7, xmm3 mulps xmm0, xmm4 rcpps xmm6, xmm6 mulps xmm0, xmm6 movaps xmm4, oword [_ps_exp2_hi] mulps xmm0, xmm2 movaps xmm6, oword [_ps_exp2_lo] mulps xmm2, xmm5 mulps xmm0, xmm5 addps xmm2, xmm7 movaps xmm3, oword [_ps_am_0p5] addps xmm0, xmm2 xorps xmm2, xmm2 mulps xmm0, xmm1 minps xmm0, xmm4 movaps xmm4, oword [_ps_exp2_p0] maxps xmm0, xmm6 movaps xmm6, oword [_ps_exp2_q0] addps xmm3, xmm0 cmpps xmm2, xmm3, 5 pand xmm2, oword [_epi32_1] cvttps2dq xmm3, xmm3 psubd xmm3, xmm2 movaps xmm5, oword [_ps_exp2_p1] cvtdq2ps xmm2, xmm3 movaps xmm7, oword [_ps_exp2_q1] subps xmm0, xmm2 movaps xmm2, xmm0 mulps xmm0, xmm0 paddd xmm3, oword [_epi32_0x7f] mulps xmm4, xmm0 mulps xmm6, xmm0 addps xmm4, xmm5 addps xmm6, xmm7 mulps xmm4, xmm0 """ if bits == '64bit': asm_code += "movaps xmm5, oword [rax - 16]\n" else: asm_code += "movaps xmm5, oword [eax - 16]\n" asm_code += """ pslld xmm3, 23 addps xmm4, oword [_ps_exp2_p2] mulps xmm2, xmm4 movaps xmm0, oword [_ps_am_1] subps xmm6, xmm2 andps xmm3, xmm5 rcpps xmm6, xmm6 mulps xmm2, xmm6 addps xmm2, xmm2 addps xmm0, xmm2 mulps xmm0, xmm3 ret """ avx_code = data + """ #CODE global fast_pow_ps: vxorps xmm5, xmm5, xmm5 vcmpps xmm5, xmm5, xmm0, 1 """ if bits == '64bit': avx_code += "mov rax, rsp\n" else: avx_code += "mov eax, esp\n" avx_code += """ vmaxps xmm0, xmm0, oword [ _ps_am_min_norm_pos] ;// cut off denormalized stuff vmovaps xmm7, oword [_ps_am_1] vmovaps xmm3, xmm0 """ if bits == '64bit': avx_code += "and rax, 0xFFFFFFF0\n" else: avx_code += "and eax, 0xFFFFFFF0\n" avx_code += """ vandps xmm0, xmm0, oword [_ps_am_inv_mant_mask] vorps xmm0, xmm0, xmm7 """ if bits == '64bit': avx_code += "vmovaps oword [rax - 16], xmm5\n" else: avx_code += "vmovaps oword [eax - 16], xmm5\n" avx_code += """ vmovaps xmm4, xmm0 vsubps xmm0, xmm0, xmm7 vaddps xmm4, xmm4, xmm7 vpsrld xmm3, xmm3, 23 vrcpps xmm4, xmm4 vmulps xmm0, xmm0, xmm4 vpsubd xmm3, xmm3, oword [_epi32_0x7f] vaddps xmm0, xmm0, xmm0 vmovaps xmm2, xmm0 vmulps xmm0, xmm0, xmm0 vmovaps xmm4, oword [_ps_log_p0] vmovaps xmm6, oword [_ps_log_q0] vmulps xmm4, xmm4, xmm0 vmovaps xmm5, oword [_ps_log_p1] vmulps xmm6, xmm6, xmm0 vmovaps xmm7, oword [_ps_log_q1] vaddps xmm4, xmm4, xmm5 vaddps xmm6, xmm6, xmm7 vmovaps xmm5, oword [_ps_log_p2] vmulps xmm4, xmm4, xmm0 vmovaps xmm7, oword [_ps_log_q2] vmulps xmm6, xmm6, xmm0 vaddps xmm4, xmm4, xmm5 vmovaps xmm5, oword [_ps_log2_c0] vaddps xmm6, xmm6, xmm7 vcvtdq2ps xmm7, xmm3 vmulps xmm0, xmm0, xmm4 vrcpps xmm6, xmm6 vmulps xmm0, xmm0, xmm6 vmovaps xmm4, oword [_ps_exp2_hi] vmulps xmm0, xmm0, xmm2 vmovaps xmm6, oword [_ps_exp2_lo] vmulps xmm2, xmm2, xmm5 vmulps xmm0, xmm0, xmm5 vaddps xmm2, xmm2, xmm7 vmovaps xmm3, oword [_ps_am_0p5] vaddps xmm0, xmm0, xmm2 vxorps xmm2, xmm2, xmm2 vmulps xmm0, xmm0, xmm1 vminps xmm0, xmm0, xmm4 vmovaps xmm4, oword [_ps_exp2_p0] vmaxps xmm0, xmm0, xmm6 vmovaps xmm6, oword [_ps_exp2_q0] vaddps xmm3, xmm3, xmm0 vcmpps xmm2, xmm2, xmm3, 5 vpand xmm2, xmm2, oword [_epi32_1] vcvttps2dq xmm3, xmm3 vpsubd xmm3, xmm3, xmm2 vmovaps xmm5, oword [_ps_exp2_p1] vcvtdq2ps xmm2, xmm3 vmovaps xmm7, oword [_ps_exp2_q1] vsubps xmm0, xmm0, xmm2 vmovaps xmm2, xmm0 vmulps xmm0, xmm0, xmm0 vpaddd xmm3, xmm3, oword [_epi32_0x7f] vmulps xmm4, xmm4, xmm0 vmulps xmm6, xmm6, xmm0 vaddps xmm4, xmm4, xmm5 vaddps xmm6, xmm6, xmm7 vmulps xmm4, xmm4, xmm0 """ if bits == '64bit': avx_code += "vmovaps xmm5, oword [rax - 16]\n" else: avx_code += "vmovaps xmm5, oword [eax - 16]\n" avx_code += """ vpslld xmm3, xmm3, 23 vaddps xmm4, xmm4, oword [_ps_exp2_p2] vmulps xmm2, xmm2, xmm4 vmovaps xmm0, oword [_ps_am_1] vsubps xmm6, xmm6, xmm2 vandps xmm3, xmm3, xmm5 vrcpps xmm6, xmm6 vmulps xmm2, xmm2, xmm6 vaddps xmm2, xmm2, xmm2 vaddps xmm0, xmm0, xmm2 vmulps xmm0, xmm0, xmm3 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
def random_float(): data = """ #DATA uint32 cur_seed[4] uint32 mult[4] = 214013, 17405, 214013, 69069 uint32 gadd[4] = 2531011, 10395331, 13737667, 1 uint32 mask[4] = 0xFFFFFFFF, 0, 0xFFFFFFFF, 0 uint32 masklo[4] = 0x00007FFF, 0x00007FFF, 0x00007FFF, 0x00007FFF uint32 _random_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF float _random_flt[4] = 0.000000000465661287524, 0.000000000465661287524, 0.000000000465661287524, 0.000000000465661287524 """ asm_code = data + """ #CODE global random: movdqa xmm0, oword [gadd] movdqa xmm1, oword [mult] movdqa xmm2, oword [mask] pshufd xmm4, oword [cur_seed], 10110001b movdqa xmm5, oword [cur_seed] pmuludq xmm5, xmm1 pshufd xmm1, xmm1, 10110001b pmuludq xmm4, xmm1 pand xmm5, xmm2 pand xmm4, xmm2 pshufd xmm4, xmm4, 10110001b por xmm5, xmm4 paddd xmm5, xmm0 movdqa oword [cur_seed], xmm5 ;convert to float pand xmm5, oword [_random_sign_mask] cvtdq2ps xmm0, xmm5 mulps xmm0, oword [_random_flt] ret """ avx_code = data + """ #CODE global random: vmovdqa xmm0, oword [gadd] vmovdqa xmm1, oword [mult] vmovdqa xmm2, oword [mask] vpshufd xmm4, oword [cur_seed], 10110001b vmovdqa xmm5, oword [cur_seed] vpmuludq xmm5, xmm5, xmm1 vpshufd xmm1, xmm1, 10110001b vpmuludq xmm4, xmm4, xmm1 vpand xmm5, xmm5, xmm2 vpand xmm4, xmm4, xmm2 vpshufd xmm4, xmm4, 10110001b vpor xmm5, xmm5, xmm4 vpaddd xmm5, xmm5, xmm0 vmovdqa oword [cur_seed], xmm5 ;convert to float vpand xmm5, xmm5, oword [_random_sign_mask] vcvtdq2ps xmm0, xmm5 vmulps xmm0, xmm0, oword [_random_flt] ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
def log_ss(): data = """ #DATA float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _ps_am_min_norm_pos[4] = 0x00800000, 0x00800000, 0x00800000, 0x00800000 uint32 _ps_am_inv_mant_mask[4] = 0x807FFFFF, 0x807FFFFF, 0x807FFFFF, 0x807FFFFF float _ps_log_p0[4] = -0.789580278884, -0.789580278884, -0.789580278884, -0.789580278884 float _ps_log_q0[4] = -35.6722798256, -35.6722798256, -35.6722798256, -35.6722798256 float _ps_log_p1[4] = 16.38666456995, 16.38666456995, 16.38666456995, 16.38666456995 float _ps_log_q1[4] = 312.0937663722, 312.0937663722, 312.0937663722, 312.0937663722 float _ps_log_p2[4] = -64.14099529587, -64.14099529587, -64.14099529587, -64.14099529587 float _ps_log_q2[4] = -769.69194355046, -769.69194355046, -769.69194355046, -769.69194355046 float _ps_log_c0[4] = 0.6931471805599, 0.6931471805599, 0.6931471805599, 0.6931471805599 """ asm_code = data + """ #CODE global fast_log_ss: maxss xmm0, dword [_ps_am_min_norm_pos] ; cut off denormalized stuff movss xmm1, dword [_ps_am_1] movd edx, xmm0 andps xmm0, oword [_ps_am_inv_mant_mask] orps xmm0, xmm1 movaps xmm4, xmm0 subss xmm0, xmm1 addss xmm4, xmm1 shr edx, 23 rcpss xmm4, xmm4 mulss xmm0, xmm4 addss xmm0, xmm0 movaps xmm2, xmm0 mulss xmm0, xmm0 sub edx, 0x7f movss xmm4, dword [_ps_log_p0] movss xmm6, dword [_ps_log_q0] mulss xmm4, xmm0 movss xmm5, dword [_ps_log_p1] mulss xmm6, xmm0 movss xmm7, dword [_ps_log_q1] addss xmm4, xmm5 addss xmm6, xmm7 movss xmm5, dword [_ps_log_p2] mulss xmm4, xmm0 movss xmm7, dword [_ps_log_q2] mulss xmm6, xmm0 addss xmm4, xmm5 movss xmm5, dword [_ps_log_c0] addss xmm6, xmm7 cvtsi2ss xmm1, edx mulss xmm0, xmm4 rcpss xmm6, xmm6 mulss xmm0, xmm6 mulss xmm0, xmm2 mulss xmm1, xmm5 addss xmm0, xmm2 addss xmm0, xmm1 ret """ avx_code = data + """ #CODE global fast_log_ss: vmaxss xmm0, xmm0, dword [_ps_am_min_norm_pos] ; cut off denormalized stuff vmovss xmm1, dword [_ps_am_1] movd edx, xmm0 vandps xmm0, xmm0, oword [_ps_am_inv_mant_mask] vorps xmm0, xmm0, xmm1 vmovaps xmm4, xmm0 vsubss xmm0, xmm0, xmm1 vaddss xmm4, xmm4, xmm1 shr edx, 23 vrcpss xmm4, xmm4, xmm4 vmulss xmm0, xmm0, xmm4 vaddss xmm0, xmm0, xmm0 vmovaps xmm2, xmm0 vmulss xmm0, xmm0, xmm0 sub edx, 0x7f vmovss xmm4, dword [_ps_log_p0] vmovss xmm6, dword [_ps_log_q0] vmulss xmm4, xmm4, xmm0 vmovss xmm5, dword [_ps_log_p1] vmulss xmm6, xmm6, xmm0 vmovss xmm7, dword [_ps_log_q1] vaddss xmm4, xmm4, xmm5 vaddss xmm6, xmm6, xmm7 vmovss xmm5, dword [_ps_log_p2] vmulss xmm4, xmm4, xmm0 vmovss xmm7, dword [_ps_log_q2] vmulss xmm6, xmm6, xmm0 vaddss xmm4, xmm4, xmm5 vmovss xmm5, dword [_ps_log_c0] vaddss xmm6, xmm6, xmm7 vcvtsi2ss xmm1, xmm1, edx vmulss xmm0, xmm0, xmm4 vrcpss xmm6, xmm6, xmm6 vmulss xmm0, xmm0, xmm6 vmulss xmm0, xmm0, xmm2 vmulss xmm1, xmm1, xmm5 vaddss xmm0, xmm0, xmm2 vaddss xmm0, xmm0, xmm1 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
def asin_ps(): data = """ #DATA uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 float _ps_am_m1[4] = -1.0, -1.0, -1.0, -1.0 float _ps_atan_t0[4] = -0.091646118527, -0.091646118527, -0.091646118527, -0.091646118527 float _ps_atan_s0[4] = 1.2797564625, 1.2797564625, 1.2797564625, 1.2797564625 float _ps_atan_s1[4] = 2.1972168858, 2.1972168858, 2.1972168858, 2.1972168858 float _ps_atan_t1[4] = -1.395694568, -1.395694568, -1.395694568, -1.395694568 float _ps_atan_s2[4] = 6.8193064723, 6.8193064723, 6.8193064723 ,6.8193064723 float _ps_atan_t2[4] = -94.3939261227, -94.3939261227, -94.3939261227, -94.3939261227 float _ps_atan_s3[4] = 28.205206687, 28.205206687, 28.205206687, 28.205206687 float _ps_atan_t3[4] = 12.888383034, 12.888383034, 12.888383034, 12.888383034 float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679 """ asm_code = data + """ #CODE global fast_asin_ps: movaps xmm1, oword [_ps_am_1] movaps xmm2, xmm1 addps xmm1, xmm0 subps xmm2, xmm0 mulps xmm1, xmm2 rsqrtps xmm1, xmm1 mulps xmm0, xmm1 ;atan movaps xmm5, oword [_ps_am_1] movaps xmm6, oword [_ps_am_m1] rcpps xmm4, xmm0 cmpps xmm5, xmm0, 1 cmpps xmm6, xmm0, 6 movaps xmm1, oword [_ps_atan_s0] orps xmm5, xmm6 andps xmm4, xmm5 movaps xmm2, oword [_ps_atan_t0] movaps xmm7, xmm5 andnps xmm5, xmm0 movaps xmm3, oword [_ps_atan_s1] orps xmm4, xmm5 movaps xmm0, xmm4 movaps xmm6, oword [_ps_atan_t1] mulps xmm4, xmm4 addps xmm1, xmm4 movaps xmm5, oword [_ps_atan_s2] rcpps xmm1, xmm1 mulps xmm1, xmm2 movaps xmm2, oword [_ps_atan_t2] addps xmm3, xmm4 addps xmm1, xmm3 movaps xmm3, oword [_ps_atan_s3] rcpps xmm1, xmm1 mulps xmm1, xmm6 movaps xmm6, oword [_ps_atan_t3] addps xmm5, xmm4 addps xmm1, xmm5 movaps xmm5, oword [_ps_am_sign_mask] rcpps xmm1, xmm1 mulps xmm1, xmm2 addps xmm3, xmm4 movaps xmm4, oword [_ps_am_pi_o_2] mulps xmm6, xmm0 addps xmm1, xmm3 andps xmm0, xmm5 rcpps xmm1, xmm1 mulps xmm1, xmm6 orps xmm0, xmm4 subps xmm0, xmm1 andps xmm0, xmm7 andnps xmm7, xmm1 orps xmm0, xmm7 ret """ avx_code = data + """ #CODE global fast_asin_ps: vmovaps xmm1, oword [_ps_am_1] vmovaps xmm2, xmm1 vaddps xmm1, xmm1, xmm0 vsubps xmm2, xmm2, xmm0 vmulps xmm1, xmm1, xmm2 vrsqrtps xmm1, xmm1 vmulps xmm0, xmm0, xmm1 ;atan vmovaps xmm5, oword [_ps_am_1] vmovaps xmm6, oword [_ps_am_m1] vrcpps xmm4, xmm0 vcmpps xmm5, xmm5, xmm0, 1 vcmpps xmm6, xmm6, xmm0, 6 vmovaps xmm1, oword [_ps_atan_s0] vorps xmm5, xmm5, xmm6 vandps xmm4, xmm4, xmm5 vmovaps xmm2, oword [_ps_atan_t0] vmovaps xmm7, xmm5 vandnps xmm5, xmm5, xmm0 vmovaps xmm3, oword [_ps_atan_s1] vorps xmm4, xmm4, xmm5 vmovaps xmm0, xmm4 vmovaps xmm6, oword [_ps_atan_t1] vmulps xmm4, xmm4, xmm4 vaddps xmm1, xmm1, xmm4 vmovaps xmm5, oword [_ps_atan_s2] vrcpps xmm1, xmm1 vmulps xmm1, xmm1, xmm2 vmovaps xmm2, oword [_ps_atan_t2] vaddps xmm3, xmm3, xmm4 vaddps xmm1, xmm1, xmm3 vmovaps xmm3, oword [_ps_atan_s3] vrcpps xmm1, xmm1 vmulps xmm1, xmm1, xmm6 vmovaps xmm6, oword [_ps_atan_t3] vaddps xmm5, xmm5, xmm4 vaddps xmm1, xmm1, xmm5 vmovaps xmm5, oword [_ps_am_sign_mask] vrcpps xmm1, xmm1 vmulps xmm1, xmm1, xmm2 vaddps xmm3, xmm3, xmm4 vmovaps xmm4, oword [_ps_am_pi_o_2] vmulps xmm6, xmm6, xmm0 vaddps xmm1, xmm1, xmm3 vandps xmm0, xmm0, xmm5 vrcpps xmm1, xmm1 vmulps xmm1, xmm1, xmm6 vorps xmm0, xmm0, xmm4 vsubps xmm0, xmm0, xmm1 vandps xmm0, xmm0, xmm7 vandnps xmm7, xmm7, xmm1 vorps xmm0, xmm0, xmm7 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
add dword [y], 1 jmp _bltrgba _endblt: ret """ return code def _blt_rgba_to_prgba_code(): bits = platform.architecture()[0] if bits == '64bit': return _blt_floatrgba_code64() else: return _blt_floatrgba_code32() _asm = Tdasm() _mc = _asm.assemble(_blt_rgba_to_prgba_code()) _runtime = Runtime() _data_section = _runtime.load("blt_rgba_to_prgba", _mc) def blt_rgba_to_prgba(src, dest): assert isinstance(src, ImageRGBA) assert isinstance(dest, ImagePRGBA) #TODO blt only part of image sa, spitch = src.address_info() da, dpitch = dest.address_info() dx = dy = sx = sy = 0 sw, sh = src.size()
mov ebx, dword [eax + hitpoint.t] mov edx, dword [esp + 16] ;populate new minimum distance mov dword [edx], ebx jmp _next_object _end_objects: add esp, 20 ret """ asm = Tdasm() renmas.shapes.multiple_isect_asm(runtime, "multiple_isect") mc = asm.assemble(ASM) def v4(v3): return (v3.x, v3.y, v3.z, 0.0) ds = runtime.load("test", mc) ray = ren.random_ray() ds["ray1.origin"] = v4(ray.origin) ds["ray1.dir"] = v4(ray.dir) ds["num"] = len(lst_shapes) ds["addrs"] = adrese runtime.run("test") print(ds["hp.t"], ds["clocks"])
def tan_ss(): data = """ #DATA uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF float _ps_am_4_o_pi[4] = 1.273239544735, 1.273239544735, 1.273239544735, 1.273239544735 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 float _ps_am_pi_o_4[4] = 0.78539816339, 0.78539816339, 0.78539816339, 0.78539816339 uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000 float _ps_tan_p0[4] = -17956525.197648, -17956525.197648, -17956525.197648, -17956525.197648 float _ps_tan_q0[4] = -53869575.592945, -53869575.592945, -53869575.592945, -53869575.592945 float _ps_tan_p1[4] = 1153516.64838587, 1153516.64838587, 1153516.64838587, 1153516.64838587 float _ps_tan_q1[4] = 25008380.18233579, 25008380.18233579, 25008380.18233579, 25008380.18233579 float _ps_tan_p2[4] = -13093.693918138, -13093.693918138, -13093.693918138, -13093.693918138 float _ps_tan_q2[4] = -1320892.3444021, -1320892.3444021, -1320892.3444021, -1320892.3444021 float _ps_tan_q3[4] = 13681.296347069, 13681.296347069, 13681.296347069, 13681.296347069 float _ps_tan_poleval[4] = 36893500000000000000.0, 36893500000000000000.0, 36893500000000000000.0, 36893500000000000000.0 """ asm_code = data + """ #CODE global fast_tan_ss: movss xmm1, dword [_ps_am_inv_sign_mask] movd eax, xmm0 andps xmm0, xmm1 movaps xmm1, xmm0 mulss xmm0, dword [_ps_am_4_o_pi] cvttss2si edx, xmm0 and eax, 0x80000000 mov ecx, 0x1 movd xmm7, eax mov eax, 0x7 movss xmm5, dword [_ps_am_1] and ecx, edx and eax, edx add edx, ecx add eax, ecx cvtsi2ss xmm0, edx xorps xmm6, xmm6 mulss xmm0, dword [_ps_am_pi_o_4] subss xmm1, xmm0 movss xmm2, dword [_ps_tan_p2] minss xmm1, xmm5 movss xmm3, dword [_ps_tan_q3] movaps xmm0, xmm1 mulss xmm1, xmm1 mulss xmm2, xmm1 addss xmm3, xmm1 addss xmm2, dword [_ps_tan_p1] mulss xmm3, xmm1 mulss xmm2, xmm1 addss xmm3, dword [_ps_tan_q2] addss xmm2, dword [_ps_tan_p0] mulss xmm3, xmm1 mulss xmm2, xmm1 addss xmm3, dword [_ps_tan_q1] xorps xmm0, xmm7 mulss xmm3, xmm1 mulss xmm2, xmm0 addss xmm3, dword [_ps_tan_q0] rcpss xmm4, xmm3 mulss xmm3, xmm4 mulss xmm3, xmm4 addss xmm4, xmm4 test eax, 0x2 subss xmm4, xmm3 mulss xmm2, xmm4 jz l_cont addss xmm2, xmm0 comiss xmm6, xmm1 rcpss xmm4, xmm2 movss xmm0, dword [_ps_am_sign_mask] jz l_pole mulss xmm2, xmm4 mulss xmm2, xmm4 addss xmm4, xmm4 subss xmm4, xmm2 xorps xmm0, xmm4 ret l_pole: movss xmm1, dword [_ps_tan_poleval] movaps xmm3, xmm0 andps xmm0, xmm2 orps xmm0, xmm1 xorps xmm0, xmm3 ret l_cont: addss xmm0, xmm2 ret """ avx_code = data + """ #CODE global fast_tan_ss: vmovss xmm1, dword [_ps_am_inv_sign_mask] vmovd eax, xmm0 vandps xmm0, xmm0, xmm1 vmovaps xmm1, xmm0 vmulss xmm0, xmm0, dword [_ps_am_4_o_pi] vcvttss2si edx, xmm0 and eax, 0x80000000 mov ecx, 0x1 vmovd xmm7, eax mov eax, 0x7 vmovss xmm5, dword [_ps_am_1] and ecx, edx and eax, edx add edx, ecx add eax, ecx vcvtsi2ss xmm0, xmm0, edx vxorps xmm6, xmm6, xmm6 vmulss xmm0, xmm0, dword [_ps_am_pi_o_4] vsubss xmm1, xmm1, xmm0 vmovss xmm2, dword [_ps_tan_p2] vminss xmm1, xmm1, xmm5 vmovss xmm3, dword [_ps_tan_q3] vmovaps xmm0, xmm1 vmulss xmm1, xmm1, xmm1 vmulss xmm2, xmm2, xmm1 vaddss xmm3, xmm3, xmm1 vaddss xmm2, xmm2, dword [_ps_tan_p1] vmulss xmm3, xmm3, xmm1 vmulss xmm2, xmm2, xmm1 vaddss xmm3, xmm3, dword [_ps_tan_q2] vaddss xmm2, xmm2, dword [_ps_tan_p0] vmulss xmm3, xmm3, xmm1 vmulss xmm2, xmm2, xmm1 vaddss xmm3, xmm3, dword [_ps_tan_q1] vxorps xmm0, xmm0, xmm7 vmulss xmm3, xmm3, xmm1 vmulss xmm2, xmm2, xmm0 vaddss xmm3, xmm3, dword [_ps_tan_q0] vrcpss xmm4, xmm4, xmm3 vmulss xmm3, xmm3, xmm4 vmulss xmm3, xmm3, xmm4 vaddss xmm4, xmm4, xmm4 test eax, 0x2 vsubss xmm4, xmm4, xmm3 vmulss xmm2, xmm2, xmm4 jz l_cont vaddss xmm2, xmm2, xmm0 vcomiss xmm6, xmm1 vrcpss xmm4, xmm4, xmm2 vmovss xmm0, dword [_ps_am_sign_mask] jz l_pole vmulss xmm2, xmm2, xmm4 vmulss xmm2, xmm2, xmm4 vaddss xmm4, xmm4, xmm4 vsubss xmm4, xmm4, xmm2 vxorps xmm0, xmm0, xmm4 ret l_pole: vmovss xmm1, dword [_ps_tan_poleval] vmovaps xmm3, xmm0 vandps xmm0, xmm0, xmm2 vorps xmm0, xmm0, xmm1 vxorps xmm0, xmm0, xmm3 ret l_cont: vaddss xmm0, xmm0, xmm2 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
def sin_ss(): data = """ #DATA uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000 float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236 uint32 _epi32_1[4] = 1, 1, 1, 1 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _epi32_2[4] = 2, 2, 2, 2 float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413 float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262 float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896 """ asm_code = data + """ #CODE global fast_sin_ss: movaps xmm7, xmm0 movss xmm1, dword [_ps_am_inv_sign_mask] movss xmm2, dword [_ps_am_sign_mask] movss xmm3, dword [_ps_am_2_o_pi] andps xmm0, xmm1 andps xmm7, xmm2 mulss xmm0, xmm3 pxor xmm3, xmm3 movd xmm5, dword [_epi32_1] movss xmm4, dword [_ps_am_1] cvttps2dq xmm2, xmm0 pand xmm5, xmm2 movd xmm1, dword [_epi32_2] pcmpeqd xmm5, xmm3 cvtdq2ps xmm6, xmm2 pand xmm2, xmm1 pslld xmm2, 30 subss xmm0, xmm6 movss xmm3, dword [_ps_sincos_p3] minss xmm0, xmm4 subss xmm4, xmm0 andps xmm0, xmm5 andnps xmm5, xmm4 orps xmm0, xmm5 movaps xmm1, xmm0 movss xmm4, dword [_ps_sincos_p2] mulss xmm0, xmm0 xorps xmm2, xmm7 movss xmm5, dword [_ps_sincos_p1] orps xmm1, xmm2 movaps xmm7, xmm0 mulss xmm0, xmm3 movss xmm6, dword [_ps_sincos_p0] addss xmm0, xmm4 mulss xmm0, xmm7 addss xmm0, xmm5 mulss xmm0, xmm7 addss xmm0, xmm6 mulss xmm0, xmm1 ret """ avx_code = data + """ #CODE global fast_sin_ss: vmovaps xmm7, xmm0 vmovss xmm1, dword [_ps_am_inv_sign_mask] vmovss xmm2, dword [_ps_am_sign_mask] vmovss xmm3, dword [_ps_am_2_o_pi] vandps xmm0, xmm0, xmm1 vandps xmm7, xmm7, xmm2 vmulss xmm0, xmm0, xmm3 vpxor xmm3, xmm3, xmm3 vmovd xmm5, dword [_epi32_1] vmovss xmm4, dword [_ps_am_1] vcvttps2dq xmm2, xmm0 vpand xmm5, xmm5, xmm2 vmovd xmm1, dword [_epi32_2] vpcmpeqd xmm5, xmm5, xmm3 vcvtdq2ps xmm6, xmm2 vpand xmm2, xmm2, xmm1 vpslld xmm2, xmm2, 30 vsubss xmm0, xmm0, xmm6 vmovss xmm3, dword [_ps_sincos_p3] vminss xmm0, xmm0, xmm4 vsubss xmm4, xmm4, xmm0 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vorps xmm0, xmm0, xmm5 vmovaps xmm1, xmm0 vmovss xmm4, dword [_ps_sincos_p2] vmulss xmm0, xmm0, xmm0 vxorps xmm2, xmm2, xmm7 vmovss xmm5, dword [_ps_sincos_p1] vorps xmm1, xmm1, xmm2 vmovaps xmm7, xmm0 vmulss xmm0, xmm0, xmm3 vmovss xmm6, dword [_ps_sincos_p0] vaddss xmm0, xmm0, xmm4 vmulss xmm0, xmm0, xmm7 vaddss xmm0, xmm0, xmm5 vmulss xmm0, xmm0, xmm7 vaddss xmm0, xmm0, xmm6 vmulss xmm0, xmm0, xmm1 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
#CODE call _memcpy #END _memcpy: mov ecx, dword [n] mov esi, dword [sa] mov edi, dword [da] rep movs byte [edi], byte [esi] ret """ asm = Tdasm() m = asm.assemble(MEMCPY) run = Runtime() data_section = run.load("memcpy", m) def memcpy(da, sa, n): data_section["da"] = da data_section["sa"] = sa data_section["n"] = n run.run("memcpy") class DynamicArray: def __init__(self, struct, reserve=0): self.size = 0 self.struct = struct
if gamma < 0.0: return False if beta + gamma > 1.0: return False e3 = a * p - b * r + d * s t = e3 * inv_denom if t < 0.00001: return False # self-intersection return (beta, gamma, t) code = ray_triangle_intersection("ray_triangle_intersection") asm = Tdasm() mc = asm.assemble(code, True) runtime = Runtime() runtime.load('ray_triangle', mc) # xmm3 - origin # xmm4 - direction # xmm5 - p0 # xmm6 - p1 # xmm7 - p2 # edx - min_distance test_code = """ #DATA float p0[4] float p1[4]
code += lst_inst2[l] + "\n" for l in range(len(lst_inst2), len(lst_inst1)): code += lst_inst1[l] + "\n" return code def arth128_32(tokens): return arth_mix(tokens, 128, 32) def arth32_128(tokens): return arth_mix(tokens, 32, 128) def arth128_128(tokens): return arth_mix(tokens, 128, 128) def arth32_32(tokens): return arth_mix(tokens, 32, 32) if __name__ == "__main__": asm = Tdasm() asm.register_macro("arth128", arth128) asm.register_macro("arth32", arth32) mc = asm.assemble(ASM_CODE) run = Runtime() ds = run.load("test", mc) run.run("test") print(ds["rez"])
def pow_ss(): data = """ #DATA uint32 _epi32_1[4] = 1, 1, 1, 1 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _ps_am_inv_mant_mask[4] = 0x807FFFFF, 0x807FFFFF, 0x807FFFFF, 0x807FFFFF uint32 _ps_am_min_norm_pos[4] = 0x00800000, 0x00800000, 0x00800000, 0x00800000 uint32 _epi32_0x7f[4] = 0x7F, 0x7F, 0x7F, 0x7F float _ps_log_p0[4] = -0.7895802788, -0.7895802788, -0.7895802788, -0.7895802788 float _ps_log_q0[4] = -35.6722798256, -35.6722798256, -35.6722798256, -35.6722798256 float _ps_log_p1[4] = 16.3866645699, 16.3866645699, 16.3866645699, 16.3866645699 float _ps_log_q1[4] = 312.093766372, 312.093766372, 312.093766372, 312.093766372 float _ps_log_p2[4] = -64.14099529, -64.14099529, -64.14099529, -64.14099529 float _ps_log_q2[4] = -769.691943550, -769.691943550, -769.691943550, -769.691943550 float _ps_log2_c0[4] = 1.442695040, 1.442695040, 1.442695040, 1.442695040 float _ps_exp2_hi[4] = 127.4999961, 127.4999961, 127.4999961, 127.4999961 float _ps_exp2_lo[4] = -127.4999961, -127.4999961, -127.4999961, -127.4999961 float _ps_am_0p5[4] = 0.5, 0.5, 0.5, 0.5 float _ps_exp2_p0[4] = 0.0230933477, 0.0230933477, 0.0230933477, 0.0230933477 float _ps_exp2_q0[4] = 233.18421172, 233.18421172, 233.18421172, 233.18421172 float _ps_exp2_p1[4] = 20.202065669, 20.202065669, 20.202065669, 20.202065669 float _ps_exp2_q1[4] = 4368.211668, 4368.211668, 4368.211668, 4368.211668 float _ps_exp2_p2[4] = 1513.90680, 1513.90680, 1513.90680, 1513.90680 """ asm_code = data + """ #CODE global fast_pow_ss: xorps xmm5, xmm5 movss xmm2, dword [_ps_am_inv_mant_mask] cmpss xmm5, xmm0, 1 maxss xmm0, dword [_ps_am_min_norm_pos] ;// cut off denormalized stuff movss xmm7, dword [_ps_am_1] movaps xmm3, xmm0 andps xmm0, xmm2 orps xmm0, xmm7 movss dword [esp - 4], xmm5 movaps xmm4, xmm0 movd xmm2, dword [_epi32_0x7f] subss xmm0, xmm7 addss xmm4, xmm7 psrld xmm3, 23 rcpss xmm4, xmm4 mulss xmm0, xmm4 psubd xmm3, xmm2 addss xmm0, xmm0 movaps xmm2, xmm0 mulss xmm0, xmm0 movss xmm4, dword [_ps_log_p0] movss xmm6, dword [_ps_log_q0] mulss xmm4, xmm0 movss xmm5, dword [_ps_log_p1] mulss xmm6, xmm0 movss xmm7, dword [_ps_log_q1] addss xmm4, xmm5 addss xmm6, xmm7 movss xmm5, dword [_ps_log_p2] mulss xmm4, xmm0 movss xmm7, dword [_ps_log_q2] mulss xmm6, xmm0 addss xmm4, xmm5 movss xmm5, dword [_ps_log2_c0] addss xmm6, xmm7 cvtdq2ps xmm7, xmm3 mulss xmm0, xmm4 rcpss xmm6, xmm6 mulss xmm0, xmm6 movss xmm4, dword [_ps_exp2_hi] mulss xmm0, xmm2 movss xmm6, dword [_ps_exp2_lo] mulss xmm2, xmm5 mulss xmm0, xmm5 addss xmm2, xmm7 movss xmm3, dword [_ps_am_0p5] addss xmm0, xmm2 xorps xmm2, xmm2 movd xmm5, dword [_epi32_1] mulss xmm0, xmm1 minss xmm0, xmm4 movss xmm4, dword [_ps_exp2_p0] maxss xmm0, xmm6 movss xmm6, dword [_ps_exp2_q0] addss xmm3, xmm0 cmpss xmm2, xmm3, 5 pand xmm2, xmm5 cvttps2dq xmm3, xmm3 psubd xmm3, xmm2 cvtdq2ps xmm2, xmm3 subss xmm0, xmm2 movaps xmm2, xmm0 mulss xmm0, xmm0 paddd xmm3, oword [_epi32_0x7f] mulss xmm4, xmm0 mulss xmm6, xmm0 addss xmm4, dword [_ps_exp2_p1] addss xmm6, dword [_ps_exp2_q1] mulss xmm4, xmm0 addss xmm4, dword [_ps_exp2_p2] mulss xmm2, xmm4 movss xmm0, dword [_ps_am_1] subss xmm6, xmm2 pslld xmm3, 23 rcpss xmm6, xmm6 movss xmm5, dword [esp - 4] mulss xmm2, xmm6 andps xmm3, xmm5 addss xmm2, xmm2 addss xmm0, xmm2 mulss xmm0, xmm3 ret """ avx_code = data + """ #CODE global fast_pow_ss: vxorps xmm5, xmm5, xmm5 vmovss xmm2, dword [_ps_am_inv_mant_mask] vcmpss xmm5, xmm5, xmm0, 1 vmaxss xmm0, xmm0, dword [_ps_am_min_norm_pos] ;// cut off denormalized stuff vmovss xmm7, dword [_ps_am_1] vmovaps xmm3, xmm0 vandps xmm0, xmm0, xmm2 vorps xmm0, xmm0, xmm7 vmovss dword [esp - 4], xmm5 vmovaps xmm4, xmm0 vmovd xmm2, dword [_epi32_0x7f] vsubss xmm0, xmm0, xmm7 vaddss xmm4, xmm4, xmm7 vpsrld xmm3, xmm3, 23 vrcpss xmm4, xmm4, xmm4 vmulss xmm0, xmm0, xmm4 vpsubd xmm3, xmm3, xmm2 vaddss xmm0, xmm0, xmm0 vmovaps xmm2, xmm0 vmulss xmm0, xmm0, xmm0 vmovss xmm4, dword [_ps_log_p0] vmovss xmm6, dword [_ps_log_q0] vmulss xmm4, xmm4, xmm0 vmovss xmm5, dword [_ps_log_p1] vmulss xmm6, xmm6, xmm0 vmovss xmm7, dword [_ps_log_q1] vaddss xmm4, xmm4, xmm5 vaddss xmm6, xmm6, xmm7 vmovss xmm5, dword [_ps_log_p2] vmulss xmm4, xmm4, xmm0 vmovss xmm7, dword [_ps_log_q2] vmulss xmm6, xmm6, xmm0 vaddss xmm4, xmm4, xmm5 vmovss xmm5, dword [_ps_log2_c0] vaddss xmm6, xmm6, xmm7 vcvtdq2ps xmm7, xmm3 vmulss xmm0, xmm0, xmm4 vrcpss xmm6, xmm6, xmm6 vmulss xmm0, xmm0, xmm6 vmovss xmm4, dword [_ps_exp2_hi] vmulss xmm0, xmm0, xmm2 vmovss xmm6, dword [_ps_exp2_lo] vmulss xmm2, xmm2, xmm5 vmulss xmm0, xmm0, xmm5 vaddss xmm2, xmm2, xmm7 vmovss xmm3, dword [_ps_am_0p5] vaddss xmm0, xmm0, xmm2 vxorps xmm2, xmm2, xmm2 vmovd xmm5, dword [_epi32_1] vmulss xmm0, xmm0, xmm1 vminss xmm0, xmm0, xmm4 vmovss xmm4, dword [_ps_exp2_p0] vmaxss xmm0, xmm0, xmm6 vmovss xmm6, dword [_ps_exp2_q0] vaddss xmm3, xmm3, xmm0 vcmpss xmm2, xmm2, xmm3, 5 vpand xmm2, xmm2, xmm5 vcvttps2dq xmm3, xmm3 vpsubd xmm3, xmm3, xmm2 vcvtdq2ps xmm2, xmm3 vsubss xmm0, xmm0, xmm2 vmovaps xmm2, xmm0 vmulss xmm0, xmm0, xmm0 vpaddd xmm3, xmm3, oword [_epi32_0x7f] vmulss xmm4, xmm4, xmm0 vmulss xmm6, xmm6, xmm0 vaddss xmm4, xmm4, dword [_ps_exp2_p1] vaddss xmm6, xmm6, dword [_ps_exp2_q1] vmulss xmm4, xmm4, xmm0 vaddss xmm4, xmm4, dword [_ps_exp2_p2] vmulss xmm2, xmm2, xmm4 vmovss xmm0, dword [_ps_am_1] vsubss xmm6, xmm6, xmm2 vpslld xmm3, xmm3, 23 vrcpss xmm6, xmm6, xmm6 vmovss xmm5, dword [esp - 4] vmulss xmm2, xmm2, xmm6 vandps xmm3, xmm3, xmm5 vaddss xmm2, xmm2, xmm2 vaddss xmm0, xmm0, xmm2 vmulss xmm0, xmm0, xmm3 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc