def test_pow_ps(self): asm = Tdasm() mc = asm.assemble(POW_CODE_PS) runtime = Runtime() load_math_func("fast_pow_ps", runtime) ds = runtime.load("pow_ps", mc) for x in range(1000): num1 = random.random() * 3 num2 = random.random() * 3 num3 = random.random() * 3 num4 = random.random() * 3 num5 = random.random() * 3 num6 = random.random() * 3 num7 = random.random() * 3 num8 = random.random() * 3 ds["v1"] = (num1, num2, num3, num4) ds["v2"] = (num5, num6, num7, num8) runtime.run("pow_ps") rez_asm = ds["v1"] rez_py1 = math.pow(num1, num5) rez_py2 = math.pow(num2, num6) rez_py3 = math.pow(num3, num7) rez_py4 = math.pow(num4, num8) self.assertAlmostEqual(rez_asm[0], rez_py1, 1) self.assertAlmostEqual(rez_asm[1], rez_py2, 1) self.assertAlmostEqual(rez_asm[2], rez_py3, 1) self.assertAlmostEqual(rez_asm[3], rez_py4, 1)
def _create_struct(self, shape): code = " #DATA " + shape.asm_struct() + """ #CODE #END """ mc = Tdasm().assemble(code) return mc.get_struct(shape.asm_struct_name())
def prepare(self, runtimes): self._load_color_funcs(runtimes) if self.loader: self.loader(runtimes) for s in self._shaders: s.prepare(runtimes) self._runtimes = runtimes asm = Tdasm() name = 'shader' + str(id(self)) for fun in self._functions: fun_name, fun_label, avx, bit = fun load_asm_function(fun_name, fun_label, runtimes, avx, bit) ds = [] for r in runtimes: if not r.global_exists(self._name): if self._name in self._mc_cache: ds.append(r.load(name, self._mc_cache[self._name])) else: mc = asm.assemble(self._code, self._func) self._mc_cache[self._name] = mc ds.append(r.load(name, mc)) if ds: self._ds = ds
def test_sincos_ps(self): asm = Tdasm() mc = asm.assemble(SINCOS_CODE_PS) runtime = Runtime() load_math_func("fast_sincos_ps", runtime) ds = runtime.load("sincos_ps", mc) for x in range(1000): num1 = random.random() * 2000 num2 = random.random() * 2000 num3 = random.random() * 2000 num4 = random.random() * 2000 ds["v1"] = (num1, num2, num3, num4) runtime.run("sincos_ps") rez_asm_sin = ds["v1"] rez_asm_cos = ds["v2"] rez_py1_sin = math.sin(num1) rez_py2_sin = math.sin(num2) rez_py3_sin = math.sin(num3) rez_py4_sin = math.sin(num4) rez_py1_cos = math.cos(num1) rez_py2_cos = math.cos(num2) rez_py3_cos = math.cos(num3) rez_py4_cos = math.cos(num4) self.assertAlmostEqual(rez_asm_sin[0], rez_py1_sin, 3) self.assertAlmostEqual(rez_asm_sin[1], rez_py2_sin, 3) self.assertAlmostEqual(rez_asm_sin[2], rez_py3_sin, 3) self.assertAlmostEqual(rez_asm_sin[3], rez_py4_sin, 3) self.assertAlmostEqual(rez_asm_cos[0], rez_py1_cos, 3) self.assertAlmostEqual(rez_asm_cos[1], rez_py2_cos, 3) self.assertAlmostEqual(rez_asm_cos[2], rez_py3_cos, 3) self.assertAlmostEqual(rez_asm_cos[3], rez_py4_cos, 3)
def _create_struct(self, struct_def, name): code = " #DATA \n" + struct_def + """ #CODE #END """ mc = Tdasm().assemble(code) return mc.get_struct(name)
def __init__(self): asm = Tdasm() m = asm.assemble(MEMCPY) self.r = Runtime() self.ds = self.r.load("memcpy", m) m2 = asm.assemble(BLTRGBA) self.ds2 = self.r.load("bltrgba", m2) m3 = asm.assemble(BLTFLOATRGBA) self.ds3 = self.r.load("bltfloatrgba", m3)
def create_float_image(runtime): img = renmas.gui.ImageFloatRGBA(150, 150) img.set_pixel_asm(runtime, "set_pixel") asm = Tdasm() mc = asm.assemble(ASM) runtime.load("write", mc) runtime.run("write") return img
def compile(self, shaders=[]): stms = parse(self._code) cgen = CodeGenerator() asm, ret_type = cgen.generate_code( stms, args=self._args, is_func=self._is_func, name=self._name, func_args=self._func_args, shaders=shaders ) self._asm_code = asm self._ret_type = ret_type asm = Tdasm() self._mc = asm.assemble(self._asm_code, self._is_func)
def _create_struct(self, struct_def, name): code = " #DATA \n" + struct_def + """ #CODE #END """ ia32 = True bits = platform.architecture()[0] if bits == '64bit': ia32 = False mc = Tdasm().assemble(code, ia32=ia32) return mc.get_struct(name)
def regular_sampler(): runtime = Runtime() sampler = renmas2.samplers.RegularSampler(2, 2, pixel=1.0) sampler.get_sample_asm([runtime], 'get_sample') tile = renmas2.core.Tile(0, 0, 2, 2) tile.split(1) sampler.set_tile(tile) asm = Tdasm() mc = asm.assemble(ASM_CODE) runtime.load("test", mc) return (sampler, runtime, 'test')
def _conv_rgba_bgra_asm(): bits = platform.architecture()[0] if bits == '64bit': code = _conv_rgba_bgra_asm64() mc = Tdasm().assemble(code, ia32=False) else: code = _conv_rgba_bgra_asm32() mc = Tdasm().assemble(code, ia32=True) runtime = Runtime() ds = runtime.load("convert", mc) return runtime, ds
def prepare(self, runtimes): for s in self._shaders: s.prepare(runtimes) self._ds = [] asm = Tdasm() mc = asm.assemble(self._code, self._func) #mc.print_machine_code() name = 'shader' + str(id(self)) self._runtimes = runtimes for r in runtimes: #TODO check if shader allread exist in runtime #TODO if shader is function load it as function self._ds.append(r.load(name, mc))
def __init__(self, width, height, pitch, address): self.addr = address self.width = width self.height = height asm = Tdasm() m = asm.assemble(ASM_STR) self.r = Runtime() self.ds = self.r.load("set_pixel", m) self.ds["color"] = 0xFF00FF00 # red color is default self.ds["address"] = address self.ds["width"] = width self.ds["height"] = height self.ds["pitch"] = pitch
def compile(self, shaders=[]): stms = parse(self._code) cgen = CodeGenerator() asm, ret_type = cgen.generate_code(stms, args=self._args, is_func=self._is_func, name=self._name, func_args=self._func_args, shaders=shaders) self._asm_code = asm self._ret_type = ret_type asm = Tdasm() self._mc = asm.assemble(self._asm_code, self._is_func)
def test_atan(self): asm = Tdasm() mc = asm.assemble(ATAN_CODE) runtime = Runtime() load_math_func("fast_atan_ss", runtime) ds = runtime.load("atan", mc) for x in range(1000): num = random.random() * 2000 ds["x"] = num runtime.run("atan") rez_asm = ds["x"] rez_py = math.atan(num) self.assertAlmostEqual(rez_asm, rez_py, 3)
def test_exp(self): asm = Tdasm() mc = asm.assemble(EXP_CODE) runtime = Runtime() load_math_func("fast_exp_ss", runtime) ds = runtime.load("exp", mc) for x in range(1000): num = random.random() * 4 ds["x"] = num runtime.run("exp") rez_asm = ds["x"] rez_py = math.exp(num) self.assertAlmostEqual(rez_asm, rez_py, 2)
def test_log(self): asm = Tdasm() mc = asm.assemble(LOG_CODE) runtime = Runtime() load_math_func("fast_log_ss", runtime) ds = runtime.load("log", mc) for x in range(1000): num = random.random() ds["x"] = num runtime.run("log") rez_asm = ds["x"] rez_py = math.log(num) self.assertAlmostEqual(rez_asm, rez_py, 3)
def compile(self, shaders=[], color_mgr=None): stms = parse(self._code) cgen = CodeGenerator() asm, ret_type, fns = cgen.generate_code(stms, args=self._args, is_func=self._is_func, name=self._name, func_args=self._func_args, shaders=shaders, color_mgr=color_mgr) self._asm_code = asm self._ret_type = ret_type self._ext_functions = fns asm = Tdasm() self._mc = asm.assemble(self._asm_code, naked=self._is_func, ia32=not cgen.BIT64)
def set_pixel_asm(self, runtime, label): bits = platform.architecture()[0] if bits == "64bit": ecx = "rcx" else: ecx = "ecx" if util.AVX: line = "vmovaps oword [" + ecx + "], xmm0" else: line = "movaps oword [" + ecx + "], xmm0" bits = platform.architecture()[0] if bits == "64bit": l1 = "uint64 ptr_buffer" l2 = "mov rcx, qword [ptr_buffer]" l3 = "add rcx, rax" else: l1 = "uint32 ptr_buffer" l2 = "mov ecx, dword [ptr_buffer]" l3 = "add ecx, eax" asm_code = """ #DATA """ asm_code += l1 + """ uint32 pitch #CODE ; eax = x , ebx = y, value = xmm0 """ asm_code += "global " + label + ": \n" asm_code += """ imul ebx, dword [pitch] imul eax , eax, 16 """ asm_code += l2 + """ add eax, ebx """ asm_code += l3 + "\n" asm_code += line + """ ret """ asm = Tdasm() mc = asm.assemble(asm_code, True) name = "ImageFloatRGBA" + str(hash(self)) self.ds = runtime.load(name, mc) self.ds["ptr_buffer"] = self.pixels.ptr() self.ds["pitch"] = self.pitch
def test_pow(self): asm = Tdasm() mc = asm.assemble(POW_CODE) runtime = Runtime() load_math_func("fast_pow_ss", runtime) ds = runtime.load("pow", mc) for x in range(1000): num = random.random() * 3 num1 = random.random() * 3 ds["x"] = num ds["y"] = num1 runtime.run("pow") rez_asm = ds["x"] rez_py = math.pow(num, num1) self.assertAlmostEqual(rez_asm, rez_py, 1)
def test_sincos(self): asm = Tdasm() mc = asm.assemble(SINCOS_CODE) runtime = Runtime() load_math_func("fast_sincos_ss", runtime) ds = runtime.load("sincos", mc) for x in range(1000): num = random.random() * 2000 ds["x"] = num runtime.run("sincos") rez_asm1 = ds["x"] rez_asm2 = ds["y"] rez_py1, rez_py2 = math.sin(num), math.cos(num) self.assertAlmostEqual(rez_asm1, rez_py1, 3) self.assertAlmostEqual(rez_asm2, rez_py2, 3)
def random_sampler(): runtime = Runtime() width = 1 height = 1 spp = 1 sampler = renmas2.samplers.RandomSampler(width, height, spp=spp, pixel=1.0) sampler.get_sample_asm([runtime], 'get_sample') tile = renmas2.core.Tile(0, 0, width, height) tile.split(1) sampler.set_tile(tile) asm = Tdasm() mc = asm.assemble(ASM_CODE) runtime.load("test", mc) nsamples = width * height * spp for x in range(nsamples): get_sample(sampler, runtime, "test") get_sample(sampler, runtime, "test")
class Structures: def __init__(self, renderer): self.tdasm = Tdasm() self.renderer = renderer self._line1 = "struct spectrum \n" self._line3 = "end struct \n" def get_struct(self, name): if name in structures: return structures[name] elif name == "spectrum": if self.renderer.spectral_rendering: line2 = "float values[" + str( self.renderer.nspectrum_samples) + "] \n" else: line2 = "float values[4] \n" return self._line1 + line2 + self._line3 elif name == "hitpoint": if self.renderer.spectral_rendering: line2 = "float values[" + str( self.renderer.nspectrum_samples) + "] \n" else: line2 = "float values[4] \n" spec = self._line1 + line2 + self._line3 return spec + HITPOINT return None def get_compiled_struct(self, name): if name in structures: asm_code = """ #DATA """ asm_code += self.get_struct(name) asm_code += """ #CODE #END """ mc = self.tdasm.assemble(asm_code) return mc.get_struct(name) return None def structs(self, names): code = "" for name in names: struct = self.get_struct(name) if struct is None: raise ValueError("Structure " + str(name) + " doesn't exist!") code += struct return code
def test_log_ps(self): asm = Tdasm() mc = asm.assemble(LOG_CODE_PS) runtime = Runtime() load_math_func("fast_log_ps", runtime) ds = runtime.load("log_ps", mc) for x in range(1000): num1 = random.random() num2 = random.random() num3 = random.random() num4 = random.random() ds["v1"] = (num1, num2, num3, num4) runtime.run("log_ps") rez_asm = ds["v1"] rez_py1 = math.log(num1) rez_py2 = math.log(num2) rez_py3 = math.log(num3) rez_py4 = math.log(num4) self.assertAlmostEqual(rez_asm[0], rez_py1, 3) self.assertAlmostEqual(rez_asm[1], rez_py2, 3) self.assertAlmostEqual(rez_asm[2], rez_py3, 3) self.assertAlmostEqual(rez_asm[3], rez_py4, 3)
class Structures: def __init__(self, renderer): self.tdasm = Tdasm() self.renderer = renderer self._line1 = "struct spectrum \n" self._line3 = "end struct \n" def get_struct(self, name): if name in structures: return structures[name] elif name == "spectrum": if self.renderer.spectral_rendering: line2 = "float values[" + str(self.renderer.nspectrum_samples) + "] \n" else: line2 = "float values[4] \n" return self._line1 + line2 + self._line3 elif name == "hitpoint": if self.renderer.spectral_rendering: line2 = "float values[" + str(self.renderer.nspectrum_samples) + "] \n" else: line2 = "float values[4] \n" spec = self._line1 + line2 + self._line3 return spec + HITPOINT return None def get_compiled_struct(self, name): if name in structures: asm_code = """ #DATA """ asm_code += self.get_struct(name) asm_code += """ #CODE #END """ mc = self.tdasm.assemble(asm_code) return mc.get_struct(name) return None def structs(self, names): code = "" for name in names: struct = self.get_struct(name) if struct is None: raise ValueError("Structure " + str(name) + " doesn't exist!") code += struct return code
def sin_ss(): data = """ #DATA uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000 float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236 uint32 _epi32_1[4] = 1, 1, 1, 1 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _epi32_2[4] = 2, 2, 2, 2 float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413 float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262 float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896 """ asm_code = data + """ #CODE global fast_sin_ss: movaps xmm7, xmm0 movss xmm1, dword [_ps_am_inv_sign_mask] movss xmm2, dword [_ps_am_sign_mask] movss xmm3, dword [_ps_am_2_o_pi] andps xmm0, xmm1 andps xmm7, xmm2 mulss xmm0, xmm3 pxor xmm3, xmm3 movd xmm5, dword [_epi32_1] movss xmm4, dword [_ps_am_1] cvttps2dq xmm2, xmm0 pand xmm5, xmm2 movd xmm1, dword [_epi32_2] pcmpeqd xmm5, xmm3 cvtdq2ps xmm6, xmm2 pand xmm2, xmm1 pslld xmm2, 30 subss xmm0, xmm6 movss xmm3, dword [_ps_sincos_p3] minss xmm0, xmm4 subss xmm4, xmm0 andps xmm0, xmm5 andnps xmm5, xmm4 orps xmm0, xmm5 movaps xmm1, xmm0 movss xmm4, dword [_ps_sincos_p2] mulss xmm0, xmm0 xorps xmm2, xmm7 movss xmm5, dword [_ps_sincos_p1] orps xmm1, xmm2 movaps xmm7, xmm0 mulss xmm0, xmm3 movss xmm6, dword [_ps_sincos_p0] addss xmm0, xmm4 mulss xmm0, xmm7 addss xmm0, xmm5 mulss xmm0, xmm7 addss xmm0, xmm6 mulss xmm0, xmm1 ret """ avx_code = data + """ #CODE global fast_sin_ss: vmovaps xmm7, xmm0 vmovss xmm1, dword [_ps_am_inv_sign_mask] vmovss xmm2, dword [_ps_am_sign_mask] vmovss xmm3, dword [_ps_am_2_o_pi] vandps xmm0, xmm0, xmm1 vandps xmm7, xmm7, xmm2 vmulss xmm0, xmm0, xmm3 vpxor xmm3, xmm3, xmm3 vmovd xmm5, dword [_epi32_1] vmovss xmm4, dword [_ps_am_1] vcvttps2dq xmm2, xmm0 vpand xmm5, xmm5, xmm2 vmovd xmm1, dword [_epi32_2] vpcmpeqd xmm5, xmm5, xmm3 vcvtdq2ps xmm6, xmm2 vpand xmm2, xmm2, xmm1 vpslld xmm2, xmm2, 30 vsubss xmm0, xmm0, xmm6 vmovss xmm3, dword [_ps_sincos_p3] vminss xmm0, xmm0, xmm4 vsubss xmm4, xmm4, xmm0 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vorps xmm0, xmm0, xmm5 vmovaps xmm1, xmm0 vmovss xmm4, dword [_ps_sincos_p2] vmulss xmm0, xmm0, xmm0 vxorps xmm2, xmm2, xmm7 vmovss xmm5, dword [_ps_sincos_p1] vorps xmm1, xmm1, xmm2 vmovaps xmm7, xmm0 vmulss xmm0, xmm0, xmm3 vmovss xmm6, dword [_ps_sincos_p0] vaddss xmm0, xmm0, xmm4 vmulss xmm0, xmm0, xmm7 vaddss xmm0, xmm0, xmm5 vmulss xmm0, xmm0, xmm7 vaddss xmm0, xmm0, xmm6 vmulss xmm0, xmm0, xmm1 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
def tan_ps(): data = """ #DATA uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF float _ps_am_4_o_pi[4] = 1.273239544735, 1.273239544735, 1.273239544735, 1.273239544735 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 float _ps_am_pi_o_4[4] = 0.78539816339, 0.78539816339, 0.78539816339, 0.78539816339 int32 _epi32_1[4] = 1, 1, 1, 1 int32 _epi32_7[4] = 7, 7, 7, 7 int32 _epi32_2[4] = 2, 2, 2, 2 uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000 float _ps_tan_p0[4] = -17956525.197648, -17956525.197648, -17956525.197648, -17956525.197648 float _ps_tan_q0[4] = -53869575.592945, -53869575.592945, -53869575.592945, -53869575.592945 float _ps_tan_p1[4] = 1153516.64838587, 1153516.64838587, 1153516.64838587, 1153516.64838587 float _ps_tan_q1[4] = 25008380.18233579, 25008380.18233579, 25008380.18233579, 25008380.18233579 float _ps_tan_p2[4] = -13093.693918138, -13093.693918138, -13093.693918138, -13093.693918138 float _ps_tan_q2[4] = -1320892.3444021, -1320892.3444021, -1320892.3444021, -1320892.3444021 float _ps_tan_q3[4] = 13681.296347069, 13681.296347069, 13681.296347069, 13681.296347069 float _ps_tan_poleval[4] = 36893500000000000000.0, 36893500000000000000.0, 36893500000000000000.0, 36893500000000000000.0 """ asm_code = data + """ #CODE global fast_tan_ps: movaps xmm7, xmm0 andps xmm0, oword [_ps_am_inv_sign_mask] andps xmm7, oword [_ps_am_sign_mask] movaps xmm1, xmm0 mulps xmm0, oword [_ps_am_4_o_pi] cvttps2dq xmm0, xmm0 movdqa xmm4, oword [_epi32_1] movdqa xmm5, oword [_epi32_7] pand xmm4, xmm0 pand xmm5, xmm0 movaps xmm3, oword [_ps_am_1] paddd xmm0, xmm4 paddd xmm5, xmm4 cvtdq2ps xmm0, xmm0 mulps xmm0, oword [_ps_am_pi_o_4] xorps xmm6, xmm6 subps xmm1, xmm0 movaps xmm2, oword [_ps_tan_p2] minps xmm1, xmm3 movaps xmm3, oword [_ps_tan_q3] movaps xmm0, xmm1 mulps xmm1, xmm1 mulps xmm2, xmm1 addps xmm3, xmm1 addps xmm2, oword [_ps_tan_p1] mulps xmm3, xmm1 mulps xmm2, xmm1 addps xmm3, oword [_ps_tan_q2] addps xmm2, oword [_ps_tan_p0] mulps xmm3, xmm1 mulps xmm2, xmm1 addps xmm3, oword [_ps_tan_q1] xorps xmm0, xmm7 mulps xmm3, xmm1 pand xmm5, oword [_epi32_2] addps xmm3, oword [_ps_tan_q0] mulps xmm2, xmm0 cmpps xmm6, xmm1, 4 rcpps xmm4, xmm3 pxor xmm7, xmm7 mulps xmm3, xmm4 pcmpeqd xmm5, xmm7 mulps xmm3, xmm4 addps xmm4, xmm4 orps xmm6, xmm5 subps xmm4, xmm3 mulps xmm2, xmm4 movaps xmm1, oword [_ps_am_sign_mask] movmskps eax, xmm6 addps xmm2, xmm0 rcpps xmm4, xmm2 cmp eax, 0xf movaps xmm0, xmm2 mulps xmm2, xmm4 mulps xmm2, xmm4 addps xmm4, xmm4 subps xmm4, xmm2 jne l_pole xorps xmm4, xmm1 andps xmm0, xmm5 andnps xmm5, xmm4 orps xmm0, xmm5 ret l_pole: movaps xmm7, xmm1 movaps xmm3, oword [_ps_tan_poleval] andps xmm1, xmm0 orps xmm3, xmm1 andps xmm4, xmm6 andnps xmm6, xmm3 orps xmm4, xmm6 xorps xmm4, xmm7 andps xmm0, xmm5 andnps xmm5, xmm4 orps xmm0, xmm5 ret """ avx_code = data + """ #CODE global fast_tan_ps: vmovaps xmm7, xmm0 vandps xmm0, xmm0, oword [_ps_am_inv_sign_mask] vandps xmm7, xmm7, oword [_ps_am_sign_mask] vmovaps xmm1, xmm0 vmulps xmm0, xmm0, oword [_ps_am_4_o_pi] vcvttps2dq xmm0, xmm0 vmovdqa xmm4, oword [_epi32_1] vmovdqa xmm5, oword [_epi32_7] vpand xmm4, xmm4, xmm0 vpand xmm5, xmm5, xmm0 vmovaps xmm3, oword [_ps_am_1] vpaddd xmm0, xmm0, xmm4 vpaddd xmm5, xmm5, xmm4 vcvtdq2ps xmm0, xmm0 vmulps xmm0, xmm0, oword [_ps_am_pi_o_4] vxorps xmm6, xmm6, xmm6 vsubps xmm1, xmm1, xmm0 vmovaps xmm2, oword [_ps_tan_p2] vminps xmm1, xmm1, xmm3 vmovaps xmm3, oword [_ps_tan_q3] vmovaps xmm0, xmm1 vmulps xmm1, xmm1, xmm1 vmulps xmm2, xmm2, xmm1 vaddps xmm3, xmm3, xmm1 vaddps xmm2, xmm2, oword [_ps_tan_p1] vmulps xmm3, xmm3, xmm1 vmulps xmm2, xmm2, xmm1 vaddps xmm3, xmm3, oword [_ps_tan_q2] vaddps xmm2, xmm2, oword [_ps_tan_p0] vmulps xmm3, xmm3, xmm1 vmulps xmm2, xmm2, xmm1 vaddps xmm3, xmm3, oword [_ps_tan_q1] vxorps xmm0, xmm0, xmm7 vmulps xmm3, xmm3, xmm1 vpand xmm5, xmm5, oword [_epi32_2] vaddps xmm3, xmm3, oword [_ps_tan_q0] vmulps xmm2, xmm2, xmm0 vcmpps xmm6, xmm6, xmm1, 4 vrcpps xmm4, xmm3 vpxor xmm7, xmm7, xmm7 vmulps xmm3, xmm3, xmm4 vpcmpeqd xmm5, xmm5, xmm7 vmulps xmm3, xmm3, xmm4 vaddps xmm4, xmm4, xmm4 vorps xmm6, xmm6, xmm5 vsubps xmm4, xmm4, xmm3 vmulps xmm2, xmm2, xmm4 vmovaps xmm1, oword [_ps_am_sign_mask] vmovmskps eax, xmm6 vaddps xmm2, xmm2, xmm0 vrcpps xmm4, xmm2 cmp eax, 0xf vmovaps xmm0, xmm2 vmulps xmm2, xmm2, xmm4 vmulps xmm2, xmm2, xmm4 vaddps xmm4, xmm4, xmm4 vsubps xmm4, xmm4, xmm2 jne l_pole vxorps xmm4, xmm4, xmm1 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vorps xmm0, xmm0, xmm5 ret l_pole: vmovaps xmm7, xmm1 vmovaps xmm3, oword [_ps_tan_poleval] vandps xmm1, xmm1, xmm0 vorps xmm3, xmm3, xmm1 vandps xmm4, xmm4, xmm6 vandnps xmm6, xmm6, xmm3 vorps xmm4, xmm4, xmm6 vxorps xmm4, xmm4, xmm7 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vorps xmm0, xmm0, xmm5 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
_update_distance: mov eax, dword [esp + 8] mov ebx, dword [eax + hitpoint.t] mov edx, dword [esp + 16] ;populate new minimum distance mov dword [edx], ebx jmp _next_object _end_objects: add esp, 20 ret """ asm = Tdasm() renmas.shapes.multiple_isect_asm(runtime, "multiple_isect") mc = asm.assemble(ASM) def v4(v3): return (v3.x, v3.y, v3.z, 0.0) ds = runtime.load("test", mc) ray = ren.random_ray() ds["ray1.origin"] = v4(ray.origin) ds["ray1.dir"] = v4(ray.dir) ds["num"] = len(lst_shapes) ds["addrs"] = adrese runtime.run("test")
def asin_ps(): data = """ #DATA uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 float _ps_am_m1[4] = -1.0, -1.0, -1.0, -1.0 float _ps_atan_t0[4] = -0.091646118527, -0.091646118527, -0.091646118527, -0.091646118527 float _ps_atan_s0[4] = 1.2797564625, 1.2797564625, 1.2797564625, 1.2797564625 float _ps_atan_s1[4] = 2.1972168858, 2.1972168858, 2.1972168858, 2.1972168858 float _ps_atan_t1[4] = -1.395694568, -1.395694568, -1.395694568, -1.395694568 float _ps_atan_s2[4] = 6.8193064723, 6.8193064723, 6.8193064723 ,6.8193064723 float _ps_atan_t2[4] = -94.3939261227, -94.3939261227, -94.3939261227, -94.3939261227 float _ps_atan_s3[4] = 28.205206687, 28.205206687, 28.205206687, 28.205206687 float _ps_atan_t3[4] = 12.888383034, 12.888383034, 12.888383034, 12.888383034 float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679 """ asm_code = data + """ #CODE global fast_asin_ps: movaps xmm1, oword [_ps_am_1] movaps xmm2, xmm1 addps xmm1, xmm0 subps xmm2, xmm0 mulps xmm1, xmm2 rsqrtps xmm1, xmm1 mulps xmm0, xmm1 ;atan movaps xmm5, oword [_ps_am_1] movaps xmm6, oword [_ps_am_m1] rcpps xmm4, xmm0 cmpps xmm5, xmm0, 1 cmpps xmm6, xmm0, 6 movaps xmm1, oword [_ps_atan_s0] orps xmm5, xmm6 andps xmm4, xmm5 movaps xmm2, oword [_ps_atan_t0] movaps xmm7, xmm5 andnps xmm5, xmm0 movaps xmm3, oword [_ps_atan_s1] orps xmm4, xmm5 movaps xmm0, xmm4 movaps xmm6, oword [_ps_atan_t1] mulps xmm4, xmm4 addps xmm1, xmm4 movaps xmm5, oword [_ps_atan_s2] rcpps xmm1, xmm1 mulps xmm1, xmm2 movaps xmm2, oword [_ps_atan_t2] addps xmm3, xmm4 addps xmm1, xmm3 movaps xmm3, oword [_ps_atan_s3] rcpps xmm1, xmm1 mulps xmm1, xmm6 movaps xmm6, oword [_ps_atan_t3] addps xmm5, xmm4 addps xmm1, xmm5 movaps xmm5, oword [_ps_am_sign_mask] rcpps xmm1, xmm1 mulps xmm1, xmm2 addps xmm3, xmm4 movaps xmm4, oword [_ps_am_pi_o_2] mulps xmm6, xmm0 addps xmm1, xmm3 andps xmm0, xmm5 rcpps xmm1, xmm1 mulps xmm1, xmm6 orps xmm0, xmm4 subps xmm0, xmm1 andps xmm0, xmm7 andnps xmm7, xmm1 orps xmm0, xmm7 ret """ avx_code = data + """ #CODE global fast_asin_ps: vmovaps xmm1, oword [_ps_am_1] vmovaps xmm2, xmm1 vaddps xmm1, xmm1, xmm0 vsubps xmm2, xmm2, xmm0 vmulps xmm1, xmm1, xmm2 vrsqrtps xmm1, xmm1 vmulps xmm0, xmm0, xmm1 ;atan vmovaps xmm5, oword [_ps_am_1] vmovaps xmm6, oword [_ps_am_m1] vrcpps xmm4, xmm0 vcmpps xmm5, xmm5, xmm0, 1 vcmpps xmm6, xmm6, xmm0, 6 vmovaps xmm1, oword [_ps_atan_s0] vorps xmm5, xmm5, xmm6 vandps xmm4, xmm4, xmm5 vmovaps xmm2, oword [_ps_atan_t0] vmovaps xmm7, xmm5 vandnps xmm5, xmm5, xmm0 vmovaps xmm3, oword [_ps_atan_s1] vorps xmm4, xmm4, xmm5 vmovaps xmm0, xmm4 vmovaps xmm6, oword [_ps_atan_t1] vmulps xmm4, xmm4, xmm4 vaddps xmm1, xmm1, xmm4 vmovaps xmm5, oword [_ps_atan_s2] vrcpps xmm1, xmm1 vmulps xmm1, xmm1, xmm2 vmovaps xmm2, oword [_ps_atan_t2] vaddps xmm3, xmm3, xmm4 vaddps xmm1, xmm1, xmm3 vmovaps xmm3, oword [_ps_atan_s3] vrcpps xmm1, xmm1 vmulps xmm1, xmm1, xmm6 vmovaps xmm6, oword [_ps_atan_t3] vaddps xmm5, xmm5, xmm4 vaddps xmm1, xmm1, xmm5 vmovaps xmm5, oword [_ps_am_sign_mask] vrcpps xmm1, xmm1 vmulps xmm1, xmm1, xmm2 vaddps xmm3, xmm3, xmm4 vmovaps xmm4, oword [_ps_am_pi_o_2] vmulps xmm6, xmm6, xmm0 vaddps xmm1, xmm1, xmm3 vandps xmm0, xmm0, xmm5 vrcpps xmm1, xmm1 vmulps xmm1, xmm1, xmm6 vorps xmm0, xmm0, xmm4 vsubps xmm0, xmm0, xmm1 vandps xmm0, xmm0, xmm7 vandnps xmm7, xmm7, xmm1 vorps xmm0, xmm0, xmm7 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
add dword [sy], 1 add dword [y], 1 jmp _bltrgba _endblt: ret """ return code def _blt_floatrgba_code(bgra=True): bits = platform.architecture()[0] if bits == '64bit': return _blt_floatrgba_code64(bgra) else: return _blt_floatrgba_code32(bgra) _asm = Tdasm() _mc = _asm.assemble(_blt_floatrgba_code()) _runtime = Runtime() _data_section = _runtime.load("blt_prgba_to_bgra", _mc) _mc2 = _asm.assemble(_blt_floatrgba_code(bgra=False)) _data_section2 = _runtime.load("blt_prgba_to_rgba", _mc2) # blt float rgba to byte bgra def blt_prgba_to_bgra(src, dest): assert isinstance(src, ImagePRGBA) assert isinstance(dest, ImageBGRA) sa, spitch = src.address_info() da, dpitch = dest.address_info()
ds[name+ ".origin"] = (o.x, o.y, o.z, 0.0) ds[name+ ".dir"] = (d.x, d.y, d.z, 0.0) def sphere_ds(ds, sphere, name): o = sphere.origin ds[name+".origin"] = (o.x, o.y, o.z, 0.0) ds[name+".radius"] = sphere.radius ds[name+".mat_index"] = sphere.material ray = get_ray() sph = get_sphere() runtime = Runtime() sph.isect_asm([runtime], 'ray_sphere_intersection') asm = Tdasm() mc = asm.assemble(ASM_CODE) ds = runtime.load('test', mc) ray_ds(ds, ray, 'ray1') sphere_ds(ds, sph, 'sph1') runtime.run('test') hp = sph.isect(ray) if hp: print(hp.t, ds['hp1.t']) print(hp.hit_point) print(ds['hp1.hit']) print(hp.normal)
def get_asm(): from renmas.macros import eq32, eq128, eq32_32, eq32_128, eq128_128, eq128_32 from renmas.macros import dot_product, macro_if, broadcast global assembler if assembler is None: assembler = Tdasm() assembler.register_macro("eq128", eq128) assembler.register_macro("eq32", eq32) assembler.register_macro("eq128_32", eq128_32) assembler.register_macro("eq32_128", eq32_128) assembler.register_macro("eq128_128", eq128_128) assembler.register_macro("eq32_32", eq32_32) assembler.register_macro("dot", dot_product) assembler.register_macro("if", macro_if) assembler.register_macro("broadcast", broadcast) return assembler
def cos_ss(): data = """ #DATA uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679 float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236 uint32 _epi32_1[4] = 1, 1, 1, 1 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _epi32_2[4] = 2, 2, 2, 2 float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413 float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262 float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896 """ asm_code = data + """ #CODE global fast_cos_ss: movss xmm1, dword [_ps_am_inv_sign_mask] movss xmm2, dword [_ps_am_pi_o_2] movss xmm3, dword [_ps_am_2_o_pi] andps xmm0, xmm1 addss xmm0, xmm2 mulss xmm0, xmm3 pxor xmm3, xmm3 movd xmm5, dword [_epi32_1] movss xmm4, dword [_ps_am_1] cvttps2dq xmm2, xmm0 pand xmm5, xmm2 movd xmm1, dword [_epi32_2] pcmpeqd xmm5, xmm3 cvtdq2ps xmm6, xmm2 pand xmm2, xmm1 pslld xmm2, 30 subss xmm0, xmm6 movss xmm3, dword [_ps_sincos_p3] minss xmm0, xmm4 subss xmm4, xmm0 andps xmm0, xmm5 andnps xmm5, xmm4 orps xmm0, xmm5 movaps xmm1, xmm0 movss xmm4, dword [_ps_sincos_p2] mulss xmm0, xmm0 movss xmm5, dword [_ps_sincos_p1] orps xmm1, xmm2 movaps xmm7, xmm0 mulss xmm0, xmm3 movss xmm6, dword [_ps_sincos_p0] addss xmm0, xmm4 mulss xmm0, xmm7 addss xmm0, xmm5 mulss xmm0, xmm7 addss xmm0, xmm6 mulss xmm0, xmm1 ret """ avx_code = data + """ #CODE global fast_cos_ss: vmovss xmm1, dword [_ps_am_inv_sign_mask] vmovss xmm2, dword [_ps_am_pi_o_2] vmovss xmm3, dword [_ps_am_2_o_pi] vandps xmm0, xmm0, xmm1 vaddss xmm0, xmm0, xmm2 vmulss xmm0, xmm0, xmm3 vpxor xmm3, xmm3, xmm3 vmovd xmm5, dword [_epi32_1] vmovss xmm4, dword [_ps_am_1] vcvttps2dq xmm2, xmm0 vpand xmm5, xmm5, xmm2 vmovd xmm1, dword [_epi32_2] vpcmpeqd xmm5, xmm5, xmm3 vcvtdq2ps xmm6, xmm2 vpand xmm2, xmm2, xmm1 vpslld xmm2, xmm2, 30 vsubss xmm0, xmm0, xmm6 vmovss xmm3, dword [_ps_sincos_p3] vminss xmm0, xmm0, xmm4 vsubss xmm4, xmm4, xmm0 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vorps xmm0, xmm0, xmm5 vmovaps xmm1, xmm0 vmovss xmm4, dword [_ps_sincos_p2] vmulss xmm0, xmm0, xmm0 vmovss xmm5, dword [_ps_sincos_p1] vorps xmm1, xmm1, xmm2 vmovaps xmm7, xmm0 vmulss xmm0, xmm0, xmm3 vmovss xmm6, dword [_ps_sincos_p0] vaddss xmm0, xmm0, xmm4 vmulss xmm0, xmm0, xmm7 vaddss xmm0, xmm0, xmm5 vmulss xmm0, xmm0, xmm7 vaddss xmm0, xmm0, xmm6 vmulss xmm0, xmm0, xmm1 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
ret """ return code def _blt_rect_code(): bits = platform.architecture()[0] if bits == '64bit': return _blt_rect_code64() else: return _blt_rect_code32() bits = platform.architecture()[0] if bits == '64bit': _mc = Tdasm().assemble(_blt_rect_code(), ia32=False) else: _mc = Tdasm().assemble(_blt_rect_code(), ia32=True) _runtime = Runtime() _data_section = _runtime.load("bltrgba", _mc) def blt_image(src, dest, sx=0, sy=0, sw=-1, sh=-1, dx=0, dy=0, fliped=False): """ Transfer block of image from source to destination. @param src - source image @param dest - destination image @param sx - x position in source image @param sy - y position in source image @param sw - width of source image
def log_ps(): data = """ #DATA float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _ps_am_min_norm_pos[4] = 0x00800000, 0x00800000, 0x00800000, 0x00800000 uint32 _ps_am_inv_mant_mask[4] = 0x807FFFFF, 0x807FFFFF, 0x807FFFFF, 0x807FFFFF uint32 _epi32_0x7f[4] = 0x7F, 0x7F, 0x7F, 0x7F float _ps_log_p0[4] = -0.789580278884, -0.789580278884, -0.789580278884, -0.789580278884 float _ps_log_q0[4] = -35.6722798256, -35.6722798256, -35.6722798256, -35.6722798256 float _ps_log_p1[4] = 16.38666456995, 16.38666456995, 16.38666456995, 16.38666456995 float _ps_log_q1[4] = 312.0937663722, 312.0937663722, 312.0937663722, 312.0937663722 float _ps_log_p2[4] = -64.14099529587, -64.14099529587, -64.14099529587, -64.14099529587 float _ps_log_q2[4] = -769.69194355046, -769.69194355046, -769.69194355046, -769.69194355046 float _ps_log_c0[4] = 0.6931471805599, 0.6931471805599, 0.6931471805599, 0.6931471805599 """ asm_code = data + """ #CODE global fast_log_ps: maxps xmm0, oword [_ps_am_min_norm_pos] ; cut off denormalized stuff movaps xmm1, oword [_ps_am_1] movaps xmm3, xmm0 andps xmm0, oword [_ps_am_inv_mant_mask] orps xmm0, xmm1 movaps xmm4, xmm0 subps xmm0, xmm1 addps xmm4, xmm1 psrld xmm3, 23 rcpps xmm4, xmm4 mulps xmm0, xmm4 psubd xmm3, oword [_epi32_0x7f] addps xmm0, xmm0 movaps xmm2, xmm0 mulps xmm0, xmm0 movaps xmm4, oword [_ps_log_p0] movaps xmm6, oword [_ps_log_q0] mulps xmm4, xmm0 movaps xmm5, oword [_ps_log_p1] mulps xmm6, xmm0 movaps xmm7, oword [_ps_log_q1] addps xmm4, xmm5 addps xmm6, xmm7 movaps xmm5, oword [_ps_log_p2] mulps xmm4, xmm0 movaps xmm7, oword [_ps_log_q2] mulps xmm6, xmm0 addps xmm4, xmm5 movaps xmm5, oword [_ps_log_c0] addps xmm6, xmm7 cvtdq2ps xmm1, xmm3 mulps xmm0, xmm4 rcpps xmm6, xmm6 mulps xmm0, xmm6 mulps xmm0, xmm2 mulps xmm1, xmm5 addps xmm0, xmm2 addps xmm0, xmm1 ret """ avx_code = data + """ #CODE global fast_log_ps: vmaxps xmm0, xmm0, oword [_ps_am_min_norm_pos] ; cut off denormalized stuff vmovaps xmm1, oword [_ps_am_1] vmovaps xmm3, xmm0 vandps xmm0, xmm0, oword [_ps_am_inv_mant_mask] vorps xmm0, xmm0, xmm1 vmovaps xmm4, xmm0 vsubps xmm0, xmm0, xmm1 vaddps xmm4, xmm4, xmm1 vpsrld xmm3, xmm3, 23 vrcpps xmm4, xmm4 vmulps xmm0, xmm0, xmm4 vpsubd xmm3, xmm3, oword [_epi32_0x7f] vaddps xmm0, xmm0, xmm0 vmovaps xmm2, xmm0 vmulps xmm0, xmm0, xmm0 vmovaps xmm4, oword [_ps_log_p0] vmovaps xmm6, oword [_ps_log_q0] vmulps xmm4, xmm4, xmm0 vmovaps xmm5, oword [_ps_log_p1] vmulps xmm6, xmm6, xmm0 vmovaps xmm7, oword [_ps_log_q1] vaddps xmm4, xmm4, xmm5 vaddps xmm6, xmm6, xmm7 vmovaps xmm5, oword [_ps_log_p2] vmulps xmm4, xmm4, xmm0 vmovaps xmm7, oword [_ps_log_q2] vmulps xmm6, xmm6, xmm0 vaddps xmm4, xmm4, xmm5 vmovaps xmm5, oword [_ps_log_c0] vaddps xmm6, xmm6, xmm7 vcvtdq2ps xmm1, xmm3 vmulps xmm0, xmm0, xmm4 vrcpps xmm6, xmm6 vmulps xmm0, xmm0, xmm6 vmulps xmm0, xmm0, xmm2 vmulps xmm1, xmm1, xmm5 vaddps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmm1 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
def cos_ps(): data = """ #DATA uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679 float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236 uint32 _epi32_1[4] = 1, 1, 1, 1 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _epi32_2[4] = 2, 2, 2, 2 float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413 float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262 float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896 """ asm_code = data + """ #CODE global fast_cos_ps: andps xmm0, oword [_ps_am_inv_sign_mask] addps xmm0, oword [_ps_am_pi_o_2] mulps xmm0, oword [_ps_am_2_o_pi] pxor xmm3, xmm3 movdqa xmm5, oword [_epi32_1] movaps xmm4, oword [_ps_am_1] cvttps2dq xmm2, xmm0 pand xmm5, xmm2 pcmpeqd xmm5, xmm3 cvtdq2ps xmm6, xmm2 pand xmm2, oword [_epi32_2] pslld xmm2, 30 subps xmm0, xmm6 minps xmm0, xmm4 subps xmm4, xmm0 andps xmm0, xmm5 andnps xmm5, xmm4 orps xmm0, xmm5 movaps xmm1, xmm0 mulps xmm0, xmm0 orps xmm1, xmm2 movaps xmm7, xmm0 mulps xmm0, oword [_ps_sincos_p3] addps xmm0, oword [_ps_sincos_p2] mulps xmm0, xmm7 addps xmm0, oword [_ps_sincos_p1] mulps xmm0, xmm7 addps xmm0, oword [_ps_sincos_p0] mulps xmm0, xmm1 ret """ avx_code = data + """ #CODE global fast_cos_ps: vandps xmm0, xmm0, oword [_ps_am_inv_sign_mask] vaddps xmm0, xmm0, oword [_ps_am_pi_o_2] vmulps xmm0, xmm0, oword [_ps_am_2_o_pi] vpxor xmm3, xmm3, xmm3 vmovdqa xmm5, oword [_epi32_1] vmovaps xmm4, oword [_ps_am_1] vcvttps2dq xmm2, xmm0 vpand xmm5, xmm5, xmm2 vpcmpeqd xmm5, xmm5, xmm3 vcvtdq2ps xmm6, xmm2 vpand xmm2, xmm2, oword [_epi32_2] vpslld xmm2, xmm2, 30 vsubps xmm0, xmm0, xmm6 vminps xmm0, xmm0, xmm4 vsubps xmm4, xmm4, xmm0 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vorps xmm0, xmm0, xmm5 vmovaps xmm1, xmm0 vmulps xmm0, xmm0, xmm0 vorps xmm1, xmm1, xmm2 vmovaps xmm7, xmm0 vmulps xmm0, xmm0, oword [_ps_sincos_p3] vaddps xmm0, xmm0, oword [_ps_sincos_p2] vmulps xmm0, xmm0, xmm7 vaddps xmm0, xmm0, oword [_ps_sincos_p1] vmulps xmm0, xmm0, xmm7 vaddps xmm0, xmm0, oword [_ps_sincos_p0] vmulps xmm0, xmm0, xmm1 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
#END """ else: code = """ #DATA uint32 sa, da, n #CODE mov ecx, dword [n] mov esi, dword [sa] mov edi, dword [da] rep movs byte [edi], byte [esi] #END """ return code _mc = Tdasm().assemble(_memcpy_code()) _runtime = Runtime() _data_section = _runtime.load("memcpy", _mc) def memcpy(da, sa, n): """ Copy n bytes form source address(sa) to destination address(da). """ _data_section["da"] = da _data_section["sa"] = sa _data_section["n"] = n _runtime.run("memcpy")
def sincos_ss(): data = """ #DATA uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000 float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679 float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236 uint32 _epi32_1[4] = 1, 1, 1, 1 float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0 uint32 _epi32_2[4] = 2, 2, 2, 2 float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413 float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262 float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896 """ asm_code = data + """ #CODE global fast_sincos_ss: movaps xmm7, xmm0 movss xmm1, dword [_ps_am_inv_sign_mask] movss xmm2, dword [_ps_am_sign_mask] movss xmm3, dword [_ps_am_2_o_pi] andps xmm0, xmm1 andps xmm7, xmm2 mulss xmm0, xmm3 pxor xmm3, xmm3 movd xmm5, dword [_epi32_1] movss xmm4, dword [_ps_am_1] cvttps2dq xmm2, xmm0 pand xmm5, xmm2 movd xmm1, dword [_epi32_2] pcmpeqd xmm5, xmm3 movd xmm3, dword [_epi32_1] cvtdq2ps xmm6, xmm2 paddd xmm3, xmm2 pand xmm2, xmm1 pand xmm3, xmm1 subss xmm0, xmm6 pslld xmm2, 30 minss xmm0, xmm4 ;mov eax, [esp + 4 + 16] ;mov edx, [esp + 4 + 16 + 4] subss xmm4, xmm0 pslld xmm3, 30 movaps xmm6, xmm4 xorps xmm2, xmm7 movaps xmm7, xmm5 andps xmm6, xmm7 andnps xmm7, xmm0 andps xmm0, xmm5 andnps xmm5, xmm4 movss xmm4, dword [_ps_sincos_p3] orps xmm6, xmm7 orps xmm0, xmm5 movss xmm5, dword [_ps_sincos_p2] movaps xmm1, xmm0 movaps xmm7, xmm6 mulss xmm0, xmm0 mulss xmm6, xmm6 orps xmm1, xmm2 orps xmm7, xmm3 movaps xmm2, xmm0 movaps xmm3, xmm6 mulss xmm0, xmm4 mulss xmm6, xmm4 movss xmm4, dword [_ps_sincos_p1] addss xmm0, xmm5 addss xmm6, xmm5 movss xmm5, dword [_ps_sincos_p0] mulss xmm0, xmm2 mulss xmm6, xmm3 addss xmm0, xmm4 addss xmm6, xmm4 mulss xmm0, xmm2 mulss xmm6, xmm3 addss xmm0, xmm5 addss xmm6, xmm5 mulss xmm0, xmm1 mulss xmm6, xmm7 ;use full stores since caller might reload with full loads ;movaps [eax], xmm0 ;movaps [edx], xmm6 ret """ avx_code = data + """ #CODE global fast_sincos_ss: vmovaps xmm7, xmm0 vmovss xmm1, dword [_ps_am_inv_sign_mask] vmovss xmm2, dword [_ps_am_sign_mask] vmovss xmm3, dword [_ps_am_2_o_pi] vandps xmm0, xmm0, xmm1 vandps xmm7, xmm7, xmm2 vmulss xmm0, xmm0, xmm3 vpxor xmm3, xmm3, xmm3 vmovd xmm5, dword [_epi32_1] vmovss xmm4, dword [_ps_am_1] vcvttps2dq xmm2, xmm0 vpand xmm5, xmm5, xmm2 vmovd xmm1, dword [_epi32_2] vpcmpeqd xmm5, xmm5, xmm3 vmovd xmm3, dword [_epi32_1] vcvtdq2ps xmm6, xmm2 vpaddd xmm3, xmm3, xmm2 vpand xmm2, xmm2, xmm1 vpand xmm3, xmm3, xmm1 vsubss xmm0, xmm0, xmm6 vpslld xmm2, xmm2, 30 vminss xmm0, xmm0, xmm4 ;mov eax, [esp + 4 + 16] ;mov edx, [esp + 4 + 16 + 4] vsubss xmm4, xmm4, xmm0 vpslld xmm3, xmm3, 30 vmovaps xmm6, xmm4 vxorps xmm2, xmm2, xmm7 vmovaps xmm7, xmm5 vandps xmm6, xmm6, xmm7 vandnps xmm7, xmm7, xmm0 vandps xmm0, xmm0, xmm5 vandnps xmm5, xmm5, xmm4 vmovss xmm4, dword [_ps_sincos_p3] vorps xmm6, xmm6, xmm7 vorps xmm0, xmm0, xmm5 vmovss xmm5, dword [_ps_sincos_p2] vmovaps xmm1, xmm0 vmovaps xmm7, xmm6 vmulss xmm0, xmm0, xmm0 vmulss xmm6, xmm6, xmm6 vorps xmm1, xmm1, xmm2 vorps xmm7, xmm7, xmm3 vmovaps xmm2, xmm0 vmovaps xmm3, xmm6 vmulss xmm0, xmm0, xmm4 vmulss xmm6, xmm6, xmm4 vmovss xmm4, dword [_ps_sincos_p1] vaddss xmm0, xmm0, xmm5 vaddss xmm6, xmm6, xmm5 vmovss xmm5, dword [_ps_sincos_p0] vmulss xmm0, xmm0, xmm2 vmulss xmm6, xmm6, xmm3 vaddss xmm0, xmm0, xmm4 vaddss xmm6, xmm6, xmm4 vmulss xmm0, xmm0, xmm2 vmulss xmm6, xmm6, xmm3 vaddss xmm0, xmm0, xmm5 vaddss xmm6, xmm6, xmm5 vmulss xmm0, xmm0, xmm1 vmulss xmm6, xmm6, xmm7 ;use full stores since caller might reload with full loads ;movaps [eax], xmm0 ;movaps [edx], xmm6 ret """ asm = Tdasm() if proc.AVX: mc = asm.assemble(avx_code, True) else: mc = asm.assemble(asm_code, True) return mc
from tdasm import Tdasm import renmas.core from renmas.core import AsmStructures asm = Tdasm() AVX = asm.avx_supported() AVX = False SSSE3 = asm.cpu["ssse3"] #SSSE3 = False SSE3 = asm.cpu["sse3"] SSE41 = asm.cpu["sse41"] #SSE41 = False SSE2 = asm.cpu["sse2"] def structs(*lst_structs): code = "" asm_structs = AsmStructures() for s in lst_structs: struct = asm_structs.get_struct(s) if struct is None: raise ValueError("Structure " + str(s) + " doesn't exist!") code += struct return code assembler = None
code += lst_inst2[l] + "\n" for l in range(len(lst_inst2), len(lst_inst1)): code += lst_inst1[l] + "\n" return code def arth128_32(tokens): return arth_mix(tokens, 128, 32) def arth32_128(tokens): return arth_mix(tokens, 32, 128) def arth128_128(tokens): return arth_mix(tokens, 128, 128) def arth32_32(tokens): return arth_mix(tokens, 32, 32) if __name__ == "__main__": asm = Tdasm() asm.register_macro("arth128", arth128) asm.register_macro("arth32", arth32) mc = asm.assemble(ASM_CODE) run = Runtime() ds = run.load("test", mc) run.run("test") print(ds["rez"])
if gamma < 0.0: return False if beta + gamma > 1.0: return False e3 = a * p - b * r + d * s t = e3 * inv_denom if t < 0.00001: return False # self-intersection return (beta, gamma, t) code = ray_triangle_intersection("ray_triangle_intersection") asm = Tdasm() mc = asm.assemble(code, True) runtime = Runtime() runtime.load('ray_triangle', mc) # xmm3 - origin # xmm4 - direction # xmm5 - p0 # xmm6 - p1 # xmm7 - p2 # edx - min_distance test_code = """ #DATA float p0[4]