def __init__(self, domain, potential=None, dat_dict=None, kernel=None): self._domain = domain self._potential = potential self._dat_dict = dat_dict self._cc = build.TMPCC ########## # End of Rapaport initialisations. ########## self._temp_dir = runtime.BUILD_DIR if not os.path.exists(self._temp_dir): os.mkdir(self._temp_dir) if potential is not None: self._kernel = self._potential.kernel elif kernel is not None: self._kernel = kernel else: print("pairloop error, no kernel passed.") self.loop_timer = opt.LoopTimer() self._code_init() self._lib = build.simple_lib_creator(self._generate_header_source(), self._generate_impl_source(), self._kernel.name, CC=self._cc)
def __init__(self): header = r""" #include <math.h> #define INT64 int64_t #define REAL double """ src = r""" extern "C" int free_space_direct( const INT64 N, const REAL * RESTRICT P, const REAL * RESTRICT Q, REAL * RESTRICT phi ){{ REAL tmp_phi = 0.0; #pragma omp parallel for reduction(+:tmp_phi) for(INT64 ix=0 ; ix<N ; ix++){{ REAL tmp_inner_phi = 0.0; const REAL iq = Q[ix]; const REAL ip0 = P[3*ix + 0]; const REAL ip1 = P[3*ix + 1]; const REAL ip2 = P[3*ix + 2]; #pragma omp simd reduction(+:tmp_inner_phi) for(INT64 jx=(ix+1) ; jx<N ; jx++){{ const REAL jq = Q[jx]; const REAL jp0 = P[3*jx + 0]; const REAL jp1 = P[3*jx + 1]; const REAL jp2 = P[3*jx + 2]; const REAL d0 = ip0 - jp0; const REAL d1 = ip1 - jp1; const REAL d2 = ip2 - jp2; const REAL r2 = d0*d0 + d1*d1 + d2*d2; const REAL r = sqrt(r2); tmp_inner_phi += iq * jq / r; }} tmp_phi += tmp_inner_phi; }} phi[0] = tmp_phi; return 0; }} """.format() self._lib = build.simple_lib_creator( header_code=header, src_code=src, name="kmc_fmm_free_space_direct")['free_space_direct']
def __init__(self, kernel=None, dat_dict=None): self._dat_dict = access.DatArgStore(self._get_allowed_types(), dat_dict) self._cc = build.TMPCC self._temp_dir = runtime.BUILD_DIR if not os.path.exists(self._temp_dir): os.mkdir(self._temp_dir) self._kernel = kernel self.loop_timer = ppmd.modules.code_timer.LoopTimer() self.wrapper_timer = ppmd.opt.Timer(runtime.TIMER) self._components = None self._generate() self._lib = build.simple_lib_creator(self._generate_header_source(), self._components['LIB_SRC'], self._kernel.name, CC=self._cc) self._group = None for pd in self._dat_dict.items(): if issubclass(type(pd[1][0]), data.ParticleDat): if pd[1][0].group is not None: self._group = pd[1][0].group break
def __init__(self, L, eps, domain, dtype, exclude_tuples=None): self.L = L self.eps = eps self.domain = domain self.dtype = dtype with open(str(_SRC_DIR) + \ '/FMMSource/PBCSource.cpp') as fh: cpp = fh.read() with open(str(_SRC_DIR) + \ '/FMMSource/PBCSource.h') as fh: hpp = fh.read() self._lib = build.simple_lib_creator(hpp, cpp, 'pbc_setup_lib') self._lib['test1']() vol = self.domain.extent[0] * self.domain.extent[1] * \ self.domain.extent[2] self.kappa = math.sqrt(math.pi/(vol**(2./3.))) if exclude_tuples is None: exclude_tuples = [] iterset = (-1, 0, 1) for tx in itertools.product(iterset, iterset, iterset): if (tx[0] != 0) or (tx[1] != 0) or (tx[2] != 0): exclude_tuples.append(tx) self.exclude_tuples = exclude_tuples
def test_c_ephemeral_harm_1(): L = 12 N = 100 correct_gen = SphGen(L - 1, '_A', 'thetaA', 'phiA') to_test_gen = SphGenEphemeral(L - 1, '_B', 'thetaB', 'phiB') d = {} for lx in range(L): for mx in range(-lx, lx + 1): d[(lx, mx)] = ('err = MAX(ABS({} - {}), err);'.format( correct_gen.get_y_sym(lx, mx)[0], to_test_gen.get_y_sym(lx, mx)[0], ) + 'err = MAX(ABS({} - {}), err);'.format( correct_gen.get_y_sym(lx, mx)[1], to_test_gen.get_y_sym(lx, mx)[1], ), ) m = to_test_gen(d) src = """ #include <math.h> #define ABS(x) ((x) > (0) ? (x) : (-(x))) #define MAX(x, y) ((x) > (y) ? (x) : (y)) extern "C" int test( const double thetaA, const double phiA, const double thetaB, const double phiB, double * err_out ){{ double err = 0.0; {CORRECT_GEN} // --------------- {TO_TEST_GEN} *err_out = err; return 0; }} """.format(CORRECT_GEN=correct_gen.module, TO_TEST_GEN=m) lib = simple_lib_creator(header_code='', src_code=src)['test'] rng = np.random.RandomState(149135315) for testx in range(N): p = rng.uniform(low=0, high=6, size=2) err = ctypes.c_double(0) lib(ctypes.c_double(p[0]), ctypes.c_double(p[1]), ctypes.c_double(p[0]), ctypes.c_double(p[1]), ctypes.byref(err)) assert abs(err.value) < 10.**-15
def __init__(self): with open(str(_SRC_DIR) + \ '/FMMSource/WignerSource.cpp') as fh: cpp = fh.read() with open(str(_SRC_DIR) + \ '/FMMSource/WignerSource.h') as fh: hpp = fh.read() self._lib = simple_lib_creator(hpp, cpp, 'wigner_matrix')['get_matrix_set']
def _generate_host_libs(self): sph_gen = self.sph_gen def cube_ind(L, M): return ((L) * ((L) + 1) + (M)) assign_gen = 'double rhol = 1.0;\n' assign_gen += 'double rholcharge = rhol * charge;\n' for lx in range(self.L): for mx in range(-lx, lx + 1): assign_gen += 'out[{ind}] += {ylmm} * rholcharge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, -mx)[0])) assign_gen += 'out[IM_OFFSET + {ind}] += {ylmm} * rholcharge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, -mx)[1])) assign_gen += 'rhol *= radius;\n' assign_gen += 'rholcharge = rhol * charge;\n' src = """ #define IM_OFFSET ({IM_OFFSET}) {DECLARE} int multipole_exp( const double charge, const double radius, const double theta, const double phi, double * RESTRICT out ){{ {SPH_GEN} {ASSIGN_GEN} return 0; }} """ header = str(sph_gen.header) src_lib = src.format(SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE=r'static inline') src = src.format(SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE=r'extern "C"') self.create_multipole_header = header self.create_multipole_src = src_lib self._multipole_lib = simple_lib_creator(header_code=header, src_code=src)['multipole_exp']
def get_timer_accuracy(): from ppmd.lib import build t = ctypes.c_double(0.0) build.simple_lib_creator( ''' #include <chrono> extern "C" void get_chrono_tick(double *t); ''', ''' void get_chrono_tick(double *t){ std::chrono::high_resolution_clock::duration t0(1); std::chrono::duration<double> t1 = t0; *t = (double) t1.count(); } ''', 'opt_tick_test')['get_chrono_tick'](ctypes.byref(t)) return t.value
def __init__(self, name=None, seed=None): assert name is not None, "name required" self.name = str(name) if seed is None: seed = int(time.time()) header = ''' #include <random> #include <memory.h> #include <cstdint> #include <iostream> using namespace std; extern "C" int get_size(); extern "C" int get_mt_instance(uint seed, int size, mt19937 *mt_buffer); extern "C" uint64_t get_rand(mt19937 *mt_buffer); ''' src = ''' int get_size(){ int size = -1; size = sizeof(mt19937); return size; } int get_mt_instance(uint seed, int size, mt19937 *mt_buffer){ mt19937 mt_tmp(seed); if (sizeof(mt_tmp) != size){ return -1; } memcpy(mt_buffer, &mt_tmp, size); return 0; } // development function uint64_t get_rand(mt19937 *mt_buffer){ #define foo() (mt_buffer[0]()) const uint64_t rr = foo(); cout << rr << endl; return rr; } ''' lib = build.simple_lib_creator(header, src, 'Cpp11MT19937Lib') mt_size = lib['get_size']() assert mt_size > 0, "MT state size cannot be negative" self._mt_buffer = np.zeros(mt_size, dtype=ctypes.c_int8) mt_flag = lib['get_mt_instance'](ctypes.c_uint(seed), ctypes.c_int(mt_size), self._mt_buffer.ctypes.data_as( ctypes.POINTER(ctypes.c_void_p))) assert mt_flag > -1, "failed to make MT instance"
def _build_lib(self, force_unit, energy_unit): with open(str(_SRC_DIR) + '/FMMSource/LocalCells.cpp') as fh: cpp = fh.read() with open(str(_SRC_DIR) + '/FMMSource/LocalCells.h') as fh: hpp = fh.read() hpp = hpp % { 'SUB_FORCE_UNIT': str(force_unit), 'SUB_ENERGY_UNIT': str(energy_unit) } self._lib = build.simple_lib_creator(hpp, cpp, 'fmm_local')['local_cell_by_cell']
def __init__(self, n, kernel, dat_dict): self._cc = build.TMPCC self._N = n self._kernel = kernel self._dat_dict = dat_dict self.loop_timer = ppmd.modules.code_timer.LoopTimer() self._code_init() self._lib = build.simple_lib_creator(self._generate_header_source(), self._generate_impl_source(), self._kernel.name, CC=self._cc)
def test_exp_gen_1(): lmax = 40 exp_gen = SphExpGen(lmax) assign_gen = '' for lx in range(-lmax, lmax+1): assign_gen += 're_out[{lx}] = {exp};\n'.format(lx=lx, exp=exp_gen.get_e_sym(lx)[0]) assign_gen += 'im_out[{lx}] = {exp};\n'.format(lx=lx, exp=exp_gen.get_e_sym(lx)[1]) src = """ extern "C" int test_exp_gen( const double phi, double * RESTRICT re_out, double * RESTRICT im_out ){{ {EXP_GEN} {ASSIGN_GEN} return 0; }} """.format( EXP_GEN=str(exp_gen.module), ASSIGN_GEN=str(assign_gen) ) header = str(exp_gen.header) lib = simple_lib_creator(header_code=header, src_code=src)['test_exp_gen'] re_exp_c = np.zeros(2*lmax+1, dtype=c_double) im_exp_c = np.zeros_like(re_exp_c) rng = np.random.RandomState(1452) for phi in rng.uniform(low=0.0, high=2.*math.pi, size=20): lib(c_double(phi), re_exp_c[lmax:].view().ctypes.get_as_parameter(), im_exp_c[lmax:].view().ctypes.get_as_parameter() ) for lx in range(-lmax, lmax+1): correct = cmath.exp(lx*phi*1.j) re_err = abs(re_exp_c[lmax + lx] - correct.real) im_err = abs(im_exp_c[lmax + lx] - correct.imag) assert re_err < 10.**-13 assert im_err < 10.**-13
def build_unpack_lib(state): dats = state.particle_dats def g(x): return getattr(state, x) args = ','.join(['{} * D_{}'.format(g(n).ctype, n) for n in dats]) mvs = ''.join([ ''' memcpy(&D_{0}[pos * {1}], _R_BUF, {2}); _R_BUF += {2}; '''.format(str(n), str(g(n).ncomp), str(g(n).ncomp * ctypes.sizeof(g(n).dtype))) for n in dats ]) hsrc = ''' #include <string.h> #include <stdint.h> ''' src = ''' extern "C" int move_unpack( const int _recv_count, const int _num_free_slots, const int _prev_num_particles, const int * _free_slots, const uint8_t * _R_BUF, %(ARGS)s ){ for(int ix = 0; ix < _recv_count; ix++){ int pos; // prioritise filling spaces in dat. if (ix < _num_free_slots) {pos = _free_slots[ix];} else {pos = _prev_num_particles + ix - _num_free_slots;} %(MVS)s } return 0;} ''' % { 'ARGS': args, 'MVS': mvs } return build.simple_lib_creator(hsrc, src, 'move_unpack')['move_unpack']
def test_legendre_gen_1(): lmax = 24 lpmv_gen = ALegendrePolynomialGen(lmax) assign_gen = '' for lx in range(lmax+1): for mx in range(lx+1): assign_gen += 'out[LMAX * {lx} + {mx}] = '.format(lx=lx, mx=mx) + \ str(lpmv_gen.get_p_sym(lx, mx)) + ';\n' src = """ #define LMAX ({LMAX}) extern "C" int test_lpmv_gen( const double theta, double * RESTRICT out ){{ {LPMV_GEN} {ASSIGN_GEN} return 0; }} """.format( LPMV_GEN=str(lpmv_gen.module), ASSIGN_GEN=str(assign_gen), LMAX=lmax+1 ) header = str(lpmv_gen.header) lib = simple_lib_creator(header_code=header, src_code=src)['test_lpmv_gen'] lpmv_c = np.zeros((lmax+1, lmax+1), dtype=c_double) rng = np.random.RandomState(14523) for theta in rng.uniform(low=-0.999999, high=0.999999, size=10): lib(c_double(theta), lpmv_c.ctypes.get_as_parameter()) for lx in range(lmax+1): for mx in range(lx+1): correct = lpmv(mx, lx, theta) rel = 1 if (abs(correct) < 1.0) else abs(correct) err = abs(correct - lpmv_c[lx, mx]) / rel assert err < 10.**-12
def __init__(self, kernel=None, dat_dict=None, shell_cutoff=None): self._dat_dict = access.DatArgStore(self._get_allowed_types(), dat_dict) self._cc = build.TMPCC self._kernel = kernel self.shell_cutoff = shell_cutoff self.loop_timer = modules.code_timer.LoopTimer() self.wrapper_timer = opt.Timer(runtime.TIMER) self.list_timer = opt.Timer(runtime.TIMER) self._gather_space = host.ThreadSpace(100, ctypes.c_uint8) self._generate() self._offset_list = host.Array(ncomp=27, dtype=ctypes.c_int) self._lib = build.simple_lib_creator(self._generate_header_source(), self._components['LIB_SRC'], self._kernel.name, CC=self._cc) self._group = None for pd in self._dat_dict.items(): if issubclass(type(pd[1][0]), data.PositionDat): self._group = pd[1][0].group break #assert self._group is not None, "No cell to particle map found" if self._group is not None: self._make_cell_list(self._group) self._kernel_execution_count = INT64(0) self._invocations = 0 self._jstore = [host.Array(ncomp=100, dtype=ctypes.c_int) for tx in \ range(runtime.NUM_THREADS)]
def _transfer_unpack(self): """ pack and transfer the particle dat, rebuild cell list if needed """ if self._exchange_lib is None: _ex_args = ''' %(DTYPE)s * RESTRICT DAT, // DAT pointer int DAT_END, // end of dat. const double * RESTRICT SHIFT, // position shifts const int f_MPI_COMM, // F90 comm from mpi4py const int * RESTRICT SEND_RANKS, // send directions const int * RESTRICT RECV_RANKS, // recv directions const int * RESTRICT h_ind, // halo indices const int * RESTRICT b_ind, // local b indices const int * RESTRICT h_arr, // h cell indices const int * RESTRICT b_arr, // b cell indices const int * RESTRICT dir_counts, // expected recv counts const int cell_offset, // offset for cell list const int sort_flag, // does the cl require updating int * RESTRICT ccc, // cell contents count int * RESTRICT crl, // cell reverse lookup int * RESTRICT cell_linked_list, // cell list %(DTYPE)s * RESTRICT b_tmp // tmp space for sending ''' % { 'DTYPE': host.ctypes_map[self.dtype] } _ex_header = ''' #include <generic.h> #include <mpi.h> #include <iostream> using namespace std; #define RESTRICT %(RESTRICT)s %(POS_ENABLE)s extern "C" void HALO_EXCHANGE_PD(%(ARGS)s); ''' _ex_code = ''' void HALO_EXCHANGE_PD(%(ARGS)s){ // get mpi comm and rank MPI_Comm MPI_COMM = MPI_Comm_f2c(f_MPI_COMM); int rank = -1; MPI_Comm_rank( MPI_COMM, &rank ); MPI_Status MPI_STATUS; MPI_Request sr; MPI_Request rr; //for( int dir=0 ; dir<6 ; dir++ ){ // cout << "dir: " << dir << " count: " << dir_counts[dir] << endl;; //} for( int dir=0 ; dir<6 ; dir++ ){ //for( int iy=0 ; iy<%(NCOMP)s ; iy++ ){ // cout << "\tdir: " << dir << " comp " << iy << " shift " << SHIFT[dir*%(NCOMP)s + iy] << endl; //} const int b_s = b_ind[dir]; const int b_e = b_ind[dir+1]; const int b_c = b_e - b_s; const int h_s = h_ind[dir]; const int h_e = h_ind[dir+1]; const int h_c = h_e - h_s; //packing index; int p_index = -1; // packing loop for( int cx=0 ; cx<b_c ; cx++ ){ // cell index const int ci = b_arr[b_s + cx]; // loop over contents of cell. int ix = cell_linked_list[cell_offset + ci]; while(ix > -1){ p_index ++; for( int iy=0 ; iy<%(NCOMP)s ; iy++ ){ b_tmp[p_index * %(NCOMP)s + iy] = DAT[ix*%(NCOMP)s + iy]; //cout << "packed: " << b_tmp[p_index * %(NCOMP)s +iy]; #ifdef POS b_tmp[p_index * %(NCOMP)s + iy] += SHIFT[dir*%(NCOMP)s + iy]; #endif //cout << " p_shifted: " << b_tmp[p_index * %(NCOMP)s +iy] << endl; } ix = cell_linked_list[ix];} } /* cout << " SEND | "; for( int tx=0 ; tx < (p_index + 1)*3; tx++){ cout << b_tmp[tx] << " |"; } cout << endl; */ // start the sendrecv as non blocking. if (( SEND_RANKS[dir] > -1 ) && ( p_index > -1 ) ){ MPI_Isend((void *) b_tmp, (p_index + 1) * %(NCOMP)s, %(MPI_DTYPE)s, SEND_RANKS[dir], rank, MPI_COMM, &sr); } if (( RECV_RANKS[dir] > -1 ) && ( dir_counts[dir] > 0 ) ){ MPI_Irecv((void *) &DAT[DAT_END * %(NCOMP)s], %(NCOMP)s * dir_counts[dir], %(MPI_DTYPE)s, RECV_RANKS[dir], RECV_RANKS[dir], MPI_COMM, &rr); } //cout << "DAT_END: " << DAT_END << endl; int DAT_END_T = DAT_END; DAT_END += dir_counts[dir]; // build halo part of cell list whilst exchange occuring. //#ifdef POS if (sort_flag > 0){ for( int hxi=h_s ; hxi<h_e ; hxi++ ){ // index of a halo cell const int hx = h_arr[ hxi ]; // number of particles in cell const int hx_count = ccc[ hx ]; if (hx_count > 0) { //cout << "\tsorting cell: " << hx << " ccc: " << hx_count << endl; cell_linked_list[cell_offset + hx] = DAT_END_T; for( int iy=0 ; iy<(hx_count-1) ; iy++ ){ cell_linked_list[ DAT_END_T+iy ] = DAT_END_T + iy + 1; crl[ DAT_END_T+iy ] = hx; } cell_linked_list[ DAT_END_T + hx_count - 1 ] = -1; crl[ DAT_END_T + hx_count -1 ] = hx; DAT_END_T += hx_count; } } } //#endif // after send has completed move to next direction. if (( SEND_RANKS[dir] > -1 ) && ( p_index > -1 ) ){ MPI_Wait(&sr, MPI_STATUS_IGNORE); } if (( RECV_RANKS[dir] > -1 ) && ( dir_counts[dir] > 0 ) ){ MPI_Wait(&rr, MPI_STATUS_IGNORE); } //MPI_Barrier(MPI_COMM); //cout << "dir end " << dir << " -----------" << endl; } return; } ''' if type(self) is PositionDat: _pos_enable = '#define POS' else: _pos_enable = '' _ex_dict = { 'ARGS': _ex_args, 'RESTRICT': build.MPI_CC.restrict_keyword, 'DTYPE': host.ctypes_map[self.dtype], 'POS_ENABLE': _pos_enable, 'NCOMP': self.ncomp, 'MPI_DTYPE': host.mpi_type_map[self.dtype] } _ex_header %= _ex_dict _ex_code %= _ex_dict self._exchange_lib = build.simple_lib_creator( _ex_header, _ex_code, 'HALO_EXCHANGE_PD', CC=build.MPI_CC)['HALO_EXCHANGE_PD'] # End of creation code ----------------------------------------- comm = self.group.domain.comm _h = self.group._halo_manager.get_halo_cell_groups() _b = self.group._halo_manager.get_boundary_cell_groups() if self.group._cell_to_particle_map.version_id > self.group._cell_to_particle_map.halo_version_id: _sort_flag = ctypes.c_int(1) else: _sort_flag = ctypes.c_int(-1) self._exchange_lib( self.ctypes_data, ctypes.c_int(self.npart_local), self.group._halo_manager.get_position_shifts().ctypes_data, ctypes.c_int(comm.py2f()), self.group._halo_manager.get_send_ranks().ctypes_data, self.group._halo_manager.get_recv_ranks().ctypes_data, _h[1].ctypes_data, _b[1].ctypes_data, _h[0].ctypes_data, _b[0].ctypes_data, self.group._halo_manager.get_dir_counts().ctypes_data, self.group._cell_to_particle_map.offset, _sort_flag, self.group._cell_to_particle_map.cell_contents_count.ctypes_data, self.group._cell_to_particle_map.cell_reverse_lookup.ctypes_data, self.group._cell_to_particle_map.cell_list.ctypes_data, self._tmp_halo_space.ctypes_data)
def __init__(self, lmax): self._lmax = lmax im_of = (lmax+1) ** 2 self._ncomp = 2 * im_of self.ncomp = self._ncomp sph_gen = SphGen(lmax) def lm_ind(L, M, OX=0): return ((L) * ( (L) + 1 ) + (M) + OX) radius_gen = 'const double iradius = 1.0/radius;\nconst double r0 = 1.0;\n' assign_gen = '' for lx in range(lmax+1): radius_gen += 'const double r{lxp1} = r{lx} * iradius;\n'.format(lxp1=lx+1, lx=lx) for mx in range(-lx, lx+1): assign_gen += 'tmp_out[{LM_IND}] += '.format(LM_IND=lm_ind(lx, mx)) + \ str(sph_gen.get_y_sym(lx, mx)[0]) + \ ' * r{lx};\n'.format(lx=lx+1) assign_gen += 'tmp_out[{LM_IND}] += '.format(LM_IND=lm_ind(lx, mx, im_of)) + \ str(sph_gen.get_y_sym(lx, mx)[1]) + \ ' * r{lx};\n'.format(lx=lx+1) src = """ #include <omp.h> #define STRIDE ({STRIDE}) extern "C" int sph_gen( const int num_threads, const int N, const double * RESTRICT radius_set, const double * RESTRICT theta_set, const double * RESTRICT phi_set, double * RESTRICT gtmp_out, double * RESTRICT out ){{ for(int tx=0 ; tx<(num_threads*STRIDE) ; tx++){{ gtmp_out[tx] = 0; }} omp_set_num_threads(num_threads); #pragma omp parallel default(none) shared(radius_set, theta_set, phi_set, gtmp_out) {{ const int threadid = omp_get_thread_num(); const int inner_num_threads = omp_get_num_threads(); const int lower = N*threadid/inner_num_threads; const int upper = (threadid == (inner_num_threads - 1)) ? N : N*(threadid+1)/inner_num_threads; double * RESTRICT tmp_out = gtmp_out + threadid * STRIDE; for (int ix=lower; ix<upper ; ix++){{ const double radius = radius_set[ix]; const double theta = theta_set[ix]; const double phi = phi_set[ix]; {RADIUS_GEN} {SPH_GEN} {ASSIGN_GEN} }} }} for(int tx=0 ; tx<num_threads ; tx++){{ for(int ix=0 ; ix<STRIDE ; ix++){{ out[ix] += gtmp_out[ix + tx*STRIDE]; }} }} return 0; }} """.format( RADIUS_GEN=radius_gen, SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), STRIDE=self._ncomp ) header = str(sph_gen.header) self._lib = build.simple_lib_creator(header_code=header, src_code=src)['sph_gen'] self._nthreads = runtime.NUM_THREADS self._gtmp = np.zeros(self._ncomp*self._nthreads, dtype=ctypes.c_double)
def _generate_host_libs(self): sph_gen = self.sph_gen def cube_ind(L, M): return ((L) * ( (L) + 1 ) + (M) ) assign_gen = 'double rhol = 1.0;\n' assign_gen += 'double rholcharge = rhol * charge;\n' for lx in range(self.L): for mx in range(-lx, lx+1): assign_gen += 'out[{ind}] += {ylmm} * rholcharge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, -mx)[0]) ) assign_gen += 'out[IM_OFFSET + {ind}] += {ylmm} * rholcharge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, -mx)[1]) ) assign_gen += 'rhol *= radius;\n' assign_gen += 'rholcharge = rhol * charge;\n' src = """ #define IM_OFFSET ({IM_OFFSET}) {DECLARE} int multipole_exp( const double charge, const double radius, const double theta, const double phi, double * RESTRICT out ){{ {SPH_GEN} {ASSIGN_GEN} return 0; }} """ header = str(sph_gen.header) src_lib = src.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE=r'static inline' ) src = src.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE=r'extern "C"' ) self.create_multipole_header = header self.create_multipole_src = src_lib self._multipole_lib = simple_lib_creator(header_code=header, src_code=src)['multipole_exp'] # --- lib to create vector to dot product with local expansions --- assign_gen = 'double rhol = 1.0;\n' assign_gen += 'double rholcharge = rhol * charge;\n' for lx in range(self.L): for mx in range(-lx, lx+1): assign_gen += 'out[{ind}] += {ylmm} * rholcharge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, mx)[0]) ) assign_gen += 'out[IM_OFFSET + {ind}] += (-1.0) * {ylmm} * rholcharge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, mx)[1]) ) assign_gen += 'rhol *= radius;\n' assign_gen += 'rholcharge = rhol * charge;\n' src = """ #define IM_OFFSET ({IM_OFFSET}) {DECLARE} int local_dot_vec( const double charge, const double radius, const double theta, const double phi, double * RESTRICT out ){{ {SPH_GEN} {ASSIGN_GEN} return 0; }} """ header = str(sph_gen.header) src_lib = src.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE=r'static inline' ) src = src.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE=r'extern "C"' ) self.create_dot_vec_header = header self.create_dot_vec_src = src_lib self._dot_vec_lib = simple_lib_creator(header_code=header, src_code=src)['local_dot_vec'] # --- lib to evaluate local expansions --- assign_gen = '' for lx in range(self.L): for mx in range(-lx, lx+1): reL = SphSymbol('moments[{ind}]'.format(ind=cube_ind(lx, mx))) imL = SphSymbol('moments[IM_OFFSET + {ind}]'.format(ind=cube_ind(lx, mx))) reY, imY = sph_gen.get_y_sym(lx, mx) phi_sym = cmplx_mul(reL, imL, reY, imY)[0] assign_gen += 'tmp_energy += rhol * ({phi_sym});\n'.format(phi_sym=str(phi_sym)) assign_gen += 'rhol *= radius;\n' src = """ #define IM_OFFSET ({IM_OFFSET}) {DECLARE} int local_eval( const double radius, const double theta, const double phi, const double * RESTRICT moments, double * RESTRICT out ){{ {SPH_GEN} double rhol = 1.0; double tmp_energy = 0.0; {ASSIGN_GEN} out[0] = tmp_energy; return 0; }} """ src_lib = src.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE=r'static inline' ) src = src.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE=r'extern "C"' ) header = str(sph_gen.header) self.create_local_eval_header = header self.create_local_eval_src = src_lib self._local_eval_lib = simple_lib_creator(header_code=header, src_code=src)['local_eval'] # lib to create local expansions tflops = common.new_flop_dict() tflops = common.add_flop_dict(tflops, sph_gen.flops) assign_gen = 'const double iradius = 1.0/radius;\n' assign_gen += 'double rhol = iradius;\n' for lx in range(self.L): for mx in range(-lx, lx+1): assign_gen += 'out[{ind}] += {ylmm} * rhol * charge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, -mx)[0]) ) assign_gen += 'out[IM_OFFSET + {ind}] += {ylmm} * rhol * charge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, -mx)[1]) ) tflops['+'] += 2 tflops['*'] += 4 assign_gen += 'rhol *= iradius;\n' tflops['*'] += 1 self.flop_count_create_local_exp = tflops src = """ #define IM_OFFSET ({IM_OFFSET}) extern "C" int create_local_exp( const double charge, const double radius, const double theta, const double phi, double * RESTRICT out ){{ {SPH_GEN} {ASSIGN_GEN} return 0; }} """.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), ) header = str(sph_gen.header) self.create_local_exp_header = header self.create_local_exp_src = """ #define IM_OFFSET ({IM_OFFSET}) static inline void inline_local_exp( const double charge, const double radius, const double theta, const double phi, double * RESTRICT out ){{ {SPH_GEN} {ASSIGN_GEN} return; }} """.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), ) self._local_create_lib = simple_lib_creator(header_code=header, src_code=src)['create_local_exp'] # --- lib to create vector to dot product and mutlipole expansions --- assign_gen = 'double rhol = 1.0;\n' assign_gen += 'double rholcharge = rhol * charge;\n' flops = {'+': 0, '-': 0, '*': 0, '/': 0} for lx in range(self.L): for mx in range(-lx, lx+1): assign_gen += 'out_mul[{ind}] += {ylmm} * rholcharge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, -mx)[0]) ) assign_gen += 'out_mul[IM_OFFSET + {ind}] += {ylmm} * rholcharge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, -mx)[1]) ) assign_gen += 'out_vec[{ind}] += {ylmm} * rholcharge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, mx)[0]) ) assign_gen += 'out_vec[IM_OFFSET + {ind}] += (-1.0) * {ylmm} * rholcharge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, mx)[1]) ) flops['+'] += 4 flops['*'] += 5 assign_gen += 'rhol *= radius;\n' assign_gen += 'rholcharge = rhol * charge;\n' flops['*'] += 2 flops['+'] += sph_gen.flops['*'] flops['-'] += sph_gen.flops['*'] flops['*'] += sph_gen.flops['*'] flops['/'] += sph_gen.flops['*'] src = """ #define IM_OFFSET ({IM_OFFSET}) {DECLARE} int local_dot_vec_multipole( const double charge, const double radius, const double theta, const double phi, double * RESTRICT out_vec, double * RESTRICT out_mul ){{ {SPH_GEN} {ASSIGN_GEN} return 0; }} """ header = str(sph_gen.header) src_lib = src.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE='static inline' ) src = src.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE=r'extern "C"' ) self.create_dot_vec_multipole_header = header self.create_dot_vec_multipole_src = src_lib self.create_dot_vec_multipole_flops = flops self._dot_vec_multipole_lib = simple_lib_creator(header_code=header, src_code=src)['local_dot_vec_multipole']
def _generate_host_libs(self): sph_gen = self.sph_gen def cube_ind(L, M): return ((L) * ((L) + 1) + (M)) # --- lib to evaluate local expansions --- assign_gen = '' for lx in range(self.L): for mx in range(-lx, lx + 1): reL = SphSymbol('moments[{ind}]'.format(ind=cube_ind(lx, mx))) imL = SphSymbol( 'moments[IM_OFFSET + {ind}]'.format(ind=cube_ind(lx, mx))) reY, imY = sph_gen.get_y_sym(lx, mx) phi_sym = cmplx_mul(reL, imL, reY, imY)[0] assign_gen += 'tmp_energy += rhol * ({phi_sym});\n'.format( phi_sym=str(phi_sym)) assign_gen += 'rhol *= radius;\n' src = """ #define IM_OFFSET ({IM_OFFSET}) #define REAL double #define INT64 int64_t {DECLARE} int local_eval( const INT64 n, const REAL * RESTRICT hradius, const REAL * RESTRICT htheta, const REAL * RESTRICT hphi, const REAL * RESTRICT * RESTRICT hmoments, REAL * RESTRICT out ){{ #pragma omp parallel for for(INT64 ix=0 ; ix<n ; ix++){{ const REAL radius = hradius[ix]; const REAL theta = htheta[ix]; const REAL phi = hphi[ix]; const REAL * RESTRICT moments = hmoments[ix]; {SPH_GEN} REAL rhol = 1.0; REAL tmp_energy = 0.0; {ASSIGN_GEN} out[ix] = tmp_energy; }} return 0; }} """ src_internal = src.format(SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE=r'extern "C"') src_lib = src.format(SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE=r'static inline') header = str(sph_gen.header) self.create_local_eval_header = header self.create_local_eval_src = src_lib self._local_eval_lib = simple_lib_creator( header_code=header, src_code=src_internal)['local_eval'] # lib to create local expansions assign_gen = 'const REAL iradius = 1.0/radius;\n' assign_gen += 'REAL rhol = iradius;\n' for lx in range(self.L): for mx in range(-lx, lx + 1): assign_gen += 'out[{ind}] += {ylmm} * rhol * charge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, -mx)[0])) assign_gen += 'out[IM_OFFSET + {ind}] += {ylmm} * rhol * charge;\n'.format( ind=cube_ind(lx, mx), ylmm=str(sph_gen.get_y_sym(lx, -mx)[1])) assign_gen += 'rhol *= iradius;\n' src = """ extern "C" int create_local_exp( const INT64 n, const REAL * RESTRICT hcharge, const REAL * RESTRICT hradius, const REAL * RESTRICT htheta, const REAL * RESTRICT hphi, REAL * RESTRICT * RESTRICT hout ){{ #pragma omp parallel for for(INT64 ix=0 ; ix<n ; ix++){{ const INT64 OFFSET = ix * IM_OFFSET * 2; const REAL charge = hcharge[ix]; const REAL radius = hradius[ix]; const REAL theta = htheta[ix]; const REAL phi = hphi[ix]; REAL * RESTRICT out = hout[ix]; {SPH_GEN} {ASSIGN_GEN} }} return 0; }} """.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), ) header = str(sph_gen.header) + ''' #define IM_OFFSET ({IM_OFFSET}) #define REAL double #define INT64 int64_t '''.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), ) self.create_local_exp_header = header self.create_local_exp_src = """ #define IM_OFFSET ({IM_OFFSET}) static inline void inline_local_exp( const double charge, const double radius, const double theta, const double phi, double * RESTRICT out ){{ {SPH_GEN} {ASSIGN_GEN} return; }} """.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), ) self._local_create_lib = simple_lib_creator( header_code=header, src_code=src)['create_local_exp'] # --- lib to evaluate a single local expansion --- assign_gen = '' for lx in range(self.L): for mx in range(-lx, lx + 1): reL = SphSymbol('moments[{ind}]'.format(ind=cube_ind(lx, mx))) imL = SphSymbol( 'moments[IM_OFFSET + {ind}]'.format(ind=cube_ind(lx, mx))) reY, imY = sph_gen.get_y_sym(lx, mx) phi_sym = cmplx_mul(reL, imL, reY, imY)[0] assign_gen += 'tmp_energy += rhol * ({phi_sym});\n'.format( phi_sym=str(phi_sym)) assign_gen += 'rhol *= radius;\n' src = """ #define IM_OFFSET ({IM_OFFSET}) {DECLARE} int local_eval( const double radius, const double theta, const double phi, const double * RESTRICT moments, double * RESTRICT out ){{ {SPH_GEN} double rhol = 1.0; double tmp_energy = 0.0; {ASSIGN_GEN} out[0] = tmp_energy; return 0; }} """ src_lib = src.format(SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE=r'static inline') src = src.format(SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), IM_OFFSET=(self.L**2), DECLARE=r'extern "C"') header = str(sph_gen.header) self.create_single_local_eval_header = header self.create_single_local_eval_src = src_lib self._single_local_eval_lib = simple_lib_creator( header_code=header, src_code=src)['local_eval']
def _init_host_old_lib(self): ncomp = (self.L**2)*2 half_ncomp = self.L**2 def _re_lm(l, m): return l**2 + l + m src = r''' {LOCAL_EVAL_HEADER} {LOCAL_EVAL_SRC} static inline void spherical( const REAL dx, const REAL dy, const REAL dz, REAL *radius, REAL *theta, REAL *phi ){{ const REAL dx2 = dx*dx; const REAL dx2_p_dy2 = dx2 + dy*dy; const REAL d2 = dx2_p_dy2 + dz*dz; *radius = sqrt(d2); *theta = atan2(sqrt(dx2_p_dy2), dz); *phi = atan2(dy, dx); return; }} extern "C" int old_long_range_energy( const INT64 num_particles, const REAL * RESTRICT old_positions, const REAL * RESTRICT old_charges, const REAL * RESTRICT existing_multipole, REAL * RESTRICT out ) {{ #pragma omp parallel for schedule(dynamic) for(INT64 px=0 ; px<num_particles ; px++){{ const REAL old_charge = old_charges[px]; const REAL opx = old_positions[3*px + 0]; const REAL opy = old_positions[3*px + 1]; const REAL opz = old_positions[3*px + 2]; REAL oradius, otheta, ophi; spherical(opx, opy, opz, &oradius, &otheta, &ophi); double tmp_energy = 0.0; local_eval( oradius, otheta, ophi, existing_multipole, &tmp_energy ); out[px] = tmp_energy * old_charge; }} return 0; }} '''.format( LOCAL_EVAL_HEADER=self._lee.create_local_eval_header, LOCAL_EVAL_SRC=self._lee.create_local_eval_src ) header = str( Module(( Include('omp.h'), Include('stdio.h'), Include('math.h'), Define('INT64', 'int64_t'), Define('REAL', 'double'), Define('NCOMP', str(ncomp)), Define('HALF_NCOMP', str(half_ncomp)), Define('DIPOLE_SX', str(self.lrc.dipole_correction[0])), Define('DIPOLE_SY', str(self.lrc.dipole_correction[1])), Define('DIPOLE_SZ', str(self.lrc.dipole_correction[2])), Define('RE_1P1', str(_re_lm(1, 1))), Define('RE_1_0', str(_re_lm(1, 0))), Define('RE_1N1', str(_re_lm(1,-1))), Define('IM_1P1', str(_re_lm(1, 1) + half_ncomp)), Define('IM_1_0', str(_re_lm(1, 0) + half_ncomp)), Define('IM_1N1', str(_re_lm(1,-1) + half_ncomp)), )) ) _l = simple_lib_creator(header_code=header, src_code=src)['old_long_range_energy'] return _l
def __init__(self, fmm, domain, boundary_condition, local_exp_eval, mirror_direction=None): assert boundary_condition in \ (BCType.PBC, BCType.FREE_SPACE, BCType.NEAREST, BCType.FF_ONLY) self.domain = domain self._lee = local_exp_eval self._bc = boundary_condition self.fmm = fmm self._new27direct = 0.0 ex = self.domain.extent for ox in cell_offsets: # image of old pos dox = np.array((ex[0] * ox[0], ex[1] * ox[1], ex[2] * ox[2])) if ox != (0, 0, 0): self._new27direct -= 1.0 / np.linalg.norm(dox) mirror_block = '' mirror_preloop = '' if self._bc == BCType.FREE_SPACE: co = ((0, 0, 0), ) else: co = cell_offsets if mirror_direction is not None: # convert mirror directions to coefficients mcoeff = dict() mcoeff['mcoeffx'] = -1.0 if mirror_direction[0] else 1.0 mcoeff['mcoeffy'] = -1.0 if mirror_direction[1] else 1.0 mcoeff['mcoeffz'] = -1.0 if mirror_direction[2] else 1.0 # compute position of old mirror charge mirror_preloop += ''' const REAL mopx = opx * {mcoeffx}; const REAL mopy = opy * {mcoeffy}; const REAL mopz = opz * {mcoeffz}; '''.format(**mcoeff) mirror_block += ''' const REAL mnpx = npx * {mcoeffx}; const REAL mnpy = npy * {mcoeffy}; const REAL mnpz = npz * {mcoeffz}; '''.format(**mcoeff) for oxi, ox in enumerate(co): oxi_zero = 0 if (ox[0] == 0 and ox[1] == 0 and ox[2] == 0) else 1 oxv = {'oxi': str(oxi), 'oxi_zero': str(oxi_zero)} oxv.update(mcoeff) mirror_block += ''' // offset of the old charge const REAL mdpx{oxi} = dox{oxi} + mopx; const REAL mdpy{oxi} = doy{oxi} + mopy; const REAL mdpz{oxi} = doz{oxi} + mopz; // diff to the old mirror in offset const REAL mddx{oxi} = mdpx{oxi} - npx; const REAL mddy{oxi} = mdpy{oxi} - npy; const REAL mddz{oxi} = mdpz{oxi} - npz; // remove old energy energy27 -= 2.0 / sqrt(mddx{oxi}*mddx{oxi} + mddy{oxi}*mddy{oxi} + mddz{oxi}*mddz{oxi}); // offset of the new charge const REAL mnpx{oxi} = dox{oxi} + mnpx; const REAL mnpy{oxi} = doy{oxi} + mnpy; const REAL mnpz{oxi} = doz{oxi} + mnpz; // diff to the new mirror in the offset const REAL mnddx{oxi} = mnpx{oxi} - npx; const REAL mnddy{oxi} = mnpy{oxi} - npy; const REAL mnddz{oxi} = mnpz{oxi} - npz; // add on the new contrib energy27 += 1.0 / sqrt(mnddx{oxi}*mnddx{oxi} + mnddy{oxi}*mnddy{oxi} + mnddz{oxi}*mnddz{oxi}); // the factor 2 required for b_bp with the non-mirrors energy27 += o_bbp{oxi}; // compute b_b, first with non-mirror const REAL do_opx{oxi} = opx - dpx{oxi}; const REAL do_opy{oxi} = opy - dpy{oxi}; const REAL do_opz{oxi} = opz - dpz{oxi}; energy27 -= ({oxi_zero} == 0) ? 0.0 : 1.0 / sqrt(do_opx{oxi}*do_opx{oxi} + do_opy{oxi}*do_opy{oxi} + do_opz{oxi}*do_opz{oxi}); // with the mirror const REAL do_mopx{oxi} = opx - mdpx{oxi}; const REAL do_mopy{oxi} = opy - mdpy{oxi}; const REAL do_mopz{oxi} = opz - mdpz{oxi}; energy27 += 1.0 / sqrt(do_mopx{oxi}*do_mopx{oxi} + do_mopy{oxi}*do_mopy{oxi} + do_mopz{oxi}*do_mopz{oxi}); '''.format(**oxv) preloop = '' bc27 = '' for oxi, ox in enumerate(co): preloop += ''' const REAL dox{oxi} = EX * {OX}; const REAL doy{oxi} = EY * {OY}; const REAL doz{oxi} = EZ * {OZ}; '''.format( oxi=str(oxi), OX=str(ox[0]), OY=str(ox[1]), OZ=str(ox[2]), ) if self._bc == BCType.FREE_SPACE: pass else: bc27 = 'energy27 = (DOMAIN_27_ENERGY);\n' for oxi, ox in enumerate(cell_offsets): bc27 += ''' const REAL dpx{oxi} = dox{oxi} + opx; const REAL dpy{oxi} = doy{oxi} + opy; const REAL dpz{oxi} = doz{oxi} + opz; const REAL ddx{oxi} = dpx{oxi} - npx; const REAL ddy{oxi} = dpy{oxi} - npy; const REAL ddz{oxi} = dpz{oxi} - npz; const REAL o_bbp{oxi} = 1.0 / sqrt(ddx{oxi}*ddx{oxi} + ddy{oxi}*ddy{oxi} + ddz{oxi}*ddz{oxi}); energy27 += o_bbp{oxi}; '''.format(oxi=str(oxi)) if self._bc == BCType.FF_ONLY: ff_only_block = 'energy27 = 0.0;' else: ff_only_block = '' src = r''' extern "C" int self_interaction( const INT64 store_stride, const INT64 num_particles, const INT64 * RESTRICT exclusive_sum, const REAL * RESTRICT old_positions, const REAL * RESTRICT old_charges, const REAL * RESTRICT new_positions, REAL * RESTRICT out ) {{ {preloop} #pragma omp parallel for schedule(dynamic) for(INT64 px=0 ; px<num_particles ; px++){{ const REAL coeff = old_charges[px] * old_charges[px]; const REAL opx = old_positions[3*px + 0]; const REAL opy = old_positions[3*px + 1]; const REAL opz = old_positions[3*px + 2]; {mirror_preloop} const INT64 nprop = exclusive_sum[px+1] - exclusive_sum[px]; #pragma omp simd for(INT64 movii=0 ; movii<nprop ; movii++){{ const INT64 movi = movii + exclusive_sum[px]; const REAL npx = new_positions[3*movi + 0]; const REAL npy = new_positions[3*movi + 1]; const REAL npz = new_positions[3*movi + 2]; const REAL dx = opx - npx; const REAL dy = opy - npy; const REAL dz = opz - npz; REAL energy27 = (1.0 / sqrt(dx*dx + dy*dy + dz*dz)); {bc27} {mirror_block} {ff_only_block} REAL tmp_energy = energy27; out[store_stride * px + movii] = coeff * tmp_energy; }} }} return 0; }} '''.format(bc27=bc27, preloop=preloop, mirror_block=mirror_block, mirror_preloop=mirror_preloop, ff_only_block=ff_only_block) header = str( Module(( Include('stdio.h'), Include('math.h'), Define('DOMAIN_27_ENERGY', str(self._new27direct)), Define('INT64', 'int64_t'), Define('REAL', 'double'), Define('EX', str(self.domain.extent[0])), Define('EY', str(self.domain.extent[1])), Define('EZ', str(self.domain.extent[2])), Define( 'PRINTF(A,B,C)', r'printf("%s:\t%f,\t%s:\t%f,\t%s:\t%f\n", #A, A, #B, B, #C, C);' ), Define('PRINTF1(A)', r'printf("%s:\t%f\n", #A, A);'), ))) #print(src) self.lib = simple_lib_creator(header_code=header, src_code=src)['self_interaction']
def exchange_cell_counts(self): """ Exchange the contents count of cells between processes. This is provided as a method in halo to avoid repeated exchanging of cell occupancy counts if multiple ParticleDat objects are being communicated. """ self._update_domain() if self._exchange_sizes_lib is None: _es_args = ''' const int f_MPI_COMM, // F90 comm from mpi4py const int * RESTRICT SEND_RANKS, // send directions const int * RESTRICT RECV_RANKS, // recv directions const int * RESTRICT h_ind, // halo indices const int * RESTRICT b_ind, // local b indices const int * RESTRICT h_arr, // h cell indices const int * RESTRICT b_arr, // b cell indices int * RESTRICT ccc, // cell contents count int * RESTRICT h_count, // number of halo particles int * RESTRICT t_count, // amount of tmp space needed int * RESTRICT h_tmp, // tmp space for recving int * RESTRICT b_tmp, // tmp space for sending int * RESTRICT dir_counts // expected recv counts ''' _es_header = ''' #include <generic.h> #include <mpi.h> #include <iostream> using namespace std; #define RESTRICT %(RESTRICT)s extern "C" void HALO_ES_LIB(%(ARGS)s); ''' _es_code = ''' void HALO_ES_LIB(%(ARGS)s){ *h_count = 0; *t_count = 0; // get mpi comm and rank MPI_Comm MPI_COMM = MPI_Comm_f2c(f_MPI_COMM); int rank = -1; MPI_Comm_rank( MPI_COMM, &rank ); MPI_Status MPI_STATUS; // [W E] [N S] [O I] for( int dir=0 ; dir<6 ; dir++ ){ //cout << "dir " << dir << "-------" << endl; const int dir_s = b_ind[dir]; // start index const int dir_c = b_ind[dir+1] - dir_s; // cell count const int dir_s_r = h_ind[dir]; // start index const int dir_c_r = h_ind[dir+1] - dir_s_r; // cell count int tmp_count = 0; for( int ix=0 ; ix<dir_c ; ix++ ){ b_tmp[ix] = ccc[b_arr[dir_s + ix]]; // copy into // send buffer tmp_count += ccc[b_arr[dir_s + ix]]; } *t_count = MAX(*t_count, tmp_count); if(rank == RECV_RANKS[dir]){ for( int tx=0 ; tx < dir_c ; tx++ ){ h_tmp[tx] = b_tmp[tx]; } } else { MPI_Sendrecv ((void *) b_tmp, dir_c, MPI_INT, SEND_RANKS[dir], rank, (void *) h_tmp, dir_c_r, MPI_INT, RECV_RANKS[dir], RECV_RANKS[dir], MPI_COMM, &MPI_STATUS); } tmp_count=0; for( int ix=0 ; ix<dir_c_r ; ix++ ){ ccc[h_arr[dir_s_r + ix]] = h_tmp[ix]; *h_count += h_tmp[ix]; tmp_count += h_tmp[ix]; } dir_counts[dir] = tmp_count; *t_count = MAX(*t_count, tmp_count); } return; } ''' _es_dict = { 'ARGS': _es_args, 'RESTRICT': build.MPI_CC.restrict_keyword } _es_header %= _es_dict _es_code %= _es_dict self._exchange_sizes_lib = build.simple_lib_creator( _es_header, _es_code, 'HALO_ES_LIB', CC=build.MPI_CC)['HALO_ES_LIB'] # End of creation code ----------------------------------------------- # update internal arrays if self._version < self._domain.cell_array.version: self._get_pairs() ccc = self._cell_to_particle_map.cell_contents_count # This if allows the host size exchnage code to be used for the gpu if type(ccc) is host.Array: ccc_ptr = ccc.ctypes_data else: if self._cell_contents_count_tmp is None: self._cell_contents_count_tmp = host.Array(ncomp=ccc.ncomp, dtype=ctypes.c_int) elif self._cell_contents_count_tmp.ncomp < ccc.ncomp: self._cell_contents_count_tmp.realloc(ccc.ncomp) #make a local copy of the cell contents counts self._cell_contents_count_tmp[:] = ccc[:] ccc_ptr = self._cell_contents_count_tmp.ctypes_data assert ccc_ptr is not None, "No valid Cell Contents Count pointer found." self._exchange_sizes_lib( ctypes.c_int(self._domain.comm.py2f()), self._send_ranks.ctypes_data, self._recv_ranks.ctypes_data, self._halo_groups_start_end_indices.ctypes_data, self._boundary_groups_start_end_indices.ctypes_data, self._halo_cell_groups.ctypes_data, self._boundary_cell_groups.ctypes_data, ccc_ptr, ctypes.byref(self._h_count), ctypes.byref(self._t_count), self._h_tmp.ctypes_data, self._b_tmp.ctypes_data, self.dir_counts.ctypes_data) # copy new sizes back to original array (eg for gpu) if type(ccc) is not host.Array: ccc[:] = self._cell_contents_count_tmp[:ccc.ncomp:] return self._h_count.value, self._t_count.value
def __init__(self, domain, boundary_condition, L, max_num_groups, mirror_mode=False, energy_unit=1.0): self.domain = domain self.bc = boundary_condition self.L = L exp_eval = LocalExpEval(L) self.lrc = LongRangeMTL(L, domain) ncomp = (self.L**2)*2 half_ncomp = self.L**2 def _re_lm(l, m): return l**2 + l + m E = self.domain.extent[0] assert abs(E - self.domain.extent[0]) < 10.**-14 assert abs(E - self.domain.extent[1]) < 10.**-14 assert abs(E - self.domain.extent[2]) < 10.**-14 m_quater_extent_z = -0.25 * self.domain.extent[2] if mirror_mode: group_decl = r""" const REAL bb_positions[12] = {{ positions[ix*3 + 0], positions[ix*3 + 1], positions[ix*3 + 2] + {Z_SHIFT}, group_positions[ix * MAX_NUM_GROUPS * 3 + gx * 3 + 0], group_positions[ix * MAX_NUM_GROUPS * 3 + gx * 3 + 1], group_positions[ix * MAX_NUM_GROUPS * 3 + gx * 3 + 2] + {Z_SHIFT}, positions[ix*3 + 0], positions[ix*3 + 1], -1.0 * (positions[ix*3 + 2] + {Z_SHIFT}), group_positions[ix * MAX_NUM_GROUPS * 3 + gx * 3 + 0], group_positions[ix * MAX_NUM_GROUPS * 3 + gx * 3 + 1], -1.0 * (group_positions[ix * MAX_NUM_GROUPS * 3 + gx * 3 + 2] + {Z_SHIFT}) }}; const REAL bb_charges[4] = {{ charges[ix], group_charges[ix*MAX_NUM_GROUPS+gx], -1.0 * charges[ix], -1.0 * group_charges[ix*MAX_NUM_GROUPS+gx] }}; const INT64 NG = 4; """.format( Z_SHIFT=m_quater_extent_z ) else: group_decl = r""" const REAL bb_positions[6] = { positions[ix*3 + 0], positions[ix*3 + 1], positions[ix*3 + 2], group_positions[ix * MAX_NUM_GROUPS * 3 + gx * 3 + 0], group_positions[ix * MAX_NUM_GROUPS * 3 + gx * 3 + 1], group_positions[ix * MAX_NUM_GROUPS * 3 + gx * 3 + 2] }; const REAL bb_charges[2] = { charges[ix], group_charges[ix*MAX_NUM_GROUPS+gx] }; const INT64 NG = 2; //printf("MAX_NUM_GROUPS %d, zpos %f gx %d\n", MAX_NUM_GROUPS, group_positions[ix * MAX_NUM_GROUPS * 3 + gx * 3 + 2], gx); //for(int ix=0 ; ix<NG ; ix++){ // printf("%d | P %f %f %f Q %f\n", ix, bb_positions[ix*3], bb_positions[ix*3+1], bb_positions[ix*3+2], bb_charges[ix]); //} """ inner_direct = '' if boundary_condition in (BCType.NEAREST, BCType.PBC): ox_range = tuple(range(-1, 2)) for oxi, ox in enumerate(product(ox_range, ox_range, ox_range)): if ox[0] != 0 or ox[1] != 0 or ox[2] != 0: inner_direct += """ d0 = jp0 - ip0 + {OX}; d1 = jp1 - ip1 + {OY}; d2 = jp2 - ip2 + {OZ}; r2 = d0*d0 + d1*d1 + d2*d2; r = sqrt(r2); tmp_inner_phi += 0.5 * iq * jq / r; """.format( OXI=oxi, OX=ox[0] * E, OY=ox[1] * E, OZ=ox[2] * E ) pbc_call = '' if boundary_condition == BCType.PBC: pbc_call = r""" const REAL tmp_energy_lr = pbc_direct(N, positions, charges, linop_data, linop_indptr, linop_indices); tmp_energy += tmp_energy_lr; """ src = r""" static inline REAL nearest_direct( const INT64 N, const REAL * RESTRICT P, const REAL * RESTRICT Q ){{ REAL tmp_phi = 0.0; for(INT64 ix=0 ; ix<N ; ix++){{ REAL tmp_inner_phi = 0.0; const REAL iq = Q[ix]; const REAL ip0 = P[3*ix + 0]; const REAL ip1 = P[3*ix + 1]; const REAL ip2 = P[3*ix + 2]; for(INT64 jx=(ix+1) ; jx<N ; jx++){{ const REAL jq = Q[jx]; const REAL jp0 = P[3*jx + 0]; const REAL jp1 = P[3*jx + 1]; const REAL jp2 = P[3*jx + 2]; REAL d0 = ip0 - jp0; REAL d1 = ip1 - jp1; REAL d2 = ip2 - jp2; REAL r2 = d0*d0 + d1*d1 + d2*d2; REAL r = sqrt(r2); tmp_inner_phi += iq * jq / r; }} for(INT64 jx=0 ; jx<N ; jx++){{ const REAL jq = Q[jx]; const REAL jp0 = P[3*jx + 0]; const REAL jp1 = P[3*jx + 1]; const REAL jp2 = P[3*jx + 2]; REAL d0; REAL d1; REAL d2; REAL r2; REAL r; {INNER_DIRECT} }} tmp_phi += tmp_inner_phi; }} return tmp_phi; }} static inline void spherical( const REAL dx, const REAL dy, const REAL dz, REAL *radius, REAL *theta, REAL *phi ){{ const REAL dx2 = dx*dx; const REAL dx2_p_dy2 = dx2 + dy*dy; const REAL d2 = dx2_p_dy2 + dz*dz; *radius = sqrt(d2); *theta = atan2(sqrt(dx2_p_dy2), dz); *phi = atan2(dy, dx); return; }} {MULTIPOLE_HEADER} {MULTIPOLE_SRC} {EVEC_HEADER} {EVEC_SRC} static inline REAL linop_csr_both( const REAL * RESTRICT linop_data, const INT64 * RESTRICT linop_indptr, const INT64 * RESTRICT linop_indices, const REAL * RESTRICT x1, const REAL * RESTRICT E ){{ INT64 data_ind = 0; REAL dot_tmp = 0.0; for(INT64 row=0 ; row<HALF_NCOMP ; row++){{ REAL row_tmp_1 = 0.0; REAL row_tmp_2 = 0.0; for(INT64 col_ind=linop_indptr[row] ; col_ind<linop_indptr[row+1] ; col_ind++){{ const INT64 col = linop_indices[data_ind]; const REAL data = linop_data[data_ind]; data_ind++; row_tmp_1 += data * x1[col]; row_tmp_2 += data * x1[col + HALF_NCOMP]; }} dot_tmp += row_tmp_1 * E[row] + row_tmp_2 * E[row + HALF_NCOMP]; }} return dot_tmp; }} static inline REAL apply_dipole_correction_split( const REAL * RESTRICT M, const REAL * RESTRICT E ){{ REAL tmp = 0.0; tmp += (DIPOLE_SX * M[RE_1P1]) * E[RE_1P1]; tmp += (DIPOLE_SX * M[RE_1P1]) * E[RE_1N1]; tmp -= (DIPOLE_SY * M[IM_1P1]) * E[IM_1P1]; tmp += (DIPOLE_SY * M[IM_1P1]) * E[IM_1N1]; tmp += (DIPOLE_SZ * M[RE_1_0]) * E[RE_1_0]; return tmp; }} static inline REAL pbc_direct( const INT64 N, const REAL * RESTRICT positions, const REAL * RESTRICT charges, const REAL * RESTRICT linop_data, const INT64 * RESTRICT linop_indptr, const INT64 * RESTRICT linop_indices ){{ REAL new_moments[NCOMP]; REAL new_evector[NCOMP]; for(int cx=0 ; cx<NCOMP ; cx++){{ new_moments[cx] = 0.0; new_evector[cx] = 0.0; }} for(int ix=0 ; ix<N ; ix++){{ REAL radius, theta, phi; const REAL px = positions[ix*3 + 0]; const REAL py = positions[ix*3 + 1]; const REAL pz = positions[ix*3 + 2]; const REAL ch = charges[ix]; spherical(px, py, pz, &radius, &theta, &phi); local_dot_vec(ch, radius, theta, phi, new_evector); multipole_exp(ch, radius, theta, phi, new_moments); }} REAL new_energy = 0.5 * linop_csr_both( linop_data, linop_indptr, linop_indices, new_moments, new_evector ); new_energy += 0.5 * apply_dipole_correction_split( new_moments, new_evector ); return new_energy; }} static inline REAL compute_energy( const INT64 N, const REAL * RESTRICT positions, const REAL * RESTRICT charges, const REAL * RESTRICT linop_data, const INT64 * RESTRICT linop_indptr, const INT64 * RESTRICT linop_indices ){{ REAL tmp_energy = nearest_direct(N, positions, charges); {PBC_CALL} return tmp_energy; }} extern "C" int direct_from_dats( const INT64 N, const INT64 * RESTRICT flags, const REAL * RESTRICT positions, const REAL * RESTRICT charges, const INT64 * RESTRICT group_counts, const REAL * RESTRICT group_positions, const REAL * RESTRICT group_charges, const REAL * RESTRICT linop_data, const INT64 * RESTRICT linop_indptr, const INT64 * RESTRICT linop_indices, REAL * RESTRICT group_energy ){{ #pragma omp parallel for for(INT64 ix=0 ; ix<N ; ix++ ){{ if ( flags[ix] > 0 ){{ for(INT64 gx=0 ; gx<group_counts[ix] ; gx++ ){{ {GROUP_DECL} group_energy[ix*MAX_NUM_GROUPS+gx] = compute_energy( NG, bb_positions, bb_charges, linop_data, linop_indptr, linop_indices ) * {ENERGY_UNIT}; }} }} }} return 0; }} """.format( GROUP_DECL=group_decl, INNER_DIRECT=inner_direct, PBC_CALL=pbc_call, MULTIPOLE_HEADER=exp_eval.create_multipole_header, MULTIPOLE_SRC=exp_eval.create_multipole_src, EVEC_HEADER=exp_eval.create_dot_vec_header, EVEC_SRC=exp_eval.create_dot_vec_src, ENERGY_UNIT=float(energy_unit) ) header = str( Module(( Include('omp.h'), Include('stdio.h'), Include('math.h'), Define('INT64', 'int64_t'), Define('REAL', 'double'), Define('NCOMP', str(ncomp)), Define('HALF_NCOMP', str(half_ncomp)), Define('DIPOLE_SX', str(self.lrc.dipole_correction[0])), Define('DIPOLE_SY', str(self.lrc.dipole_correction[1])), Define('DIPOLE_SZ', str(self.lrc.dipole_correction[2])), Define('RE_1P1', str(_re_lm(1, 1))), Define('RE_1_0', str(_re_lm(1, 0))), Define('RE_1N1', str(_re_lm(1,-1))), Define('IM_1P1', str(_re_lm(1, 1) + half_ncomp)), Define('IM_1_0', str(_re_lm(1, 0) + half_ncomp)), Define('IM_1N1', str(_re_lm(1,-1) + half_ncomp)), Define('MAX_NUM_GROUPS', str(max_num_groups)) )) ) self._lib = build.simple_lib_creator(header, src)['direct_from_dats']
def __init__(self, E, tuples=None): if not isinstance(E, Iterable): E = (E, E, E) if tuples is None: ox_range = tuple(range(-1, 2)) tuples = product(ox_range, ox_range, ox_range) inner = '' for oxi, ox in enumerate(tuples): if ox[0] != 0 or ox[1] != 0 or ox[2] != 0: inner += """ d0 = jp0 - ip0 + {OX}; d1 = jp1 - ip1 + {OY}; d2 = jp2 - ip2 + {OZ}; r2 = d0*d0 + d1*d1 + d2*d2; r = sqrt(r2); tmp_inner_phi += 0.5 * iq * jq / r; """.format(OXI=oxi, OX=ox[0] * E[0], OY=ox[1] * E[1], OZ=ox[2] * E[2]) header = r""" #include <math.h> #define INT64 int64_t #define REAL double """ src = r""" extern "C" int nearest_direct( const INT64 N, const REAL * RESTRICT P, const REAL * RESTRICT Q, REAL * RESTRICT phi ){{ REAL tmp_phi = 0.0; #pragma omp parallel for reduction(+:tmp_phi) for(INT64 ix=0 ; ix<N ; ix++){{ REAL tmp_inner_phi = 0.0; const REAL iq = Q[ix]; const REAL ip0 = P[3*ix + 0]; const REAL ip1 = P[3*ix + 1]; const REAL ip2 = P[3*ix + 2]; for(INT64 jx=(ix+1) ; jx<N ; jx++){{ const REAL jq = Q[jx]; const REAL jp0 = P[3*jx + 0]; const REAL jp1 = P[3*jx + 1]; const REAL jp2 = P[3*jx + 2]; REAL d0 = ip0 - jp0; REAL d1 = ip1 - jp1; REAL d2 = ip2 - jp2; REAL r2 = d0*d0 + d1*d1 + d2*d2; REAL r = sqrt(r2); tmp_inner_phi += iq * jq / r; }} for(INT64 jx=0 ; jx<N ; jx++){{ const REAL jq = Q[jx]; const REAL jp0 = P[3*jx + 0]; const REAL jp1 = P[3*jx + 1]; const REAL jp2 = P[3*jx + 2]; REAL d0; REAL d1; REAL d2; REAL r2; REAL r; {INNER} }} tmp_phi += tmp_inner_phi; }} phi[0] = tmp_phi; return 0; }} """.format(INNER=inner) self._lib = build.simple_lib_creator( header_code=header, src_code=src, name="kmc_fmm_nearest_direct")['nearest_direct']
def _create_dat_lib(self): if self.boundary_condition is BCType.FREE_SPACE: cell_gen = r""" shifted_position[0] = position[0]; shifted_position[1] = position[1]; shifted_position[2] = position[2]; """ check_mod = r"" else: assert self.boundary_condition in (BCType.PBC, BCType.NEAREST, BCType.FF_ONLY) cell_gen = r""" REAL offsets[3]; offsets[0] = ((cell[0] - local_offset[2]) < lower_allowed[0]) ? 1.0 : 0.0; offsets[1] = ((cell[1] - local_offset[1]) < lower_allowed[1]) ? 1.0 : 0.0; offsets[2] = ((cell[2] - local_offset[0]) < lower_allowed[2]) ? 1.0 : 0.0; if ((cell[0] - local_offset[2]) > upper_allowed[0]) { offsets[0] = -1.0; }; if ((cell[1] - local_offset[1]) > upper_allowed[1]) { offsets[1] = -1.0; }; if ((cell[2] - local_offset[0]) > upper_allowed[2]) { offsets[2] = -1.0; }; cell[0] += offsets[0] * fmm_cells_per_side[2]; cell[1] += offsets[1] * fmm_cells_per_side[1]; cell[2] += offsets[2] * fmm_cells_per_side[0]; shifted_position[0] = position[0] + offsets[0] * extent[0]; shifted_position[1] = position[1] + offsets[1] * extent[1]; shifted_position[2] = position[2] + offsets[2] * extent[2]; """ check_mod = r""" if (d0 < (0.5*extent[0])) { d0 += extent[0]; } if (d1 < (0.5*extent[1])) { d1 += extent[1]; } if (d2 < (0.5*extent[2])) { d2 += extent[2]; } if (d0 > (0.5*extent[0])) { d0 -= extent[0]; } if (d1 > (0.5*extent[1])) { d1 -= extent[1]; } if (d2 > (0.5*extent[2])) { d2 -= extent[2]; } """ src = r""" #define REAL double #define INT64 int64_t static inline void get_cell( const REAL * RESTRICT position, const REAL * RESTRICT extent, const INT64 * fmm_cells_per_side, const INT64 * RESTRICT upper_allowed, const INT64 * RESTRICT lower_allowed, const INT64 * RESTRICT local_offset, INT64 * cell, REAL * shifted_position ){{ shifted_position[0] = position[0] + 0.5 * extent[0]; shifted_position[1] = position[1] + 0.5 * extent[1]; shifted_position[2] = position[2] + 0.5 * extent[2]; const REAL w0 = fmm_cells_per_side[0] / extent[0]; const REAL w1 = fmm_cells_per_side[1] / extent[1]; const REAL w2 = fmm_cells_per_side[2] / extent[2]; cell[0] = (INT64) (shifted_position[0] * w0); cell[1] = (INT64) (shifted_position[1] * w1); cell[2] = (INT64) (shifted_position[2] * w2); if (cell[0] >= fmm_cells_per_side[2]) {{ cell[0] = fmm_cells_per_side[2] - 1; }} if (cell[1] >= fmm_cells_per_side[1]) {{ cell[1] = fmm_cells_per_side[1] - 1; }} if (cell[2] >= fmm_cells_per_side[0]) {{ cell[2] = fmm_cells_per_side[0] - 1; }} {CELL_GEN} return; }} static inline void get_fmm_cell( const INT64 cc, const INT64 * fmm_cells_per_side, INT64 * cell ){{ const INT64 fx = fmm_cells_per_side[0]; const INT64 fy = fmm_cells_per_side[1]; const INT64 cx = cc % fx; const INT64 cycz = (cc - cx) / fx; const INT64 cy = cycz % fy; const INT64 cz = (cycz - cy) / fy; cell[0] = cx; cell[1] = cy; cell[2] = cz; return; }} static inline INT64 gcell_to_lcell( const INT64 * RESTRICT cell_data_offset, const INT64 * RESTRICT local_store_dims, const INT64 * cell ){{ const INT64 c0 = cell[0] + cell_data_offset[2]; const INT64 c1 = cell[1] + cell_data_offset[1]; const INT64 c2 = cell[2] + cell_data_offset[0]; return c0 + local_store_dims[2] * ( c1 + local_store_dims[1] * c2 ); }} extern "C" int get_fmm_lcell( const INT64 N, const INT64 * RESTRICT FMM_CELLS_PER_SIDE, const INT64 * RESTRICT CELL_DATA_OFFSET, const INT64 * RESTRICT LOCAL_STORE_DIMS, const INT64 * RESTRICT IDS_ARRAY, const INT64 * RESTRICT FMM_CELLS, INT64 * RESTRICT OUT_BUFF ){{ for(INT64 ix=0 ; ix<N ; ix++){{ INT64 tmp_cell[3]; get_fmm_cell(FMM_CELLS[IDS_ARRAY[ix]], FMM_CELLS_PER_SIDE, tmp_cell); OUT_BUFF[ix] = gcell_to_lcell(CELL_DATA_OFFSET, LOCAL_STORE_DIMS, tmp_cell); }} return 0; }} static inline void check_move( const REAL * RESTRICT extent, const REAL * RESTRICT p, const REAL * RESTRICT pp, int * RESTRICT err, const INT64 px, const INT64 movx ){{ if (pp[0] < -0.5 * extent[0]) {{ err[0]++; printf("ERROR: Proposed position is outside domain (%ld, %ld). %f %f %f -> %f %f %f\n", px, movx, p[0], p[1], p[2], pp[0], pp[1], pp[2]);}} if (pp[1] < -0.5 * extent[1]) {{ err[0]++; printf("ERROR: Proposed position is outside domain (%ld, %ld). %f %f %f -> %f %f %f\n", px, movx, p[0], p[1], p[2], pp[0], pp[1], pp[2]);}} if (pp[2] < -0.5 * extent[2]) {{ err[0]++; printf("ERROR: Proposed position is outside domain (%ld, %ld). %f %f %f -> %f %f %f\n", px, movx, p[0], p[1], p[2], pp[0], pp[1], pp[2]);}} if (pp[0] > 0.5 * extent[0]) {{ err[0]++; printf("ERROR: Proposed position is outside domain (%ld, %ld). %f %f %f -> %f %f %f\n", px, movx, p[0], p[1], p[2], pp[0], pp[1], pp[2]);}} if (pp[1] > 0.5 * extent[1]) {{ err[0]++; printf("ERROR: Proposed position is outside domain (%ld, %ld). %f %f %f -> %f %f %f\n", px, movx, p[0], p[1], p[2], pp[0], pp[1], pp[2]);}} if (pp[2] > 0.5 * extent[2]) {{ err[0]++; printf("ERROR: Proposed position is outside domain (%ld, %ld). %f %f %f -> %f %f %f\n", px, movx, p[0], p[1], p[2], pp[0], pp[1], pp[2]);}} REAL d0 = p[0] - pp[0]; REAL d1 = p[1] - pp[1]; REAL d2 = p[2] - pp[2]; {CHECK_MOD} if ( (d0*d0) > ({MAX_MOVE_2}) ){{ err[0]++; printf("ERROR: Proposed move violates max_move (%ld, %ld). %f %f %f -> %f %f %f\n", px, movx, p[0], p[1], p[2], pp[0], pp[1], pp[2]);}} if ( (d1*d1) > ({MAX_MOVE_2}) ){{ err[0]++; printf("ERROR: Proposed move violates max_move (%ld, %ld). %f %f %f -> %f %f %f\n", px, movx, p[0], p[1], p[2], pp[0], pp[1], pp[2]);}} if ( (d2*d2) > ({MAX_MOVE_2}) ){{ err[0]++; printf("ERROR: Proposed move violates max_move (%ld, %ld). %f %f %f -> %f %f %f\n", px, movx, p[0], p[1], p[2], pp[0], pp[1], pp[2]);}} }} extern "C" int setup_move( const INT64 npart_local, const INT64 max_prop, const INT64 * RESTRICT site_max_counts, const INT64 * RESTRICT current_sites, const REAL * RESTRICT current_positions, const REAL * RESTRICT current_charges, const INT64 * RESTRICT current_ids, const INT64 * RESTRICT current_fmm_cells, const REAL * RESTRICT prop_positions, const INT64 * RESTRICT prop_masks, const INT64 prop_charge_flag, const REAL * RESTRICT prop_charges, INT64 * RESTRICT rate_location, REAL * RESTRICT new_positions, REAL * RESTRICT new_charges, INT64 * RESTRICT new_ids, INT64 * RESTRICT new_fmm_cells, REAL * RESTRICT new_shifted_positions, REAL * RESTRICT old_positions, REAL * RESTRICT old_charges, INT64 * RESTRICT old_ids, INT64 * RESTRICT old_fmm_cells, INT64 * RESTRICT exclusive_sum, INT64 * RESTRICT num_particles, INT64 * RESTRICT total_movs, const REAL * RESTRICT extent, const INT64 * RESTRICT fmm_cells_per_side, const INT64 * RESTRICT cell_data_offset, const INT64 * RESTRICT local_store_dims, const INT64 * RESTRICT upper_allowed, const INT64 * RESTRICT lower_allowed, const INT64 * RESTRICT local_offset ){{ int err = 0; INT64 es_tmp = 0; INT64 old_ind = 0; for(INT64 px=0 ; px<npart_local ; px++){{ // Compute the exclusive sum INT64 es_inner = 0; INT64 prop_found = 0; INT64 max_prop_count = site_max_counts[current_sites[px]]; for(INT64 movx=0 ; (movx<max_prop) && (prop_found<max_prop_count) ; movx++){{ const INT64 mask = prop_masks[px*max_prop + movx]; if (mask > 0){{ es_inner++; prop_found++; }} }} // if moves involve this particle we need the data if (es_inner > 0){{ exclusive_sum[old_ind] = es_tmp; es_tmp += es_inner; old_ids[old_ind] = px; old_ind++; }} }} exclusive_sum[old_ind] = es_tmp; *num_particles = old_ind; *total_movs = es_tmp; // now move the data #pragma omp parallel for schedule(dynamic) reduction(+:err) for(INT64 oind=0 ; oind<old_ind; oind++ ){{ const INT64 px = old_ids[oind]; old_positions[oind*3 + 0] = current_positions[px*3 + 0]; old_positions[oind*3 + 1] = current_positions[px*3 + 1]; old_positions[oind*3 + 2] = current_positions[px*3 + 2]; old_charges[oind] = current_charges[px]; old_ids[oind] = current_ids[px]; INT64 tmp_cell[3] = {{0,0,0}}; get_fmm_cell(current_fmm_cells[px], fmm_cells_per_side, tmp_cell); old_fmm_cells[oind] = gcell_to_lcell(cell_data_offset, local_store_dims, &tmp_cell[0]); const INT64 prop_count = exclusive_sum[oind+1] - exclusive_sum[oind]; const INT64 nstart = exclusive_sum[oind]; INT64 prop_found = 0; for(INT64 movx=0 ; ((movx<max_prop) && (prop_found < prop_count)) ; movx++){{ const INT64 mask = prop_masks[px*max_prop + movx]; if(mask > 0){{ const INT64 nind = nstart + prop_found; const INT64 prop_ind = px*max_prop*3 + (movx*3); new_positions[nind*3 + 0] = prop_positions[prop_ind + 0]; new_positions[nind*3 + 1] = prop_positions[prop_ind + 1]; new_positions[nind*3 + 2] = prop_positions[prop_ind + 2]; check_move(extent, ¤t_positions[px*3], &prop_positions[prop_ind], &err, px, movx); INT64 tmp_cell[3] = {{0,0,0}}; REAL tmp_pos[3] = {{0.0, 0.0, 0.0}}; get_cell( &prop_positions[prop_ind], extent, fmm_cells_per_side, upper_allowed, lower_allowed, local_offset, &tmp_cell[0], &tmp_pos[0]); new_fmm_cells[nind] = gcell_to_lcell(cell_data_offset, local_store_dims, &tmp_cell[0]); new_shifted_positions[nind*3 + 0] = tmp_pos[0]; new_shifted_positions[nind*3 + 1] = tmp_pos[1]; new_shifted_positions[nind*3 + 2] = tmp_pos[2]; const INT64 charge_prop_ind = px * max_prop + movx; new_charges[nind] = (prop_charge_flag > 0) ? prop_charges[charge_prop_ind] : current_charges[px]; new_ids[nind] = current_ids[px]; rate_location[nind] = px*max_prop + movx; prop_found++; }} }} }} return err; }} """.format(CELL_GEN=cell_gen, CHECK_MOD=check_mod, MAX_MOVE_2=str(self.max_move * self.max_move)) header = r""" #include <stdint.h> #include <stdio.h> """ _lib = simple_lib_creator(header, src) self._dat_lib = _lib["setup_move"] self._get_fmm_lcell_lib = _lib["get_fmm_lcell"]
def test_sph_gen_1(): lmax = 26 N = 10 M = (lmax+1) * (2*lmax+1) sph_gen = SphGen(lmax) assign_gen = '' for lx in range(lmax+1): for mx in range(-lx, lx+1): assign_gen += 're_out[NSTRIDE * ix + LMAX * {lx} + LOFFSET + {mx}] = '.format(lx=lx, mx=mx) + \ str(sph_gen.get_y_sym(lx, mx)[0]) + ';\n' assign_gen += 'im_out[NSTRIDE * ix + LMAX * {lx} + LOFFSET + {mx}] = '.format(lx=lx, mx=mx) + \ str(sph_gen.get_y_sym(lx, mx)[1]) + ';\n' src = """ #define LMAX ({LMAX}) #define LOFFSET ({LOFFSET}) #define N ({N}) #define NSTRIDE ({NSTRIDE}) extern "C" int test_sph_gen( const double * RESTRICT theta_set, const double * RESTRICT phi_set, double * RESTRICT re_out, double * RESTRICT im_out ){{ #pragma omp parallel for for (int ix=0; ix<N ; ix++){{ const double theta = theta_set[ix]; const double phi = phi_set[ix]; {SPH_GEN} {ASSIGN_GEN} }} return 0; }} """.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), LMAX=2*lmax+1, LOFFSET=lmax, N=N, NSTRIDE=M ) header = str(sph_gen.header) lib = simple_lib_creator(header_code=header, src_code=src)['test_sph_gen'] re_out = np.zeros((N, lmax+1, 2*lmax+1), dtype=c_double) im_out = np.zeros_like(re_out) rng = np.random.RandomState(1234) theta_set = np.array(rng.uniform(low=0.0, high=math.pi, size=N), dtype=c_double) phi_set = np.array(rng.uniform(low=0.0, high=2.*math.pi, size=N), dtype=c_double) lib( theta_set.ctypes.get_as_parameter(), phi_set.ctypes.get_as_parameter(), re_out.ctypes.get_as_parameter(), im_out.ctypes.get_as_parameter() ) for ix in range(N): theta = theta_set[ix] phi = phi_set[ix] for lx in range(lmax + 1): mrange = list(range(lx, -1, -1)) + list(range(1, lx+1)) mrange2 = list(range(-1*lx, 1)) + list(range(1, lx+1)) scipy_p = lpmv(mrange, lx, np.cos(theta)) for mxi, mx in enumerate(mrange2): re_exp = math.cos(mx * phi) im_exp = math.sin(mx * phi) val = math.sqrt(math.factorial( lx - abs(mx))/math.factorial(lx + abs(mx))) val *= scipy_p[mxi] scipy_real = re_exp * val scipy_imag = im_exp * val re_err = abs(scipy_real - re_out[ix, lx, lmax + mx]) im_err = abs(scipy_imag - im_out[ix, lx, lmax + mx]) assert re_err < 10.**-13 assert im_err < 10.**-13
def build_compress_lib(state): dats = state.particle_dats def g(x): return getattr(state, x) hsrc = '''''' args = ','.join(['{} * D_{}'.format(g(n).ctype, n) for n in dats]) dyn = '\n'.join([ ''' for(int ix=0 ; ix<{0} ; ix++){{ {1}[dest*{0}+ix] = {1}[src*{0}+ix];}} '''.format(str(g(n).ncomp), 'D_{}'.format(n)) for n in dats ]) src = ''' extern "C" int compress( const int slots_to_fill_in, const int n_new_in, const int * slots, int * n_new_out, %(ARGS)s ){ int slots_to_fill = slots_to_fill_in; int n_new = n_new_in; int last_slot; int last_slot_lookup_index = slots_to_fill - 1; int dest_index = 0; int dest = -1; // Whilst there are slots to fill and the current slot is not past the // end of the array. if (n_new > 0) { while ( (dest_index <= last_slot_lookup_index) && (slots[dest_index] < n_new) ){ // get first empty slot in particle dats. dest = slots[dest_index]; int src = -1; //loop from end to empty slot for (int iy = n_new - 1; iy > dest; iy--){ if (iy == slots[last_slot_lookup_index]){ n_new = iy; last_slot_lookup_index--; //printf("n_new=%%d \\n", n_new); } else { src = iy; break; } } if (src > 0){ \n%(DYN_DAT_CODE)s n_new = src; } else { n_new = slots[last_slot_lookup_index]; break; } dest_index++; } } n_new_out[0] = n_new; return 0;} ''' % { 'DYN_DAT_CODE': dyn, 'ARGS': args } return build.simple_lib_creator(hsrc, src, 'compress')['compress']
def build_pack_lib(state): dats = state.particle_dats def g(x): return getattr(state, x) args = ','.join( ['const {} * D_{}'.format(g(n).ctype, n) for n in dats]) _dynamic_dats_shift = '' for ix in state.particle_dats: dat = g(ix) sub_dict = { 'DTYPE': dat.ctype, 'DBYTE': str(ctypes.sizeof(dat.dtype)), 'TBYTE': str(dat.ncomp * ctypes.sizeof(dat.dtype)), 'NCOMP': str(dat.ncomp), 'NAME': str(ix) } if type(dat) is data.PositionDat: assert dat.ncomp == 3, "move only defined in 3D" _dynamic_dats_shift += ''' %(DTYPE)s _pos_tmp[3]; _pos_tmp[0]=D_%(NAME)s[_ix* %(NCOMP)s ]+SHIFT[(_dir*3) ]; _pos_tmp[1]=D_%(NAME)s[_ix* %(NCOMP)s + 1]+SHIFT[(_dir*3) + 1]; _pos_tmp[2]=D_%(NAME)s[_ix* %(NCOMP)s + 2]+SHIFT[(_dir*3) + 2]; memcpy(_S_BUF, _pos_tmp, 3*%(DBYTE)s); _S_BUF += %(TBYTE)s; ''' % sub_dict else: assert dat.ncomp > 0, "move not defined for 0 component dats" _dynamic_dats_shift += ''' memcpy(_S_BUF, &D_%(NAME)s[_ix * %(NCOMP)s], %(TBYTE)s); _S_BUF += %(TBYTE)s; ''' % sub_dict hsrc = ''' #include<string.h> #include<stdint.h> ''' src = ''' extern "C" int move_pack( uint8_t * _S_BUF, const double * SHIFT, const int * _direction_id_list, int * _empty_slot_store, %(ARGS)s ){ // Next free space in send buffer. int _slot_index = 0; //loop over the send directions. for(int _dir = 0; _dir < 26; _dir++){ //traverse linked list. int _ixd = _direction_id_list[_dir]; while(_ixd > -1){ //Generate code based on ParticleDats int _ix = _direction_id_list[_ixd]; \n%(DYNAMIC_DATS)s _empty_slot_store[_slot_index] = _ix; _slot_index += 1; _ixd = _direction_id_list[_ixd+1]; } } return 0;}''' % { 'DYNAMIC_DATS': _dynamic_dats_shift, 'ARGS': args } return build.simple_lib_creator(hsrc, src, 'move_pack')['move_pack']
def _init_host_point_eval(self): header = str( Module((Include('stdint.h'), Include('stdio.h'), Include('math.h'), Include('omp.h'), Define('REAL', 'double'), Define('INT64', 'int64_t')))) src = r""" static inline INT64 gcell_to_lcell( const INT64 * RESTRICT cell_data_offset, const INT64 * RESTRICT local_store_dims, const INT64 * cell ){{ const INT64 c0 = cell[0] + cell_data_offset[2]; const INT64 c1 = cell[1] + cell_data_offset[1]; const INT64 c2 = cell[2] + cell_data_offset[0]; return c0 + local_store_dims[2] * ( c1 + local_store_dims[1] * c2 ); }} static inline void get_cell( const REAL * RESTRICT position, const REAL * RESTRICT extent, const INT64 * fmm_cells_per_side, INT64 * cell ){{ REAL shifted_position[3]; shifted_position[0] = position[0] + 0.5 * extent[0]; shifted_position[1] = position[1] + 0.5 * extent[1]; shifted_position[2] = position[2] + 0.5 * extent[2]; const REAL w0 = fmm_cells_per_side[0] / extent[0]; const REAL w1 = fmm_cells_per_side[1] / extent[1]; const REAL w2 = fmm_cells_per_side[2] / extent[2]; cell[0] = (INT64) (shifted_position[0] * w0); cell[1] = (INT64) (shifted_position[1] * w1); cell[2] = (INT64) (shifted_position[2] * w2); if (cell[0] >= fmm_cells_per_side[2]) {{ cell[0] = fmm_cells_per_side[2] - 1; }} if (cell[1] >= fmm_cells_per_side[1]) {{ cell[1] = fmm_cells_per_side[1] - 1; }} if (cell[2] >= fmm_cells_per_side[0]) {{ cell[2] = fmm_cells_per_side[0] - 1; }} return; }} extern "C" int direct_point_eval( const INT64 N, const REAL * RESTRICT d_positions, const REAL * RESTRICT d_pdata, const INT64 * RESTRICT d_cell_occ, const INT64 d_cell_stride, const INT64 * RESTRICT offsets, const INT64 * RESTRICT cell_data_offset, const INT64 * RESTRICT local_store_dims, const INT64 * RESTRICT fmm_cells_per_side, const REAL * RESTRICT extent, REAL * RESTRICT d_potential ){{ int err = 0; INT64 max_cell = fmm_cells_per_side[0] * fmm_cells_per_side[1] * fmm_cells_per_side[2]; #pragma omp parallel for schedule(dynamic) for( INT64 idx=0 ; idx < N ; idx++ ) {{ INT64 ict[3]; get_cell(&d_positions[idx*3], extent, fmm_cells_per_side, ict); const INT64 ic = gcell_to_lcell(cell_data_offset, local_store_dims, ict); const REAL ipx = d_positions[idx*3]; const REAL ipy = d_positions[idx*3+1]; const REAL ipz = d_positions[idx*3+2]; REAL energy_red = 0.0; // loop over the jcells for(INT64 jcx=0 ; jcx<27 ; jcx++){{ const INT64 jc = ic + offsets[jcx]; // compute the offset into the cell data const INT64 offset = jc * d_cell_stride; const INT64 offset5 = 5 * jc * d_cell_stride; // loop over the particles in the j cell for(INT64 jx=0 ; jx<d_cell_occ[jc] ; jx++){{ const REAL jpx = d_pdata[offset5 + jx*5+0]; const REAL jpy = d_pdata[offset5 + jx*5+1]; const REAL jpz = d_pdata[offset5 + jx*5+2]; const REAL jch = d_pdata[offset5 + jx*5+3]; const REAL dx = ipx - jpx; const REAL dy = ipy - jpy; const REAL dz = ipz - jpz; const REAL r2 = dx*dx + dy*dy + dz*dz; const REAL contrib = jch / sqrt(r2); energy_red += contrib; }} }} d_potential[idx] += energy_red; }} return err; }} """.format() self._host_point_eval_lib = build.simple_lib_creator( header_code=header, src_code=src, name='kmc_fmm_direct_point_eval')['direct_point_eval']
def _init_host_kernels(self): header = str( Module((Include('stdint.h'), Include('stdio.h'), Include('math.h'), Include('omp.h'), Define('REAL', 'double'), Define('INT64', 'int64_t')))) LIB_PARAMETERS = """ const INT64 num_movs, const INT64 * RESTRICT lsd, const INT64 * RESTRICT offsets, const REAL * RESTRICT d_positions, const REAL * RESTRICT d_charges, const INT64 * RESTRICT d_ids, const INT64 * RESTRICT d_fmm_cells, const REAL * RESTRICT d_pdata, const INT64 * RESTRICT d_pdata_ids, const INT64 * RESTRICT d_cell_occ, const INT64 d_cell_stride, REAL * RESTRICT d_energy, INT64 * RESTRICT div_count""" common_1 = r""" INT64 tmp_div_count = 0; #pragma omp parallel for schedule(dynamic) reduction(+:tmp_div_count) for( INT64 idx=0 ; idx< num_movs ; idx++ ) {{ const INT64 ic = d_fmm_cells[idx]; const REAL ipx = d_positions[idx*3]; const REAL ipy = d_positions[idx*3+1]; const REAL ipz = d_positions[idx*3+2]; REAL energy_red = 0.0; // loop over the jcells for(INT64 jcx=0 ; jcx<27 ; jcx++){{ const INT64 jc = ic + offsets[jcx]; // compute the offset into the cell data const INT64 offset = jc * d_cell_stride; const INT64 offset5 = 5 * jc * d_cell_stride; tmp_div_count += d_cell_occ[jc]; // loop over the particles in the j cell for(INT64 jx=0 ; jx<d_cell_occ[jc] ; jx++){{ const REAL jpx = d_pdata[offset5 + jx*5+0]; const REAL jpy = d_pdata[offset5 + jx*5+1]; const REAL jpz = d_pdata[offset5 + jx*5+2]; const REAL jch = d_pdata[offset5 + jx*5+3]; const REAL dx = ipx - jpx; const REAL dy = ipy - jpy; const REAL dz = ipz - jpz; const REAL r2 = dx*dx + dy*dy + dz*dz; const REAL contrib = jch / sqrt(r2); """.format() # new/old part goes here -> common_2 = r""" }} }} energy_red *= d_charges[idx]; d_energy[idx] = energy_red; *div_count = tmp_div_count; }} return 0; """.format() src = r""" {HEADER} extern "C" int direct_new( {LIB_PARAMETERS} ) {{ {COMMON_1} energy_red += contrib; {COMMON_2} }} extern "C" int direct_old( {LIB_PARAMETERS} ) {{ {COMMON_1} energy_red += (d_pdata_ids[offset + jx] != d_ids[idx]) ? contrib : 0.0; {COMMON_2} }} """.format(HEADER=header, COMMON_1=common_1, COMMON_2=common_2, LIB_PARAMETERS=LIB_PARAMETERS) self._host_lib = build.simple_lib_creator(header_code=' ', src_code=src, name='kmc_fmm_direct_host') self._host_direct_new = self._host_lib["direct_new"] self._host_direct_old = self._host_lib["direct_old"]