def local_search(points, weights, distances, upper_bound, lower_bound, random_states): thread_id = cuda.threadIdx.x if thread_id < len(points): pass # TODO rozmiar tej tablicy musi byc stalą czasu kompilacji , wtf tmp_point = cuda.local.array(20, float64) for i in range(20): tmp_point[i] = points[thread_id][i] for index in range(20): direction = xoroshiro128p_uniform_float32(random_states, thread_id) > 0.5 step = xoroshiro128p_uniform_float32(random_states, thread_id) if direction: length = upper_bound - tmp_point[index] tmp_point[index] = tmp_point[index] + step * length else: length = tmp_point[index] - lower_bound tmp_point[index] = tmp_point[index] - step * length val1 = qap_device(tmp_point, weights, distances) val2 = qap_device(points[thread_id], weights, distances) if val1 < val2: for i in range(20): points[thread_id][i] = tmp_point[i] break
def loop(_, r, V, rngs, w, d, tavg, bold_state, bold_out, I, Delta, eta, tau, J, cr, cv, r_sigma, V_sigma): it = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x nt = cuda.blockDim.x * cuda.gridDim.x itx = cuda.threadIdx.x # if it==0: print('hello from ', cuda.blockIdx.x, cuda.threadIdx.x) # if it==0: print("NT =", NT) o_tau = nb.float32(1 / tau) # if it==0: print("o_tau = ", o_tau) assert r.shape[0] == V.shape[ 0] == nh # shape asserts help numba optimizer assert r.shape[1] == V.shape[1] == nn # if it==0: print("creating nrV shared..") nrV = cuda.shared.array((2, blockDim_x), nb.float32) # if it==0: print("zeroing tavg..") for j in range(nto): for i in range(nn): tavg[j, 0, i, it] = nb.float32(0.0) tavg[j, 1, i, it] = nb.float32(0.0) # if it==0: print('tavg zero\'d', -1, nh - 1) for t0 in range(-1, nh - 1): # if it==0: print('t0=', t0) t = nh - 1 if t0 < 0 else t0 # if it==0: print('t=', t) t1 = t0 + 1 # if it==0: print('t1=', t1) # if it==0: print('nh//nto', nh // nto) # if it==0: print('t1=', t1) t0_nto = t0 // (nh // nto) # if it==0: print(t, t1, t0_nto) for i in range(nn): rc = nb.float32(0) # using array here costs 50%+ Vc = nb.float32(0) for j in range(nn): dij = (t - d[i, j] + nh) & (nh - 1) rc += w[i, j] * cfpre(r[dij, j, it], r[t, i, it]) Vc += w[i, j] * cfpre(V[dij, j, it], V[t, i, it]) rc = cfpost(rc) Vc = cfpost(Vc) # RNG + Box Muller pi_2 = nb.float32(np.pi * 2) u1 = xoroshiro128p_uniform_float32( rngs, t1 * nt * nn * 2 + i * nt * 2 + it) u2 = xoroshiro128p_uniform_float32( rngs, t1 * nt * nn * 2 + i * nt * 2 + it + nt) z0 = math.sqrt(-nb.float32(2.0) * math.log(u1)) * math.cos( pi_2 * u2) z1 = math.sqrt(-nb.float32(2.0) * math.log(u1)) * math.sin( pi_2 * u2) # RK4 rk4_rV(nrV, r[t, i, it], V[t, i, it], o_tau, pi, tau, Delta, eta, J, I, cr, rc, cv, Vc, r_sigma, V_sigma, z0, z1) r[t1, i, it] = nrV[0, itx] V[t1, i, it] = nrV[1, itx] # if it==0: print(nrV[0, it], nrV[1, it], o_nh) tavg[t0_nto, 0, i, it] += nrV[0, itx] * o_nh tavg[t0_nto, 1, i, it] += nrV[1, itx] * o_nh # if it==0: print(t1, o_nh, tavg[t0_nto, 0, i, it], tavg[t0_nto, 1, i, it]) bold_out[i, it] = fmri_gpu(it, bold_state[i], nrV[0, itx], dt)
def mc_pi(states, iterations, out): tid = cuda.grid(1) inside = 0 for i in range(iterations): x = xoroshiro128p_uniform_float32(states, tid) y = xoroshiro128p_uniform_float32(states, tid) if x**2 + y**2 <= 1.0: inside += 1 out[tid] = 4.0 * inside / iterations
def simulate_pi(rng_states, iterations, out): thread_id = cuda.grid(1) inside = 0 for i in range(iterations): x = xoroshiro128p_uniform_float32(rng_states, thread_id) y = xoroshiro128p_uniform_float32(rng_states, thread_id) if x**2 + y**2 <= 1.0: inside += 1 out[thread_id] = 4.0 * inside / iterations
def compute_pi(rng_states, iterations, out): """Find the maximum value in values and store in result[0]""" thread_id = cuda.grid(1) # Compute pi by drawing random (x, y) points and finding what # fraction lie inside a unit circle inside = 0 for i in range(iterations): x = xoroshiro128p_uniform_float32(rng_states, thread_id) y = xoroshiro128p_uniform_float32(rng_states, thread_id) if x**2 + y**2 <= 1.0: inside += 1 out[thread_id] = 4.0 * inside / iterations
def compute_pi(rng_states, iterations, out): """Find the maximum value in values and store in result[0]""" thread_id = cuda.grid(1) # Compute pi by drawing random (x, y) points and finding what # fraction lie inside a unit circle inside = 0 for i in range(iterations): x = xoroshiro128p_uniform_float32(rng_states, thread_id) y = xoroshiro128p_uniform_float32(rng_states, thread_id) if x**2 + y**2 <= 1.0: inside += 1 out[thread_id] = 4.0 * inside / iterations
def mutate_kernel(d_next_gen, d_is_elite, rng_states, mutate_prob): r"""Perform mutation by randomly swapping two CPs. """ i = cuda.grid(1) if i < d_next_gen.shape[0]: if d_is_elite[i] == False: rnd = xoroshiro128p_uniform_float32(rng_states, i) if rnd < mutate_prob: rnd = xoroshiro128p_uniform_float32(rng_states, i) idx1 = int(math.floor(rnd * d_next_gen.shape[1])) rnd = xoroshiro128p_uniform_float32(rng_states, i) idx2 = int(math.floor(rnd * d_next_gen.shape[1])) tmp = d_next_gen[i, idx1] d_next_gen[i, idx1] = d_next_gen[i, idx2] d_next_gen[i, idx2] = tmp
def Busqueda_MetropolisCUDA(M,individuo,probabilidades,AristMono,numColores,numNodos,rng_states,id): if AristMono != 0: for i in prange(busqueda_vecindario): nodo = 0 vacia = 0 #print(i) while vacia == 0: bolsaProbabilistica = int(bolsaProbabilidad_gpu(probabilidades,numColores,rng_states,id)) # Elige una bolsa con probabilidad a su número de nodos #print(bolsaProbabilistica) vacia = esVacia_gpu(individuo[bolsaProbabilistica],numNodos) #verifica que la bolsa no esté vacía #print(id, vacia) while nodo == 0: #Selecciona un nodo al azar r = int(xoroshiro128p_uniform_float32(rng_states, id)*numNodos) #selecciona un número al azar del cero al número de nodos nodo = individuo[bolsaProbabilistica][r] monoAct = NAMBolsa_gpu(individuo[bolsaProbabilistica], r, M,numNodos) # calcula el número de aristas monocromáticas del nodo en la bolsa elegida #print(id, r, nodo, bolsaProbabilistica,monoAct) BolsaNueva = bolsaAleatoria_gpu(bolsaProbabilistica, numColores,rng_states,id) #print(bolsaProbabilistica, BolsaNueva, r) monopost = NAMBolsa_gpu(individuo[BolsaNueva], r, M,numNodos) delta = monopost - monoAct #print(delta, AristMono) if probAcepta_gpu(delta,rng_states,id): #print("acepta") AristMono = AristMono + delta #print(AristMono) individuo[bolsaProbabilistica][nodo] = 0 # Elimina el nodo de la bolsa individuo[BolsaNueva][nodo] = 1 # Inserta el nodo en otra bolsa al azar if AristMono == 0: break
def Busqueda_EscalandoCUDA(M, individuo, probabilidades, AristMono, numColores, numNodos, rng_states, id): if AristMono != 0: for i in prange(busqueda_vecindario): r = 0 vacia = 0 while vacia == 0: bolsaProbabilistica = int( bolsaProbabilidad_gpu(probabilidades, numColores, rng_states, id) ) # Elige una bolsa con probabilidad a su número de nodos vacia = esVacia_gpu( individuo[bolsaProbabilistica], numNodos) #verifica que la bolsa no esté vacía while r == 0: #Selecciona un nodo al azar nodo = int( xoroshiro128p_uniform_float32(rng_states, id) * numNodos ) #selecciona un número al azar del cero al número de nodos r = individuo[bolsaProbabilistica][nodo] monoAct = NAMBolsa_gpu( individuo[bolsaProbabilistica], nodo, M, numNodos ) # calcula el número de aristas monocromáticas del nodo en la bolsa elegida BolsaNueva = bolsaAleatoria_gpu(bolsaProbabilistica, numColores, rng_states, id) monopost = NAMBolsa_gpu(individuo[BolsaNueva], nodo, M, numNodos) delta = monopost - monoAct if monopost > monoAct: AristMono = AristMono + delta individuo[bolsaProbabilistica][ nodo] = 0 # Elimina el nodo de la bolsa individuo[BolsaNueva][ nodo] = 1 # Inserta el nodo en otra bolsa al azar if AristMono == 0: break
def packet(rng_states, thread_id, q): x = 0 y = False while y == False: y = (xoroshiro128p_uniform_float32(rng_states, thread_id) > q) x += 1 return x
def cross_over_1p(parent1, parent2, rng_states, d_pop, d_next_gen, i): r"""Perform 1-point crossover. Copy a portion of parent1, then fill the rest with parent2. """ m = d_pop.shape[1] rnd = xoroshiro128p_uniform_float32(rng_states, i) split = int(math.floor(rnd * m)) # copy from parent1 for j in range(split): d_next_gen[i, j] = d_pop[parent1, j] # copy from parent2 idx = split for j in range(m): cp2 = d_pop[parent2, j] repeat = False for k in range(split): cp1 = d_next_gen[i, k] if cp1 == cp2: repeat = True break if repeat == False: d_next_gen[i, idx] = cp2 idx += 1 if idx == m: break
def scatter2(threadindex, rng_states, neutron): "isotropic scattering kernel with a uniform sampling of the polar angle and azimuthal angle" # randomly pick direction theta = xoroshiro128p_uniform_float32(rng_states, threadindex) * pi phi = xoroshiro128p_uniform_float32(rng_states, threadindex) * (2 * pi) cos_t, sin_t = cos(theta), sin(theta) sin_p, cos_p = sin(phi), cos(phi) # compute velocity vx, vy, vz = neutron[3:6] vi = sqrt(vx * vx + vy * vy + vz * vz) vx = vi * sin_t * cos_p vy = vi * sin_t * sin_p vz = vi * cos_t neutron[3:6] = vx, vy, vz neutron[-1] *= sin_t * (pi / 2) return
def tournament(rng_states, i, pop_size, d_nonelite, num_elites, d_fitness_all, tournament_size): r"""Randomly choose candidates from nonelite individuals then choose the best as one parent. """ rnd = xoroshiro128p_uniform_float32(rng_states, i) num_nonelite = d_nonelite.size parent = d_nonelite[int(math.floor(rnd * num_nonelite))] min_fitness = d_fitness_all[parent] for j in range(tournament_size - 1): rnd = xoroshiro128p_uniform_float32(rng_states, i) new_parent = parent = d_nonelite[int(math.floor(rnd * num_nonelite))] if min_fitness < d_fitness_all[new_parent]: parent = new_parent min_fitness = d_fitness_all[new_parent] return parent
def scatter(threadindex, rng_states, neutron): "isotropic scattering kernel with a uniform sampling of 4pi solid angle" # randomly pick direction cos_t = xoroshiro128p_uniform_float32(rng_states, threadindex) * 2 - 1 phi = xoroshiro128p_uniform_float32(rng_states, threadindex) * (2 * pi) if cos_t > 1: cos_t = 1 sin_t = sqrt(1 - cos_t * cos_t) sin_p, cos_p = sin(phi), cos(phi) # compute velocity vx, vy, vz = neutron[3:6] vi = sqrt(vx * vx + vy * vy + vz * vz) vx = vi * sin_t * cos_p vy = vi * sin_t * sin_p vz = vi * cos_t neutron[3:6] = vx, vy, vz return
def find_spread_gpu(graph, active, new_active, new_ones, mc, p, rng_states): # Get abosolute position of current thread thread_id = cuda.grid(1) # Because of fixed block sizes, some of the threads won't be needed if thread_id >= mc: return done = False while not done: done = True for j in range(new_active[thread_id].shape[0]): if new_active[thread_id][j]: for k in range(graph.shape[0]): if graph[j][k] and p > xoroshiro128p_uniform_float32( rng_states, thread_id): new_ones[thread_id][k] = True for j in range(new_active[thread_id].shape[0]): if new_ones[thread_id][j] and (not active[thread_id][j]): active[thread_id][j] = True new_active[thread_id][j] = True done = False else: new_active[thread_id][j] = False new_ones[thread_id][j] = False
def move(rng_states, start_x, start_y, out_x, out_y, doms, rs, domhits, domhitstimes): thread_id = cuda.grid(1) def rng(): return xoroshiro128p_uniform_float32(rng_states, thread_id) x = start_x y = start_y d = rng()*math.pi*2 vx = math.cos(d) vy = math.sin(d) absorbed = False time = 0 while not absorbed: if rng() < 0.02:#1: d = xoroshiro128p_uniform_float32(rng_states, thread_id)*math.pi*2 vx = math.cos(d) vy = math.sin(d) if rng() < 0.02:#05: absorbed = True x += vx y += vy for i in range(len(doms)): domx = doms[i,0] domy = doms[i,1] r = rs[i] if r >= (math.sqrt((domx-x)**2 + (domy-y)**2)): domhits[thread_id, i] += 1 domhitstimes[thread_id, i] = time absorbed = True time += 1 out_x[thread_id] = x out_y[thread_id] = y
def rng_kernel_float32(states, out, count, distribution): thread_id = cuda.grid(1) for i in range(count): if distribution == UNIFORM: out[thread_id * count + i] = xoroshiro128p_uniform_float32(states, thread_id) elif distribution == NORMAL: out[thread_id * count + i] = xoroshiro128p_normal_float32(states, thread_id)
def _detect_gpu(matrix, vec, rng_states): thread_id = cuda.grid(1) if thread_id < vec.shape[0]: l = matrix.shape[0] x = int(xoroshiro128p_uniform_float32(rng_states, thread_id) * l) y = int(xoroshiro128p_uniform_float32(rng_states, thread_id) * l) ret = 0 for m in [x, y]: m_inv = x + y - m for n in range(l): if matrix[m, n] > 0 or matrix[m_inv, n] > 0: if m != n and m_inv != n: ret += (abs(m - n) - abs(m_inv - n)) * ( matrix[m, n] - matrix[m_inv, n]) if ret > 0: vec[thread_id, 0] = x vec[thread_id, 1] = y
def recombination( inp_weights, out_weights, n_inp_ia, tot_ia, n_weights, rng_states ): #n_inp_ia: number of ia in input, tot_ia: total number of ia to be generated, n_weights: total number of weighta for ia pos = cuda.grid(1) if pos < tot_ia: ia_1 = int( xoroshiro128p_uniform_float32(rng_states, cuda.grid(1)) * n_inp_ia) ia_2 = int( xoroshiro128p_uniform_float32(rng_states, cuda.grid(1)) * n_inp_ia) cut = int( xoroshiro128p_uniform_float32(rng_states, cuda.grid(1)) * n_weights) for i in range(n_weights): if i < cut: out_weights[pos][i] = inp_weights[ia_1][i] else: out_weights[pos][i] = inp_weights[ia_2][i]
def sample_kernel(rng_states,weight,old_particle_pos,particle_pos): tx = int(cuda.threadIdx.x) # this is the unique thread ID within a 1D block ty = int(cuda.blockIdx.x) # Similarly, this is t thread_id = cuda.grid(1) tt=xoroshiro128p_uniform_float32(rng_states,thread_id) if tt<0.01: particle_pos[ty][0]=xoroshiro128p_uniform_float32(rng_states,thread_id)*max_x particle_pos[ty][1]=xoroshiro128p_uniform_float32(rng_states,thread_id)*max_y else: t=xoroshiro128p_uniform_float32(rng_states,thread_id) for i in range(len(weight)): if t-weight[i]<0: particle_pos[ty][0]=old_particle_pos[i][0] particle_pos[ty][1]=old_particle_pos[i][1] break else: t-=weight[i]
def bolsaAleatoriaProbabilidadCUDA (probabilidades, numColores,rng_states,id): r = xoroshiro128p_uniform_float32(rng_states, id) #selecciona un número al azar del cero al uno l = 0 for i in range(numColores): #recorre hasta el número de bolsas if (r >= l and r < l + probabilidades[i]): #si cae entre l y la probabilidad de la bolsa i return i #retorna el indice i else: l = l + probabilidades[i] #si no a la variable l le suma probabilidad de la bolsa i return i
def mutation(inp_weights, n_ia, n_weights, rng_states, prob=0.1): # Thread id in a 1D block tx = cuda.threadIdx.x # Block id in a 1D grid ty = cuda.blockIdx.x if ty < n_ia and tx < n_weights: a = xoroshiro128p_uniform_float32(rng_states, cuda.grid(1)) if a < prob: inp_weights[ty][tx] += (xoroshiro128p_normal_float32( rng_states, cuda.grid(1))) / 5.
def recombination_2(inp_weights, out_weights, n_inp_ia, tot_ia, n_weights, rng_states): # Thread id in a 1D block tx = cuda.threadIdx.x # Block id in a 1D grid ty = cuda.blockIdx.x if ty < tot_ia and tx < n_weights: ia_rng = int( xoroshiro128p_uniform_float32(rng_states, cuda.grid(1)) * n_inp_ia) out_weights[ty][tx] = inp_weights[ia_rng][tx]
def probabilidadAceptarCUDA(delta, rng_states, id): if delta < 0: return True else: P = math.exp(-delta/k*T) r = xoroshiro128p_uniform_float32(rng_states, id) # selecciona un número al azar del cero al uno if r < P: return True else: return False
def crossoverUniform1(popvec_in, mother, father, ii, popvec_out, config_i, rng_states, tid, tmp): chr_sz = config_i[cfg.CHROMO_SIZE] # uniform crossover leading to 1 child for j in range(0, chr_sz): if (xoroshiro128p_uniform_float32(rng_states, tid) < 0.50): popvec_out[ii * chr_sz + j] = popvec_in[mother * chr_sz + j] else: popvec_out[ii * chr_sz + j] = popvec_in[father * chr_sz + j]
def initialize_kernel(d_pop_init, rng_states): r"""Generate random numbers. """ i = cuda.grid(1) pop_size = d_pop_init.shape[0] m = d_pop_init.shape[1] if i < pop_size: for j in range(m): rnd = xoroshiro128p_uniform_float32(rng_states, i) d_pop_init[i, j] = rnd
def sample(q_array, input, sigma, rng): pos = cuda.grid(1) if pos < q_array.shape[1]: euler = cuda.local.array(shape=(3), dtype=float32) for i in range(euler.shape[0]): euler[i] = input[i] + (xoroshiro128p_uniform_float32(rng, pos) - 0.5) quaternion_add_euler(q_array[:, pos], euler)
def propagate( threadindex, rng_states, in_neutron, square, width, height, radius, wl_distr, Lambda0, dLambda, E0, dE, xw, yh, dist, pmul ): r1 = xoroshiro128p_uniform_float32(rng_states, threadindex) r2 = xoroshiro128p_uniform_float32(rng_states, threadindex) r3 = xoroshiro128p_uniform_float32(rng_states, threadindex) r4 = xoroshiro128p_uniform_float32(rng_states, threadindex) r5 = xoroshiro128p_uniform_float32(rng_states, threadindex) if square: x = width * (r1 - 0.5) y = height * (r2 - 0.5) else: chi=2*math.pi*r1 r=math.sqrt(r2)*radius x=r*math.cos(chi) y=r*math.sin(chi) in_neutron[:3] = x, y, 0. # choose final vector target = cuda.local.array(shape=3, dtype=NB_FLOAT) target[0] = target[1] = 0.0 target[2] = dist vec_f = cuda.local.array(shape=3, dtype=NB_FLOAT) solidangle = randvec_target_rect(target, xw, yh, r3, r4, vec_f) # vector from moderator to final position is # (vec_f[0]-x, vec_f[1]-y, dist) dx = vec_f[0]-x; dy = vec_f[1]-y dist1 = math.sqrt(dx*dx+dy*dy+dist*dist) # velocity scalar if wl_distr: L = Lambda0+dLambda*(r5*2-1) v = K2V*(2*math.pi/L) else: E = E0+dE*(r5*2-1) v = SE2V*math.sqrt(E) in_neutron[3:6] = v*dx/dist1, v*dy/dist1, v*dist/dist1 in_neutron[-2] = 0 in_neutron[-1] = pmul*solidangle return
def move(points, best_point_index, forces, random_states): thread_id = cuda.threadIdx.x if thread_id == best_point_index: return step = xoroshiro128p_uniform_float32(random_states, thread_id) for k in range(20): points[thread_id][ k] = points[thread_id][k] + step * forces[thread_id][k]
def initial_state(state_array: cuda.devicearry, initial_state: cuda.devicearray, state_prob: cuda.devicearray, rng_states): tx = cuda.threadIdx.x ty = cuda.blockIdx.y bw = cuda.blockDim.x pos = tx + ty * bw if pos < state_array.shape[0]: # for i in range(state_num.shape[0]) for i in range(6): state_array[i, pos] =initial_state[i] + xoroshiro128p_uniform_float32(rng_states, tx)
def random_3d(arr, rng_states): # Per-dimension thread indices and strides startx, starty, startz = cuda.grid(3) stridex, stridey, stridez = cuda.gridsize(3) # Linearized thread index tid = (startz * stridey * stridex) + (starty * stridex) + startx # Use strided loops over the array to assign a random value to each entry for i in range(startz, arr.shape[0], stridez): for j in range(starty, arr.shape[1], stridey): for k in range(startx, arr.shape[2], stridex): arr[i, j, k] = xoroshiro128p_uniform_float32( rng_states, tid)