def team_reduce(j: int, team_acc: pk.Acc[pk.double]): def vector_reduce(i: int, vector_acc: pk.Acc[pk.double]): vector_acc += self.A_vector[e][j][i] * self.x_vector[e][i] tempM: float = pk.parallel_reduce(pk.ThreadVectorRange(team_member, self.M), vector_reduce) team_acc += self.y_vector[e][j] * tempM
def team_reduce(j: int, team_acc: pk.Acc[float]): def vector_reduce(i: int, vector_acc: pk.Acc[float]): vector_acc += A[e][j][i] * x[e][i] tempM: float = pk.parallel_reduce( pk.ThreadVectorRange(team_member, M), vector_reduce) team_acc += y[e][j] * tempM
def team_thread_reduce(bi: int, PE_i: pk.Acc[pk.double]): i: int = self.permute_vector[i_offset + bi] if i >= self.N_local: return x_i: float = self.x[i][0] y_i: float = self.x[i][1] z_i: float = self.x[i][2] type_i: int = self.type[i] bx_j_start: int = bx if bx > 0: bx_j_start = bx - 1 bx_j_stop: int = bx + 1 if bx + 1 < self.nbinx: bx_j_stop = bx + 2 by_j_start: int = by if by > 0: by_j_start = by - 1 by_j_stop: int = by + 1 if by + 1 < self.nbiny: by_j_stop = by + 2 bz_j_start: int = bz if bz > 0: bz_j_start = bz - 1 bz_j_stop: int = bz + 1 if bz + 1 < self.nbinx: bz_j_stop = bz + 2 for bx_j in range(bx_j_start, bx_j_stop): for by_j in range(by_j_start, by_j_stop): for bz_j in range(bz_j_start, bz_j_stop): j_offset: int = self.bin_offsets[bx_j][by_j][bz_j] def thread_vector_reduce(bj: int, PE_ibj: pk.Acc[pk.double]): j: int = self.permute_vector[j_offset + bj] dx: float = x_i - self.x[j][0] dy: float = y_i - self.x[j][1] dz: float = z_i - self.x[j][2] type_j: int = self.type[j] rsq: float = (dx * dx) + (dy * dy) + (dz * dz) if rsq < self.cutsq[type_i][type_j] and i != j: r2inv: float = 1.0 / rsq r6inv: float = r2inv * r2inv * r2inv PE_ibj += 0.5 * r6inv * \ (0.5 * self.lj1[type_i][type_j] * r6inv - self.lj2[type_i][type_j]) / 6.0 if shift_flag: r2invc: float = 1.0 / \ self.cutsq[type_i][type_j] r6invc: float = r2inv * r2inv * r2inv PE_ibj -= 0.5 * r6invc * \ (0.5 * self.lj1[type_i][type_j] * r6invc - self.lj2[type_i][type_j]) / 6.0 thread_vector_count: int = self.bin_count[bx_j][by_j][bz_j] PE_ibj: float = pk.parallel_reduce(pk.ThreadVectorRange( team, thread_vector_count), thread_vector_reduce) PE_i += PE_ibj
def team_thread_for(bi: int): i: int = self.permute_vector[i_offset + bi] if i >= self.N_local: return x_i: float = self.x[i][0] y_i: float = self.x[i][1] z_i: float = self.x[i][2] type_i: int = self.type[i] f_i: t_scalar3 = t_scalar3() bx_j_start: int = bx if bx > 0: bx_j_start = bx - 1 bx_j_stop: int = bx + 1 if bx + 1 < self.nbinx: bx_j_stop = bx + 2 by_j_start: int = by if by > 0: by_j_start = by - 1 by_j_stop: int = by + 1 if by + 1 < self.nbiny: by_j_stop = by + 2 bz_j_start: int = bz if bz > 0: bz_j_start = bz - 1 bz_j_stop: int = bz + 1 if bz + 1 < self.nbinx: bz_j_stop = bz + 2 for bx_j in range(bx_j_start, bx_j_stop): for by_j in range(by_j_start, by_j_stop): for bz_j in range(bz_j_start, bz_j_stop): j_offset: int = self.bin_offsets[bx_j][by_j][bz_j] f_i_tmp: t_scalar3 = t_scalar3() def thread_vector_reduce_x(bj: int, lf_i: pk.Acc[pk.double]): j: int = self.permute_vector[j_offset + bj] dx: float = x_i - self.x[j][0] dy: float = y_i - self.x[j][1] dz: float = z_i - self.x[j][2] type_j: int = self.type[j] rsq: float = (dx * dx) + (dy * dy) + (dz * dz) if rsq < self.cutsq[type_i][type_j] and i != j: r2inv: float = 1.0 / rsq r6inv: float = r2inv * r2inv * r2inv fpair: float = ( r6inv * (self.lj1[type_i][type_j] * r6inv - self.lj2[type_i][type_j])) * r2inv lf_i += dx * fpair def thread_vector_reduce_y(bj: int, lf_i: pk.Acc[pk.double]): j: int = self.permute_vector[j_offset + bj] dx: float = x_i - self.x[j][0] dy: float = y_i - self.x[j][1] dz: float = z_i - self.x[j][2] type_j: int = self.type[j] rsq: float = (dx * dx) + (dy * dy) + (dz * dz) if rsq < self.cutsq[type_i][type_j] and i != j: r2inv: float = 1.0 / rsq r6inv: float = r2inv * r2inv * r2inv fpair: float = ( r6inv * (self.lj1[type_i][type_j] * r6inv - self.lj2[type_i][type_j])) * r2inv lf_i += dy * fpair def thread_vector_reduce_z(bj: int, lf_i: pk.Acc[pk.double]): j: int = self.permute_vector[j_offset + bj] dx: float = x_i - self.x[j][0] dy: float = y_i - self.x[j][1] dz: float = z_i - self.x[j][2] type_j: int = self.type[j] rsq: float = (dx * dx) + (dy * dy) + (dz * dz) if rsq < self.cutsq[type_i][type_j] and i != j: r2inv: float = 1.0 / rsq r6inv: float = r2inv * r2inv * r2inv fpair: float = ( r6inv * (self.lj1[type_i][type_j] * r6inv - self.lj2[type_i][type_j])) * r2inv lf_i += dz * fpair thread_vector_count: int = self.bin_count[bx_j][by_j][bz_j] f_i_tmp_x: float = pk.parallel_reduce( pk.ThreadVectorRange(team, thread_vector_count), thread_vector_reduce_x) f_i_tmp_y: float = pk.parallel_reduce( pk.ThreadVectorRange(team, thread_vector_count), thread_vector_reduce_y) f_i_tmp_z: float = pk.parallel_reduce( pk.ThreadVectorRange(team, thread_vector_count), thread_vector_reduce_z) f_i.x += f_i_tmp_x f_i.y += f_i_tmp_y f_i.z += f_i_tmp_z self.f[i][0] = f_i.x self.f[i][1] = f_i.y self.f[i][2] = f_i.z