Ejemplo n.º 1
0
    def __init__(self):

        self.__device_inited = False

        assert isinstance(cfg.device_id,
                          (np.integer, int)) and cfg.device_id >= 0

        self.__device_id = cfg.device_id
        self.__init_device()

        dtype_c = {np.float32: 'float', np.float64: 'double'}[cfg.dtype]

        dtype_complex_c = {
            np.complex64: 'pycuda::complex<float>',
            np.complex128: 'pycuda::complex<double>'
        }[cfg.dtype_complex]

        # If gl_parameter is changed, this needs to change as well
        solveA = np.bool(not np.isposinf(cfg.gl_parameter))
        if solveA:
            self.reduction_vector_length = 17
        else:
            self.reduction_vector_length = 5

        # PyCUDA supports compilation of one cuda file, so files are
        # concatenated and thus order of files is important
        cuda_files_parallel = [
            'common.h', 'block_reduction.h', 'reduction.h', 'utils.h'
        ]
        cuda_files_solvers_post = ['observables.h', 'td.h', 'cg.h']

        cuda_src_files = cuda_files_parallel + cuda_files_solvers_post

        this_dir = os.path.dirname(__file__)
        root_dir = os.path.abspath(os.path.join(this_dir, '..'))

        cuda_template = ''
        for ifile in cuda_src_files:
            with open(os.path.abspath(os.path.join(root_dir, 'cuda', ifile)),
                      'r') as f:
                cuda_template += f.read() + '\n'

        self.cuda_template_dict = {
            'real': dtype_c,
            'complex': dtype_complex_c,
            'Nx': cfg.Nx,
            'Ny': cfg.Ny,
            'dx': cfg.dx,
            'dy': cfg.dy,
            'reduction_vector_length': self.reduction_vector_length,
        }

        cuda_code = cuda_template % self.cuda_template_dict
        self._cuda_module = cuda_compiler.SourceModule(cuda_code,
                                                       options=['-std=c++11'])

        self.block_size = 128
        self.grid_size = Utils.intceil(cfg.N, self.block_size)
        self.grid_size_A = Utils.intceil(cfg.Nab, self.block_size)
Ejemplo n.º 2
0
    def gsum(self,
             ga_in,
             ga_out=None,
             N=0,
             block_size=0,
             use_gpuarray_sum=False):
        """Reduce a scalar.

        Args:
        ga_in : Input GPUArray of type real

        Optional Args:
        ga_out:  Store the output here in GPU memory. If
                not provided, the reduced value is brought back to host.
        N     :  No. of elements in ga_in array that should be reduced.
        block_size: Use specified block size in the reduction kernel
        use_gpuarray_sum: Use PyCUDA's reduction function (can be slow)
        """

        if ga_in is None:
            return None

        if use_gpuarray_sum:
            if ga_out is not None:
                Utils.copy_dtod(ga_out, gpuarray.sum(ga_in))
            else:
                return gpuarray.sum(ga_in).get().item()

        if N == 0:
            N = ga_in.size

        if block_size == 0:
            block_size = self.par.block_size

        block_size = int(self.__get_nearest_multiple(block_size, 32))
        grid_size = int(Utils.intceil(N, block_size))

        if self.gwork_s1 is None:
            self.gwork_s1 = gpuarray.zeros(grid_size, dtype=cfg.dtype)
        elif self.gwork_s1.size < grid_size:
            self.gwork_s1.gpudata.free()
            self.gwork_s1 = gpuarray.zeros(grid_size, dtype=cfg.dtype)

        if self.gwork_s2 is None:
            self.gwork_s2 = gpuarray.zeros(1, dtype=cfg.dtype)

        niterations = 1
        if grid_size > 1:
            niterations = 2

        arr_in = ga_in

        arr_out = self.gwork_s1
        if niterations == 1 and ga_out is not None:
            arr_out = ga_out

        self._sum_krnl(arr_in,
                       arr_out,
                       np.uint32(N),
                       grid=(grid_size, 1, 1),
                       block=(block_size, 1, 1))

        # Output of previous iteration is input for the next one
        if niterations == 2:

            arr_in = arr_out
            arr_out = ga_out if ga_out is not None else self.gwork_s2

            if block_size >= grid_size:
                block_size = int(self.__get_nearest_multiple(grid_size, 32))

            self._sum_krnl(arr_in,
                           arr_out,
                           np.uint32(grid_size),
                           block=(block_size, 1, 1))

        if ga_out is None:
            return arr_out.get()[0]
Ejemplo n.º 3
0
    def gsum_v(self, ga_in, nv, ne, block_size=0):
        """Reduce a vector.

        Args:
        ga_in : Input GPUArray of type real that has 'nv * ne' elements
        nv: No. of vectors
        ne: No. of elements per vector

        Optional Args:
        block_size: Use specified block size in the reduction kernel
        """

        if ga_in is None:
            return None

        if (nv == 1):
            return ga_in.copy()

        if block_size == 0:
            block_size = self.par.block_size

        block_size = int(self.__get_nearest_multiple(block_size, 32))
        grid_size = int(Utils.intceil(nv, block_size))

        size = int(grid_size * ne)
        if self.gwork_v1 is None:
            self.gwork_v1 = gpuarray.zeros(size, dtype=cfg.dtype)
        elif self.gwork_v1.size < size:
            self.gwork_v1.gpudata.free()
            self.gwork_v1 = gpuarray.zeros(size, dtype=cfg.dtype)

        if self.gwork_v2 is None:
            self.gwork_v2 = gpuarray.zeros(ne, dtype=cfg.dtype)

        niterations = 1
        if grid_size > 1:
            niterations = 2

        self._sum_v_krnl(ga_in,
                         self.gwork_v1,
                         np.uint32(nv),
                         grid=(grid_size, 1, 1),
                         block=(block_size, 1, 1))

        # Output of previous iteration is input for the next one
        if niterations == 2:

            # reuse work array
            if self.gwork_v2.size != ne:
                self.gwork_v2.gpudata.free()
                self.gwork_v2 = gpuarray.zeros(ne, dtype=cfg.dtype)

            self._sum_v_krnl(self.gwork_v1,
                             self.gwork_v2,
                             np.uint32(grid_size),
                             block=(block_size, 1, 1))

            r = self.gwork_v2.get()
        else:
            r = self.gwork_v1.get()

        return r
Ejemplo n.º 4
0
 def __get_nearest_multiple(self, a, multiplier):
     return multiplier * Utils.intceil(a, multiplier)
Ejemplo n.º 5
0
Archivo: td.py Proyecto: skywo1f/svirl
    def __iterate_vector_potential_gpu(self):
        """Performs dtA-iteration of self.a/self.b on GPU"""

        # self.gabi += self.gab; no memory allocation
        if self.fixed_vortices._vpi is not None:
            self.__xpy_r_krnl(
                self.fixed_vortices.irregular_vector_potential_h(),
                self.vars.vector_potential_h(),
                np.uint32(cfg.N),
                block=(self.par.block_size, 1, 1),
                grid=(self.par.grid_size, 1, 1))
            gabi_gab = self.fixed_vortices.irregular_vector_potential_h(
            )  # just a pointer
        else:
            gabi_gab = self.vars.vector_potential_h()

        # similar to gab_rhs = gab.copy(), but does not allocate new array
        Utils.copy_dtod(self.vars._tmp_edge_var_h(),
                        self.vars.vector_potential_h())
        #self.vars._tmp_edge_var.need_dtoh_sync()

        # if self.ab_langevin_c > 1e-16:
        #     self.gab_rhs += self.ab_langevin_c*(curand(self.gab_rhs.shape, dtype=cfg.dtype) - 0.5)
        for j in range(1024):
            self.__gr2_max.fill(np.int32(0))

            self.__iterate_vector_potential_jacobi_step_krnl(
                self.dt,
                self.params.gl_parameter_squared_h(),
                self.params._rho,
                self.params.homogeneous_external_field,
                self.mesh.material_tiling_h(),
                self.vars.order_parameter_h(),
                gabi_gab,
                self.vars._tmp_edge_var_h(
                ),  # ab for right-hand side; does not change during Jacobi interactions
                self.vars.vector_potential_h(),  # ab^{j} in Jacobi method
                self.__gab_next,  # ab^{j+1} in Jacobi method
                self.params.vector_potential_Langevin_coefficient,
                np.uint32(j),
                self._random_t,
                cfg.stop_criterion_vector_potential,
                self.__gr2_max,
                grid=(self.par.grid_size, 1, 1),
                block=(self.par.block_size, 1, 1),
            )

            # swap pointers, does not change arrays
            self.vars._vp._gdata, self.__gab_next = self.__gab_next, self.vars._vp._gdata
            #self.vars.vector_potential_h(), self.__gab_next = self.__gab_next, self.vars.vector_potential_h()

            # r2_max_norm = residual/stop_criterion
            r2_max_norm = 1.0e-4 * cfg.dtype(self.__gr2_max.get()[0])

            # convergence criteria
            if r2_max_norm < 1.0:
                break

        self._random_t += np.uint32(1)

        self.vars._vp.need_dtoh_sync()

        # self.gabi -= self.gab; no memory allocation
        if self.fixed_vortices._vpi is not None:
            self.__xmy_r_krnl(
                self.fixed_vortices.irregular_vector_potential_h(),
                self.vars.vector_potential_h(),
                np.uint32(cfg.N),
                block=(self.par.block_size, 1, 1),
                grid=(self.par.grid_size, 1, 1))
Ejemplo n.º 6
0
Archivo: td.py Proyecto: skywo1f/svirl
    def __iterate_order_parameter_gpu(self, gab_gabi):
        """Performs dt-iteration of self.psi on GPU"""

        # similar to gpsi_rhs = gpsi.copy(), but does not allocate new array
        Utils.copy_dtod(self.vars._tmp_node_var_h(),
                        self.vars.order_parameter_h())
        #self.vars._tmp_node_var.need_dtoh_sync()

        for j in range(1024):
            self.__gr2_max.fill(np.int32(0))

            # TODO: prepare all cuda calls
            self.__iterate_order_parameter_jacobi_step_krnl(
                self.dt,
                self.params.linear_coefficient_scalar_h(),
                self.params.linear_coefficient_h(),
                self.mesh.material_tiling_h(),
                gab_gabi,
                self.vars._tmp_node_var_h(
                ),  # psi for right-hand side; does not change during Jacobi interactions
                self.vars.order_parameter_h(),  # psi^{j} in Jacobi method
                self.__gpsi_next,  # psi^{j+1} in Jacobi method
                self.params.order_parameter_Langevin_coefficient,
                np.uint32(j),
                self._random_t,
                cfg.stop_criterion_order_parameter,
                self.__gr2_max,
                grid=(self.par.grid_size, 1, 1),
                block=(self.par.block_size, 1, 1),
            )

            # swap pointers, does not change arrays
            # TODO: this is hard-wired for now since python doesn't allow
            # assignment for a function call.Sync Status not updated

            self.vars._psi._gdata, self.__gpsi_next = self.__gpsi_next, self.vars._psi._gdata
            #self.vars.order_parameter_h(), self.__gpsi_next = self.__gpsi_next, self.vars.order_parameter_h()

            # residual = max{|b-M*psi|}
            # r2_max_norm = residual/stop_criterion
            r2_max_norm = 1.0e-4 * cfg.dtype(self.__gr2_max.get()[0])

            # convergence criteria
            if r2_max_norm < 1.0:
                break

        self._random_t += np.uint32(1)

        if self.fixed_vortices._phase_lock_ns is not None:
            block_size = 2
            grid_size = Utils.intceil(self.fixed_vortices._phase_lock_ns.size,
                                      block_size)

            self.__order_parameter_phase_lock_krnl(
                self.vars.order_parameter_h(),
                np.int32(self.fixed_vortices._phase_lock_ns.size),
                self.fixed_vortices._phase_lock_ns_h(),
                grid=(grid_size, 1, 1),
                block=(block_size, 1, 1),
            )

        self.vars._psi.need_dtoh_sync()
Ejemplo n.º 7
0
 def superfluid_density(self):
     """Calculates local superfluid density in grid vertices"""
     self.vars._psi.sync()
     return Utils.abs2(self.vars._psi.get_h())
Ejemplo n.º 8
0
Archivo: cg.py Proyecto: skywo1f/svirl
    def __free_energy_minimization(self, n_iter=1000):
        """Minimizes energy with respect to order parameter and vector potential"""
        # TODO: check material tiling
        # TODO: add external vector potential
        # TODO: add phase lock gridpoints

        # TODO: Ideally there should be one entry for both minimzation

        self.vars._psi.sync()
        self.vars._vp.sync()

        self.cg_energies = []  # TMP

        #beta_psi = 0.0  # First iteration is steepest descent, so make beta = 0.0
        #beta_A = 0.0  # First iteration is steepest descent, so make beta = 0.0

        # gpu arrays:
        # (g)dir     : search direction
        # (g)jac     : gradient
        # (g)jac_prev: gradient from previous iteration

        #cuda.start_profiler()

        self.vars._alloc_free_temporary_gpu_storage('alloc')

        for i in range(n_iter):

            # 1. Compute jacobians
            self.__gjac_psi = self._free_energy_jacobian_psi
            self.__gjac_A = self._free_energy_jacobian_A

            # 2. Compute betas
            # use Polak–Ribière formula with resetting
            # TODO: consider other formulas, see e.g. https://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method
            if i > 0:
                self.__compute_beta_psi(self.__gjac_psi, self.__gjac_psi_prev)
                self.__compute_beta_A(self.__gjac_A, self.__gjac_A_prev)

            # 3. Update search directions
            self.__axmy_c_krnl(self.__gdir_psi,
                               self.__gjac_psi,
                               self.__gdir_psi,
                               self._beta_psi,
                               np.uint32(cfg.N),
                               block=(self.par.block_size, 1, 1),
                               grid=(self.par.grid_size, 1, 1))

            self.__axmy_r_krnl(self.__gdir_A,
                               self.__gjac_A,
                               self.__gdir_A,
                               self._beta_A,
                               np.uint32(cfg.Nab),
                               block=(self.par.block_size, 1, 1),
                               grid=(self.par.grid_size_A, 1, 1))

            # 4. Compute alphas
            self._free_energy_conjgrad_coef(self.__gdir_psi, self.__gdir_A)
            alpha_psi, alpha_A = self._cg_alpha_min()
            #print('iter: ', i, 'c: ', self.__c, 'alpha, beta: ', alpha_psi, alpha_A, beta_psi, beta_A, flush=True)

            # 5. Update variables
            self.__axpy_c_krnl(self.__gdir_psi,
                               self.vars.order_parameter_h(),
                               self.vars.order_parameter_h(),
                               cfg.dtype(alpha_psi),
                               np.uint32(cfg.N),
                               block=(self.par.block_size, 1, 1),
                               grid=(self.par.grid_size, 1, 1))

            self.__axpy_r_krnl(self.__gdir_A,
                               self.vars.vector_potential_h(),
                               self.vars.vector_potential_h(),
                               cfg.dtype(alpha_A),
                               np.uint32(cfg.Nab),
                               block=(self.par.block_size, 1, 1),
                               grid=(self.par.grid_size_A, 1, 1))

            # 6. Save previous step
            Utils.copy_dtod(self.__gjac_psi_prev, self.__gjac_psi)
            Utils.copy_dtod(self.__gjac_A_prev, self.__gjac_A)

            E0 = self.observables.free_energy  # TMP
            self.cg_energies.append(E0)  # TMP
            # if i%10 == 0:
            #     print('%3.d: E = %10.10f' % (i, E0)) # TMP

            if (i > 0
                    and np.abs(self.cg_energies[i] / self.cg_energies[i - 1] -
                               1.0) < self.__convergence_rtol):
                #print('CG converged in %d iterations with residual %g ' % ( i, np.abs(self.cg_energies[i]/self.cg_energies[i-1] - 1.0)))
                break

        #cuda.stop_profiler()

        self.vars._psi.need_dtoh_sync()
        self.vars._vp.need_dtoh_sync()

        self.vars._alloc_free_temporary_gpu_storage('free')
Ejemplo n.º 9
0
Archivo: cg.py Proyecto: skywo1f/svirl
    def __free_energy_minimization_psi(self, n_iter=1000):
        """Minimizes energy with respect to order parameter"""
        # NOTE: Tests show that
        #       - CG minimization is much faster than TD for 1-2 vortices (at least current implementation)
        #       - for ~30 of vortices CG demonstrates similar "performance" as TD

        # NOTE: works with material tiling
        # TODO: add external vector potential
        # TODO: add phase lock gridpoints

        assert not self.params.solveA

        self.vars._psi.sync()
        self.vars._vp.sync()

        self.cg_energies = []  # TMP

        #beta = 0.0  # First iteration is steepest descent, so make beta = 0.0

        # gpu arrays:
        # (g)dir     : search direction
        # (g)jac     : gradient
        # (g)jac_prev: gradient from previous iteration

        self.vars._alloc_free_temporary_gpu_storage('alloc')

        self.__gdir_psi.fill(0.0)
        for i in range(n_iter):

            # 1. Compute jacobians
            self.__gjac_psi = self._free_energy_jacobian_psi

            # 2. Compute beta
            # Polak–Ribière formula
            # TODO: consider other formulas, see e.g. https://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method
            if i > 0:
                # d*(d-dp)/(dp*dp)
                # = d.re, d.im]*([d.re-dp.re, d.im-dp.im])/([dp.re, dp.im]*[dp.re, dp.im])
                # = (d.re*(d.re-dp.re) + d.im*(d.im-dp.im))/(dp.re*dp.re + dp.im*dp.im)
                # = (j.re*(j.re-jp.re) + j.im*(j.im-jp.im))/(jp.re*jp.re + jp.im*jp.im)
                # where j = -d

                self.__compute_beta_psi(self.__gjac_psi, self.__gjac_psi_prev)

            # 3. Update search direction
            self.__axmy_c_krnl(self.__gdir_psi,
                               self.__gjac_psi,
                               self.__gdir_psi,
                               self._beta_psi,
                               np.uint32(cfg.N),
                               block=(self.par.block_size, 1, 1),
                               grid=(self.par.grid_size, 1, 1))

            # 4. Compute alpha
            self._free_energy_conjgrad_coef_psi(self.__gdir_psi)
            alpha0 = self._cg_alpha_psi_min()
            #print('iter: ', i, 'F: ', c0, c1, c2, c3, c4, 'alpha: ', alpha0, 'beta: ', beta, flush=True)

            # 5. Update variables
            self.__axpy_c_krnl(self.__gdir_psi,
                               self.vars.order_parameter_h(),
                               self.vars.order_parameter_h(),
                               cfg.dtype(alpha0),
                               np.uint32(cfg.N),
                               block=(self.par.block_size, 1, 1),
                               grid=(self.par.grid_size, 1, 1))

            # 6. Save
            Utils.copy_dtod(self.__gjac_psi_prev, self.__gjac_psi)

            E0 = self.observables.free_energy  # TMP
            self.cg_energies.append(E0)  # TMP
            # if i%10 == 0:
            #     print('%3.d: E = %10.10f' % (i, E0)) # TMP

            if (i > 0
                    and np.abs(self.cg_energies[i] / self.cg_energies[i - 1] -
                               1.0) < self.__convergence_rtol):
                #print('CG converged in %d iterations with residual %g ' % ( i, np.abs(self.cg_energies[i]/self.cg_energies[i-1] - 1.0)))
                break

        self.vars._psi.need_dtoh_sync()

        self.vars._alloc_free_temporary_gpu_storage('free')