Esempio n. 1
0
def call_clu(obs_pts, tris, slips, nu, fnc):
    fnc_name, vec_dim = fnc
    if tris.shape[0] != obs_pts.shape[0]:
        raise ValueError(
            "There must be one input observation point per triangle.")

    check_inputs(obs_pts, tris, slips)
    float_type, (obs_pts, tris, slips) = solve_types(obs_pts, tris, slips)

    n = obs_pts.shape[0]
    block_size = backend.max_block_size(16)
    n_blocks = int(np.ceil(n / block_size))
    gpu_config = dict(block_size=block_size,
                      float_type=backend.np_to_c_type(float_type))
    module = backend.load_module("pairs.cu",
                                 tmpl_args=gpu_config,
                                 tmpl_dir=source_dir)

    gpu_results = backend.empty(n * vec_dim, float_type)
    gpu_obs_pts = backend.to(obs_pts, float_type)
    gpu_tris = backend.to(tris, float_type)
    gpu_slips = backend.to(slips, float_type)

    getattr(module, "pairs_" + fnc_name)(
        gpu_results,
        np.int32(n),
        gpu_obs_pts,
        gpu_tris,
        gpu_slips,
        float_type(nu),
        (n_blocks, 1, 1),
        (block_size, 1, 1),
    )
    out = backend.get(gpu_results).reshape((n, vec_dim))
    return out
Esempio n. 2
0
def call_clu_matrix(obs_pts, tris, nu, fnc):
    fnc_name, vec_dim = fnc
    check_inputs(obs_pts, tris, placeholder)
    float_type, (obs_pts, tris, _) = solve_types(obs_pts, tris, placeholder)

    n_obs = obs_pts.shape[0]
    n_src = tris.shape[0]
    block_size = backend.max_block_size(16)
    n_obs_blocks = int(np.ceil(n_obs / block_size))
    n_src_blocks = int(np.ceil(n_src / block_size))
    gpu_config = dict(float_type=backend.np_to_c_type(float_type))
    module = backend.load_module("matrix.cu",
                                 tmpl_args=gpu_config,
                                 tmpl_dir=source_dir)

    gpu_results = backend.empty(n_obs * vec_dim * n_src * 3, float_type)
    gpu_obs_pts = backend.to(obs_pts, float_type)
    gpu_tris = backend.to(tris, float_type)

    getattr(module, "matrix_" + fnc_name)(
        gpu_results,
        np.int32(n_obs),
        np.int32(n_src),
        gpu_obs_pts,
        gpu_tris,
        float_type(nu),
        (n_obs_blocks, n_src_blocks, 1),
        (block_size, block_size, 1),
    )
    out = backend.get(gpu_results).reshape((n_obs, vec_dim, n_src, 3))
    return out
Esempio n. 3
0
def call_clu_free(obs_pts, tris, slips, nu, fnc):
    fnc_name, vec_dim = fnc
    check_inputs(obs_pts, tris, slips)
    float_type, (obs_pts, tris, slips) = solve_types(obs_pts, tris, slips)

    n_obs = obs_pts.shape[0]
    n_src = tris.shape[0]
    block_size = backend.max_block_size(256)

    gpu_obs_pts = backend.to(obs_pts, float_type)
    gpu_tris = backend.to(tris, float_type)
    gpu_slips = backend.to(slips, float_type)
    gpu_results = backend.zeros(n_obs * vec_dim, float_type)

    n_obs_blocks = int(np.ceil(n_obs / block_size))
    gpu_config = dict(float_type=backend.np_to_c_type(float_type))
    module = backend.load_module("free.cu",
                                 tmpl_args=gpu_config,
                                 tmpl_dir=source_dir)

    # Split up the sources into chunks so that we don't completely overwhelm a
    # single GPU machine and cause the screen to lock up.
    default_chunk_size = 64
    n_chunks = int(ceil(n_src / default_chunk_size))
    out = np.zeros((n_obs, vec_dim), dtype=float_type)
    for i in range(n_chunks):
        chunk_start = i * default_chunk_size
        chunk_size = min(n_src - chunk_start, default_chunk_size)
        chunk_end = chunk_start + chunk_size

        getattr(module, "free_" + fnc_name)(
            gpu_results,
            np.int32(n_obs),
            np.int32(n_src),
            np.int32(chunk_start),
            np.int32(chunk_end),
            gpu_obs_pts,
            gpu_tris,
            gpu_slips,
            float_type(nu),
            (n_obs_blocks, 1, 1),
            (block_size, 1, 1),
        )
        out += backend.get(gpu_results).reshape((n_obs, vec_dim))
    return out
Esempio n. 4
0
def call_clu_block(obs_pts, tris, obs_start, obs_end, src_start, src_end, nu,
                   fnc):
    fnc_name, vec_dim = fnc
    check_inputs(obs_pts, tris, placeholder)
    float_type, (obs_pts, tris, _) = solve_types(obs_pts, tris, placeholder)
    obs_start, obs_end, src_start, src_end = process_block_inputs(
        obs_start, obs_end, src_start, src_end)

    block_sizes = vec_dim * 3 * (obs_end - obs_start) * (src_end - src_start)
    block_end = np.cumsum(block_sizes)
    block_start = np.empty(block_end.shape[0] + 1, dtype=block_end.dtype)
    block_start[:-1] = block_end - block_sizes
    block_start[-1] = block_end[-1]

    n_blocks = obs_end.shape[0]
    team_size = backend.max_block_size(16)
    gpu_config = dict(float_type=backend.np_to_c_type(float_type))
    module = backend.load_module("blocks.cu",
                                 tmpl_args=gpu_config,
                                 tmpl_dir=source_dir)

    gpu_results = backend.zeros(block_end[-1], float_type)
    gpu_obs_pts = backend.to(obs_pts, float_type)
    gpu_tris = backend.to(tris, float_type)
    gpu_obs_start = backend.to(obs_start, np.int32)
    gpu_obs_end = backend.to(obs_end, np.int32)
    gpu_src_start = backend.to(src_start, np.int32)
    gpu_src_end = backend.to(src_end, np.int32)
    gpu_block_start = backend.to(block_start, np.int32)

    getattr(module, "blocks_" + fnc_name)(
        gpu_results,
        gpu_obs_pts,
        gpu_tris,
        gpu_obs_start,
        gpu_obs_end,
        gpu_src_start,
        gpu_src_end,
        gpu_block_start,
        float_type(nu),
        (n_blocks, 1, 1),
        (team_size, 1, 1),
    )
    return backend.get(gpu_results), block_start
Esempio n. 5
0
def call_clu_aca(
    obs_pts,
    tris,
    obs_start,
    obs_end,
    src_start,
    src_end,
    nu,
    tol,
    max_iter,
    fnc,
    Iref0=None,
    Jref0=None,
):
    fnc_name, vec_dim = fnc
    check_inputs(obs_pts, tris, placeholder)
    float_type, (obs_pts, tris, _) = solve_types(obs_pts, tris, placeholder)
    obs_start, obs_end, src_start, src_end = process_block_inputs(
        obs_start, obs_end, src_start, src_end
    )
    tol, max_iter = check_tol_max_iter(obs_start, tol, max_iter, float_type)

    default_chunk_size = 512
    team_size = backend.max_block_size(32)
    n_blocks = obs_end.shape[0]

    verbose = False
    gpu_config = dict(float_type=backend.np_to_c_type(float_type), verbose=verbose)
    module = backend.load_module("aca.cu", tmpl_args=gpu_config, tmpl_dir=source_dir)

    n_chunks = int(ceil(n_blocks / default_chunk_size))
    appxs = []
    for i in range(n_chunks):
        chunk_start = i * default_chunk_size
        chunk_size = min(n_blocks - chunk_start, default_chunk_size)
        chunk_end = chunk_start + chunk_size

        n_obs_per_block = (
            obs_end[chunk_start:chunk_end] - obs_start[chunk_start:chunk_end]
        )
        n_src_per_block = (
            src_end[chunk_start:chunk_end] - src_start[chunk_start:chunk_end]
        )
        n_rows = n_obs_per_block * vec_dim
        n_cols = n_src_per_block * 3
        block_sizes = n_rows * n_cols

        # Storage for the U, V output matrices. These will be in a packed format.
        gpu_buffer = backend.empty(block_sizes.sum(), float_type)

        # Storage for temporary rows and columns: RIref, RJref, RIstar, RJstar
        fworkspace_per_block = n_cols + n_rows + 3 * n_cols + vec_dim * n_rows
        fworkspace_ends = np.cumsum(fworkspace_per_block)
        fworkspace_starts = fworkspace_ends - fworkspace_per_block
        gpu_fworkspace = backend.empty(fworkspace_ends[-1], float_type)
        gpu_fworkspace_starts = backend.to(fworkspace_starts, np.int32)

        # uv_ptrs forms arrays that point to the start of each U/V vector pairs in
        # the main output buffer
        uv_ptrs_size = np.minimum(n_rows, n_cols)
        uv_ptrs_ends = np.cumsum(uv_ptrs_size)
        uv_ptrs_starts = uv_ptrs_ends - uv_ptrs_size
        gpu_uv_ptrs_starts = backend.to(uv_ptrs_starts, np.int32)
        gpu_uv_ptrs = backend.empty(uv_ptrs_ends[-1], np.int32)
        gpu_iworkspace = backend.empty(uv_ptrs_ends[-1], np.int32)

        # Output space for specifying the number of terms used for each
        # approximation.
        gpu_n_terms = backend.empty(chunk_size, np.int32)

        # Storage space for a pointer to the next empty portion of the output
        # buffer.
        gpu_next_ptr = backend.zeros(1, np.int32)

        # The index of the starting reference rows/cols.
        if Iref0 is None:
            Iref0_chunk = np.random.randint(0, n_rows, size=chunk_size, dtype=np.int32)
        else:
            Iref0_chunk = Iref0[chunk_start:chunk_end]
        if Jref0 is None:
            Jref0_chunk = np.random.randint(0, n_cols, size=chunk_size, dtype=np.int32)
        else:
            Jref0_chunk = Jref0[chunk_start:chunk_end]
        gpu_Iref0 = backend.to(Iref0_chunk, np.int32)
        gpu_Jref0 = backend.to(Jref0_chunk, np.int32)

        gpu_obs_pts = backend.to(obs_pts, float_type)
        gpu_tris = backend.to(tris, float_type)
        gpu_obs_start = backend.to(obs_start[chunk_start:chunk_end], np.int32)
        gpu_obs_end = backend.to(obs_end[chunk_start:chunk_end], np.int32)
        gpu_src_start = backend.to(src_start[chunk_start:chunk_end], np.int32)
        gpu_src_end = backend.to(src_end[chunk_start:chunk_end], np.int32)
        gpu_tol = backend.to(tol[chunk_start:chunk_end], float_type)
        gpu_max_iter = backend.to(max_iter[chunk_start:chunk_end], np.int32)

        if verbose:
            print(f"gpu_buffer.shape = {gpu_buffer.shape}")
            print(f"gpu_uv_ptrs.shape = {gpu_uv_ptrs.shape}")
            print(f"gpu_n_terms.shape = {gpu_n_terms.shape}")
            print(f"gpu_next_ptr.shape = {gpu_next_ptr.shape}")
            print(f"gpu_fworkspace.shape = {gpu_fworkspace.shape}")
            print(f"gpu_iworkspace.shape = {gpu_iworkspace.shape}")
            print(f"gpu_uv_ptrs_starts.shape = {gpu_uv_ptrs_starts.shape}")
            print(f"gpu_Iref0.shape = {gpu_Iref0.shape}")
            print(f"gpu_Jref0.shape = {gpu_Jref0.shape}")
            print(f"obs_pts.shape = {obs_pts.shape}")
            print(f"tris.shape = {tris.shape}")
            print(f"gpu_obs_start.shape = {gpu_obs_start.shape}")
            print(f"gpu_obs_end.shape = {gpu_obs_end.shape}")
            print(f"gpu_src_start.shape = {gpu_src_start.shape}")
            print(f"gpu_src_end.shape = {gpu_src_end.shape}")

        getattr(module, "aca_" + fnc_name)(
            gpu_buffer,
            gpu_uv_ptrs,
            gpu_n_terms,
            gpu_next_ptr,
            gpu_fworkspace,
            gpu_iworkspace,
            gpu_uv_ptrs_starts,
            gpu_fworkspace_starts,
            gpu_Iref0,
            gpu_Jref0,
            gpu_obs_pts,
            gpu_tris,
            gpu_obs_start,
            gpu_obs_end,
            gpu_src_start,
            gpu_src_end,
            gpu_tol,
            gpu_max_iter,
            float_type(nu),
            (chunk_size, 1, 1),
            (team_size, 1, 1),
        )

        # post-process the buffer to collect the U, V vectors
        buffer = backend.get(gpu_buffer)
        uv_ptrs = backend.get(gpu_uv_ptrs)
        n_terms = backend.get(gpu_n_terms)
        for i in range(chunk_size):
            us = []
            vs = []
            uv_ptr0 = uv_ptrs_starts[i]
            ptrs = uv_ptrs[uv_ptr0 + np.arange(n_terms[i])]
            us = buffer[ptrs[:, None] + np.arange(n_rows[i])[None, :]]
            vs = buffer[
                ptrs[:, None] + np.arange(n_rows[i], n_rows[i] + n_cols[i])[None, :]
            ]
            appxs.append((us.T, vs))
    return appxs