def t1(): out = clone_here(a[j, j]) # Move data to the current device rhs = clone_here(a[j, k]) out = update(rhs, rhs, out) copy(a[j, j], out) # Move the result to the global array
def t1(): out = clone_here(a[j, j]) # Move data to the current device rhs = clone_here(a[j, k]) out -= rhs @ rhs.T copy(a[j, j], out) # Move the result to the global array
def _check_set(x): x, i = x # TODO (bozhi) need C pointers! def _hard_set(i, value): if len(i) == 1: self._latest_view[i[0]] = value elif len(i) == 2: self._latest_view[i[0]][i[1]] = value else: raise NotImplementedError( "High-dimensional PartitionedTensor with None not supported!" ) if x is None: _hard_set(i, value) return is_to_array = is_array(x) is_from_array = is_array(value) if is_from_array and is_to_array: try: copy(x, value) except ValueError: warn( "Incompatible arrays (e.g. different shapes). Overwritting." ) _hard_set(i, value) else: # TODO (bozhi): should not allow None assignment but implement free(index) if not is_to_array and x is not None: warn("Array partition was modified as %s object." % type(x)) if not is_from_array and value is not None: warn("Modifying array partition with %s object!" % type(value)) x = value
def t4(): factor = clone_here(a[j, j]) panel = clone_here(a[i, j]) print(i, j, "Before", panel, flush=True) out = ltriang_solve(factor, panel) print(i, j, "Panel", panel, flush=True) print(i, j, "Out", out, flush=True) copy(a[i, j], out)
def t3(): out = clone_here(a[i, j]) # Move data to the current device rhs1 = clone_here(a[i, k]) rhs2 = clone_here(a[j, k]) out = update(rhs1, rhs2, out) copy(a[i, j], out) # Move the result to the global array
async def run_jacobi(): assert steps > 0 # Specify which set of blocks is used as input or output # (they will be swapped for each iteration). in_blocks = a0_row_groups out_blocks = a1_row_groups # Create a set of labels for the tasks that perform the first # Jacobi iteration step. previous_block_tasks = CompletedTaskSpace() # Now create the tasks for subsequent iteration steps. for i in range(steps): # Swap input and output blocks for the next step. in_blocks, out_blocks = out_blocks, in_blocks # Create a new set of labels for the tasks that do this iteration step. current_block_tasks = TaskSpace("block_tasks[{}]".format(i)) # Create the tasks to do the i'th iteration. # As before, each task needs the following info: # a block index "j" # a "device" where it should execute (supplied by mapper used for partitioning) # the "in_block" of data used as input # the "out_block" to write the output to for j in range(divisions): device = mapper.device(j) in_block = in_blocks[j] out_block = out_blocks[j] # Make each task operating on each block depend on the tasks for # that block and its immediate neighbors from the previous iteration. @spawn(current_block_tasks[j], dependencies=[ previous_block_tasks[max(0, j - 1):min(divisions, j + 2)] ], placement=device) def device_local_jacobi_task(): # Read boundary values from adjacent blocks in the partition. # This may communicate across device boundaries. if j > 0: copy(in_block[0], in_blocks[j - 1][-2]) if j < divisions - 1: copy(in_block[-1], in_blocks[j + 1][1]) # Run the computation, dispatching to device specific code. jacobi(in_block, out_block) # For the next iteration, use the newly created tasks as # the tasks from the previous step. previous_block_tasks = current_block_tasks await previous_block_tasks cupy.cuda.get_current_stream().synchronize() cupy.cuda.Stream.null.synchronize() end = time.perf_counter() print(end - start) # This depends on all the tasks from the last iteration step. for j in range(divisions): start_index = 1 if j > 0 else 0 end_index = -1 if j < divisions - 1 else None # None indicates the last element of the dimension copy(a1[mapper.slice(j, len(a1))], out_blocks[j][start_index:end_index])
def device_local_jacobi_task(): # Read boundary values from adjacent blocks in the partition. # This may communicate across device boundaries. if j > 0: copy(in_block[0], in_blocks[j - 1][-2]) if j < divisions - 1: copy(in_block[-1], in_blocks[j + 1][1]) # Run the computation, dispatching to device specific code. jacobi(in_block, out_block)
def t3(): out = clone_here(a[i, j]) # Move data to the current device rhs1 = clone_here(a[i, k]) rhs2 = clone_here(a[j, k]) out -= rhs1 @ rhs2.T copy(a[i, j], out) # Move the result to the global array
def t3(): out = clone_here(a[i, j]) rhs1 = clone_here(a[i, k]) rhs2 = clone_here(a[j, k]) out -= rhs1 @ rhs2.T copy(a[i, j], out)
def t2(): dblock = clone_here(a[j, j]) dblock = cholesky(dblock) copy(a[j, j], dblock)
def inner_local(): # Perform the local inner product using the numpy multiply operation, @. copy(partial_sums[i:i + 1], a_part[i] @ b_part[i])
def b(): copy(xp[i][j], xp[j][j])
def c(): copy(y[mapper.slice_x(i, y.shape[0])], yp[i][i])
def inner_local(): copy(partial_sums[i:i + 1], a_part[i] @ b_part[i])
def t1(): out = clone_here(a[j, j]) rhs = clone_here(a[j, k]) out -= rhs @ rhs.T copy(a[j, j], out)
def cholesky_inplace(a): if a.shape[0] != a.shape[1]: raise ValueError("A square array is required.") ca = clone_here(a) ca[:] = cupy.linalg.cholesky(ca) copy(a, ca)
def t4(): factor = clone_here(a[j, j]) panel = clone_here(a[i, j]) panel = ltriang_solve(factor, panel) copy(a[i, j], panel)