def t3(): out = clone_here(a[i, j]) # Move data to the current device rhs1 = clone_here(a[i, k]) rhs2 = clone_here(a[j, k]) out = update(rhs1, rhs2, out) copy(a[i, j], out) # Move the result to the global array
def t3(): out = clone_here(a[i, j]) # Move data to the current device rhs1 = clone_here(a[i, k]) rhs2 = clone_here(a[j, k]) out -= rhs1 @ rhs2.T copy(a[i, j], out) # Move the result to the global array
def matmul_task(): old_device = cp.cuda.Device() #local_start = time.perf_counter() b_block_local = clone_here(b_block) #cp.cuda.get_current_stream().synchronize() #communication_stop = time.perf_counter() # cupy doesn't support the out argument for matmul yet so we have to copy. # cp.matmul(a_block, b_block_local.T, out = c_block) c_block[:] = a_block @ b_block_local.T
def __getitem__(self, index: IndexType): # -> Union[Array, List[Array]] """ Read partitions and make sure they are on the current device. :param index: index of the target partition(s). .. todo: Multiple partitions are currently returned as a Python list of partitions (ndarrays). """ if not isinstance(index, tuple): index = (index, ) ret = [] parse_index( self._latest_view, index, step=lambda I, i: I[i], stop=lambda x: ret.append(clone_here(x) if is_array(x) else x)) if len(ret) == 1: if ret[0] is None: warn("Partition has been freed!") return ret[0] warn( "Multiple partitions are currently returned as a Python list of partitions (ndarrays)." ) return ret
def t2(): dblock = clone_here(a[j, j]) dblock = cholesky(dblock) copy(a[j, j], dblock)
def t1(): out = clone_here(a[j, j]) # Move data to the current device rhs = clone_here(a[j, k]) out = update(rhs, rhs, out) copy(a[j, j], out) # Move the result to the global array
def get_gpu_memory(i:int, j:int, num_gpus:int): dev_id = i % num_gpus local_id = i // num_gpus src = gpu_arrs[dev_id][local_id][j] dst = clone_here(src) return dst
def t3(): out = clone_here(a[i, j]) rhs1 = clone_here(a[i, k]) rhs2 = clone_here(a[j, k]) out -= rhs1 @ rhs2.T copy(a[i, j], out)
def t1(): out = clone_here(a[j, j]) rhs = clone_here(a[j, k]) out -= rhs @ rhs.T copy(a[j, j], out)
def cholesky_inplace(a): if a.shape[0] != a.shape[1]: raise ValueError("A square array is required.") ca = clone_here(a) ca[:] = cupy.linalg.cholesky(ca) copy(a, ca)
def t4(): factor = clone_here(a[j, j]) panel = clone_here(a[i, j]) panel = ltriang_solve(factor, panel) copy(a[i, j], panel)
def t1(): #print("t1[", i, "] start", sep='') #copy_start = time.time() A_block_local = clone_here(A_block) #copy_end = time.time() Q1_blocked[i], R1[R1_lower:R1_upper] = qr_block(A_block_local)
def t3(): #print("t3[", i, "] start", sep='') Q1_block_local = clone_here(Q1_blocked[i]) Q2_block_local = clone_here(Q2_block) Q[Q_lower:Q_upper] = matmul_block(Q1_block_local, Q2_block_local)