async def test_tsqr_blocked(placement=cpu): for i in range(WARMUP + ITERS): # Reset all iteration-specific timers and counters perf_stats.reset() # Original matrix np.random.seed(i) A = np.random.rand(NROWS, NCOLS) if PLACEMENT_STRING == 'puregpu': if (NROWS % NGPUS != 0): raise ValueError( "Pure GPU version requires NROWS %% NGPUS == 0 (currently %i %% %i)" % (NROWS, NGPUS)) # Partition matrix on GPUs mapper = LDeviceSequenceBlocked( NGPUS, placement=[gpu(dev) for dev in range(NGPUS)]) A_dev = mapper.partition_tensor(A) tot_start = time() Q_dev, R_dev = await tsqr_blocked_puregpu(A_dev, BLOCK_SIZE) tot_end = time() # Copy the data back if CHECK_RESULT: Q = np.empty(shape=(0, NCOLS)) for dev in range(NGPUS): with cp.cuda.Device(dev): Q = np.vstack((Q, cp.asnumpy(Q_dev[dev]))) R = cp.asnumpy(R_dev) else: # Normal version # Run and time the algorithm tot_start = time() Q, R = await tsqr_blocked(A, BLOCK_SIZE) tot_end = time() perf_stats.tot_time = tot_end - tot_start # Combine task timings into totals for this iteration perf_stats.consolidate_stats() if (i >= WARMUP): iter = i - WARMUP if CSV: perf_stats.print_stats_csv(iter) else: perf_stats.print_stats(iter) # Check the results if CHECK_RESULT: if check_result(A, Q, R): print("\nCorrect result!\n") else: print("%***** ERROR: Incorrect final result!!! *****%")
def main(): comm = MPI.COMM_WORLD print(comm.Get_rank(), comm.Get_size()) a = np.random.rand(10000000).astype(dtype='d') b = np.random.rand(10000000).astype(dtype='d') divisions = 100 comm.Barrier() start = time.perf_counter() # Map the divisions onto actual hardware locations mapper = LDeviceSequenceBlocked(divisions) # print(mapper.devices) a_part = mapper.partition_tensor(a) b_part = mapper.partition_tensor(b) inner_result = np.empty(1, dtype='d') @spawn(placement=cpu(0)) async def inner_part(): partial_sums = np.empty(divisions) async with finish(): for i in range(divisions): @spawn(placement=mapper.device(i)) def inner_local(): copy(partial_sums[i:i + 1], a_part[i] @ b_part[i]) res = 0. for i in range(divisions): res += partial_sums[i] inner_result[0] = res overall_result = np.array(0.0, dtype='d') if comm.Get_rank() == 0 else None comm.Reduce([inner_result, MPI.DOUBLE], [overall_result, MPI.DOUBLE], op=MPI.SUM, root=0) if overall_result is not None: result = float(overall_result) print(result) end = time.perf_counter() print(end - start) assert np.allclose(np.inner(a, b), inner_result[0]) other_results = np.empty(comm.Get_size(), dtype='d') if comm.Get_rank() == 0 else None comm.Gather([inner_result, MPI.DOUBLE], [other_results, MPI.DOUBLE], root=0) if overall_result is not None: assert np.isclose(result, np.sum(other_results))
def main(): n = 3 * 100000000 a = np.random.rand(n) b = np.random.rand(n) divisions = 100 start = time.perf_counter() # Map the divisions onto actual hardware locations devs = list(gpu.devices) + list(cpu.devices) if "N_DEVICES" in os.environ: devs = devs[:int(os.environ.get("N_DEVICES"))] mapper = LDeviceSequenceBlocked(divisions, devices=devs) a_part = mapper.partition_tensor(a) b_part = mapper.partition_tensor(b) inner_result = np.empty(1) @spawn() async def inner_part(): # Create array to store partial sums from each logical device partial_sums = np.empty(divisions) # Start a block of tasks that much all complete before leaving the block. async with finish(): # For each logical device, perform the local inner product using the numpy multiply operation, @. for i in range(divisions): @spawn(devices=[ Req(mapper.device(i), threads=1, memory=storage_size(a_part[i], b_part[i])) ]) def inner_local(): copy(partial_sums[i:i + 1], a_part[i] @ b_part[i]) # Reduce the partial results (sequentially) res = 0. for i in range(divisions): res += partial_sums[i] inner_result[0] = res @spawn(None, [inner_part]) def check(): end = time.perf_counter() print(end - start) assert np.allclose(np.inner(a, b), inner_result[0])
def main(): divisions = 10 mapper = LDeviceSequenceBlocked(divisions) async def inner(a, b): a_part = mapper.partition_tensor(a) b_part = mapper.partition_tensor(b) # Create array to store partial sums from each logical device partial_sums = np.empty(len(a_part)) # Define a space of task names for the product tasks P = TaskSpace("P") for i in range(len(a_part)): @spawn(P[i], data=[a_part[i], b_part[i]]) def inner_local(): # Perform the local inner product using the numpy multiply operation, @. copy(partial_sums[i:i + 1], a_part[i] @ b_part[i]) @spawn(dependencies=P, data=[partial_sums]) def reduce(): return np.sum(partial_sums) return await reduce @spawn() async def main_task(): n = 3 * 1000 a = np.random.rand(n) b = np.random.rand(n) print("Starting.", a.shape, b.shape) res = await inner(a, b) assert np.allclose(np.inner(a, b), res) print("Success.", res)
def main(): ngpus = int(sys.argv[1]) runs = int(sys.argv[2]) blocks_per_gpu = int(sys.argv[3]) devices = gpu.devices[:ngpus] # 1D partition over available devices mapper = LDeviceSequenceBlocked(ngpus * blocks_per_gpu, placement=devices) # Generate an nxn array of random data and # partition it over the devices in use. n = 20000 * 20000 # Main task that generates others. @spawn(placement=cpu) async def rerun_exp(): for run in range(runs): @spawn(placement=cpu) async def launch_exp(): np.random.seed(0) a_cpu = np.random.rand(n).astype(np.float32) #a_part = mapper.partition_tensor(a_cpu) a_part = [] nblocks = ngpus * blocks_per_gpu block_size = (n - 1) // nblocks + 1 for i in range(nblocks): with cp.cuda.Device(i % ngpus): a_part.append( cp.asarray(a_cpu[i * block_size:(i + 1) * block_size])) start = time.perf_counter() # A place to store tasks in order to refer # to them later for dependencies. exp_runs = TaskSpace("exp_runs") for i in range(ngpus * blocks_per_gpu): # Launch a task for each GPU. # These execute asynchronously. @spawn(exp_runs[i], placement=a_part[i]) def run_exp(): # Call cupy for exponentiation. # More complicated kernels can use numba. #local_start = time.perf_counter() cp.exp(a_part[i], out=a_part[i]) #a_loc = a_part[i] #blocks = a_loc.shape[0] // (1024) #threads_per_block = 512 #inplace_exp[blocks, threads_per_block](a_loc) #cuda.default_stream().synchronize() #cp.cuda.get_current_stream().synchronize() #local_stop = time.perf_counter() #print("local:", local_stop - local_start) # Wait for the exp tasks to complete # before measuring the end time. await exp_runs stop = time.perf_counter() print(stop - start) await launch_exp
def main(): n = 3 * 100000000 a = np.random.rand(n) b = np.random.rand(n) divisions = 100 start = time.perf_counter() # Map the divisions onto actual hardware locations devs = list(gpu.devices) + list(cpu.devices) if "N_DEVICES" in os.environ: devs = devs[:int(os.environ.get("N_DEVICES"))] mapper = LDeviceSequenceBlocked(divisions, devices=devs) a_part = mapper.partition_tensor(a) b_part = mapper.partition_tensor(b) inner_result = np.empty(1) @spawn(placement=cpu(0)) async def inner_part(): partial_sums = np.empty(divisions) async with finish(): for i in range(divisions): @spawn(placement=mapper.device(i)) def inner_local(): copy(partial_sums[i:i + 1], a_part[i] @ b_part[i]) res = 0. for i in range(divisions): res += partial_sums[i] inner_result[0] = res end = time.perf_counter() print(end - start) assert np.allclose(np.inner(a, b), inner_result[0])
def main(): devs = list(gpu.devices) + list(cpu.devices) if "N_DEVICES" in os.environ: devs = devs[:int(os.environ.get("N_DEVICES"))] divisions = len(devs)*2 # Set up an "n" x "n" grid of values and run # "steps" number of iterations of the 4 point stencil on it. n = 25000 steps = 200 # Set up two arrays containing the input data. # This demo uses the standard technique of computing # from one array into another then swapping the # input and output arrays for the next iteration. # These are the two arrays that will be swapped back # and forth as input and output. a0 = np.random.rand(n, n) a1 = a0.copy() # An object that distributes arrays across all the given devices. mapper = LDeviceSequenceBlocked(divisions, devices=devs) # Partition a0 and a1. # Here we just partition the rows across the different devices. # Other partitioning schemes are possible. a0_row_groups = mapper.partition_tensor(a0, overlap=1) a1_row_groups = mapper.partition_tensor(a1, overlap=1) # Trigger JIT @spawn(placement=cpu(0)) async def warmups(): warmup = TaskSpace() for i in range(divisions): @spawn(warmup[i], placement=mapper.device(i)) async def w(): jacobi(a1_row_groups[i], a0_row_groups[i]) cupy.cuda.get_current_stream().synchronize() cupy.cuda.Stream.null.synchronize() await warmup time.sleep(5) start = time.perf_counter() # Main parla task. @spawn(placement=cpu(0)) async def run_jacobi(): assert steps > 0 # Specify which set of blocks is used as input or output # (they will be swapped for each iteration). in_blocks = a0_row_groups out_blocks = a1_row_groups # Create a set of labels for the tasks that perform the first # Jacobi iteration step. previous_block_tasks = CompletedTaskSpace() # Now create the tasks for subsequent iteration steps. for i in range(steps): # Swap input and output blocks for the next step. in_blocks, out_blocks = out_blocks, in_blocks # Create a new set of labels for the tasks that do this iteration step. current_block_tasks = TaskSpace("block_tasks[{}]".format(i)) # Create the tasks to do the i'th iteration. # As before, each task needs the following info: # a block index "j" # a "device" where it should execute (supplied by mapper used for partitioning) # the "in_block" of data used as input # the "out_block" to write the output to for j in range(divisions): device = mapper.device(j) in_block = in_blocks[j] out_block = out_blocks[j] # Make each task operating on each block depend on the tasks for # that block and its immediate neighbors from the previous iteration. @spawn(current_block_tasks[j], dependencies=[previous_block_tasks[max(0, j-1):min(divisions, j+2)]], placement=device) def device_local_jacobi_task(): # Read boundary values from adjacent blocks in the partition. # This may communicate across device boundaries. if j > 0: copy(in_block[0], in_blocks[j - 1][-2]) if j < divisions - 1: copy(in_block[-1], in_blocks[j + 1][1]) # Run the computation, dispatching to device specific code. jacobi(in_block, out_block) # For the next iteration, use the newly created tasks as # the tasks from the previous step. previous_block_tasks = current_block_tasks await previous_block_tasks cupy.cuda.get_current_stream().synchronize() cupy.cuda.Stream.null.synchronize() end = time.perf_counter() print(end - start) # This depends on all the tasks from the last iteration step. for j in range(divisions): start_index = 1 if j > 0 else 0 end_index = -1 if j < divisions - 1 else None # None indicates the last element of the dimension copy(a1[mapper.slice(j, len(a1))], out_blocks[j][start_index:end_index])
async def tsqr_blocked_puregpu(A, block_size): Q1 = [None] * NGPUS R1 = PartitionedTensor( [None] * NGPUS ) # CAVEAT: PartitionedTensor with None holes can be fragile! Be cautious! # Create tasks to perform qr factorization on each block and store them in lists t1_tot_start = time() T1 = TaskSpace() for i in range(NGPUS): @spawn(taskid=T1[i], placement=A.base[i]) # NB: A[i] dumbly moves the block here! def t1(): #print("t1[", i, "] start on ", get_current_devices(), sep='', flush=True) perf_stats.t1_is_GPU_tasks[i] = True t1_ker_start = time() Q1[i], R1[i] = cp.linalg.qr(A[i]) R1[i] = R1[i].flatten() t1_ker_end = time() perf_stats.t1_ker_tasks[i] = t1_ker_end - t1_ker_start A[i] = None # Free up memory #print("t1[", i, "] end on ", get_current_devices(), sep='', flush=True) await t1 t1_tot_end = time() perf_stats.t1_tot = t1_tot_end - t1_tot_start # Perform intermediate qr factorization on R1 to get Q2 and final R t2_tot_start = time() @spawn(dependencies=T1, placement=gpu) def t2(): #print("\nt2 start", flush=True) # Gather to this device t2_D2D_start = time() R1_reduced = np.empty(shape=(0, NCOLS)) for dev in range(NGPUS): next = R1[dev] next = next.reshape(NCOLS, NCOLS) R1_reduced = cp.vstack((R1_reduced, next)) R1[dev] = None # Free up memory t2_D2D_end = time() perf_stats.t2_D2D = t2_D2D_end - t2_D2D_start # R here is the final R result t2_ker_start = time() Q2, R = cp.linalg.qr(R1_reduced) Q2 = Q2.flatten() t2_ker_end = time() perf_stats.t2_ker = t2_ker_end - t2_ker_start return Q2, R Q2, R = await t2 t2_tot_end = time() perf_stats.t2_tot = t2_tot_end - t2_tot_start #print("t2 end\n", flush=True) mapper = LDeviceSequenceBlocked(NGPUS, placement=Q2) Q2p = mapper.partition_tensor(Q2) Q = [None] * NGPUS t3_tot_start = time() # Create tasks to perform Q1 @ Q2 matrix multiplication by block T3 = TaskSpace() for i in range(NGPUS): @spawn(taskid=T3[i], dependencies=[T1[i], t2], placement=Q1[i]) def t3(): #print("t3[", i, "] start on ", get_current_devices(), sep='', flush=True) perf_stats.t3_is_GPU_tasks[i] = True # Copy the data to the processor # Q1 and Q2 must have an equal number of blocks, where Q1 blocks' ncols = Q2 blocks' nrows # Q2 is currently an (ncols * nblocks) x ncols matrix. Need nblocks of ncols rows each t3_H2D_start = time() Q2_local = Q2p[i] Q2_local = Q2_local.reshape(NCOLS, NCOLS) t3_H2D_end = time() perf_stats.t3_H2D_tasks[i] = t3_H2D_end - t3_H2D_start # Run the kernel. (Data is copied back within this call; timing annotations are added there) t3_ker_start = time() Q[i] = cp.matmul(Q1[i], Q2_local) t3_ker_end = time() perf_stats.t3_ker_tasks[i] = t3_ker_end - t3_ker_start #print("t3[", i, "] end on ", get_current_devices(), sep='', flush=True) await T3 t3_tot_end = time() perf_stats.t3_tot = t3_tot_end - t3_tot_start return Q, R
async def tsqr_blocked(A, block_size): nrows, ncols = A.shape # Check for block_size > ncols assert ncols <= block_size, "Block size must be greater than or equal to the number of columns in the input matrix" # Calculate the number of blocks nblocks = (nrows + block_size - 1) // block_size # ceiling division mapper = LDeviceSequenceBlocked(nblocks, placement=A) A_blocked = mapper.partition_tensor(A) # Partition A into blocks # Initialize and partition empty array to store blocks (same partitioning scheme, share the mapper) Q1_blocked = mapper.partition_tensor(np.empty_like(A)) R1 = np.empty([nblocks * ncols, ncols]) # Concatenated view # Q2 is allocated in t2 Q = np.empty([nrows, ncols]) # Concatenated view # Create tasks to perform qr factorization on each block and store them in lists t1_tot_start = time() T1 = TaskSpace() for i in range(nblocks): # Block view to store Q1 not needed since it's not contiguous # Get block view to store R1 R1_lower = i * ncols R1_upper = (i + 1) * ncols T1_MEMORY = None if PLACEMENT_STRING == 'gpu' or PLACEMENT_STRING == 'both': T1_MEMORY = int( 4.2 * A_blocked[i:i + 1].nbytes) # Estimate based on empirical evidence @spawn(taskid=T1[i], placement=PLACEMENT, memory=T1_MEMORY, vcus=ACUS) def t1(): #print("t1[", i, "] start on ", get_current_devices(), sep='', flush=True) # Copy the data to the processor t1_H2D_start = time() A_block_local = A_blocked[i:i + 1] cp.cuda.get_current_stream().synchronize() t1_H2D_end = time() perf_stats.t1_H2D_tasks[i] = t1_H2D_end - t1_H2D_start # Run the kernel. (Data is copied back within this call; timing annotations are added there) Q1_blocked[i], R1[R1_lower:R1_upper] = qr_block(A_block_local, i) #print("t1[", i, "] end on ", get_current_devices(), sep='', flush=True) await t1 t1_tot_end = time() perf_stats.t1_tot = t1_tot_end - t1_tot_start # Perform intermediate qr factorization on R1 to get Q2 and final R t2_tot_start = time() @spawn(dependencies=T1, placement=cpu) def t2(): #print("\nt2 start", flush=True) # R here is the final R result # This step could be done recursively, but for small column counts that's not necessary Q2, R = np.linalg.qr(R1) # Q1 and Q2 must have an equal number of blocks, where Q1 blocks' ncols = Q2 blocks' nrows # Q2 is currently an (ncols * nblocks) x ncols matrix. Need nblocks of ncols rows each return Q2, R Q2, R = await t2 t2_tot_end = time() perf_stats.t2_tot = t2_tot_end - t2_tot_start #print("t2 end\n", flush=True) # Partition Q2 (same partitioning scheme, share the mapper) Q2_blocked = mapper.partition_tensor(Q2) t3_tot_start = time() # Create tasks to perform Q1 @ Q2 matrix multiplication by block T3 = TaskSpace() for i in range(nblocks): # Q1 is already in blocks # Get block view to store Q Q_lower = i * block_size # first row in block, inclusive Q_upper = (i + 1) * block_size # last row in block, exclusive T3_MEMORY = None if PLACEMENT_STRING == 'gpu' or PLACEMENT_STRING == 'both': T3_MEMORY = 4 * Q1_blocked[i].nbytes # # This is a guess @spawn(taskid=T3[i], dependencies=[T1[i], t2], placement=PLACEMENT, memory=T3_MEMORY, vcus=ACUS) def t3(): #print("t3[", i, "] start on ", get_current_devices(), sep='', flush=True) # Copy the data to the processor t3_H2D_start = time() Q1_block_local = Q1_blocked[i] Q2_block_local = Q2_blocked[i:i + 1] cp.cuda.get_current_stream().synchronize() t3_H2D_end = time() perf_stats.t3_H2D_tasks[i] = t3_H2D_end - t3_H2D_start # Run the kernel. (Data is copied back within this call; timing annotations are added there) Q[Q_lower:Q_upper] = matmul_block(Q1_block_local, Q2_block_local, i) #print("t3[", i, "] end on ", get_current_devices(), sep='', flush=True) await T3 t3_tot_end = time() perf_stats.t3_tot = t3_tot_end - t3_tot_start return Q, R