def test_multiple_environments_free_assignment(): # Dummy environments with no components for testing. environments = [ TaskEnvironment(placement=[cpu(0)], components=[DummyComponent("foo")]), TaskEnvironment(placement=[cpu(1)], components=[DummyComponent("bar")]) ] with Parla(environments): for _ in repetitions(): task_results = [] @spawn(vcus=1) def task(): sleep(0.1) task_results.append(thread_locals.value) @spawn(vcus=1) def task(): sleep(0.1) task_results.append(thread_locals.value) @spawn(vcus=1) def task(): sleep(0.1) task_results.append(thread_locals.value) sleep_until(lambda: len(task_results) == 3) assert set(task_results) == {"foo", "bar"}
def test_multiple_environments_tagged(): # Dummy environments with no components for testing. environments = [ TaskEnvironment(placement=[cpu(0)], components=[DummyComponent("foo")], tags=(threading, )), TaskEnvironment(placement=[cpu(1)], components=[DummyComponent("bar")], tags=(logging, )) ] with Parla(environments): for _ in repetitions(): task_results = [] @spawn(tags=(threading, )) def task(): task_results.append(thread_locals.value) sleep_until(lambda: len(task_results) == 1) assert task_results == ["foo"] task_results = [] @spawn(tags=(logging, )) def task(): task_results.append(thread_locals.value) sleep_until(lambda: len(task_results) == 1) assert task_results == ["bar"]
def test_placement(runtime_sched): devices = [cpu(0), cpu(1), cpu(2)] for rep in repetitions(): task_results = [] for (i, dev) in enumerate(devices): @spawn(placement=dev) def task(): task_results.append(get_current_devices()[0]) sleep_until(lambda: len(task_results) == i+1) assert task_results == devices
def test_placement_multi(): # Dummy environments with no components for testing. environments = [TaskEnvironment(placement=d, components=[]) for d in combinations(cpu.devices, 2)] with Parla(environments): devices = [frozenset((cpu(0), cpu(1))), frozenset((cpu(1), cpu(2))), frozenset((cpu(4), cpu(3)))] for rep in repetitions(): task_results = [] for (i, dev) in enumerate(devices): @spawn(placement=dev, ndevices=2) def task(): task_results.append(frozenset(get_current_devices())) sleep_until(lambda: len(task_results) == i+1) assert task_results == devices
def test_placement_await(runtime_sched): devices = [cpu(0), cpu(1), cpu(2)] for rep in repetitions(): task_results = [] for (i, dev) in enumerate(devices): @spawn(placement=dev) async def task(): task_results.append(get_current_devices()[0]) await tasks() # Await nothing to force a new task. task_results.append(get_current_devices()[0]) sleep_until(lambda: len(task_results) == (i+1)*2) assert task_results == [cpu(0), cpu(0), cpu(1), cpu(1), cpu(2), cpu(2)]
def test_memory_aware_scheduling(runtime_sched): # test memory restrictions for rep in repetitions(): task_results = [] for i in range(8): @spawn(placement=cpu, memory=cpu(0).available_memory) def task(): task_results.append(get_current_devices()[0]) sleep(0.1) sleep_until(lambda: len(task_results) == 8) assert 8 >= len(set(task_results)) >= 4
def test_multiple_environments_fixed_assignment(): # Dummy environments with no components for testing. environments = [ TaskEnvironment(placement=[cpu(0)], components=[DummyComponent("foo")]), TaskEnvironment(placement=[cpu(1)], components=[DummyComponent("bar")]) ] with Parla(environments): task_results = [] @spawn(placement=cpu(0)) def task(): task_results.append(thread_locals.value) @spawn(placement=cpu(1)) def task(): task_results.append(thread_locals.value) sleep_until(lambda: len(task_results) == 2) assert set(task_results) == {"foo", "bar"}
def test_placement_await(): try: from parla.cuda import gpu except (ImportError, AttributeError): skip("CUDA required for this test.") devices = [cpu(0), gpu(0)] for rep in repetitions(): task_results = [] for i in range(2): @spawn(placement=devices[i]) async def task(): task_results.append(get_current_device()) await tasks() # Await nothing to force a new task. task_results.append(get_current_device()) sleep_until(lambda: len(task_results) == (i + 1) * 2) assert task_results == [cpu(0), cpu(0), gpu(0), gpu(0)]
def main(): comm = MPI.COMM_WORLD print(comm.Get_rank(), comm.Get_size()) a = np.random.rand(10000000).astype(dtype='d') b = np.random.rand(10000000).astype(dtype='d') divisions = 100 comm.Barrier() start = time.perf_counter() # Map the divisions onto actual hardware locations mapper = LDeviceSequenceBlocked(divisions) # print(mapper.devices) a_part = mapper.partition_tensor(a) b_part = mapper.partition_tensor(b) inner_result = np.empty(1, dtype='d') @spawn(placement=cpu(0)) async def inner_part(): partial_sums = np.empty(divisions) async with finish(): for i in range(divisions): @spawn(placement=mapper.device(i)) def inner_local(): copy(partial_sums[i:i+1], a_part[i] @ b_part[i]) res = 0. for i in range(divisions): res += partial_sums[i] inner_result[0] = res overall_result = np.array(0.0, dtype='d') if comm.Get_rank() == 0 else None comm.Reduce([inner_result, MPI.DOUBLE], [overall_result, MPI.DOUBLE], op=MPI.SUM, root=0) if overall_result is not None: result = float(overall_result) print(result) end = time.perf_counter() print(end - start) assert np.allclose(np.inner(a, b), inner_result[0]) other_results = np.empty(comm.Get_size(), dtype='d') if comm.Get_rank() == 0 else None comm.Gather([inner_result, MPI.DOUBLE], [other_results, MPI.DOUBLE], root=0) if overall_result is not None: assert np.isclose(result, np.sum(other_results))
def test_placement_options_memory(runtime_sched): # test multiple options in placement list with only one device used in the end for rep in repetitions(): task_results = [] for i in range(4): @spawn(placement=[cpu(0), cpu(1)], memory=cpu(0).available_memory) def task(): sleep(0.1) task_results.append(get_current_devices()[0]) sleep_until(lambda: len(task_results) == 4) assert set(task_results) == {cpu(0), cpu(1)} assert task_results.count(cpu(0)) == 2 assert task_results.count(cpu(1)) == 2
def test_dummy_environment_component(): environments = [ TaskEnvironment(placement=[cpu(0)], components=[DummyComponent("test")]) ] with Parla(environments): task_results = [] @spawn() def task(): assert get_current_devices() == [cpu(0)] task_results.append(thread_locals.value) sleep_until(lambda: len(task_results) == 1) assert task_results == ["test"]
def main(): @spawn(placement=cpu(0)) async def test_fox(): comm = MPI.COMM_WORLD print(comm.Get_rank(), comm.Get_size()) # Create test data at each rank comm.Barrier() size_factor = 1024*8 A = np.random.rand(size_factor // comm.Get_size(), size_factor).astype(dtype='d') x = np.random.rand(size_factor // comm.Get_size()).astype(dtype='d') comm.Barrier() print("----", A.shape) # Perform multiplication y = await matvec_mpi(comm, A, x) print("++++", A.shape)
def test_placement_data(runtime_sched): try: from parla.cuda import gpu except: skip("Test needs cuda.") return devices = [cpu(0), gpu(0)] for rep in repetitions(): task_results = [] for (i, dev) in enumerate(devices): d = dev.memory()(np.array([1, 2, 3])) @spawn(placement=d) def task(): task_results.append(get_current_devices()[0]) sleep_until(lambda: len(task_results) == i+1) assert task_results == devices
def main(): @spawn(placement=cpu(0)) async def test_fox(): size_factor = 1024 A = np.random.rand(size_factor, size_factor) x = np.random.rand(size_factor) ## Perform single multiplication # Compute "golden" result res = A @ x print("----", A.shape) # Compute with Parla out = np.empty_like(x) out1 = await matvec_fox(out, A, x) assert out is out1 # Compare parla result to golden result print("++++", A.shape) print(np.linalg.norm(res - out, ord=np.inf)) assert np.allclose(res, out), "Parallel fox failed" ## Perform double multiplication # Compute "golden" result res = A @ (A @ x) print("----", A.shape) # Compute with Parla out = np.empty_like(x) # Partition the data yp, Ap, xp = partition_fox(out, A, x) # Multiply twice without copying back to system memory. await matvec_fox_partitioned(yp, Ap, xp) await matvec_fox_partitioned(xp, Ap, yp) # Collect the final result to system memory. out1 = await collect_fox(out, xp) assert out is out1 # Compare parla result to golden result print("++++", A.shape) print(np.linalg.norm(res - out, ord=np.inf)) assert np.allclose(res, out), "Parallel fox failed" print("Done")
def test_placement_options_vcus(runtime_sched): # test multiple options in placement list with only one device used in the end for rep in repetitions(): N = 4 task_results = [] for i in range(N): @spawn(placement=[cpu(0), cpu(1)], vcus=1) def task(): sleep(0.1) task_results.append(get_current_devices()[0]) sleep_until(lambda: len(task_results) == N) assert set(task_results) == {cpu(0), cpu(1)} assert task_results.count(cpu(0)) == N/2 assert task_results.count(cpu(1)) == N/2
async def collect_fox(y, yp): """ Collect the partitions in `yp` into `y`. :param yp: A 2d list of partitions. :param y: The output array. :return: `y` """ C = TaskSpace() # Collect from diagonal in parallel for i in range(0, partitions_y): # rows @spawn(C[i], placement=cpu(0)) def c(): copy(y[mapper.slice_x(i, y.shape[0])], yp[i][i]) # wait for the collect tasks to complete. await C return y
def main(): n = 3 * 100000000 a = np.random.rand(n) b = np.random.rand(n) divisions = 100 start = time.perf_counter() # Map the divisions onto actual hardware locations devs = list(gpu.devices) + list(cpu.devices) if "N_DEVICES" in os.environ: devs = devs[:int(os.environ.get("N_DEVICES"))] mapper = LDeviceSequenceBlocked(divisions, devices=devs) a_part = mapper.partition_tensor(a) b_part = mapper.partition_tensor(b) inner_result = np.empty(1) @spawn(placement=cpu(0)) async def inner_part(): partial_sums = np.empty(divisions) async with finish(): for i in range(divisions): @spawn(placement=mapper.device(i)) def inner_local(): copy(partial_sums[i:i + 1], a_part[i] @ b_part[i]) res = 0. for i in range(divisions): res += partial_sums[i] inner_result[0] = res end = time.perf_counter() print(end - start) assert np.allclose(np.inner(a, b), inner_result[0])
def main(): devs = list(gpu.devices) + list(cpu.devices) if "N_DEVICES" in os.environ: devs = devs[:int(os.environ.get("N_DEVICES"))] divisions = len(devs)*2 # Set up an "n" x "n" grid of values and run # "steps" number of iterations of the 4 point stencil on it. n = 25000 steps = 200 # Set up two arrays containing the input data. # This demo uses the standard technique of computing # from one array into another then swapping the # input and output arrays for the next iteration. # These are the two arrays that will be swapped back # and forth as input and output. a0 = np.random.rand(n, n) a1 = a0.copy() # An object that distributes arrays across all the given devices. mapper = LDeviceSequenceBlocked(divisions, devices=devs) # Partition a0 and a1. # Here we just partition the rows across the different devices. # Other partitioning schemes are possible. a0_row_groups = mapper.partition_tensor(a0, overlap=1) a1_row_groups = mapper.partition_tensor(a1, overlap=1) # Trigger JIT @spawn(placement=cpu(0)) async def warmups(): warmup = TaskSpace() for i in range(divisions): @spawn(warmup[i], placement=mapper.device(i)) async def w(): jacobi(a1_row_groups[i], a0_row_groups[i]) cupy.cuda.get_current_stream().synchronize() cupy.cuda.Stream.null.synchronize() await warmup time.sleep(5) start = time.perf_counter() # Main parla task. @spawn(placement=cpu(0)) async def run_jacobi(): assert steps > 0 # Specify which set of blocks is used as input or output # (they will be swapped for each iteration). in_blocks = a0_row_groups out_blocks = a1_row_groups # Create a set of labels for the tasks that perform the first # Jacobi iteration step. previous_block_tasks = CompletedTaskSpace() # Now create the tasks for subsequent iteration steps. for i in range(steps): # Swap input and output blocks for the next step. in_blocks, out_blocks = out_blocks, in_blocks # Create a new set of labels for the tasks that do this iteration step. current_block_tasks = TaskSpace("block_tasks[{}]".format(i)) # Create the tasks to do the i'th iteration. # As before, each task needs the following info: # a block index "j" # a "device" where it should execute (supplied by mapper used for partitioning) # the "in_block" of data used as input # the "out_block" to write the output to for j in range(divisions): device = mapper.device(j) in_block = in_blocks[j] out_block = out_blocks[j] # Make each task operating on each block depend on the tasks for # that block and its immediate neighbors from the previous iteration. @spawn(current_block_tasks[j], dependencies=[previous_block_tasks[max(0, j-1):min(divisions, j+2)]], placement=device) def device_local_jacobi_task(): # Read boundary values from adjacent blocks in the partition. # This may communicate across device boundaries. if j > 0: copy(in_block[0], in_blocks[j - 1][-2]) if j < divisions - 1: copy(in_block[-1], in_blocks[j + 1][1]) # Run the computation, dispatching to device specific code. jacobi(in_block, out_block) # For the next iteration, use the newly created tasks as # the tasks from the previous step. previous_block_tasks = current_block_tasks await previous_block_tasks cupy.cuda.get_current_stream().synchronize() cupy.cuda.Stream.null.synchronize() end = time.perf_counter() print(end - start) # This depends on all the tasks from the last iteration step. for j in range(divisions): start_index = 1 if j > 0 else 0 end_index = -1 if j < divisions - 1 else None # None indicates the last element of the dimension copy(a1[mapper.slice(j, len(a1))], out_blocks[j][start_index:end_index])
WARMUP = args.warmup NTHREADS = args.threads NGPUS = args.ngpus PLACEMENT_STRING = args.placement CHECK_RESULT = args.check_result CSV = args.csv # Set up PLACEMENT variable if PLACEMENT_STRING == 'cpu': PLACEMENT = cpu ACUS = None elif PLACEMENT_STRING == 'gpu': PLACEMENT = [gpu(i) for i in range(NGPUS)] ACUS = None elif PLACEMENT_STRING == 'both': PLACEMENT = [cpu(0)] + [gpu(i) for i in range(NGPUS)] ACUS = 1 elif PLACEMENT_STRING == 'puregpu': PLACEMENT = [gpu(i) for i in range(NGPUS)] ACUS = None BLOCK_SIZE = int(NROWS / NGPUS) else: print( "Invalid value for placement. Must be 'cpu' or 'gpu' or 'both' or 'puregpu'" ) perf_stats = perfStats(ITERS, NROWS, BLOCK_SIZE) print( '%**********************************************************************************************%\n' )
def test_multiple_environments_less_good_fit(): # Dummy environments with no components for testing. environments = [ TaskEnvironment(placement=[cpu(0), cpu(1)], components=[DummyComponent("foo")]), TaskEnvironment(placement=[cpu(2), cpu(3), cpu(4)], components=[DummyComponent("bar")]) ] with Parla(environments): for _ in repetitions(): task_results = [] # The first two will fit in the the first environment using 0.5 of the environment. # The next two will spill into the less good (0.33) fit of the second environment. @spawn(placement=[cpu(1), cpu(2)], vcus=1) def task(): sleep(0.1) task_results.append(thread_locals.value) @spawn(placement=[cpu(1), cpu(2)], vcus=1) def task(): sleep(0.1) task_results.append(thread_locals.value) @spawn(placement=[cpu(1), cpu(2)], vcus=1) def task(): sleep(0.1) task_results.append(thread_locals.value) @spawn(placement=[cpu(1), cpu(2)], vcus=1) def task(): sleep(0.1) task_results.append(thread_locals.value) sleep_until(lambda: len(task_results) == 4) task_results.sort() assert task_results == ["bar", "bar", "foo", "foo"]
def task(): assert get_current_devices() == [cpu(0)] task_results.append(thread_locals.value)