def srumma(g_a, g_b, g_c, chunk_size, multiplier, g_counter): # statically partition the task list among nprocs task_list = get_task_list(chunk_size, multiplier) task_id = ga.read_inc(g_counter, 0) # the srumma algorithm, more or less task_prev = task_list[task_id] a_prev,a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi) b_prev,b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi) task_id = ga.read_inc(g_counter, 0) while task_id < multiplier**3: task_next = task_list[task_id] a_next,a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi) b_next,b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev,b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) task_prev = task_next a_prev,a_nb_prev = a_next,a_nb_next b_prev,b_nb_prev = b_next,b_nb_next task_id = ga.read_inc(g_counter, 0) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev,b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) ga.sync()
def srumma(g_a, g_b, g_c, chunk_size, multiplier): # statically partition the task list among nprocs task_list = get_task_list(chunk_size, multiplier) ntasks = multiplier**3 // nproc start = me*ntasks stop = (me+1)*ntasks if me+1 == nproc: stop += multiplier**3 % nproc # the srumma algorithm, more or less task_prev = task_list[start] a_prev,a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi) b_prev,b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi) for i in range(start+1,stop): task_next = task_list[i] a_next,a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi) b_next,b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev,b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) task_prev = task_next a_prev,a_nb_prev = a_next,a_nb_next b_prev,b_nb_prev = b_next,b_nb_next ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev,b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) ga.sync()
def srumma(g_a, g_b, g_c, chunk_size, multiplier, g_counter): task_list = get_task_list(chunk_size, multiplier) ### get first integer from g_counter and assign to 'task_id' # the srumma algorithm, more or less task_prev = task_list[task_id] a_prev,a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi) b_prev,b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi) ### get next integer from g_counter and assign to 'task_id' while task_id < multiplier**3: task_next = task_list[task_id] a_next,a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi) b_next,b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev,b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) task_prev = task_next a_prev,a_nb_prev = a_next,a_nb_next b_prev,b_nb_prev = b_next,b_nb_next ### get next integer from g_counter and assign to 'task_id' ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev,b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) ga.sync()
def srumma(g_a, g_b, g_c, chunk_size, multiplier, g_counter): # statically partition the task list among nprocs task_list = get_task_list(chunk_size, multiplier) task_id = ga.read_inc(g_counter, 0) # the srumma algorithm, more or less task_prev = task_list[task_id] a_prev, a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi) b_prev, b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi) task_id = ga.read_inc(g_counter, 0) while task_id < multiplier**3: task_next = task_list[task_id] a_next, a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi) b_next, b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev, b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) task_prev = task_next a_prev, a_nb_prev = a_next, a_nb_next b_prev, b_nb_prev = b_next, b_nb_next task_id = ga.read_inc(g_counter, 0) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev, b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) ga.sync()
def srumma(g_a, g_b, g_c, chunk_size, multiplier): # statically partition the task list among nprocs task_list = get_task_list(chunk_size, multiplier) ntasks = multiplier**3 // nproc start = me * ntasks stop = (me + 1) * ntasks if me + 1 == nproc: stop += multiplier**3 % nproc # the srumma algorithm, more or less task_prev = task_list[start] a_prev, a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi) b_prev, b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi) for i in range(start + 1, stop): task_next = task_list[i] a_next, a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi) b_next, b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi) ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev, b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) task_prev = task_next a_prev, a_nb_prev = a_next, a_nb_next b_prev, b_nb_prev = b_next, b_nb_next ga.nbwait(a_nb_prev) ga.nbwait(b_nb_prev) result = np.dot(a_prev, b_prev) ga.acc(g_c, result, task_prev.clo, task_prev.chi) ga.sync()
def matrix_multiply(): # Configure array dimensions. Force an unequal data distribution. dims = [TOTALELEMS] * NDIM chunk = [TOTALELEMS / nprocs - 1] * NDIM # Create a global array g_a and duplicate it to get g_b and g_c. g_a = ga.create(ga.C_DBL, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") g_c = ga.duplicate(g_a, "array C") if not g_b or not g_c: ga.eror("duplicate failed") if not me: print "Created Arrays B and C" # Initialize data in matrices a and b. if not me: print "Initializing matrix A and B" a = np.random.rand(*dims) * 29 b = np.random.rand(*dims) * 37 # Copy data to global arrays g_a and g_b. if not me: ga.put(g_a, a) ga.put(g_b, b) # Synchronize all processors to make sure everyone has data. ga.sync() # Determine which block of data is locally owned. Note that # the same block is locally owned for all GAs. lo, hi = ga.distribution(g_c) # Get the blocks from g_a and g_b needed to compute this block in # g_c and copy them into the local buffers a and b. a = ga.get(g_a, (lo[0], 0), (hi[0], dims[0])) b = ga.get(g_b, (0, lo[1]), (dims[1], hi[1])) # Do local matrix multiplication and store the result in local # buffer c. Start by evaluating the transpose of b. btrns = b.transpose() # Multiply a and b to get c. c = np.dot(a, b) # Copy c back to g_c. ga.put(g_c, c, lo, hi) verify(g_a, g_b, g_c) # Deallocate arrays. ga.destroy(g_a) ga.destroy(g_b) ga.destroy(g_c)
def matrix_multiply(): # Configure array dimensions. Force an unequal data distribution. dims = [TOTALELEMS]*NDIM chunk = [TOTALELEMS/nprocs-1]*NDIM # Create a global array g_a and duplicate it to get g_b and g_c. g_a = ga.create(ga.C_DBL, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") g_c = ga.duplicate(g_a, "array C") if not g_b or not g_c: ga.eror("duplicate failed") if not me: print "Created Arrays B and C" # Initialize data in matrices a and b. if not me: print "Initializing matrix A and B" a = np.random.rand(*dims)*29 b = np.random.rand(*dims)*37 # Copy data to global arrays g_a and g_b. if not me: ga.put(g_a, a) ga.put(g_b, b) # Synchronize all processors to make sure everyone has data. ga.sync() # Determine which block of data is locally owned. Note that # the same block is locally owned for all GAs. lo,hi = ga.distribution(g_c) # Get the blocks from g_a and g_b needed to compute this block in # g_c and copy them into the local buffers a and b. a = ga.get(g_a, (lo[0],0), (hi[0],dims[0])) b = ga.get(g_b, (0,lo[1]), (dims[1],hi[1])) # Do local matrix multiplication and store the result in local # buffer c. Start by evaluating the transpose of b. btrns = b.transpose() # Multiply a and b to get c. c = np.dot(a,b) # Copy c back to g_c. ga.put(g_c, c, lo, hi) verify(g_a, g_b, g_c) # Deallocate arrays. ga.destroy(g_a) ga.destroy(g_b) ga.destroy(g_c)
def TRANSPOSE1D(): # Configure array dimensions. Force an unequal data distribution. dims = [nprocs * TOTALELEMS + nprocs / 2] chunk = [TOTALELEMS] # minimum data on each process # create a global array g_a and duplicate it to get g_b g_a = ga.create(ga.C_INT, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") if not g_b: ga.error("duplicate failed") if not me: print "Created Array B" # initialize data in g_a if not me: print "Initializing matrix A" ga.put(g_a, np.arange(dims[0], dtype=np.int32)) # Synchronize all processors to guarantee that everyone has data # before proceeding to the next step. ga.sync() # Start initial phase of inversion by inverting the data held locally on # each processor. Start by finding out which data each processor owns. lo, hi = ga.distribution(g_a) # Get locally held data and copy it into local buffer a a = ga.get(g_a, lo, hi) # Invert data locally b = a[::-1] # Invert data globally by copying locally inverted blocks into # their inverted positions in the GA ga.put(g_b, b, dims[0] - hi[0], dims[0] - lo[0]) # Synchronize all processors to make sure inversion is complete ga.sync() # Check to see if inversion is correct if not me: verify(g_a, g_b) # Deallocate arrays ga.destroy(g_a) ga.destroy(g_b)
def TRANSPOSE1D(): # Configure array dimensions. Force an unequal data distribution. dims = [nprocs*TOTALELEMS + nprocs/2] chunk = [TOTALELEMS] # minimum data on each process # create a global array g_a and duplicate it to get g_b g_a = ga.create(ga.C_INT, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") if not g_b: ga.error("duplicate failed") if not me: print "Created Array B" # initialize data in g_a if not me: print "Initializing matrix A" ga.put(g_a, np.arange(dims[0], dtype=np.int32)) # Synchronize all processors to guarantee that everyone has data # before proceeding to the next step. ga.sync() # Start initial phase of inversion by inverting the data held locally on # each processor. Start by finding out which data each processor owns. lo,hi = ga.distribution(g_a) # Get locally held data and copy it into local buffer a a = ga.get(g_a, lo, hi) # Invert data locally b = a[::-1] # Invert data globally by copying locally inverted blocks into # their inverted positions in the GA ga.put(g_b, b, dims[0]-hi[0], dims[0]-lo[0]) # Synchronize all processors to make sure inversion is complete ga.sync() # Check to see if inversion is correct if not me: verify(g_a, g_b) # Deallocate arrays ga.destroy(g_a) ga.destroy(g_b)
def verify(g_a, g_b, g_c): g_chk = ga.duplicate(g_a, "array check") if not g_chk: ga.error("duplicate failed") ga.sync() ga.gemm(False, False, TOTALELEMS, TOTALELEMS, TOTALELEMS, 1.0, g_a, g_b, 0.0, g_chk) ga.sync() ga.add(g_c, g_chk, g_chk, 1.0, -1.0) rchk = ga.dot(g_chk, g_chk) if not me: print "Normed difference in matrices: %12.4f" % rchk if not (-TOLERANCE < rchk < TOLERANCE): ga.error("Matrix multiply verify failed") else: print "Matrix Multiply OK" ga.destroy(g_chk)
def verify(g_a, g_b, g_c): g_chk = ga.duplicate(g_a, "array check") if not g_chk: ga.error("duplicate failed") ga.sync() ga.gemm(False, False, TOTALELEMS, TOTALELEMS, TOTALELEMS, 1.0, g_a, g_b, 0.0, g_chk); ga.sync() ga.add(g_c, g_chk, g_chk, 1.0, -1.0) rchk = ga.dot(g_chk, g_chk) if not me: print "Normed difference in matrices: %12.4f" % rchk if not (-TOLERANCE < rchk < TOLERANCE): ga.error("Matrix multiply verify failed") else: print "Matrix Multiply OK" ga.destroy(g_chk)
val = ga.gop_add(val) return val == 0 if __name__ == '__main__': if nproc > MULTIPLIER**3: if 0 == me: print "You must use less than %s processors" % (MULTIPLIER**3+1) else: g_a = ga.create(ga.C_DBL, [N,N]) g_b = ga.create(ga.C_DBL, [N,N]) g_c = ga.create(ga.C_DBL, [N,N]) # put some fake data into input arrays A and B if me == 0: ga.put(g_a, np.random.random(N*N)) ga.put(g_b, np.random.random(N*N)) ga.sync() if me == 0: print "srumma...", srumma(g_a, g_b, g_c, CHUNK_SIZE, MULTIPLIER) if me == 0: print "done" if me == 0: print "verifying using ga.gemm...", ok = verify_using_ga(g_a, g_b, g_c) if me == 0: if ok: print "OKAY" else: print "FAILED" if me == 0: print "verifying using np.dot...",
x = h * (i - 0.5); s += 4.0 / (1.0 + x**2); return s * h def prn_pi(pi, PI): message = "pi is approximately %.16f, error is %.16f" print (message % (pi, abs(pi - PI))) ### assign total number of processors to variable 'nprocs' ### assign processor ID to the variable 'myrank' ### create a global array 'g_pi' of type double and a single value while True: if myrank == 0: n = get_n() ### broadcast the value of 'n' else: ### receive the broadcast of the value of 'n' if n == 0: break ### zero the global array 'g_pi' mypi = comp_pi(n, myrank, nprocs) ### accumulate local value 'mypi' into global array 'g_pi' ga.sync() if myrank == 0: ### get value of 'pi' from global array 'g_pi' prn_pi(pi, PI) ### destroy the global array 'g_pi'