def check_gather(gatype): if 0 == me: print '> Checking gather (might be slow)...', g_a = create_global_array(gatype) a = create_local_a(gatype) if 0 == me: ga.put(g_a, a) ga.sync() ijv = np.zeros((m,2), dtype=np.int64) random.seed(ga.nodeid()*51 + 1) # different seed for each proc for j in range(10): itmp = None if MIRROR: itmp = random.randint(0,lprocs-1) else: itmp = random.randint(0,nproc-1) if itmp == me: for loop in range(m): ijv[loop,:] = (random.randint(0,n-1),random.randint(0,n-1)) #if ijv[loop,0] > ijv[loop,1]: # ijv[loop,:] = ijv[loop,::-1] # reverse result = ga.gather(g_a, ijv) for loop in range(m): value = ga.get(g_a, ijv[loop], ijv[loop]+1).flatten() if not result[loop] == value: ga.error('gather failed') if 0 == me: print 'OK' ga.destroy(g_a)
def verify(g_a, g_b): ### copy the entire block of data from the global array "g_a" into the ### local array "a" and similarly for "g_b" and "b". if not np.all(a[::-1] == b): print "Mismatch: a[::-1] is not equal to b" ga.error("verify failed") print "Transpose OK"
def verify(g_a, g_b): a = ga.get(g_a) b = ga.get(g_b) if not np.all(a[::-1] == b): print "Mismatch: a[::-1] is not equal to b" ga.error("verify failed") print "Transpose OK"
def check_dot(gatype): if 0 == me: print '> Checking dot ...', np.random.seed(12345) # everyone has same seed g_a = create_global_array(gatype) g_b = create_global_array(gatype) a = create_local_a(gatype) b = np.random.random_sample((n,n)) if MIRROR: if 0 == iproc: ga.put(g_b, b) ga.put(g_a, a) else: if 0 == me: ga.put(g_b, b) ga.put(g_a, a) ga.sync() sum1 = np.sum(a*b) sum2 = ga.dot(g_a, g_b) if mismatch(sum1, sum2): ga.error('dot wrong %s != %s' % (sum1, sum2)) if 0 == me: print 'OK' ga.destroy(g_a) ga.destroy(g_b)
def main(): if 4 != nproc and 0 == me: ga.error('Program requires 4 GA processes; nproc=%s' % nproc) test2D() test1D() if 0 == me: print 'All tests successful'
def check_put_disjoint(gatype): """each node fills in disjoint sections of the array""" if 0 == me: print '> Checking disjoint put ...', g_a = create_global_array(gatype) a = create_local_a(gatype) inc = (n-1)/20 + 1 ij = 0 for i in range(0,n,inc): for j in range(0,n,inc): check = False if MIRROR: check = ij % lprocs == iproc else: check = ij % nproc == me if check: lo = [i,j] hi = [min(i+inc,n), min(j+inc,n)] piece = a[ga.zip(lo,hi)] ga.put(g_a, piece, lo, hi) # the following check is not part of the original test.F result = ga.get(g_a, lo, hi) if not np.all(result == piece): ga.error("put followed by get failed", 1) ga.sync() ij += 1 ga.sync() # all nodes check all of a b = ga.get(g_a) if not np.all(a == b): ga.error('put failed, exiting') if 0 == me: print 'OK' ga.destroy(g_a)
def check_get(gatype): """check nloop random gets from each node""" if 0 == me: print '> Checking random get (%d calls)...' % nloop g_a = create_global_array(gatype) a = create_local_a(gatype) if 0 == me: ga.put(g_a, a) ga.sync() nwords = 0 random.seed(ga.nodeid()*51+1) # different seed for each proc for loop in range(nloop): ilo,ihi = random.randint(0, nloop-1),random.randint(0, nloop-1) if ihi < ilo: ilo,ihi = ihi,ilo jlo,jhi = random.randint(0, nloop-1),random.randint(0, nloop-1) if jhi < jlo: jlo,jhi = jhi,jlo nwords += (ihi-ilo+1)*(jhi-jlo+1) ihi += 1 jhi += 1 result = ga.get(g_a, (ilo,jlo), (ihi,jhi)) if not np.all(result == a[ilo:ihi,jlo:jhi]): ga.error('random get failed') if 0 == me and loop % max(1,nloop/20) == 0: print ' call %d node %d checking get((%d,%d),(%d,%d)) total %f' % ( loop, me, ilo, ihi, jlo, jhi, nwords) if 0 == me: print 'OK' ga.destroy(g_a)
def create_global_array(gatype): if NEW_API: g_a = ga.create_handle() ga.set_data(g_a, [n,n], gatype) ga.set_array_name(g_a, 'a') if USE_RESTRICTED: num_restricted = nproc/2 or 1 restricted_list = np.arange(num_restricted) + num_restricted/2 ga.set_restricted(g_a, restricted_list) if BLOCK_CYCLIC: if USE_SCALAPACK_DISTR: if nproc % 2 == 0: ga.error('Available procs must be divisible by 2',nproc) ga.set_block_cyclic_proc_grid(g_a, block_size, proc_grid) else: ga.set_block_cyclic(g_a, block_size) if MIRROR: p_mirror = ga.pgroup_get_mirror() ga.set_pgroup(g_a, p_mirror) ga.allocate(g_a) else: if MIRROR: p_mirror = ga.pgroup_get_mirror() ga.create_config(gatype, (n,n), 'a', None, p_mirror) else: g_a = ga.create(gatype, (n,n), 'a') if 0 == g_a: ga.error('ga.create failed') if MIRROR: lproc = me - ga.cluster_procid(inode, 0) lo,hi = ga.distribution(g_a, lproc) else: lo,hi = ga.distribution(g_a, me) ga.sync() return g_a
def check_scatter(gatype): nptype = ga.dtype(gatype) if 0 == me: print '> Checking scatter (might be slow)...', g_a = create_global_array(gatype) a = create_local_a(gatype) if 0 == me: ga.put(g_a, a) ga.sync() ijv = np.zeros((m,2), dtype=np.int64) v = np.zeros(m, dtype=nptype) random.seed(ga.nodeid()*51 + 1) # different seed for each proc for j in range(10): check = None if MIRROR: check = random.randint(0,lprocs-1) == iproc else: check = random.randint(0,nproc-1) == me if check: for loop in range(m): ijv[loop,:] = (random.randint(0,n-1),random.randint(0,n-1)) v[loop] = ijv[loop,0]+ijv[loop,1] ga.scatter(g_a, v, ijv) for loop in range(m): value = ga.get(g_a, ijv[loop], ijv[loop]+1).flatten() if not v[loop] == value: ga.error('scatter failed') if 0 == me: print 'OK' ga.destroy(g_a)
def check_gop(nptype): if 0 == me: print '> checking ga.gop (%s)' % nptype, input = np.arange(n, dtype=nptype) + me sum = np.arange(n, dtype=nptype)*nproc + (nproc-1)*nproc/2 output = ga.gop(input, '+') if not np.all(output == sum): ga.error('ga.gop (%s) error' % nptype) if 0 == me: print 'OK'
def check_zero(gatype): if 0 == me: print '> Checking zero ...', g_a = create_global_array(gatype) ga.zero(g_a) a = ga.get(g_a) if not np.all(a == 0): ga.error('ga.zero failed') if 0 == me: print 'OK' ga.destroy(g_a)
def matrix_multiply(): # Configure array dimensions. Force an unequal data distribution. dims = [TOTALELEMS]*NDIM chunk = [TOTALELEMS/nprocs-1]*NDIM # Create a global array g_a and duplicate it to get g_b and g_c. g_a = ga.create(ga.C_DBL, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") g_c = ga.duplicate(g_a, "array C") if not g_b or not g_c: ga.eror("duplicate failed") if not me: print "Created Arrays B and C" # Initialize data in matrices a and b. if not me: print "Initializing matrix A and B" a = np.random.rand(*dims)*29 b = np.random.rand(*dims)*37 # Copy data to global arrays g_a and g_b. if not me: ga.put(g_a, a) ga.put(g_b, b) # Synchronize all processors to make sure everyone has data. ga.sync() # Determine which block of data is locally owned. Note that # the same block is locally owned for all GAs. lo,hi = ga.distribution(g_c) # Get the blocks from g_a and g_b needed to compute this block in # g_c and copy them into the local buffers a and b. a = ga.get(g_a, (lo[0],0), (hi[0],dims[0])) b = ga.get(g_b, (0,lo[1]), (dims[1],hi[1])) # Do local matrix multiplication and store the result in local # buffer c. Start by evaluating the transpose of b. btrns = b.transpose() # Multiply a and b to get c. c = np.dot(a,b) # Copy c back to g_c. ga.put(g_c, c, lo, hi) verify(g_a, g_b, g_c) # Deallocate arrays. ga.destroy(g_a) ga.destroy(g_b) ga.destroy(g_c)
def TRANSPOSE1D(): # Configure array dimensions. Force an unequal data distribution. dims = [nprocs * TOTALELEMS + nprocs / 2] chunk = [TOTALELEMS] # minimum data on each process # create a global array g_a and duplicate it to get g_b ### create GA of integers with dimension "dims" with minimum block size ### "chunk" and name of "Array A" and assign the handle to the variable ### "g_a" if not g_a: ga.error("create failed: A") if not me: print "Created Array A" ### create a second global array assigned to the handled "g_b" by ### duplicating "g_a" and assigning the name "Array B" if not g_b: ga.error("duplicate failed") if not me: print "Created Array B" # initialize data in g_a if not me: print "Initializing matrix A" ### copy contents of a numpy range array into the remote ### global array "g_a" ### HINT: use numpy's arange() e.g. np.arange(###, dtype=np.int32) # Synchronize all processors to guarantee that everyone has data # before proceeding to the next step. ### synchronize all processors # Start initial phase of inversion by inverting the data held locally on # each processor. Start by finding out which data each processor owns. ### find out which block of data my node owns for the global array "g_a" ### and store the contents of the arrays into "lo" and "hi" # Get locally held data and copy it into local buffer a ### use the arrays "lo" and "hi" to copy the locally held block of data ### from the global array "g_a" into the local array "a". # Invert data locally b = a[::-1] # Invert data globally by copying locally inverted blocks into # their inverted positions in the GA lo2 = [dims[0] - hi[0]] hi2 = [dims[0] - lo[0]] ### copy data from the local array "b" into the block of the global ### array "g_a" described by the integer arrays "lo" and "hi" # Synchronize all processors to make sure inversion is complete ### synchronize all processors # Check to see if inversion is correct if not me: verify(g_a, g_b)
def TRANSPOSE1D(): # Configure array dimensions. Force an unequal data distribution. dims = [nprocs*TOTALELEMS + nprocs/2] chunk = [TOTALELEMS] # minimum data on each process # create a global array g_a and duplicate it to get g_b ### create GA of integers with dimension "dims" with minimum block size ### "chunk" and name of "Array A" and assign the handle to the variable ### "g_a" if not g_a: ga.error("create failed: A") if not me: print "Created Array A" ### create a second global array assigned to the handled "g_b" by ### duplicating "g_a" and assigning the name "Array B" if not g_b: ga.error("duplicate failed") if not me: print "Created Array B" # initialize data in g_a if not me: print "Initializing matrix A" ### copy contents of a numpy range array into the remote ### global array "g_a" ### HINT: use numpy's arange() e.g. np.arange(###, dtype=np.int32) # Synchronize all processors to guarantee that everyone has data # before proceeding to the next step. ### synchronize all processors # Start initial phase of inversion by inverting the data held locally on # each processor. Start by finding out which data each processor owns. ### find out which block of data my node owns for the global array "g_a" ### and store the contents of the arrays into "lo" and "hi" # Get locally held data and copy it into local buffer a ### use the arrays "lo" and "hi" to copy the locally held block of data ### from the global array "g_a" into the local array "a". # Invert data locally b = a[::-1] # Invert data globally by copying locally inverted blocks into # their inverted positions in the GA lo2 = [dims[0]-hi[0]] hi2 = [dims[0]-lo[0]] ### copy data from the local array "b" into the block of the global ### array "g_a" described by the integer arrays "lo" and "hi" # Synchronize all processors to make sure inversion is complete ### synchronize all processors # Check to see if inversion is correct if not me: verify(g_a, g_b)
def check_broadcast(): if 0 == me: print '> Checking ga.brdcst', buf = [0,0] if nproc-1 == me: buf = [me,nproc] buf = ga.brdcst(buf,nproc-1) if buf[0] != nproc-1: ga.error('ga.brdcst buf[0] failed') if buf[1] != nproc: ga.error('ga.brdcst buf[1] failed') if 0 == me: print 'OK'
def check_fence_and_lock(gatype): if 0 == me: print '> Checking ga.fence and ga.lock', g_a = create_global_array(gatype) ga.zero(g_a) if not ga.create_mutexes(1): ga.error('ga.create_mutexes failed') if n < 2: ga.error('insufficient n to test ga.fence', n) ga.lock(0) a = ga.get(g_a) # get original values a[:,0] += 1 # add my contribution # need to use fence to assure that coms complete before leaving # critical section ga.init_fence() ga.put(g_a, a) ga.fence() ga.unlock(0) if not ga.destroy_mutexes(): ga.error('mutex not destroyed') ga.sync() if 0 == me: a = ga.get(g_a) if not np.all(a[:,0] == nproc): ga.error('fence failed') if 0 == me: print 'OK'
def check_copy(gatype): if 0 == me: print '> Checking copy ...', g_a = create_global_array(gatype) g_b = create_global_array(gatype) a = create_local_a(gatype) if 0 == me: ga.put(g_a, a) ga.copy(g_a, g_b) if not np.all(a == ga.get(g_b)): ga.error('copy failed') if 0 == me: print 'OK' ga.destroy(g_a) ga.destroy(g_b)
def check_scale(gatype): if 0 == me: print '> Checking scale ...', g_a = create_global_array(gatype) a = create_local_a(gatype) if 0 == me: ga.put(g_a, a) ga.sync() ga.scale(g_a, 0.123) a *= 0.123 if np.any(np.vectorize(mismatch)(a,ga.get(g_a))): ga.error('add failed') if 0 == me: print 'OK' ga.destroy(g_a)
def test2D(): n = 1024 buf = np.zeros((n,n), dtype=np.float64) chunk = np.asarray([1,3,4,9,16,24,30,48,64,91,128,171,256,353,440,512]) g_a = ga.create(ga.C_DBL, (n,n), 'a') if 0 == g_a: ga.error('ga.create failed') buf[:] = 0.01 ga.zero(g_a) if 0 == me: print (' Performance of GA get, put & acc' ' for square sections of array[%d,%d]' % (n,n)) lo,hi = ga.distribution(g_a, me) # local ops TestPutGetAcc(g_a, n, chunk, buf, lo, hi, True) # remote ops TestPutGetAcc(g_a, n, chunk, buf, lo, hi, False)
def TRANSPOSE1D(): # Configure array dimensions. Force an unequal data distribution. dims = [nprocs * TOTALELEMS + nprocs / 2] chunk = [TOTALELEMS] # minimum data on each process # create a global array g_a and duplicate it to get g_b g_a = ga.create(ga.C_INT, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") if not g_b: ga.error("duplicate failed") if not me: print "Created Array B" # initialize data in g_a if not me: print "Initializing matrix A" ga.put(g_a, np.arange(dims[0], dtype=np.int32)) # Synchronize all processors to guarantee that everyone has data # before proceeding to the next step. ga.sync() # Start initial phase of inversion by inverting the data held locally on # each processor. Start by finding out which data each processor owns. lo, hi = ga.distribution(g_a) # Get locally held data and copy it into local buffer a a = ga.get(g_a, lo, hi) # Invert data locally b = a[::-1] # Invert data globally by copying locally inverted blocks into # their inverted positions in the GA ga.put(g_b, b, dims[0] - hi[0], dims[0] - lo[0]) # Synchronize all processors to make sure inversion is complete ga.sync() # Check to see if inversion is correct if not me: verify(g_a, g_b) # Deallocate arrays ga.destroy(g_a) ga.destroy(g_b)
def TRANSPOSE1D(): # Configure array dimensions. Force an unequal data distribution. dims = [nprocs*TOTALELEMS + nprocs/2] chunk = [TOTALELEMS] # minimum data on each process # create a global array g_a and duplicate it to get g_b g_a = ga.create(ga.C_INT, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") if not g_b: ga.error("duplicate failed") if not me: print "Created Array B" # initialize data in g_a if not me: print "Initializing matrix A" ga.put(g_a, np.arange(dims[0], dtype=np.int32)) # Synchronize all processors to guarantee that everyone has data # before proceeding to the next step. ga.sync() # Start initial phase of inversion by inverting the data held locally on # each processor. Start by finding out which data each processor owns. lo,hi = ga.distribution(g_a) # Get locally held data and copy it into local buffer a a = ga.get(g_a, lo, hi) # Invert data locally b = a[::-1] # Invert data globally by copying locally inverted blocks into # their inverted positions in the GA ga.put(g_b, b, dims[0]-hi[0], dims[0]-lo[0]) # Synchronize all processors to make sure inversion is complete ga.sync() # Check to see if inversion is correct if not me: verify(g_a, g_b) # Deallocate arrays ga.destroy(g_a) ga.destroy(g_b)
def test2D(): n = 1024 buf = np.zeros((n, n), dtype=np.float64) chunk = np.asarray( [1, 3, 4, 9, 16, 24, 30, 48, 64, 91, 128, 171, 256, 353, 440, 512]) g_a = ga.create(ga.C_DBL, (n, n), 'a') if 0 == g_a: ga.error('ga.create failed') buf[:] = 0.01 ga.zero(g_a) if 0 == me: print( ' Performance of GA get, put & acc' ' for square sections of array[%d,%d]' % (n, n)) lo, hi = ga.distribution(g_a, me) # local ops TestPutGetAcc(g_a, n, chunk, buf, lo, hi, True) # remote ops TestPutGetAcc(g_a, n, chunk, buf, lo, hi, False)
def verify_ga_gemm(ta, tb, num_m, num_n, num_k, alpha, g_a, g_b, beta, g_c): tmpa = np.ndarray((num_m, num_k), dtype=np.float64) tmpb = np.ndarray((num_k, num_n), dtype=np.float64) tmpc = np.ndarray((num_m, num_n), dtype=np.float64) tmpa = ga.get(g_a, buffer=tmpa) tmpb = ga.get(g_b, buffer=tmpb) tmpc = ga.get(g_c, buffer=tmpc) if not ta and not tb: result = dgemm(alpha, tmpa, tmpb, beta=beta, trans_a=ta, trans_b=tb) elif ta and not tb: result = dgemm(alpha, tmpa, tmpb, beta=beta, trans_a=ta, trans_b=tb) elif not ta and tb: result = dgemm(alpha, tmpa, tmpb, beta=beta, trans_a=ta, trans_b=tb) elif ta and tb: result = dgemm(alpha, tmpa, tmpb, beta=beta, trans_a=ta, trans_b=tb) else: raise ValueError, "shouldn't get here" abs_value = np.abs(tmpc-result) if np.any(abs_value > 1): ga.error('verify ga.gemm failed')
def verify(g_a, g_b, g_c): g_chk = ga.duplicate(g_a, "array check") if not g_chk: ga.error("duplicate failed") ga.sync() ga.gemm(False, False, TOTALELEMS, TOTALELEMS, TOTALELEMS, 1.0, g_a, g_b, 0.0, g_chk); ga.sync() ga.add(g_c, g_chk, g_chk, 1.0, -1.0) rchk = ga.dot(g_chk, g_chk) if not me: print "Normed difference in matrices: %12.4f" % rchk if not (-TOLERANCE < rchk < TOLERANCE): ga.error("Matrix multiply verify failed") else: print "Matrix Multiply OK" ga.destroy(g_chk)
def test1D(): n = 1024*1024 buf = np.zeros(n/4, dtype=np.float64) chunk = np.asarray([1,9,16,81,256,576,900,2304,4096,8281, 16384,29241,65536,124609,193600,262144]) g_a = ga.create(ga.C_DBL, (n,), 'a') if 0 == g_a: ga.error('ga.create failed') buf[:] = 0.01 ga.zero(g_a) if 0 == me: print '' print '' print '' print (' Performance of GA get, put & acc' ' for 1-dimensional sections of array[%d]' % n) lo,hi = ga.distribution(g_a, me) # local ops TestPutGetAcc1(g_a, n, chunk, buf, lo, hi, True) # remote ops TestPutGetAcc1(g_a, n, chunk, buf, lo, hi, False)
def check_accumulate_overlap(gatype): if 0 == me: print '> Checking overlapping accumulate ...', g_a = create_global_array(gatype) ga.zero(g_a) ga.acc(g_a, [1], (n/2,n/2), (n/2+1,n/2+1), 1) ga.sync() if MIRROR: if 0 == iproc: x = abs(ga.get(g_a, (n/2,n/2), (n/2+1,n/2+1))[0,0] - lprocs) if not 0 == x: ga.error('overlapping accumulate failed -- expected %s got %s'%( x, lprocs)) else: if 0 == me: x = abs(ga.get(g_a, (n/2,n/2), (n/2+1,n/2+1))[0,0] - nproc) if not 0 == x: ga.error('overlapping accumulate failed -- expected %s got %s'%( x, nproc)) if 0 == me: print 'OK' ga.destroy(g_a)
def check_add(gatype): if 0 == me: print '> Checking add ...', g_a = create_global_array(gatype) g_b = create_global_array(gatype) a = create_local_a(gatype) b = create_local_b(gatype) alpha = None beta = None if 0 == me: ga.put(g_a, a) ga.sync(); np.random.seed(12345) # everyone has same seed if gatype in [ga.C_SCPL,ga.C_DCPL]: b_real = np.random.random_sample((n,n)) b_imag = np.random.random_sample((n,n)) b[:] = np.vectorize(complex)(b_real,b_imag) alpha = complex(0.1,-0.1) beta = complex(0.9,-0.9) else: b[:] = np.random.random_sample((n,n)) alpha = 0.1 beta = 0.9 a = alpha*a + beta*b if MIRROR: if 0 == iproc: ga.put(g_b, b) else: if 0 == me: ga.put(g_b, b) ga.sync() ga.add(g_a, g_b, g_b, alpha, beta) b = ga.get(g_b, buffer=b) if np.any(np.vectorize(mismatch)(b,a)): ga.error('add failed') if 0 == me: print 'OK' ga.destroy(g_a) ga.destroy(g_b)
def test1D(): n = 1024 * 1024 buf = np.zeros(n / 4, dtype=np.float64) chunk = np.asarray([ 1, 9, 16, 81, 256, 576, 900, 2304, 4096, 8281, 16384, 29241, 65536, 124609, 193600, 262144 ]) g_a = ga.create(ga.C_DBL, (n, ), 'a') if 0 == g_a: ga.error('ga.create failed') buf[:] = 0.01 ga.zero(g_a) if 0 == me: print '' print '' print '' print( ' Performance of GA get, put & acc' ' for 1-dimensional sections of array[%d]' % n) lo, hi = ga.distribution(g_a, me) # local ops TestPutGetAcc1(g_a, n, chunk, buf, lo, hi, True) # remote ops TestPutGetAcc1(g_a, n, chunk, buf, lo, hi, False)
def check_accumulate_disjoint(gatype): """Each node accumulates into disjoint sections of the array.""" if 0 == me: print '> Checking disjoint accumulate ...', g_a = create_global_array(gatype) a = create_local_a(gatype) b = np.fromfunction(lambda i,j: i+j+2, (n,n), dtype=ga.dtype(gatype)) if 0 == me: ga.put(g_a, a) ga.sync() inc = (n-1)/20 + 1 ij = 0 for i in range(0,n,inc): for j in range(0,n,inc): x = 10.0 lo = [i,j] hi = [min(i+inc,n), min(j+inc,n)] piece = b[ga.zip(lo,hi)] check = False if MIRROR: check = ij % lprocs == iproc else: check = ij % nproc == me if check: ga.acc(g_a, piece, lo, hi, x) ga.sync() ij += 1 # each process applies all updates to its local copy a[ga.zip(lo,hi)] += x * piece ga.sync() # all nodes check all of a if not np.all(ga.get(g_a) == a): ga.error('acc failed') if 0 == me: print 'OK' ga.destroy(g_a)
import mpi4py.MPI # initialize Message Passing Interface from ga4py import ga # initialize Global Arrays import numpy as np me = ga.nodeid() nproc = ga.nnodes() def print_distribution(g_a): for i in range(ga.nnodes()): lo, hi = ga.distribution(g_a, i) print "P=%s lo=%s hi=%s" % (i, lo, hi) # create some irregular arrays block = [3, 2] map = [0, 2, 6, 0, 5] if nproc < np.prod(block): raise ValueError, "ERROR: fewer procs than requested blocks" g_a = ga.create_irreg(ga.C_DBL, [8, 10], block, map, "Array A") if not g_a: ga.error("Could not create global array A", g_a) g_b = ga.create(ga.C_INT, (2, 3, 4, 5, 6)) if not me: print_distribution(g_a) print_distribution(g_b)
def matrix_multiply(): # Configure array dimensions. Force an unequal data distribution. dims = [TOTALELEMS]*NDIM chunk = [TOTALELEMS/nprocs-1]*NDIM # Create a global array g_a and duplicate it to get g_b and g_c. ### create GA of doubles with dimensions "dims", with minimum block size ### "chunk", and with name "array A", and assign the handle to the integer ### variable "g_a". if not g_a: ga.error("create failed: A") if not me: print "Created Array A" ### Duplicate array "g_a" to create arrays "g_b" and "g_c" with array ### names "array B" and "array C", respectively. if not g_b or not g_c: ga.eror("duplicate failed") if not me: print "Created Arrays B and C" # Initialize data in matrices a and b. if not me: print "Initializing matrix A and B" a = np.random.rand(*dims)*29 b = np.random.rand(*dims)*37 # Copy data to global arrays g_a and g_b. if not me: ### copy the contents of array "a" into the global array "g_a" ### similarly for "b" # Synchronize all processors to make sure everyone has data. ### Synchronize all processors # Determine which block of data is locally owned. Note that # the same block is locally owned for all GAs. ### find out which block of data my node owns for the global array "g_c" ### and store the contents in the integer arrays "lo" and "hi" # Get the blocks from g_a and g_b needed to compute this block in # g_c and copy them into the local buffers a and b. lo2 = (lo[0],0) hi2 = (hi[0],dims[0])) ### copy the block of data described by the arrays "lo2" and "hi2" from ### the global array "g_a" in to the local array "a" lo3 = (0,lo[1]) hi3 = (dims[1],hi[1])) ### copy the block of data described by the arrays "lo3" and "hi3" from ### the global array "g_b" in to the local array "b" # Do local matrix multiplication and store the result in local # buffer c. Start by evaluating the transpose of b. btrns = b.transpose() # Multiply a and b to get c. c = np.dot(a,b) # Copy c back to g_c. ### copy data from the local array "c" into the block of the global array ### "g_c" described by the integer arrays "lo" and "hi". verify(g_a, g_b, g_c) # Deallocate arrays. ### destroy the global arrays "g_a", "g_b", "g_c" if __name__ == '__main__': if not me: print "\nUsing %d processes\n" % nprocs matrix_multiply() if not me: print "\nTerminating..."
def main(): # TODO there's got to be a loopless, more pythonic way to do this ii = 0 for i in range(num1*num1): ii += 1 if ii > num1: ii = 0 h0[i] = ii # compute times assuming 500 mflops and 5 second target time # ntimes = max(3.0, 5.0/(4.0-9*num**3)) ntimes = 5 for ii in range(howmany): num_m = nums_m[ii] num_n = nums_n[ii] num_k = nums_k[ii] a = 0.5/(num_m*num_n) if num_m > nummax or num_n > nummax or num_k > nummax: ga.error('Insufficient memory: check nummax') if BLOCK_CYCLIC: block_size = [128,128] g_c = ga.create_handle() ga.set_data(g_c, (num_m,num_n), ga.C_DBL) ga.set_array_name(g_c, 'g_c') ga.set_block_cyclic(g_c, block_size) if not ga.allocate(g_c): ga.error('create failed') block_size = [128,128] g_b = ga.create_handle() ga.set_data(g_b, (num_k,num_n), ga.C_DBL) ga.set_array_name(g_b, 'g_b') ga.set_block_cyclic(g_b, block_size) if not ga.allocate(g_b): ga.error('create failed') block_size = [128,128] g_a = ga.create_handle() ga.set_data(g_a, (num_m,num_k), ga.C_DBL) ga.set_array_name(g_a, 'g_a') ga.set_block_cyclic(g_a, block_size) if not ga.allocate(g_a): ga.error('create failed') else: g_a = ga.create(ga.C_DBL, (num_m,num_k), 'g_a') g_b = ga.create(ga.C_DBL, (num_k,num_n), 'g_b') g_c = ga.create(ga.C_DBL, (num_m,num_n), 'g_c') for handle in [g_a,g_b,g_c]: if 0 == handle: ga.error('create failed') # initialize matrices A and B if 0 == me: load_ga(g_a, h0, num_m, num_k) load_ga(g_b, h0, num_k, num_n) ga.zero(g_c) ga.sync() if 0 == me: print '\nMatrix Multiplication C = A[%d,%d] x B[%d,%d]\n' % ( num_m, num_k, num_k, num_n) print ' %4s %12s %12s %7s %7s'%( "Run#", "Time (seconds)", "mflops/proc", "A trans", "B trans") avg_t[:] = 0 avg_mf[:] = 0 for itime in range(ntimes): for i in range(ntrans): ga.sync() ta = transa[i] tb = transb[i] t1 = time.time() ga.gemm(ta,tb,num_m,num_n,num_k,1,g_a,g_b,0,g_c) t1 = time.time() - t1 if 0 == me: mf = 2*num_m*num_n*num_k/t1*10**-6/nproc avg_t[i] += t1 avg_mf[i] += mf print ' %4d %12.4f %12.1f %7s %7s'%( itime+1, t1, mf, ta, tb) if VERIFY and itime == 0: verify_ga_gemm(ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c) if 0 == me: print '' for i in range(ntrans): print 'Average: %12.4f seconds %12.1f mflops/proc %s %s'%( avg_t[i]/ntimes, avg_mf[i]/ntimes, transa[i], transb[i]) if VERIFY: print 'All ga.gemms are verified...O.K.'
import mpi4py.MPI # initialize Message Passing Interface from ga4py import ga # initialize Global Arrays import numpy as np me = ga.nodeid() nproc = ga.nnodes() def print_distribution(g_a): for i in range(ga.nnodes()): lo,hi = ga.distribution(g_a, i) print "P=%s lo=%s hi=%s" % (i,lo,hi) # create some irregular arrays block = [3,2] map = [0,2,6,0,5] if nproc < np.prod(block): raise ValueError, "ERROR: fewer procs than requested blocks" g_a = ga.create_irreg(ga.C_DBL, [8,10], block, map, "Array A") if not g_a: ga.error("Could not create global array A",g_a) g_b = ga.create(ga.C_INT, (2,3,4,5,6)) if not me: print_distribution(g_a) print_distribution(g_b)