def time_get(g_a, lo, hi, buf, chunk, jump, local): count = 0 rows = hi[0]-lo[0] cols = hi[1]-lo[1] shifti = [rows, 0, rows] shiftj = [0, cols, cols] seconds = time.time() # distance between consecutive patches increased by jump # to destroy locality of reference for ilo in range(lo[0], hi[0]-chunk-jump+1, chunk+jump): ihi = ilo + chunk for jlo in range(lo[1], hi[1]-chunk-jump+1, chunk+jump): jhi = jlo + chunk count += 1 if local: llo = [ilo,jlo] lhi = [ihi,jhi] ignore = ga.get(g_a, llo, lhi, buf[ga.zip(llo,lhi)]) else: index = count%3 llo = [ilo+shifti[index],jlo+shiftj[index]] lhi = [ihi+shifti[index],jhi+shiftj[index]] ignore = ga.get(g_a, llo, lhi, buf[ilo:ihi,jlo:jhi]) seconds = time.time() - seconds return seconds/count
def time_get(g_a, lo, hi, buf, chunk, jump, local): count = 0 rows = hi[0] - lo[0] cols = hi[1] - lo[1] shifti = [rows, 0, rows] shiftj = [0, cols, cols] seconds = time.time() # distance between consecutive patches increased by jump # to destroy locality of reference for ilo in range(lo[0], hi[0] - chunk - jump + 1, chunk + jump): ihi = ilo + chunk for jlo in range(lo[1], hi[1] - chunk - jump + 1, chunk + jump): jhi = jlo + chunk count += 1 if local: llo = [ilo, jlo] lhi = [ihi, jhi] ignore = ga.get(g_a, llo, lhi, buf[ga.zip(llo, lhi)]) else: index = count % 3 llo = [ilo + shifti[index], jlo + shiftj[index]] lhi = [ihi + shifti[index], jhi + shiftj[index]] ignore = ga.get(g_a, llo, lhi, buf[ilo:ihi, jlo:jhi]) seconds = time.time() - seconds return seconds / count
def check_put_disjoint(gatype): """each node fills in disjoint sections of the array""" if 0 == me: print '> Checking disjoint put ...', g_a = create_global_array(gatype) a = create_local_a(gatype) inc = (n-1)/20 + 1 ij = 0 for i in range(0,n,inc): for j in range(0,n,inc): check = False if MIRROR: check = ij % lprocs == iproc else: check = ij % nproc == me if check: lo = [i,j] hi = [min(i+inc,n), min(j+inc,n)] piece = a[ga.zip(lo,hi)] ga.put(g_a, piece, lo, hi) # the following check is not part of the original test.F result = ga.get(g_a, lo, hi) if not np.all(result == piece): ga.error("put followed by get failed", 1) ga.sync() ij += 1 ga.sync() # all nodes check all of a b = ga.get(g_a) if not np.all(a == b): ga.error('put failed, exiting') if 0 == me: print 'OK' ga.destroy(g_a)
def verify(g_a, g_b): a = ga.get(g_a) b = ga.get(g_b) if not np.all(a[::-1] == b): print "Mismatch: a[::-1] is not equal to b" ga.error("verify failed") print "Transpose OK"
def check_fence_and_lock(gatype): if 0 == me: print '> Checking ga.fence and ga.lock', g_a = create_global_array(gatype) ga.zero(g_a) if not ga.create_mutexes(1): ga.error('ga.create_mutexes failed') if n < 2: ga.error('insufficient n to test ga.fence', n) ga.lock(0) a = ga.get(g_a) # get original values a[:,0] += 1 # add my contribution # need to use fence to assure that coms complete before leaving # critical section ga.init_fence() ga.put(g_a, a) ga.fence() ga.unlock(0) if not ga.destroy_mutexes(): ga.error('mutex not destroyed') ga.sync() if 0 == me: a = ga.get(g_a) if not np.all(a[:,0] == nproc): ga.error('fence failed') if 0 == me: print 'OK'
def verify_using_np(g_a, g_b, g_c): a = ga.get(g_a) b = ga.get(g_b) c = ga.get(g_c) v = np.dot(a,b) val = int(np.abs(np.sum(c-v))>0.0001) val = ga.gop_add(val) return val == 0
def verify_using_np(g_a, g_b, g_c): a = ga.get(g_a) b = ga.get(g_b) c = ga.get(g_c) v = np.dot(a, b) val = int(np.abs(np.sum(c - v)) > 0.0001) val = ga.gop_add(val) return val == 0
def matrix_multiply(): # Configure array dimensions. Force an unequal data distribution. dims = [TOTALELEMS]*NDIM chunk = [TOTALELEMS/nprocs-1]*NDIM # Create a global array g_a and duplicate it to get g_b and g_c. g_a = ga.create(ga.C_DBL, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") g_c = ga.duplicate(g_a, "array C") if not g_b or not g_c: ga.eror("duplicate failed") if not me: print "Created Arrays B and C" # Initialize data in matrices a and b. if not me: print "Initializing matrix A and B" a = np.random.rand(*dims)*29 b = np.random.rand(*dims)*37 # Copy data to global arrays g_a and g_b. if not me: ga.put(g_a, a) ga.put(g_b, b) # Synchronize all processors to make sure everyone has data. ga.sync() # Determine which block of data is locally owned. Note that # the same block is locally owned for all GAs. lo,hi = ga.distribution(g_c) # Get the blocks from g_a and g_b needed to compute this block in # g_c and copy them into the local buffers a and b. a = ga.get(g_a, (lo[0],0), (hi[0],dims[0])) b = ga.get(g_b, (0,lo[1]), (dims[1],hi[1])) # Do local matrix multiplication and store the result in local # buffer c. Start by evaluating the transpose of b. btrns = b.transpose() # Multiply a and b to get c. c = np.dot(a,b) # Copy c back to g_c. ga.put(g_c, c, lo, hi) verify(g_a, g_b, g_c) # Deallocate arrays. ga.destroy(g_a) ga.destroy(g_b) ga.destroy(g_c)
def check_get(gatype): """check nloop random gets from each node""" if 0 == me: print '> Checking random get (%d calls)...' % nloop g_a = create_global_array(gatype) a = create_local_a(gatype) if 0 == me: ga.put(g_a, a) ga.sync() nwords = 0 random.seed(ga.nodeid()*51+1) # different seed for each proc for loop in range(nloop): ilo,ihi = random.randint(0, nloop-1),random.randint(0, nloop-1) if ihi < ilo: ilo,ihi = ihi,ilo jlo,jhi = random.randint(0, nloop-1),random.randint(0, nloop-1) if jhi < jlo: jlo,jhi = jhi,jlo nwords += (ihi-ilo+1)*(jhi-jlo+1) ihi += 1 jhi += 1 result = ga.get(g_a, (ilo,jlo), (ihi,jhi)) if not np.all(result == a[ilo:ihi,jlo:jhi]): ga.error('random get failed') if 0 == me and loop % max(1,nloop/20) == 0: print ' call %d node %d checking get((%d,%d),(%d,%d)) total %f' % ( loop, me, ilo, ihi, jlo, jhi, nwords) if 0 == me: print 'OK' ga.destroy(g_a)
def check_scatter(gatype): nptype = ga.dtype(gatype) if 0 == me: print '> Checking scatter (might be slow)...', g_a = create_global_array(gatype) a = create_local_a(gatype) if 0 == me: ga.put(g_a, a) ga.sync() ijv = np.zeros((m,2), dtype=np.int64) v = np.zeros(m, dtype=nptype) random.seed(ga.nodeid()*51 + 1) # different seed for each proc for j in range(10): check = None if MIRROR: check = random.randint(0,lprocs-1) == iproc else: check = random.randint(0,nproc-1) == me if check: for loop in range(m): ijv[loop,:] = (random.randint(0,n-1),random.randint(0,n-1)) v[loop] = ijv[loop,0]+ijv[loop,1] ga.scatter(g_a, v, ijv) for loop in range(m): value = ga.get(g_a, ijv[loop], ijv[loop]+1).flatten() if not v[loop] == value: ga.error('scatter failed') if 0 == me: print 'OK' ga.destroy(g_a)
def check_gather(gatype): if 0 == me: print '> Checking gather (might be slow)...', g_a = create_global_array(gatype) a = create_local_a(gatype) if 0 == me: ga.put(g_a, a) ga.sync() ijv = np.zeros((m,2), dtype=np.int64) random.seed(ga.nodeid()*51 + 1) # different seed for each proc for j in range(10): itmp = None if MIRROR: itmp = random.randint(0,lprocs-1) else: itmp = random.randint(0,nproc-1) if itmp == me: for loop in range(m): ijv[loop,:] = (random.randint(0,n-1),random.randint(0,n-1)) #if ijv[loop,0] > ijv[loop,1]: # ijv[loop,:] = ijv[loop,::-1] # reverse result = ga.gather(g_a, ijv) for loop in range(m): value = ga.get(g_a, ijv[loop], ijv[loop]+1).flatten() if not result[loop] == value: ga.error('gather failed') if 0 == me: print 'OK' ga.destroy(g_a)
def time_get1(g_a, lo, hi, buf, chunk, jump, local): count = 0 rows = hi[0]-lo[0] shift = [3*rows, 2*rows, rows] seconds = time.time() # distance between consecutive patches increased by jump # to destroy locality of reference for ilo in range(lo[0], hi[0]-chunk-jump+1, chunk+jump): ihi = ilo+chunk count += 1 if local: ignore = ga.get(g_a, [ilo], [ihi], buf[ilo:ihi]) else: index = count%3 llo = ilo+shift[index] lhi = ihi+shift[index] ignore = ga.get(g_a, llo, lhi, buf[ilo:ihi]) seconds = time.time() - seconds return seconds/count
def time_get1(g_a, lo, hi, buf, chunk, jump, local): count = 0 rows = hi[0] - lo[0] shift = [3 * rows, 2 * rows, rows] seconds = time.time() # distance between consecutive patches increased by jump # to destroy locality of reference for ilo in range(lo[0], hi[0] - chunk - jump + 1, chunk + jump): ihi = ilo + chunk count += 1 if local: ignore = ga.get(g_a, [ilo], [ihi], buf[ilo:ihi]) else: index = count % 3 llo = ilo + shift[index] lhi = ihi + shift[index] ignore = ga.get(g_a, llo, lhi, buf[ilo:ihi]) seconds = time.time() - seconds return seconds / count
def verify_ga_gemm(ta, tb, num_m, num_n, num_k, alpha, g_a, g_b, beta, g_c): tmpa = np.ndarray((num_m, num_k), dtype=np.float64) tmpb = np.ndarray((num_k, num_n), dtype=np.float64) tmpc = np.ndarray((num_m, num_n), dtype=np.float64) tmpa = ga.get(g_a, buffer=tmpa) tmpb = ga.get(g_b, buffer=tmpb) tmpc = ga.get(g_c, buffer=tmpc) if not ta and not tb: result = dgemm(alpha, tmpa, tmpb, beta=beta, trans_a=ta, trans_b=tb) elif ta and not tb: result = dgemm(alpha, tmpa, tmpb, beta=beta, trans_a=ta, trans_b=tb) elif not ta and tb: result = dgemm(alpha, tmpa, tmpb, beta=beta, trans_a=ta, trans_b=tb) elif ta and tb: result = dgemm(alpha, tmpa, tmpb, beta=beta, trans_a=ta, trans_b=tb) else: raise ValueError, "shouldn't get here" abs_value = np.abs(tmpc-result) if np.any(abs_value > 1): ga.error('verify ga.gemm failed')
def check_zero(gatype): if 0 == me: print '> Checking zero ...', g_a = create_global_array(gatype) ga.zero(g_a) a = ga.get(g_a) if not np.all(a == 0): ga.error('ga.zero failed') if 0 == me: print 'OK' ga.destroy(g_a)
def check_accumulate_overlap(gatype): if 0 == me: print '> Checking overlapping accumulate ...', g_a = create_global_array(gatype) ga.zero(g_a) ga.acc(g_a, [1], (n/2,n/2), (n/2+1,n/2+1), 1) ga.sync() if MIRROR: if 0 == iproc: x = abs(ga.get(g_a, (n/2,n/2), (n/2+1,n/2+1))[0,0] - lprocs) if not 0 == x: ga.error('overlapping accumulate failed -- expected %s got %s'%( x, lprocs)) else: if 0 == me: x = abs(ga.get(g_a, (n/2,n/2), (n/2+1,n/2+1))[0,0] - nproc) if not 0 == x: ga.error('overlapping accumulate failed -- expected %s got %s'%( x, nproc)) if 0 == me: print 'OK' ga.destroy(g_a)
def check_scale(gatype): if 0 == me: print '> Checking scale ...', g_a = create_global_array(gatype) a = create_local_a(gatype) if 0 == me: ga.put(g_a, a) ga.sync() ga.scale(g_a, 0.123) a *= 0.123 if np.any(np.vectorize(mismatch)(a,ga.get(g_a))): ga.error('add failed') if 0 == me: print 'OK' ga.destroy(g_a)
def check_copy(gatype): if 0 == me: print '> Checking copy ...', g_a = create_global_array(gatype) g_b = create_global_array(gatype) a = create_local_a(gatype) if 0 == me: ga.put(g_a, a) ga.copy(g_a, g_b) if not np.all(a == ga.get(g_b)): ga.error('copy failed') if 0 == me: print 'OK' ga.destroy(g_a) ga.destroy(g_b)
def TRANSPOSE1D(): # Configure array dimensions. Force an unequal data distribution. dims = [nprocs*TOTALELEMS + nprocs/2] chunk = [TOTALELEMS] # minimum data on each process # create a global array g_a and duplicate it to get g_b g_a = ga.create(ga.C_INT, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") if not g_b: ga.error("duplicate failed") if not me: print "Created Array B" # initialize data in g_a if not me: print "Initializing matrix A" ga.put(g_a, np.arange(dims[0], dtype=np.int32)) # Synchronize all processors to guarantee that everyone has data # before proceeding to the next step. ga.sync() # Start initial phase of inversion by inverting the data held locally on # each processor. Start by finding out which data each processor owns. lo,hi = ga.distribution(g_a) # Get locally held data and copy it into local buffer a a = ga.get(g_a, lo, hi) # Invert data locally b = a[::-1] # Invert data globally by copying locally inverted blocks into # their inverted positions in the GA ga.put(g_b, b, dims[0]-hi[0], dims[0]-lo[0]) # Synchronize all processors to make sure inversion is complete ga.sync() # Check to see if inversion is correct if not me: verify(g_a, g_b) # Deallocate arrays ga.destroy(g_a) ga.destroy(g_b)
def TRANSPOSE1D(): # Configure array dimensions. Force an unequal data distribution. dims = [nprocs * TOTALELEMS + nprocs / 2] chunk = [TOTALELEMS] # minimum data on each process # create a global array g_a and duplicate it to get g_b g_a = ga.create(ga.C_INT, dims, "array A", chunk) if not g_a: ga.error("create failed: A") if not me: print "Created Array A" g_b = ga.duplicate(g_a, "array B") if not g_b: ga.error("duplicate failed") if not me: print "Created Array B" # initialize data in g_a if not me: print "Initializing matrix A" ga.put(g_a, np.arange(dims[0], dtype=np.int32)) # Synchronize all processors to guarantee that everyone has data # before proceeding to the next step. ga.sync() # Start initial phase of inversion by inverting the data held locally on # each processor. Start by finding out which data each processor owns. lo, hi = ga.distribution(g_a) # Get locally held data and copy it into local buffer a a = ga.get(g_a, lo, hi) # Invert data locally b = a[::-1] # Invert data globally by copying locally inverted blocks into # their inverted positions in the GA ga.put(g_b, b, dims[0] - hi[0], dims[0] - lo[0]) # Synchronize all processors to make sure inversion is complete ga.sync() # Check to see if inversion is correct if not me: verify(g_a, g_b) # Deallocate arrays ga.destroy(g_a) ga.destroy(g_b)
def check_add(gatype): if 0 == me: print '> Checking add ...', g_a = create_global_array(gatype) g_b = create_global_array(gatype) a = create_local_a(gatype) b = create_local_b(gatype) alpha = None beta = None if 0 == me: ga.put(g_a, a) ga.sync(); np.random.seed(12345) # everyone has same seed if gatype in [ga.C_SCPL,ga.C_DCPL]: b_real = np.random.random_sample((n,n)) b_imag = np.random.random_sample((n,n)) b[:] = np.vectorize(complex)(b_real,b_imag) alpha = complex(0.1,-0.1) beta = complex(0.9,-0.9) else: b[:] = np.random.random_sample((n,n)) alpha = 0.1 beta = 0.9 a = alpha*a + beta*b if MIRROR: if 0 == iproc: ga.put(g_b, b) else: if 0 == me: ga.put(g_b, b) ga.sync() ga.add(g_a, g_b, g_b, alpha, beta) b = ga.get(g_b, buffer=b) if np.any(np.vectorize(mismatch)(b,a)): ga.error('add failed') if 0 == me: print 'OK' ga.destroy(g_a) ga.destroy(g_b)
def check_accumulate_disjoint(gatype): """Each node accumulates into disjoint sections of the array.""" if 0 == me: print '> Checking disjoint accumulate ...', g_a = create_global_array(gatype) a = create_local_a(gatype) b = np.fromfunction(lambda i,j: i+j+2, (n,n), dtype=ga.dtype(gatype)) if 0 == me: ga.put(g_a, a) ga.sync() inc = (n-1)/20 + 1 ij = 0 for i in range(0,n,inc): for j in range(0,n,inc): x = 10.0 lo = [i,j] hi = [min(i+inc,n), min(j+inc,n)] piece = b[ga.zip(lo,hi)] check = False if MIRROR: check = ij % lprocs == iproc else: check = ij % nproc == me if check: ga.acc(g_a, piece, lo, hi, x) ga.sync() ij += 1 # each process applies all updates to its local copy a[ga.zip(lo,hi)] += x * piece ga.sync() # all nodes check all of a if not np.all(ga.get(g_a) == a): ga.error('acc failed') if 0 == me: print 'OK' ga.destroy(g_a)
patch_shape = hi-lo a_buf = np.fromfunction(lambda i,j: j*NSIZE + i, patch_shape, dtype=ga.dtype(ga.C_INT)) a_buf += lo[1,np.newaxis] a_buf += lo[np.newaxis,0]*dims[0] # Copy local data to GA ga.put(g_a, a_buf, lo, hi) ga.sync() if me == 0: print "\nCopied values into Global Array from local buffer\n" # Check data in GA to see if it is correct. Find data owned by this # processor and then copy it to local buffer lo,hi = ga.distribution(g_a, me) b_buf = ga.get(g_a, lo, hi) if me == 0: print "\nCopied values from Global Array to local buffer\n" # Verify that data is correct patch_shape = hi-lo c_buf = np.fromfunction(lambda i,j: j*NSIZE + i, patch_shape, dtype=ga.dtype(ga.C_INT)) c_buf += lo[1,np.newaxis] c_buf += lo[np.newaxis,0]*dims[0] chk = 1 if not np.all(b_buf == c_buf): print "Incorrect value found on process %d" % me chk = 0
nprocs = ga.nnodes() myrank = ga.nodeid() g_pi = ga.create(ga.C_DBL, [1]) one_time = False if len(sys.argv) == 2: n = int(sys.argv[1]) one_time = True while True: if not one_time: if myrank == 0: n = get_n() n = ga.brdcst(n) else: n = ga.brdcst(0) if n == 0: break ga.zero(g_pi) mypi = comp_pi(n, myrank, nprocs) ga.acc(g_pi, mypi) ga.sync() if myrank == 0: pi = ga.get(g_pi)[0] prn_pi(pi, PI) if one_time: break ga.destroy(g_pi)
d = dict([key, frames_seg[key]] for key in range(size)) start, stop = d[rank][0], d[rank][1] # Block-RMSD in Parallel start3 = time.time() out = block_rmsd(index, topology, trajectory, xref0) # Communication start4 = time.time() print(np.shape(out[0]), start, stop) ga.put(g_a, out[0], (start, 0), (stop, 2)) start5 = time.time() if rank == 0: buf = ga.get(g_a, lo=None, hi=None) start6 = time.time() if rank == 0: data = np.zeros([size, 5], dtype=float) else: data = None comm.Gather(np.array(out[1:], dtype=float), data, root=0) start7 = time.time() if rank == 0 and int(j) == 1: res = os.path.abspath( os.path.normpath(os.path.join(os.getcwd(),
set_boundary_conditions_access(g_a) iteration = 0 start = ga.wtime() while True: ga.sync() iteration += 1 if iteration % HOW_MANY_STEPS_BEFORE_CONVERGENCE_TEST == 0: # check for convergence will occur, so make a copy of the GA ga.copy(g_a, g_b) # the iteration ga.update_ghosts(g_a) set_boundary_conditions_access(g_a) my_array = ga.access_ghosts(g_a) my_array[1:-1,1:-1] = ( my_array[0:-2, 1:-1] + my_array[2:, 1:-1] + my_array[1:-1,0:-2] + my_array[1:-1, 2:]) / 4 ga.release_ghosts(g_a) if iteration % HOW_MANY_STEPS_BEFORE_CONVERGENCE_TEST == 0: if convergence_test_L2(g_a, g_b): break if DEBUG or True and rank == 0: print ga.get(g_a) if rank == 0: print iteration print ga.wtime() - start, "seconds"
ga.sync() ga.copy(g_a, g_b) # the iteration if rlo == 0 and rhi == dim: # I own the top and bottom rows ga.sync() my_array = ga.access(g_a) my_array[1:-1,1:-1] = ( my_array[0:-2, 1:-1] + my_array[2:, 1:-1] + my_array[1:-1,0:-2] + my_array[1:-1, 2:]) / 4 ga.release(g_a) elif rlo == 0: # I own the top rows, so get top row of next domain next_domain_row = ga.get(g_a, (rhi,0), (rhi+1,dim)) ga.sync() my_array = ga.access(g_a) combined = np.vstack((my_array,next_domain_row)) my_array[1:,1:-1] = ( combined[0:-2, 1:-1] + combined[2:, 1:-1] + combined[1:-1,0:-2] + combined[1:-1, 2:]) / 4 ga.release(g_a) elif rhi == dim: # I own the bottom rows, so get bottom row of previous domain prev_domain_row = ga.get(g_a, (rlo-1,0), (rlo,dim)) ga.sync() my_array = ga.access(g_a) combined = np.vstack((prev_domain_row,my_array))