Esempio n. 1
0
def time_get(g_a, lo, hi, buf, chunk, jump, local):
    count = 0
    rows = hi[0]-lo[0]
    cols = hi[1]-lo[1]
    shifti = [rows, 0, rows]
    shiftj = [0, cols, cols]
    seconds = time.time()
    # distance between consecutive patches increased by jump
    # to destroy locality of reference
    for ilo in range(lo[0], hi[0]-chunk-jump+1, chunk+jump):
        ihi = ilo + chunk
        for jlo in range(lo[1], hi[1]-chunk-jump+1, chunk+jump):
            jhi = jlo + chunk
            count += 1
            if local:
                llo = [ilo,jlo]
                lhi = [ihi,jhi]
                ignore = ga.get(g_a, llo, lhi, buf[ga.zip(llo,lhi)])
            else:
                index = count%3
                llo = [ilo+shifti[index],jlo+shiftj[index]]
                lhi = [ihi+shifti[index],jhi+shiftj[index]]
                ignore = ga.get(g_a, llo, lhi, buf[ilo:ihi,jlo:jhi])
    seconds = time.time() - seconds
    return seconds/count
Esempio n. 2
0
def time_get(g_a, lo, hi, buf, chunk, jump, local):
    count = 0
    rows = hi[0] - lo[0]
    cols = hi[1] - lo[1]
    shifti = [rows, 0, rows]
    shiftj = [0, cols, cols]
    seconds = time.time()
    # distance between consecutive patches increased by jump
    # to destroy locality of reference
    for ilo in range(lo[0], hi[0] - chunk - jump + 1, chunk + jump):
        ihi = ilo + chunk
        for jlo in range(lo[1], hi[1] - chunk - jump + 1, chunk + jump):
            jhi = jlo + chunk
            count += 1
            if local:
                llo = [ilo, jlo]
                lhi = [ihi, jhi]
                ignore = ga.get(g_a, llo, lhi, buf[ga.zip(llo, lhi)])
            else:
                index = count % 3
                llo = [ilo + shifti[index], jlo + shiftj[index]]
                lhi = [ihi + shifti[index], jhi + shiftj[index]]
                ignore = ga.get(g_a, llo, lhi, buf[ilo:ihi, jlo:jhi])
    seconds = time.time() - seconds
    return seconds / count
Esempio n. 3
0
def check_put_disjoint(gatype):
    """each node fills in disjoint sections of the array"""
    if 0 == me:
        print '> Checking disjoint put ...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    inc = (n-1)/20 + 1
    ij = 0
    for i in range(0,n,inc):
        for j in range(0,n,inc):
            check = False
            if MIRROR:
                check = ij % lprocs == iproc
            else:
                check = ij % nproc == me
            if check:
                lo = [i,j]
                hi = [min(i+inc,n), min(j+inc,n)]
                piece = a[ga.zip(lo,hi)]
                ga.put(g_a, piece, lo, hi)
                # the following check is not part of the original test.F
                result = ga.get(g_a, lo, hi)
                if not np.all(result == piece):
                    ga.error("put followed by get failed", 1)
            ga.sync()
            ij += 1
    ga.sync()
    # all nodes check all of a
    b = ga.get(g_a)
    if not np.all(a == b):
        ga.error('put failed, exiting')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
def verify(g_a, g_b):
    a = ga.get(g_a)
    b = ga.get(g_b)
    if not np.all(a[::-1] == b):
        print "Mismatch: a[::-1] is not equal to b"
        ga.error("verify failed")
    print "Transpose OK"
Esempio n. 5
0
def check_fence_and_lock(gatype):
    if 0 == me:
        print '> Checking ga.fence and ga.lock',
    g_a = create_global_array(gatype)
    ga.zero(g_a)
    if not ga.create_mutexes(1):
        ga.error('ga.create_mutexes failed')
    if n < 2:
        ga.error('insufficient n to test ga.fence', n)
    ga.lock(0)
    a = ga.get(g_a) # get original values
    a[:,0] += 1 # add my contribution
    # need to use fence to assure that coms complete before leaving
    # critical section
    ga.init_fence()
    ga.put(g_a, a)
    ga.fence()
    ga.unlock(0)
    if not ga.destroy_mutexes():
        ga.error('mutex not destroyed')
    ga.sync()
    if 0 == me:
        a = ga.get(g_a)
        if not np.all(a[:,0] == nproc):
            ga.error('fence failed')
    if 0 == me:
        print 'OK'
def verify(g_a, g_b):
    a = ga.get(g_a)
    b = ga.get(g_b)
    if not np.all(a[::-1] == b):
        print "Mismatch: a[::-1] is not equal to b"
        ga.error("verify failed")
    print "Transpose OK"
Esempio n. 7
0
def verify_using_np(g_a, g_b, g_c):
    a = ga.get(g_a)
    b = ga.get(g_b)
    c = ga.get(g_c)
    v = np.dot(a,b)
    val = int(np.abs(np.sum(c-v))>0.0001)
    val = ga.gop_add(val)
    return val == 0
Esempio n. 8
0
def verify_using_np(g_a, g_b, g_c):
    a = ga.get(g_a)
    b = ga.get(g_b)
    c = ga.get(g_c)
    v = np.dot(a, b)
    val = int(np.abs(np.sum(c - v)) > 0.0001)
    val = ga.gop_add(val)
    return val == 0
Esempio n. 9
0
def matrix_multiply():
    # Configure array dimensions. Force an unequal data distribution.
    dims = [TOTALELEMS]*NDIM
    chunk = [TOTALELEMS/nprocs-1]*NDIM

    # Create a global array g_a and duplicate it to get g_b and g_c.
    g_a = ga.create(ga.C_DBL, dims, "array A", chunk)
    if not g_a: ga.error("create failed: A")
    if not me: print "Created Array A"

    g_b = ga.duplicate(g_a, "array B")
    g_c = ga.duplicate(g_a, "array C")
    if not g_b or not g_c: ga.eror("duplicate failed")
    if not me: print "Created Arrays B and C"

    # Initialize data in matrices a and b.
    if not me: print "Initializing matrix A and B"
    a = np.random.rand(*dims)*29
    b = np.random.rand(*dims)*37

    # Copy data to global arrays g_a and g_b.
    if not me:
        ga.put(g_a, a)
        ga.put(g_b, b)

    # Synchronize all processors to make sure everyone has data.
    ga.sync()

    # Determine which block of data is locally owned. Note that
    # the same block is locally owned for all GAs.
    lo,hi = ga.distribution(g_c)

    # Get the blocks from g_a and g_b needed to compute this block in
    # g_c and copy them into the local buffers a and b.
    a = ga.get(g_a, (lo[0],0), (hi[0],dims[0]))
    b = ga.get(g_b, (0,lo[1]), (dims[1],hi[1]))

    # Do local matrix multiplication and store the result in local
    # buffer c. Start by evaluating the transpose of b.
    btrns = b.transpose()

    # Multiply a and b to get c.
    c = np.dot(a,b)

    # Copy c back to g_c.
    ga.put(g_c, c, lo, hi)

    verify(g_a, g_b, g_c)

    # Deallocate arrays.
    ga.destroy(g_a)
    ga.destroy(g_b)
    ga.destroy(g_c)
Esempio n. 10
0
def check_get(gatype):
    """check nloop random gets from each node"""
    if 0 == me:
        print '> Checking random get (%d calls)...' % nloop
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    nwords = 0
    random.seed(ga.nodeid()*51+1) # different seed for each proc
    for loop in range(nloop):
        ilo,ihi = random.randint(0, nloop-1),random.randint(0, nloop-1)
        if ihi < ilo: ilo,ihi = ihi,ilo
        jlo,jhi = random.randint(0, nloop-1),random.randint(0, nloop-1)
        if jhi < jlo: jlo,jhi = jhi,jlo
        nwords += (ihi-ilo+1)*(jhi-jlo+1)
        ihi += 1
        jhi += 1
        result = ga.get(g_a, (ilo,jlo), (ihi,jhi))
        if not np.all(result == a[ilo:ihi,jlo:jhi]):
            ga.error('random get failed')
        if 0 == me and loop % max(1,nloop/20) == 0:
            print ' call %d node %d checking get((%d,%d),(%d,%d)) total %f' % (
                    loop, me, ilo, ihi, jlo, jhi, nwords)
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Esempio n. 11
0
def check_scatter(gatype):
    nptype = ga.dtype(gatype)
    if 0 == me:
        print '> Checking scatter (might be slow)...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    ijv = np.zeros((m,2), dtype=np.int64)
    v = np.zeros(m, dtype=nptype)
    random.seed(ga.nodeid()*51 + 1) # different seed for each proc
    for j in range(10):
        check = None
        if MIRROR:
            check = random.randint(0,lprocs-1) == iproc
        else:
            check = random.randint(0,nproc-1) == me
        if check:
            for loop in range(m):
                ijv[loop,:] = (random.randint(0,n-1),random.randint(0,n-1))
                v[loop] = ijv[loop,0]+ijv[loop,1]
            ga.scatter(g_a, v, ijv)
            for loop in range(m):
                value = ga.get(g_a, ijv[loop], ijv[loop]+1).flatten()
                if not v[loop] == value:
                    ga.error('scatter failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Esempio n. 12
0
def check_gather(gatype):
    if 0 == me:
        print '> Checking gather (might be slow)...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    ijv = np.zeros((m,2), dtype=np.int64)
    random.seed(ga.nodeid()*51 + 1) # different seed for each proc
    for j in range(10):
        itmp = None
        if MIRROR:
            itmp = random.randint(0,lprocs-1)
        else:
            itmp = random.randint(0,nproc-1)
        if itmp == me:
            for loop in range(m):
                ijv[loop,:] = (random.randint(0,n-1),random.randint(0,n-1))
                #if ijv[loop,0] > ijv[loop,1]:
                #    ijv[loop,:] = ijv[loop,::-1] # reverse
            result = ga.gather(g_a, ijv)
            for loop in range(m):
                value = ga.get(g_a, ijv[loop], ijv[loop]+1).flatten()
                if not result[loop] == value:
                    ga.error('gather failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Esempio n. 13
0
def time_get1(g_a, lo, hi, buf, chunk, jump, local):
    count = 0
    rows = hi[0]-lo[0]
    shift = [3*rows, 2*rows, rows]
    seconds = time.time()
    # distance between consecutive patches increased by jump
    # to destroy locality of reference
    for ilo in range(lo[0], hi[0]-chunk-jump+1, chunk+jump):
        ihi = ilo+chunk
        count += 1
        if local:
            ignore = ga.get(g_a, [ilo], [ihi], buf[ilo:ihi])
        else:
            index = count%3
            llo = ilo+shift[index]
            lhi = ihi+shift[index]
            ignore = ga.get(g_a, llo, lhi, buf[ilo:ihi])
    seconds = time.time() - seconds
    return seconds/count
Esempio n. 14
0
def time_get1(g_a, lo, hi, buf, chunk, jump, local):
    count = 0
    rows = hi[0] - lo[0]
    shift = [3 * rows, 2 * rows, rows]
    seconds = time.time()
    # distance between consecutive patches increased by jump
    # to destroy locality of reference
    for ilo in range(lo[0], hi[0] - chunk - jump + 1, chunk + jump):
        ihi = ilo + chunk
        count += 1
        if local:
            ignore = ga.get(g_a, [ilo], [ihi], buf[ilo:ihi])
        else:
            index = count % 3
            llo = ilo + shift[index]
            lhi = ihi + shift[index]
            ignore = ga.get(g_a, llo, lhi, buf[ilo:ihi])
    seconds = time.time() - seconds
    return seconds / count
Esempio n. 15
0
def verify_ga_gemm(ta, tb, num_m, num_n, num_k, alpha, g_a, g_b, beta, g_c):
    tmpa = np.ndarray((num_m, num_k), dtype=np.float64)
    tmpb = np.ndarray((num_k, num_n), dtype=np.float64)
    tmpc = np.ndarray((num_m, num_n), dtype=np.float64)
    tmpa = ga.get(g_a, buffer=tmpa)
    tmpb = ga.get(g_b, buffer=tmpb)
    tmpc = ga.get(g_c, buffer=tmpc)
    if not ta and not tb:
        result = dgemm(alpha, tmpa, tmpb, beta=beta, trans_a=ta, trans_b=tb)
    elif ta and not tb:
        result = dgemm(alpha, tmpa, tmpb, beta=beta, trans_a=ta, trans_b=tb)
    elif not ta and tb:
        result = dgemm(alpha, tmpa, tmpb, beta=beta, trans_a=ta, trans_b=tb)
    elif ta and tb:
        result = dgemm(alpha, tmpa, tmpb, beta=beta, trans_a=ta, trans_b=tb)
    else:
        raise ValueError, "shouldn't get here"
    abs_value = np.abs(tmpc-result)
    if np.any(abs_value > 1):
        ga.error('verify ga.gemm failed')
Esempio n. 16
0
def check_zero(gatype):
    if 0 == me:
        print '> Checking zero ...',
    g_a = create_global_array(gatype)
    ga.zero(g_a)
    a = ga.get(g_a)
    if not np.all(a == 0):
        ga.error('ga.zero failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Esempio n. 17
0
def check_accumulate_overlap(gatype):
    if 0 == me:
        print '> Checking overlapping accumulate ...',
    g_a = create_global_array(gatype)
    ga.zero(g_a)
    ga.acc(g_a, [1], (n/2,n/2), (n/2+1,n/2+1), 1)
    ga.sync()
    if MIRROR:
        if 0 == iproc:
            x = abs(ga.get(g_a, (n/2,n/2), (n/2+1,n/2+1))[0,0] - lprocs)
            if not 0 == x:
                ga.error('overlapping accumulate failed -- expected %s got %s'%(
                        x, lprocs))
    else:
        if 0 == me:
            x = abs(ga.get(g_a, (n/2,n/2), (n/2+1,n/2+1))[0,0] - nproc)
            if not 0 == x:
                ga.error('overlapping accumulate failed -- expected %s got %s'%(
                        x, nproc))
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Esempio n. 18
0
def check_scale(gatype):
    if 0 == me:
        print '> Checking scale ...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    ga.scale(g_a, 0.123)
    a *= 0.123
    if np.any(np.vectorize(mismatch)(a,ga.get(g_a))):
        ga.error('add failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Esempio n. 19
0
def check_copy(gatype):
    if 0 == me:
        print '> Checking copy ...',
    g_a = create_global_array(gatype)
    g_b = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.copy(g_a, g_b)
    if not np.all(a == ga.get(g_b)):
        ga.error('copy failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
    ga.destroy(g_b)
def TRANSPOSE1D():
    # Configure array dimensions. Force an unequal data distribution.
    dims = [nprocs*TOTALELEMS + nprocs/2]
    chunk = [TOTALELEMS] # minimum data on each process

    # create a global array g_a and duplicate it to get g_b
    g_a = ga.create(ga.C_INT, dims, "array A", chunk)
    if not g_a: ga.error("create failed: A")
    if not me: print "Created Array A"

    g_b = ga.duplicate(g_a, "array B")
    if not g_b: ga.error("duplicate failed")
    if not me: print "Created Array B"

    # initialize data in g_a
    if not me:
        print "Initializing matrix A"
        ga.put(g_a, np.arange(dims[0], dtype=np.int32))

    # Synchronize all processors to guarantee that everyone has data
    # before proceeding to the next step.
    ga.sync()

    # Start initial phase of inversion by inverting the data held locally on
    # each processor. Start by finding out which data each processor owns.
    lo,hi = ga.distribution(g_a)

    # Get locally held data and copy it into local buffer a
    a = ga.get(g_a, lo, hi)

    # Invert data locally
    b = a[::-1]

    # Invert data globally by copying locally inverted blocks into
    # their inverted positions in the GA
    ga.put(g_b, b, dims[0]-hi[0], dims[0]-lo[0])

    # Synchronize all processors to make sure inversion is complete
    ga.sync()

    # Check to see if inversion is correct
    if not me: verify(g_a, g_b)

    # Deallocate arrays
    ga.destroy(g_a)
    ga.destroy(g_b)
Esempio n. 21
0
def TRANSPOSE1D():
    # Configure array dimensions. Force an unequal data distribution.
    dims = [nprocs * TOTALELEMS + nprocs / 2]
    chunk = [TOTALELEMS]  # minimum data on each process

    # create a global array g_a and duplicate it to get g_b
    g_a = ga.create(ga.C_INT, dims, "array A", chunk)
    if not g_a: ga.error("create failed: A")
    if not me: print "Created Array A"

    g_b = ga.duplicate(g_a, "array B")
    if not g_b: ga.error("duplicate failed")
    if not me: print "Created Array B"

    # initialize data in g_a
    if not me:
        print "Initializing matrix A"
        ga.put(g_a, np.arange(dims[0], dtype=np.int32))

    # Synchronize all processors to guarantee that everyone has data
    # before proceeding to the next step.
    ga.sync()

    # Start initial phase of inversion by inverting the data held locally on
    # each processor. Start by finding out which data each processor owns.
    lo, hi = ga.distribution(g_a)

    # Get locally held data and copy it into local buffer a
    a = ga.get(g_a, lo, hi)

    # Invert data locally
    b = a[::-1]

    # Invert data globally by copying locally inverted blocks into
    # their inverted positions in the GA
    ga.put(g_b, b, dims[0] - hi[0], dims[0] - lo[0])

    # Synchronize all processors to make sure inversion is complete
    ga.sync()

    # Check to see if inversion is correct
    if not me: verify(g_a, g_b)

    # Deallocate arrays
    ga.destroy(g_a)
    ga.destroy(g_b)
Esempio n. 22
0
def check_add(gatype):
    if 0 == me:
        print '> Checking add ...',
    g_a = create_global_array(gatype)
    g_b = create_global_array(gatype)
    a = create_local_a(gatype)
    b = create_local_b(gatype)
    alpha = None
    beta = None
    if 0 == me:
        ga.put(g_a, a)
    ga.sync();
    np.random.seed(12345) # everyone has same seed
    if gatype in [ga.C_SCPL,ga.C_DCPL]:
        b_real = np.random.random_sample((n,n))
        b_imag = np.random.random_sample((n,n))
        b[:] = np.vectorize(complex)(b_real,b_imag)
        alpha = complex(0.1,-0.1)
        beta = complex(0.9,-0.9)
    else:
        b[:] = np.random.random_sample((n,n))
        alpha = 0.1
        beta = 0.9
    a = alpha*a + beta*b
    if MIRROR:
        if 0 == iproc:
            ga.put(g_b, b)
    else:
        if 0 == me:
            ga.put(g_b, b)
    ga.sync()
    ga.add(g_a, g_b, g_b, alpha, beta)
    b = ga.get(g_b, buffer=b)
    if np.any(np.vectorize(mismatch)(b,a)):
        ga.error('add failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
    ga.destroy(g_b)
Esempio n. 23
0
def check_accumulate_disjoint(gatype):
    """Each node accumulates into disjoint sections of the array."""
    if 0 == me:
        print '> Checking disjoint accumulate ...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    b = np.fromfunction(lambda i,j: i+j+2, (n,n), dtype=ga.dtype(gatype))
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    inc = (n-1)/20 + 1
    ij = 0
    for i in range(0,n,inc):
        for j in range(0,n,inc):
            x = 10.0
            lo = [i,j]
            hi = [min(i+inc,n), min(j+inc,n)]
            piece = b[ga.zip(lo,hi)]
            check = False
            if MIRROR:
                check = ij % lprocs == iproc
            else:
                check = ij % nproc == me
            if check:
                ga.acc(g_a, piece, lo, hi, x)
            ga.sync()
            ij += 1
            # each process applies all updates to its local copy
            a[ga.zip(lo,hi)] += x * piece
    ga.sync()
    # all nodes check all of a
    if not np.all(ga.get(g_a) == a):
        ga.error('acc failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Esempio n. 24
0
patch_shape = hi-lo
a_buf = np.fromfunction(lambda i,j: j*NSIZE + i,
        patch_shape, dtype=ga.dtype(ga.C_INT))
a_buf += lo[1,np.newaxis]
a_buf += lo[np.newaxis,0]*dims[0]

# Copy local data to GA
ga.put(g_a, a_buf, lo, hi)
ga.sync()
if me == 0:
    print "\nCopied values into Global Array from local buffer\n"

# Check data in GA to see if it is correct. Find data owned by this
# processor and then copy it to local buffer
lo,hi = ga.distribution(g_a, me)
b_buf = ga.get(g_a, lo, hi)
if me == 0:
    print "\nCopied values from Global Array to local buffer\n"

# Verify that data is correct
patch_shape = hi-lo
c_buf = np.fromfunction(lambda i,j: j*NSIZE + i,
        patch_shape, dtype=ga.dtype(ga.C_INT))
c_buf += lo[1,np.newaxis]
c_buf += lo[np.newaxis,0]*dims[0]

chk = 1
if not np.all(b_buf == c_buf):
    print "Incorrect value found on process %d" % me
    chk = 0
Esempio n. 25
0
nprocs = ga.nnodes()
myrank = ga.nodeid()

g_pi = ga.create(ga.C_DBL, [1])

one_time = False
if len(sys.argv) == 2:
    n = int(sys.argv[1])
    one_time = True

while True:
    if not one_time:
        if myrank == 0:
            n = get_n()
            n = ga.brdcst(n)
        else:
            n = ga.brdcst(0)
        if n == 0:
            break
    ga.zero(g_pi)
    mypi = comp_pi(n, myrank, nprocs)
    ga.acc(g_pi, mypi)
    ga.sync()
    if myrank == 0:
        pi = ga.get(g_pi)[0]
        prn_pi(pi, PI)
    if one_time:
        break

ga.destroy(g_pi)
d = dict([key, frames_seg[key]] for key in range(size))

start, stop = d[rank][0], d[rank][1]

# Block-RMSD in Parallel
start3 = time.time()
out = block_rmsd(index, topology, trajectory, xref0)

# Communication
start4 = time.time()
print(np.shape(out[0]), start, stop)
ga.put(g_a, out[0], (start, 0), (stop, 2))

start5 = time.time()
if rank == 0:
    buf = ga.get(g_a, lo=None, hi=None)

start6 = time.time()

if rank == 0:
    data = np.zeros([size, 5], dtype=float)
else:
    data = None

comm.Gather(np.array(out[1:], dtype=float), data, root=0)

start7 = time.time()

if rank == 0 and int(j) == 1:
    res = os.path.abspath(
        os.path.normpath(os.path.join(os.getcwd(),
Esempio n. 27
0
nprocs = ga.nnodes()
myrank = ga.nodeid()

g_pi = ga.create(ga.C_DBL, [1])

one_time = False
if len(sys.argv) == 2:
    n = int(sys.argv[1])
    one_time = True

while True:
    if not one_time:
        if myrank == 0:
            n = get_n()
            n = ga.brdcst(n)
        else:
            n = ga.brdcst(0)
        if n == 0:
            break
    ga.zero(g_pi)
    mypi = comp_pi(n, myrank, nprocs)
    ga.acc(g_pi, mypi)
    ga.sync()
    if myrank == 0:
        pi = ga.get(g_pi)[0]
        prn_pi(pi, PI)
    if one_time:
        break

ga.destroy(g_pi)
set_boundary_conditions_access(g_a)
iteration = 0
start = ga.wtime()
while True:
    ga.sync()
    iteration += 1
    if iteration % HOW_MANY_STEPS_BEFORE_CONVERGENCE_TEST == 0:
        # check for convergence will occur, so make a copy of the GA
        ga.copy(g_a, g_b)
    # the iteration
    ga.update_ghosts(g_a)
    set_boundary_conditions_access(g_a)
    my_array = ga.access_ghosts(g_a)
    my_array[1:-1,1:-1] = (
            my_array[0:-2, 1:-1] +
            my_array[2:, 1:-1] +
            my_array[1:-1,0:-2] +
            my_array[1:-1, 2:]) / 4
    ga.release_ghosts(g_a)
    if iteration % HOW_MANY_STEPS_BEFORE_CONVERGENCE_TEST == 0:
        if convergence_test_L2(g_a, g_b):
            break

if DEBUG or True and rank == 0:
    print ga.get(g_a)

if rank == 0:
    print iteration
    print ga.wtime() - start, "seconds"
     ga.sync()
     ga.copy(g_a, g_b)
 # the iteration
 if rlo == 0 and rhi == dim:
     # I own the top and bottom rows
     ga.sync()
     my_array = ga.access(g_a)
     my_array[1:-1,1:-1] = (
             my_array[0:-2, 1:-1] +
             my_array[2:, 1:-1] +
             my_array[1:-1,0:-2] +
             my_array[1:-1, 2:]) / 4
     ga.release(g_a)
 elif rlo == 0:
     # I own the top rows, so get top row of next domain
     next_domain_row = ga.get(g_a, (rhi,0), (rhi+1,dim))
     ga.sync()
     my_array = ga.access(g_a)
     combined = np.vstack((my_array,next_domain_row))
     my_array[1:,1:-1] = (
             combined[0:-2, 1:-1] +
             combined[2:, 1:-1] +
             combined[1:-1,0:-2] +
             combined[1:-1, 2:]) / 4
     ga.release(g_a)
 elif rhi == dim:
     # I own the bottom rows, so get bottom row of previous domain
     prev_domain_row = ga.get(g_a, (rlo-1,0), (rlo,dim))
     ga.sync()
     my_array = ga.access(g_a)
     combined = np.vstack((prev_domain_row,my_array))