Ejemplo n.º 1
0
def check_put_disjoint(gatype):
    """each node fills in disjoint sections of the array"""
    if 0 == me:
        print '> Checking disjoint put ...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    inc = (n-1)/20 + 1
    ij = 0
    for i in range(0,n,inc):
        for j in range(0,n,inc):
            check = False
            if MIRROR:
                check = ij % lprocs == iproc
            else:
                check = ij % nproc == me
            if check:
                lo = [i,j]
                hi = [min(i+inc,n), min(j+inc,n)]
                piece = a[ga.zip(lo,hi)]
                ga.put(g_a, piece, lo, hi)
                # the following check is not part of the original test.F
                result = ga.get(g_a, lo, hi)
                if not np.all(result == piece):
                    ga.error("put followed by get failed", 1)
            ga.sync()
            ij += 1
    ga.sync()
    # all nodes check all of a
    b = ga.get(g_a)
    if not np.all(a == b):
        ga.error('put failed, exiting')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Ejemplo n.º 2
0
def time_put(g_a, lo, hi, buf, chunk, jump, local):
    count = 0
    rows = hi[0] - lo[0]
    cols = hi[1] - lo[1]
    shifti = [rows, 0, rows]
    shiftj = [0, cols, cols]
    seconds = time.time()
    # distance between consecutive patches increased by jump
    # to destroy locality of reference
    for ilo in range(lo[0], hi[0] - chunk - jump + 1, chunk + jump):
        ihi = ilo + chunk
        for jlo in range(lo[1], hi[1] - chunk - jump + 1, chunk + jump):
            jhi = jlo + chunk
            count += 1
            if local:
                llo = [ilo, jlo]
                lhi = [ihi, jhi]
                ga.put(g_a, buf[ga.zip(llo, lhi)], llo, lhi)
            else:
                index = count % 3
                llo = [ilo + shifti[index], jlo + shiftj[index]]
                lhi = [ihi + shifti[index], jhi + shiftj[index]]
                ga.put(g_a, buf[ilo:ihi, jlo:jhi], llo, lhi)
    seconds = time.time() - seconds
    return seconds / count
Ejemplo n.º 3
0
def check_scatter(gatype):
    nptype = ga.dtype(gatype)
    if 0 == me:
        print '> Checking scatter (might be slow)...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    ijv = np.zeros((m,2), dtype=np.int64)
    v = np.zeros(m, dtype=nptype)
    random.seed(ga.nodeid()*51 + 1) # different seed for each proc
    for j in range(10):
        check = None
        if MIRROR:
            check = random.randint(0,lprocs-1) == iproc
        else:
            check = random.randint(0,nproc-1) == me
        if check:
            for loop in range(m):
                ijv[loop,:] = (random.randint(0,n-1),random.randint(0,n-1))
                v[loop] = ijv[loop,0]+ijv[loop,1]
            ga.scatter(g_a, v, ijv)
            for loop in range(m):
                value = ga.get(g_a, ijv[loop], ijv[loop]+1).flatten()
                if not v[loop] == value:
                    ga.error('scatter failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Ejemplo n.º 4
0
def check_fence_and_lock(gatype):
    if 0 == me:
        print '> Checking ga.fence and ga.lock',
    g_a = create_global_array(gatype)
    ga.zero(g_a)
    if not ga.create_mutexes(1):
        ga.error('ga.create_mutexes failed')
    if n < 2:
        ga.error('insufficient n to test ga.fence', n)
    ga.lock(0)
    a = ga.get(g_a) # get original values
    a[:,0] += 1 # add my contribution
    # need to use fence to assure that coms complete before leaving
    # critical section
    ga.init_fence()
    ga.put(g_a, a)
    ga.fence()
    ga.unlock(0)
    if not ga.destroy_mutexes():
        ga.error('mutex not destroyed')
    ga.sync()
    if 0 == me:
        a = ga.get(g_a)
        if not np.all(a[:,0] == nproc):
            ga.error('fence failed')
    if 0 == me:
        print 'OK'
Ejemplo n.º 5
0
def check_get(gatype):
    """check nloop random gets from each node"""
    if 0 == me:
        print '> Checking random get (%d calls)...' % nloop
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    nwords = 0
    random.seed(ga.nodeid()*51+1) # different seed for each proc
    for loop in range(nloop):
        ilo,ihi = random.randint(0, nloop-1),random.randint(0, nloop-1)
        if ihi < ilo: ilo,ihi = ihi,ilo
        jlo,jhi = random.randint(0, nloop-1),random.randint(0, nloop-1)
        if jhi < jlo: jlo,jhi = jhi,jlo
        nwords += (ihi-ilo+1)*(jhi-jlo+1)
        ihi += 1
        jhi += 1
        result = ga.get(g_a, (ilo,jlo), (ihi,jhi))
        if not np.all(result == a[ilo:ihi,jlo:jhi]):
            ga.error('random get failed')
        if 0 == me and loop % max(1,nloop/20) == 0:
            print ' call %d node %d checking get((%d,%d),(%d,%d)) total %f' % (
                    loop, me, ilo, ihi, jlo, jhi, nwords)
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Ejemplo n.º 6
0
def check_gather(gatype):
    if 0 == me:
        print '> Checking gather (might be slow)...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    ijv = np.zeros((m,2), dtype=np.int64)
    random.seed(ga.nodeid()*51 + 1) # different seed for each proc
    for j in range(10):
        itmp = None
        if MIRROR:
            itmp = random.randint(0,lprocs-1)
        else:
            itmp = random.randint(0,nproc-1)
        if itmp == me:
            for loop in range(m):
                ijv[loop,:] = (random.randint(0,n-1),random.randint(0,n-1))
                #if ijv[loop,0] > ijv[loop,1]:
                #    ijv[loop,:] = ijv[loop,::-1] # reverse
            result = ga.gather(g_a, ijv)
            for loop in range(m):
                value = ga.get(g_a, ijv[loop], ijv[loop]+1).flatten()
                if not result[loop] == value:
                    ga.error('gather failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Ejemplo n.º 7
0
def time_put(g_a, lo, hi, buf, chunk, jump, local):
    count = 0
    rows = hi[0]-lo[0]
    cols = hi[1]-lo[1]
    shifti = [rows, 0, rows]
    shiftj = [0, cols, cols]
    seconds = time.time()
    # distance between consecutive patches increased by jump
    # to destroy locality of reference
    for ilo in range(lo[0], hi[0]-chunk-jump+1, chunk+jump):
        ihi = ilo + chunk
        for jlo in range(lo[1], hi[1]-chunk-jump+1, chunk+jump):
            jhi = jlo + chunk
            count += 1
            if local:
                llo = [ilo,jlo]
                lhi = [ihi,jhi]
                ga.put(g_a, buf[ga.zip(llo,lhi)], llo, lhi)
            else:
                index = count%3
                llo = [ilo+shifti[index],jlo+shiftj[index]]
                lhi = [ihi+shifti[index],jhi+shiftj[index]]
                ga.put(g_a, buf[ilo:ihi,jlo:jhi], llo, lhi)
    seconds = time.time() - seconds
    return seconds/count
def set_boundary_conditions_put(g_a):
    # process 0 initializes global array
    # this would only set the initial conditions since we are putting an entire
    # zeros array with the outer elements changed
    if rank == 0:
        a = np.zeros((dim,dim), dtype=np.float32)
        a[0,:] = 100 #top row
        a[:,0] = 75 #left column
        a[:,a.shape[0] - 1] = 50 #right column
        ga.put(g_a, a)
    ga.sync()
Ejemplo n.º 9
0
def check_dot(gatype):
    if 0 == me:
        print '> Checking dot ...',
    np.random.seed(12345) # everyone has same seed
    g_a = create_global_array(gatype)
    g_b = create_global_array(gatype)
    a = create_local_a(gatype)
    b = np.random.random_sample((n,n))
    if MIRROR:
        if 0 == iproc:
            ga.put(g_b, b)
            ga.put(g_a, a)
    else:
        if 0 == me:
            ga.put(g_b, b)
            ga.put(g_a, a)
    ga.sync()
    sum1 = np.sum(a*b)
    sum2 = ga.dot(g_a, g_b)
    if mismatch(sum1, sum2):
        ga.error('dot wrong %s != %s' % (sum1, sum2))
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
    ga.destroy(g_b)
Ejemplo n.º 10
0
def check_copy(gatype):
    if 0 == me:
        print '> Checking copy ...',
    g_a = create_global_array(gatype)
    g_b = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.copy(g_a, g_b)
    if not np.all(a == ga.get(g_b)):
        ga.error('copy failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
    ga.destroy(g_b)
Ejemplo n.º 11
0
def check_scale(gatype):
    if 0 == me:
        print '> Checking scale ...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    ga.scale(g_a, 0.123)
    a *= 0.123
    if np.any(np.vectorize(mismatch)(a,ga.get(g_a))):
        ga.error('add failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
def TRANSPOSE1D():
    # Configure array dimensions. Force an unequal data distribution.
    dims = [nprocs*TOTALELEMS + nprocs/2]
    chunk = [TOTALELEMS] # minimum data on each process

    # create a global array g_a and duplicate it to get g_b
    g_a = ga.create(ga.C_INT, dims, "array A", chunk)
    if not g_a: ga.error("create failed: A")
    if not me: print "Created Array A"

    g_b = ga.duplicate(g_a, "array B")
    if not g_b: ga.error("duplicate failed")
    if not me: print "Created Array B"

    # initialize data in g_a
    if not me:
        print "Initializing matrix A"
        ga.put(g_a, np.arange(dims[0], dtype=np.int32))

    # Synchronize all processors to guarantee that everyone has data
    # before proceeding to the next step.
    ga.sync()

    # Start initial phase of inversion by inverting the data held locally on
    # each processor. Start by finding out which data each processor owns.
    lo,hi = ga.distribution(g_a)

    # Get locally held data and copy it into local buffer a
    a = ga.get(g_a, lo, hi)

    # Invert data locally
    b = a[::-1]

    # Invert data globally by copying locally inverted blocks into
    # their inverted positions in the GA
    ga.put(g_b, b, dims[0]-hi[0], dims[0]-lo[0])

    # Synchronize all processors to make sure inversion is complete
    ga.sync()

    # Check to see if inversion is correct
    if not me: verify(g_a, g_b)

    # Deallocate arrays
    ga.destroy(g_a)
    ga.destroy(g_b)
Ejemplo n.º 13
0
def TRANSPOSE1D():
    # Configure array dimensions. Force an unequal data distribution.
    dims = [nprocs * TOTALELEMS + nprocs / 2]
    chunk = [TOTALELEMS]  # minimum data on each process

    # create a global array g_a and duplicate it to get g_b
    g_a = ga.create(ga.C_INT, dims, "array A", chunk)
    if not g_a: ga.error("create failed: A")
    if not me: print "Created Array A"

    g_b = ga.duplicate(g_a, "array B")
    if not g_b: ga.error("duplicate failed")
    if not me: print "Created Array B"

    # initialize data in g_a
    if not me:
        print "Initializing matrix A"
        ga.put(g_a, np.arange(dims[0], dtype=np.int32))

    # Synchronize all processors to guarantee that everyone has data
    # before proceeding to the next step.
    ga.sync()

    # Start initial phase of inversion by inverting the data held locally on
    # each processor. Start by finding out which data each processor owns.
    lo, hi = ga.distribution(g_a)

    # Get locally held data and copy it into local buffer a
    a = ga.get(g_a, lo, hi)

    # Invert data locally
    b = a[::-1]

    # Invert data globally by copying locally inverted blocks into
    # their inverted positions in the GA
    ga.put(g_b, b, dims[0] - hi[0], dims[0] - lo[0])

    # Synchronize all processors to make sure inversion is complete
    ga.sync()

    # Check to see if inversion is correct
    if not me: verify(g_a, g_b)

    # Deallocate arrays
    ga.destroy(g_a)
    ga.destroy(g_b)
Ejemplo n.º 14
0
def time_put1(g_a, lo, hi, buf, chunk, jump, local):
    count = 0
    rows = hi[0]-lo[0]
    shift = [rows, 2*rows, 3*rows]
    seconds = time.time()
    # distance between consecutive patches increased by jump
    # to destroy locality of reference
    for ilo in range(lo[0], hi[0]-chunk-jump+1, chunk+jump):
        ihi = ilo+chunk
        count += 1
        if local:
            ga.put(g_a, buf[ilo:ihi], [ilo], [ihi])
        else:
            index = count%3
            llo = ilo+shift[index]
            lhi = ihi+shift[index]
            ga.put(g_a, buf[ilo:ihi], llo, lhi)
    seconds = time.time() - seconds
    return seconds/count
Ejemplo n.º 15
0
def time_put1(g_a, lo, hi, buf, chunk, jump, local):
    count = 0
    rows = hi[0] - lo[0]
    shift = [rows, 2 * rows, 3 * rows]
    seconds = time.time()
    # distance between consecutive patches increased by jump
    # to destroy locality of reference
    for ilo in range(lo[0], hi[0] - chunk - jump + 1, chunk + jump):
        ihi = ilo + chunk
        count += 1
        if local:
            ga.put(g_a, buf[ilo:ihi], [ilo], [ihi])
        else:
            index = count % 3
            llo = ilo + shift[index]
            lhi = ihi + shift[index]
            ga.put(g_a, buf[ilo:ihi], llo, lhi)
    seconds = time.time() - seconds
    return seconds / count
Ejemplo n.º 16
0
def check_accumulate_disjoint(gatype):
    """Each node accumulates into disjoint sections of the array."""
    if 0 == me:
        print '> Checking disjoint accumulate ...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    b = np.fromfunction(lambda i,j: i+j+2, (n,n), dtype=ga.dtype(gatype))
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    inc = (n-1)/20 + 1
    ij = 0
    for i in range(0,n,inc):
        for j in range(0,n,inc):
            x = 10.0
            lo = [i,j]
            hi = [min(i+inc,n), min(j+inc,n)]
            piece = b[ga.zip(lo,hi)]
            check = False
            if MIRROR:
                check = ij % lprocs == iproc
            else:
                check = ij % nproc == me
            if check:
                ga.acc(g_a, piece, lo, hi, x)
            ga.sync()
            ij += 1
            # each process applies all updates to its local copy
            a[ga.zip(lo,hi)] += x * piece
    ga.sync()
    # all nodes check all of a
    if not np.all(ga.get(g_a) == a):
        ga.error('acc failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Ejemplo n.º 17
0
def matrix_multiply():
    # Configure array dimensions. Force an unequal data distribution.
    dims = [TOTALELEMS]*NDIM
    chunk = [TOTALELEMS/nprocs-1]*NDIM

    # Create a global array g_a and duplicate it to get g_b and g_c.
    g_a = ga.create(ga.C_DBL, dims, "array A", chunk)
    if not g_a: ga.error("create failed: A")
    if not me: print "Created Array A"

    g_b = ga.duplicate(g_a, "array B")
    g_c = ga.duplicate(g_a, "array C")
    if not g_b or not g_c: ga.eror("duplicate failed")
    if not me: print "Created Arrays B and C"

    # Initialize data in matrices a and b.
    if not me: print "Initializing matrix A and B"
    a = np.random.rand(*dims)*29
    b = np.random.rand(*dims)*37

    # Copy data to global arrays g_a and g_b.
    if not me:
        ga.put(g_a, a)
        ga.put(g_b, b)

    # Synchronize all processors to make sure everyone has data.
    ga.sync()

    # Determine which block of data is locally owned. Note that
    # the same block is locally owned for all GAs.
    lo,hi = ga.distribution(g_c)

    # Get the blocks from g_a and g_b needed to compute this block in
    # g_c and copy them into the local buffers a and b.
    a = ga.get(g_a, (lo[0],0), (hi[0],dims[0]))
    b = ga.get(g_b, (0,lo[1]), (dims[1],hi[1]))

    # Do local matrix multiplication and store the result in local
    # buffer c. Start by evaluating the transpose of b.
    btrns = b.transpose()

    # Multiply a and b to get c.
    c = np.dot(a,b)

    # Copy c back to g_c.
    ga.put(g_c, c, lo, hi)

    verify(g_a, g_b, g_c)

    # Deallocate arrays.
    ga.destroy(g_a)
    ga.destroy(g_b)
    ga.destroy(g_c)
Ejemplo n.º 18
0
def check_add(gatype):
    if 0 == me:
        print '> Checking add ...',
    g_a = create_global_array(gatype)
    g_b = create_global_array(gatype)
    a = create_local_a(gatype)
    b = create_local_b(gatype)
    alpha = None
    beta = None
    if 0 == me:
        ga.put(g_a, a)
    ga.sync();
    np.random.seed(12345) # everyone has same seed
    if gatype in [ga.C_SCPL,ga.C_DCPL]:
        b_real = np.random.random_sample((n,n))
        b_imag = np.random.random_sample((n,n))
        b[:] = np.vectorize(complex)(b_real,b_imag)
        alpha = complex(0.1,-0.1)
        beta = complex(0.9,-0.9)
    else:
        b[:] = np.random.random_sample((n,n))
        alpha = 0.1
        beta = 0.9
    a = alpha*a + beta*b
    if MIRROR:
        if 0 == iproc:
            ga.put(g_b, b)
    else:
        if 0 == me:
            ga.put(g_b, b)
    ga.sync()
    ga.add(g_a, g_b, g_b, alpha, beta)
    b = ga.get(g_b, buffer=b)
    if np.any(np.vectorize(mismatch)(b,a)):
        ga.error('add failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
    ga.destroy(g_b)
Ejemplo n.º 19
0
    print "\nSuccessfully created Global Array"

# Initialize data in GA. Find data owned by neighboring processor

nghbr = (me+1)%nprocs
lo,hi = ga.distribution(g_a, nghbr)

# Create data in local buffer, assign unique value for each data element
patch_shape = hi-lo
a_buf = np.fromfunction(lambda i,j: j*NSIZE + i,
        patch_shape, dtype=ga.dtype(ga.C_INT))
a_buf += lo[1,np.newaxis]
a_buf += lo[np.newaxis,0]*dims[0]

# Copy local data to GA
ga.put(g_a, a_buf, lo, hi)
ga.sync()
if me == 0:
    print "\nCopied values into Global Array from local buffer\n"

# Check data in GA to see if it is correct. Find data owned by this
# processor and then copy it to local buffer
lo,hi = ga.distribution(g_a, me)
b_buf = ga.get(g_a, lo, hi)
if me == 0:
    print "\nCopied values from Global Array to local buffer\n"

# Verify that data is correct
patch_shape = hi-lo
c_buf = np.fromfunction(lambda i,j: j*NSIZE + i,
        patch_shape, dtype=ga.dtype(ga.C_INT))
Ejemplo n.º 20
0
"""Use ga.access() to sum locally per SMP node."""

import mpi4py.MPI
from ga4py import ga
import numpy as np

# Okay, we create the global array
g_a = ga.create(ga.C_DBL, (3, 4, 5, 6))
if world_id == 0:
    ga.put(g_a, np.arange(3 * 4 * 5 * 6))
ga.sync()

# You're on your own!
Ejemplo n.º 21
0
"""Use ga.access() to sum locally per SMP node."""

import mpi4py.MPI
from ga4py import ga
import numpy as np

# Okay, we create the global array
g_a = ga.create(ga.C_DBL, (3,4,5,6))
if world_id == 0:
    ga.put(g_a, np.arange(3*4*5*6))
ga.sync()

# You're on your own!
frames_seg = np.zeros([size, 2], dtype=int)
for iblock in range(size):
    frames_seg[iblock, :] = iblock * bsize, (iblock + 1) * bsize

d = dict([key, frames_seg[key]] for key in range(size))

start, stop = d[rank][0], d[rank][1]

# Block-RMSD in Parallel
start3 = time.time()
out = block_rmsd(index, topology, trajectory, xref0)

# Communication
start4 = time.time()
print(np.shape(out[0]), start, stop)
ga.put(g_a, out[0], (start, 0), (stop, 2))

start5 = time.time()
if rank == 0:
    buf = ga.get(g_a, lo=None, hi=None)

start6 = time.time()

if rank == 0:
    data = np.zeros([size, 5], dtype=float)
else:
    data = None

comm.Gather(np.array(out[1:], dtype=float), data, root=0)

start7 = time.time()
Ejemplo n.º 23
0
    v = np.dot(a,b)
    val = int(np.abs(np.sum(c-v))>0.0001)
    val = ga.gop_add(val)
    return val == 0

if __name__ == '__main__':
    if nproc > MULTIPLIER**3:
        if 0 == me:
            print "You must use less than %s processors" % (MULTIPLIER**3+1)
    else:
        g_a = ga.create(ga.C_DBL, [N,N])
        g_b = ga.create(ga.C_DBL, [N,N])
        g_c = ga.create(ga.C_DBL, [N,N])
        # put some fake data into input arrays A and B
        if me == 0:
            ga.put(g_a, np.random.random(N*N))
            ga.put(g_b, np.random.random(N*N))
        ga.sync()
        if me == 0:
            print "srumma...",
        srumma(g_a, g_b, g_c, CHUNK_SIZE, MULTIPLIER)
        if me == 0:
            print "done"
        if me == 0:
            print "verifying using ga.gemm...",
        ok = verify_using_ga(g_a, g_b, g_c)
        if me == 0:
            if ok:
                print "OKAY"
            else:
                print "FAILED"
Ejemplo n.º 24
0
def load_ga(handle, h0, num_m, num_k):
    if True:
        ga.put(handle, h0[:num_m*num_k])
    else:
        a = np.arange(num_m*num_k, dtype=np.float64)
        ga.put(handle, a)
    return value < EPSILON

# create GA, distribute entire rows
g_a = ga.create(ga.C_FLOAT, (dim,dim), chunk=(0,dim))
# create a duplicate GA for the convergence test
g_b = ga.duplicate(g_a)

# process 0 initializes global array
# Note: alternatively, each process could initialize its local data using
# ga.access() and ga.distribution()
a = np.zeros((dim,dim), dtype=np.float32)
if rank == 0:
    a[0,:] = 100 #top row
    a[:,0] = 75 #left column
    a[:,a.shape[0] - 1] = 50 #right column
    ga.put(g_a, a)
ga.sync()

# which piece of array do I own?
# note that rhi and chi follow python range conventions i.e. [lo,hi)
(rlo,clo),(rhi,chi) = ga.distribution(g_a)

iteration = 0
start = ga.wtime()
while True:
    iteration += 1
    if iteration % HOW_MANY_STEPS_BEFORE_CONVERGENCE_TEST == 0:
        # check for convergence will occur, so make a copy of the GA
        ga.sync()
        ga.copy(g_a, g_b)
    # the iteration
Ejemplo n.º 26
0
    return val == 0


if __name__ == '__main__':
    if nproc > MULTIPLIER**3:
        if 0 == me:
            print "You must use less than %s processors" % (MULTIPLIER**3 + 1)
    else:
        g_a = ga.create(ga.C_DBL, [N, N])
        g_b = ga.create(ga.C_DBL, [N, N])
        g_c = ga.create(ga.C_DBL, [N, N])
        g_counter = ga.create(ga.C_INT, [1])
        ga.zero(g_counter)
        # put some fake data into input arrays A and B
        if me == 0:
            ga.put(g_a, np.random.random(N * N))
            ga.put(g_b, np.random.random(N * N))
        ga.sync()
        if me == 0:
            print "srumma...",
        srumma(g_a, g_b, g_c, CHUNK_SIZE, MULTIPLIER, g_counter)
        if me == 0:
            print "done"
        if me == 0:
            print "verifying using ga.gemm...",
        ok = verify_using_ga(g_a, g_b, g_c)
        if me == 0:
            if ok:
                print "OKAY"
            else:
                print "FAILED"