def check_gather(gatype):
    if 0 == me:
        print '> Checking gather (might be slow)...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    ijv = np.zeros((m,2), dtype=np.int64)
    random.seed(ga.nodeid()*51 + 1) # different seed for each proc
    for j in range(10):
        itmp = None
        if MIRROR:
            itmp = random.randint(0,lprocs-1)
        else:
            itmp = random.randint(0,nproc-1)
        if itmp == me:
            for loop in range(m):
                ijv[loop,:] = (random.randint(0,n-1),random.randint(0,n-1))
                #if ijv[loop,0] > ijv[loop,1]:
                #    ijv[loop,:] = ijv[loop,::-1] # reverse
            result = ga.gather(g_a, ijv)
            for loop in range(m):
                value = ga.get(g_a, ijv[loop], ijv[loop]+1).flatten()
                if not result[loop] == value:
                    ga.error('gather failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
def check_scatter(gatype):
    nptype = ga.dtype(gatype)
    if 0 == me:
        print '> Checking scatter (might be slow)...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    ijv = np.zeros((m,2), dtype=np.int64)
    v = np.zeros(m, dtype=nptype)
    random.seed(ga.nodeid()*51 + 1) # different seed for each proc
    for j in range(10):
        check = None
        if MIRROR:
            check = random.randint(0,lprocs-1) == iproc
        else:
            check = random.randint(0,nproc-1) == me
        if check:
            for loop in range(m):
                ijv[loop,:] = (random.randint(0,n-1),random.randint(0,n-1))
                v[loop] = ijv[loop,0]+ijv[loop,1]
            ga.scatter(g_a, v, ijv)
            for loop in range(m):
                value = ga.get(g_a, ijv[loop], ijv[loop]+1).flatten()
                if not v[loop] == value:
                    ga.error('scatter failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
def check_dot(gatype):
    if 0 == me:
        print '> Checking dot ...',
    np.random.seed(12345) # everyone has same seed
    g_a = create_global_array(gatype)
    g_b = create_global_array(gatype)
    a = create_local_a(gatype)
    b = np.random.random_sample((n,n))
    if MIRROR:
        if 0 == iproc:
            ga.put(g_b, b)
            ga.put(g_a, a)
    else:
        if 0 == me:
            ga.put(g_b, b)
            ga.put(g_a, a)
    ga.sync()
    sum1 = np.sum(a*b)
    sum2 = ga.dot(g_a, g_b)
    if mismatch(sum1, sum2):
        ga.error('dot wrong %s != %s' % (sum1, sum2))
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
    ga.destroy(g_b)
Exemple #4
0
def srumma(g_a, g_b, g_c, chunk_size, multiplier, g_counter):
    task_list = get_task_list(chunk_size, multiplier)
    ### get first integer from g_counter and assign to 'task_id'
    # the srumma algorithm, more or less
    task_prev = task_list[task_id]
    a_prev,a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi)
    b_prev,b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi)
    ### get next integer from g_counter and assign to 'task_id'
    while task_id < multiplier**3:
        task_next = task_list[task_id]
        a_next,a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi)
        b_next,b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi)
        ga.nbwait(a_nb_prev)
        ga.nbwait(b_nb_prev)
        result = np.dot(a_prev,b_prev)
        ga.acc(g_c, result, task_prev.clo, task_prev.chi)
        task_prev = task_next
        a_prev,a_nb_prev = a_next,a_nb_next
        b_prev,b_nb_prev = b_next,b_nb_next
        ### get next integer from g_counter and assign to 'task_id'
    ga.nbwait(a_nb_prev)
    ga.nbwait(b_nb_prev)
    result = np.dot(a_prev,b_prev)
    ga.acc(g_c, result, task_prev.clo, task_prev.chi)
    ga.sync()
Exemple #5
0
def srumma(g_a, g_b, g_c, chunk_size, multiplier):
    # statically partition the task list among nprocs
    task_list = get_task_list(chunk_size, multiplier)
    ntasks = multiplier**3 // nproc
    start = me * ntasks
    stop = (me + 1) * ntasks
    if me + 1 == nproc:
        stop += multiplier**3 % nproc
    # the srumma algorithm, more or less
    task_prev = task_list[start]
    a_prev, a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi)
    b_prev, b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi)
    for i in range(start + 1, stop):
        task_next = task_list[i]
        a_next, a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi)
        b_next, b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi)
        ga.nbwait(a_nb_prev)
        ga.nbwait(b_nb_prev)
        result = np.dot(a_prev, b_prev)
        ga.acc(g_c, result, task_prev.clo, task_prev.chi)
        task_prev = task_next
        a_prev, a_nb_prev = a_next, a_nb_next
        b_prev, b_nb_prev = b_next, b_nb_next
    ga.nbwait(a_nb_prev)
    ga.nbwait(b_nb_prev)
    result = np.dot(a_prev, b_prev)
    ga.acc(g_c, result, task_prev.clo, task_prev.chi)
    ga.sync()
def check_get(gatype):
    """check nloop random gets from each node"""
    if 0 == me:
        print '> Checking random get (%d calls)...' % nloop
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    nwords = 0
    random.seed(ga.nodeid()*51+1) # different seed for each proc
    for loop in range(nloop):
        ilo,ihi = random.randint(0, nloop-1),random.randint(0, nloop-1)
        if ihi < ilo: ilo,ihi = ihi,ilo
        jlo,jhi = random.randint(0, nloop-1),random.randint(0, nloop-1)
        if jhi < jlo: jlo,jhi = jhi,jlo
        nwords += (ihi-ilo+1)*(jhi-jlo+1)
        ihi += 1
        jhi += 1
        result = ga.get(g_a, (ilo,jlo), (ihi,jhi))
        if not np.all(result == a[ilo:ihi,jlo:jhi]):
            ga.error('random get failed')
        if 0 == me and loop % max(1,nloop/20) == 0:
            print ' call %d node %d checking get((%d,%d),(%d,%d)) total %f' % (
                    loop, me, ilo, ihi, jlo, jhi, nwords)
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Exemple #7
0
def srumma(g_a, g_b, g_c, chunk_size, multiplier, g_counter):
    # statically partition the task list among nprocs
    task_list = get_task_list(chunk_size, multiplier)
    task_id = ga.read_inc(g_counter, 0)
    # the srumma algorithm, more or less
    task_prev = task_list[task_id]
    a_prev, a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi)
    b_prev, b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi)
    task_id = ga.read_inc(g_counter, 0)
    while task_id < multiplier**3:
        task_next = task_list[task_id]
        a_next, a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi)
        b_next, b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi)
        ga.nbwait(a_nb_prev)
        ga.nbwait(b_nb_prev)
        result = np.dot(a_prev, b_prev)
        ga.acc(g_c, result, task_prev.clo, task_prev.chi)
        task_prev = task_next
        a_prev, a_nb_prev = a_next, a_nb_next
        b_prev, b_nb_prev = b_next, b_nb_next
        task_id = ga.read_inc(g_counter, 0)
    ga.nbwait(a_nb_prev)
    ga.nbwait(b_nb_prev)
    result = np.dot(a_prev, b_prev)
    ga.acc(g_c, result, task_prev.clo, task_prev.chi)
    ga.sync()
def check_put_disjoint(gatype):
    """each node fills in disjoint sections of the array"""
    if 0 == me:
        print '> Checking disjoint put ...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    inc = (n-1)/20 + 1
    ij = 0
    for i in range(0,n,inc):
        for j in range(0,n,inc):
            check = False
            if MIRROR:
                check = ij % lprocs == iproc
            else:
                check = ij % nproc == me
            if check:
                lo = [i,j]
                hi = [min(i+inc,n), min(j+inc,n)]
                piece = a[ga.zip(lo,hi)]
                ga.put(g_a, piece, lo, hi)
                # the following check is not part of the original test.F
                result = ga.get(g_a, lo, hi)
                if not np.all(result == piece):
                    ga.error("put followed by get failed", 1)
            ga.sync()
            ij += 1
    ga.sync()
    # all nodes check all of a
    b = ga.get(g_a)
    if not np.all(a == b):
        ga.error('put failed, exiting')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
def create_global_array(gatype):
    if NEW_API:
        g_a = ga.create_handle()
        ga.set_data(g_a, [n,n], gatype)
        ga.set_array_name(g_a, 'a')
        if USE_RESTRICTED:
            num_restricted = nproc/2 or 1
            restricted_list = np.arange(num_restricted) + num_restricted/2
            ga.set_restricted(g_a, restricted_list)
        if BLOCK_CYCLIC:
            if USE_SCALAPACK_DISTR:
                if nproc % 2 == 0:
                    ga.error('Available procs must be divisible by 2',nproc)
                ga.set_block_cyclic_proc_grid(g_a, block_size, proc_grid)
            else:
                ga.set_block_cyclic(g_a, block_size)
        if MIRROR:
            p_mirror = ga.pgroup_get_mirror()
            ga.set_pgroup(g_a, p_mirror)
        ga.allocate(g_a)
    else:
        if MIRROR:
            p_mirror = ga.pgroup_get_mirror()
            ga.create_config(gatype, (n,n), 'a', None, p_mirror)
        else:
            g_a = ga.create(gatype, (n,n), 'a')
    if 0 == g_a:
        ga.error('ga.create failed')
    if MIRROR:
        lproc = me - ga.cluster_procid(inode, 0)
        lo,hi = ga.distribution(g_a, lproc)
    else:
        lo,hi = ga.distribution(g_a, me)
    ga.sync()
    return g_a
def srumma(g_a, g_b, g_c, chunk_size, multiplier, g_counter):
    # statically partition the task list among nprocs
    task_list = get_task_list(chunk_size, multiplier)
    task_id = ga.read_inc(g_counter, 0)
    # the srumma algorithm, more or less
    task_prev = task_list[task_id]
    a_prev,a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi)
    b_prev,b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi)
    task_id = ga.read_inc(g_counter, 0)
    while task_id < multiplier**3:
        task_next = task_list[task_id]
        a_next,a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi)
        b_next,b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi)
        ga.nbwait(a_nb_prev)
        ga.nbwait(b_nb_prev)
        result = np.dot(a_prev,b_prev)
        ga.acc(g_c, result, task_prev.clo, task_prev.chi)
        task_prev = task_next
        a_prev,a_nb_prev = a_next,a_nb_next
        b_prev,b_nb_prev = b_next,b_nb_next
        task_id = ga.read_inc(g_counter, 0)
    ga.nbwait(a_nb_prev)
    ga.nbwait(b_nb_prev)
    result = np.dot(a_prev,b_prev)
    ga.acc(g_c, result, task_prev.clo, task_prev.chi)
    ga.sync()
Exemple #11
0
def check_fence_and_lock(gatype):
    if 0 == me:
        print '> Checking ga.fence and ga.lock',
    g_a = create_global_array(gatype)
    ga.zero(g_a)
    if not ga.create_mutexes(1):
        ga.error('ga.create_mutexes failed')
    if n < 2:
        ga.error('insufficient n to test ga.fence', n)
    ga.lock(0)
    a = ga.get(g_a) # get original values
    a[:,0] += 1 # add my contribution
    # need to use fence to assure that coms complete before leaving
    # critical section
    ga.init_fence()
    ga.put(g_a, a)
    ga.fence()
    ga.unlock(0)
    if not ga.destroy_mutexes():
        ga.error('mutex not destroyed')
    ga.sync()
    if 0 == me:
        a = ga.get(g_a)
        if not np.all(a[:,0] == nproc):
            ga.error('fence failed')
    if 0 == me:
        print 'OK'
def srumma(g_a, g_b, g_c, chunk_size, multiplier):
    # statically partition the task list among nprocs
    task_list = get_task_list(chunk_size, multiplier)
    ntasks = multiplier**3 // nproc
    start = me*ntasks
    stop = (me+1)*ntasks
    if me+1 == nproc:
        stop += multiplier**3 % nproc
    # the srumma algorithm, more or less
    task_prev = task_list[start]
    a_prev,a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi)
    b_prev,b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi)
    for i in range(start+1,stop):
        task_next = task_list[i]
        a_next,a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi)
        b_next,b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi)
        ga.nbwait(a_nb_prev)
        ga.nbwait(b_nb_prev)
        result = np.dot(a_prev,b_prev)
        ga.acc(g_c, result, task_prev.clo, task_prev.chi)
        task_prev = task_next
        a_prev,a_nb_prev = a_next,a_nb_next
        b_prev,b_nb_prev = b_next,b_nb_next
    ga.nbwait(a_nb_prev)
    ga.nbwait(b_nb_prev)
    result = np.dot(a_prev,b_prev)
    ga.acc(g_c, result, task_prev.clo, task_prev.chi)
    ga.sync()
def set_boundary_conditions_put(g_a):
    # process 0 initializes global array
    # this would only set the initial conditions since we are putting an entire
    # zeros array with the outer elements changed
    if rank == 0:
        a = np.zeros((dim,dim), dtype=np.float32)
        a[0,:] = 100 #top row
        a[:,0] = 75 #left column
        a[:,a.shape[0] - 1] = 50 #right column
        ga.put(g_a, a)
    ga.sync()
Exemple #14
0
def matrix_multiply():
    # Configure array dimensions. Force an unequal data distribution.
    dims = [TOTALELEMS]*NDIM
    chunk = [TOTALELEMS/nprocs-1]*NDIM

    # Create a global array g_a and duplicate it to get g_b and g_c.
    g_a = ga.create(ga.C_DBL, dims, "array A", chunk)
    if not g_a: ga.error("create failed: A")
    if not me: print "Created Array A"

    g_b = ga.duplicate(g_a, "array B")
    g_c = ga.duplicate(g_a, "array C")
    if not g_b or not g_c: ga.eror("duplicate failed")
    if not me: print "Created Arrays B and C"

    # Initialize data in matrices a and b.
    if not me: print "Initializing matrix A and B"
    a = np.random.rand(*dims)*29
    b = np.random.rand(*dims)*37

    # Copy data to global arrays g_a and g_b.
    if not me:
        ga.put(g_a, a)
        ga.put(g_b, b)

    # Synchronize all processors to make sure everyone has data.
    ga.sync()

    # Determine which block of data is locally owned. Note that
    # the same block is locally owned for all GAs.
    lo,hi = ga.distribution(g_c)

    # Get the blocks from g_a and g_b needed to compute this block in
    # g_c and copy them into the local buffers a and b.
    a = ga.get(g_a, (lo[0],0), (hi[0],dims[0]))
    b = ga.get(g_b, (0,lo[1]), (dims[1],hi[1]))

    # Do local matrix multiplication and store the result in local
    # buffer c. Start by evaluating the transpose of b.
    btrns = b.transpose()

    # Multiply a and b to get c.
    c = np.dot(a,b)

    # Copy c back to g_c.
    ga.put(g_c, c, lo, hi)

    verify(g_a, g_b, g_c)

    # Deallocate arrays.
    ga.destroy(g_a)
    ga.destroy(g_b)
    ga.destroy(g_c)
def set_boundary_conditions_access(g_a):
    # this will reset the outer (ghost) elements back to the boundary cond.
    if rlo == 0 or clo == 0 or chi == dim:
        a = ga.access_ghosts(g_a)
        if rlo == 0:
            a[0,:] = 100 # I own a top row
        if clo == 0:
            a[:,0] = 75 # I own a left column
        if chi == dim:
            a[:,-1] = 50 # I own a right column
        ga.release_update_ghosts(g_a)
    ga.sync()
Exemple #16
0
def check_scale(gatype):
    if 0 == me:
        print '> Checking scale ...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    ga.scale(g_a, 0.123)
    a *= 0.123
    if np.any(np.vectorize(mismatch)(a,ga.get(g_a))):
        ga.error('add failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
def TRANSPOSE1D():
    # Configure array dimensions. Force an unequal data distribution.
    dims = [nprocs*TOTALELEMS + nprocs/2]
    chunk = [TOTALELEMS] # minimum data on each process

    # create a global array g_a and duplicate it to get g_b
    g_a = ga.create(ga.C_INT, dims, "array A", chunk)
    if not g_a: ga.error("create failed: A")
    if not me: print "Created Array A"

    g_b = ga.duplicate(g_a, "array B")
    if not g_b: ga.error("duplicate failed")
    if not me: print "Created Array B"

    # initialize data in g_a
    if not me:
        print "Initializing matrix A"
        ga.put(g_a, np.arange(dims[0], dtype=np.int32))

    # Synchronize all processors to guarantee that everyone has data
    # before proceeding to the next step.
    ga.sync()

    # Start initial phase of inversion by inverting the data held locally on
    # each processor. Start by finding out which data each processor owns.
    lo,hi = ga.distribution(g_a)

    # Get locally held data and copy it into local buffer a
    a = ga.get(g_a, lo, hi)

    # Invert data locally
    b = a[::-1]

    # Invert data globally by copying locally inverted blocks into
    # their inverted positions in the GA
    ga.put(g_b, b, dims[0]-hi[0], dims[0]-lo[0])

    # Synchronize all processors to make sure inversion is complete
    ga.sync()

    # Check to see if inversion is correct
    if not me: verify(g_a, g_b)

    # Deallocate arrays
    ga.destroy(g_a)
    ga.destroy(g_b)
def TRANSPOSE1D():
    # Configure array dimensions. Force an unequal data distribution.
    dims = [nprocs * TOTALELEMS + nprocs / 2]
    chunk = [TOTALELEMS]  # minimum data on each process

    # create a global array g_a and duplicate it to get g_b
    g_a = ga.create(ga.C_INT, dims, "array A", chunk)
    if not g_a: ga.error("create failed: A")
    if not me: print "Created Array A"

    g_b = ga.duplicate(g_a, "array B")
    if not g_b: ga.error("duplicate failed")
    if not me: print "Created Array B"

    # initialize data in g_a
    if not me:
        print "Initializing matrix A"
        ga.put(g_a, np.arange(dims[0], dtype=np.int32))

    # Synchronize all processors to guarantee that everyone has data
    # before proceeding to the next step.
    ga.sync()

    # Start initial phase of inversion by inverting the data held locally on
    # each processor. Start by finding out which data each processor owns.
    lo, hi = ga.distribution(g_a)

    # Get locally held data and copy it into local buffer a
    a = ga.get(g_a, lo, hi)

    # Invert data locally
    b = a[::-1]

    # Invert data globally by copying locally inverted blocks into
    # their inverted positions in the GA
    ga.put(g_b, b, dims[0] - hi[0], dims[0] - lo[0])

    # Synchronize all processors to make sure inversion is complete
    ga.sync()

    # Check to see if inversion is correct
    if not me: verify(g_a, g_b)

    # Deallocate arrays
    ga.destroy(g_a)
    ga.destroy(g_b)
Exemple #19
0
def TestPutGetAcc(g_a, n, chunk, buf, lo, hi, local):
    if 0 == me:
        print ''
        if local:
            print 'Local 2-D Array Section'
        else:
            print 'Remote 2-D Array Section'
        print '%15s %19s %19s %19s' % ('section', 'get', 'put', 'accumulate')
        print '%7s %7s %9s %9s %9s %9s %9s %9s' % (
                'bytes','dim','usec','MB/s','usec','MB/s','usec','MB/s')
    ga.sync()
    bytes = 0
    jump = 0
    num_chunks = len(chunk)
    for loop in range(num_chunks):
        tg = 0.0
        tp = 0.0
        ta = 0.0
        bytes = 8*chunk[loop]*chunk[loop] # how much data is accessed
        jump = n/(60*(loop+1)) # jump between consecutive patches
        if loop+1 == num_chunks:
            jump = 0
        # everybody touches own data
        ga.fill(g_a, me*(loop+1), [0,0], [n,n])
        if 0 == me:
            tg = time_get(g_a, lo, hi, buf, chunk[loop], jump, local)
        else:
            time.sleep(1)
        # everybody touches own data
        ga.fill(g_a, me*(loop+1), [0,0], [n,n])
        if 0 == me:
            tp = time_put(g_a, lo, hi, buf, chunk[loop], jump, local)
        else:
            time.sleep(1)
        # everybody touches own data
        ga.fill(g_a, me*(loop+1), [0,0], [n,n])
        if 0 == me:
            ta = time_acc(g_a, lo, hi, buf, chunk[loop], jump, local)
        else:
            time.sleep(1)
        if 0 == me:
            print '%7d %7d %8.3e %8.3e %8.3e %8.3e %8.3e %8.3e' % (
                    bytes, chunk[loop],
                    tg/1e-6, 1e-6*bytes/tg,
                    tp/1e-6, 1e-6*bytes/tp,
                    ta/1e-6, 1e-6*bytes/ta)
Exemple #20
0
def TestPutGetAcc(g_a, n, chunk, buf, lo, hi, local):
    if 0 == me:
        print ''
        if local:
            print 'Local 2-D Array Section'
        else:
            print 'Remote 2-D Array Section'
        print '%15s %19s %19s %19s' % ('section', 'get', 'put', 'accumulate')
        print '%7s %7s %9s %9s %9s %9s %9s %9s' % (
            'bytes', 'dim', 'usec', 'MB/s', 'usec', 'MB/s', 'usec', 'MB/s')
    ga.sync()
    bytes = 0
    jump = 0
    num_chunks = len(chunk)
    for loop in range(num_chunks):
        tg = 0.0
        tp = 0.0
        ta = 0.0
        bytes = 8 * chunk[loop] * chunk[loop]  # how much data is accessed
        jump = n / (60 * (loop + 1))  # jump between consecutive patches
        if loop + 1 == num_chunks:
            jump = 0
        # everybody touches own data
        ga.fill(g_a, me * (loop + 1), [0, 0], [n, n])
        if 0 == me:
            tg = time_get(g_a, lo, hi, buf, chunk[loop], jump, local)
        else:
            time.sleep(1)
        # everybody touches own data
        ga.fill(g_a, me * (loop + 1), [0, 0], [n, n])
        if 0 == me:
            tp = time_put(g_a, lo, hi, buf, chunk[loop], jump, local)
        else:
            time.sleep(1)
        # everybody touches own data
        ga.fill(g_a, me * (loop + 1), [0, 0], [n, n])
        if 0 == me:
            ta = time_acc(g_a, lo, hi, buf, chunk[loop], jump, local)
        else:
            time.sleep(1)
        if 0 == me:
            print '%7d %7d %8.3e %8.3e %8.3e %8.3e %8.3e %8.3e' % (
                bytes, chunk[loop], tg / 1e-6, 1e-6 * bytes / tg, tp / 1e-6,
                1e-6 * bytes / tp, ta / 1e-6, 1e-6 * bytes / ta)
Exemple #21
0
def verify(g_a, g_b, g_c):
    g_chk = ga.duplicate(g_a, "array check")
    if not g_chk: ga.error("duplicate failed")
    ga.sync()

    ga.gemm(False, False, TOTALELEMS, TOTALELEMS, TOTALELEMS, 1.0, g_a, g_b,
            0.0, g_chk);
    ga.sync()

    ga.add(g_c, g_chk, g_chk, 1.0, -1.0)
    rchk = ga.dot(g_chk, g_chk)

    if not me:
        print "Normed difference in matrices: %12.4f" % rchk
        if not (-TOLERANCE < rchk < TOLERANCE):
            ga.error("Matrix multiply verify failed")
        else:
            print "Matrix Multiply OK"

    ga.destroy(g_chk)
Exemple #22
0
def check_accumulate_overlap(gatype):
    if 0 == me:
        print '> Checking overlapping accumulate ...',
    g_a = create_global_array(gatype)
    ga.zero(g_a)
    ga.acc(g_a, [1], (n/2,n/2), (n/2+1,n/2+1), 1)
    ga.sync()
    if MIRROR:
        if 0 == iproc:
            x = abs(ga.get(g_a, (n/2,n/2), (n/2+1,n/2+1))[0,0] - lprocs)
            if not 0 == x:
                ga.error('overlapping accumulate failed -- expected %s got %s'%(
                        x, lprocs))
    else:
        if 0 == me:
            x = abs(ga.get(g_a, (n/2,n/2), (n/2+1,n/2+1))[0,0] - nproc)
            if not 0 == x:
                ga.error('overlapping accumulate failed -- expected %s got %s'%(
                        x, nproc))
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
Exemple #23
0
def check_add(gatype):
    if 0 == me:
        print '> Checking add ...',
    g_a = create_global_array(gatype)
    g_b = create_global_array(gatype)
    a = create_local_a(gatype)
    b = create_local_b(gatype)
    alpha = None
    beta = None
    if 0 == me:
        ga.put(g_a, a)
    ga.sync();
    np.random.seed(12345) # everyone has same seed
    if gatype in [ga.C_SCPL,ga.C_DCPL]:
        b_real = np.random.random_sample((n,n))
        b_imag = np.random.random_sample((n,n))
        b[:] = np.vectorize(complex)(b_real,b_imag)
        alpha = complex(0.1,-0.1)
        beta = complex(0.9,-0.9)
    else:
        b[:] = np.random.random_sample((n,n))
        alpha = 0.1
        beta = 0.9
    a = alpha*a + beta*b
    if MIRROR:
        if 0 == iproc:
            ga.put(g_b, b)
    else:
        if 0 == me:
            ga.put(g_b, b)
    ga.sync()
    ga.add(g_a, g_b, g_b, alpha, beta)
    b = ga.get(g_b, buffer=b)
    if np.any(np.vectorize(mismatch)(b,a)):
        ga.error('add failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
    ga.destroy(g_b)
Exemple #24
0
def check_accumulate_disjoint(gatype):
    """Each node accumulates into disjoint sections of the array."""
    if 0 == me:
        print '> Checking disjoint accumulate ...',
    g_a = create_global_array(gatype)
    a = create_local_a(gatype)
    b = np.fromfunction(lambda i,j: i+j+2, (n,n), dtype=ga.dtype(gatype))
    if 0 == me:
        ga.put(g_a, a)
    ga.sync()
    inc = (n-1)/20 + 1
    ij = 0
    for i in range(0,n,inc):
        for j in range(0,n,inc):
            x = 10.0
            lo = [i,j]
            hi = [min(i+inc,n), min(j+inc,n)]
            piece = b[ga.zip(lo,hi)]
            check = False
            if MIRROR:
                check = ij % lprocs == iproc
            else:
                check = ij % nproc == me
            if check:
                ga.acc(g_a, piece, lo, hi, x)
            ga.sync()
            ij += 1
            # each process applies all updates to its local copy
            a[ga.zip(lo,hi)] += x * piece
    ga.sync()
    # all nodes check all of a
    if not np.all(ga.get(g_a) == a):
        ga.error('acc failed')
    if 0 == me:
        print 'OK'
    ga.destroy(g_a)
me = ga.nodeid()
nproc = ga.nnodes()

def parallel_task():
    me = ga.pgroup_nodeid()
    nproc = ga.pgroup_nnodes()
    if not me:
        print "This is process 0 on group %s" % ga.pgroup_get_default()
    g_a = ga.create(ga.C_DBL, (3,4,5))
    ga.randomize(g_a)
    if me == 0:
        print np.sum(ga.access(g_a))

midproc = nproc//2
proclist_first = range(0,midproc)
proclist_last  = range(midproc,nproc)
group_id_first = ga.pgroup_create(proclist_first)
group_id_last  = ga.pgroup_create(proclist_last)
if me in proclist_first:
    ga.pgroup_set_default(group_id_first)
    parallel_task()
ga.pgroup_set_default(ga.pgroup_get_world())
ga.sync()
if me in proclist_last:
    ga.pgroup_set_default(group_id_last)
    parallel_task()
ga.pgroup_set_default(ga.pgroup_get_world())
ga.sync()
if not me:
    print "All done with groups"
Exemple #26
0
"""Use ga.access() to sum locally per SMP node."""

import mpi4py.MPI
from ga4py import ga
import numpy as np

# Okay, we create the global array
g_a = ga.create(ga.C_DBL, (3, 4, 5, 6))
if world_id == 0:
    ga.put(g_a, np.arange(3 * 4 * 5 * 6))
ga.sync()

# You're on your own!
def print_sync(obj):
    for proc in range(size):
        if rank == proc:
            print "%d: %s" % (proc, obj)
        ga.sync()
Exemple #28
0
def main():
    # TODO there's got to be a loopless, more pythonic way to do this
    ii = 0
    for i in range(num1*num1):
        ii += 1
        if ii > num1:
            ii = 0
        h0[i] = ii
    # compute times assuming 500 mflops and 5 second target time
    # ntimes = max(3.0, 5.0/(4.0-9*num**3))
    ntimes = 5

    for ii in range(howmany):
        num_m = nums_m[ii]
        num_n = nums_n[ii]
        num_k = nums_k[ii]
        a = 0.5/(num_m*num_n)
        if num_m > nummax or num_n > nummax or num_k > nummax:
            ga.error('Insufficient memory: check nummax')
        
        if BLOCK_CYCLIC:
            block_size = [128,128]
            g_c = ga.create_handle()
            ga.set_data(g_c, (num_m,num_n), ga.C_DBL)
            ga.set_array_name(g_c, 'g_c')
            ga.set_block_cyclic(g_c, block_size)
            if not ga.allocate(g_c):
                ga.error('create failed')
            block_size = [128,128]
            g_b = ga.create_handle()
            ga.set_data(g_b, (num_k,num_n), ga.C_DBL)
            ga.set_array_name(g_b, 'g_b')
            ga.set_block_cyclic(g_b, block_size)
            if not ga.allocate(g_b):
                ga.error('create failed')
            block_size = [128,128]
            g_a = ga.create_handle()
            ga.set_data(g_a, (num_m,num_k), ga.C_DBL)
            ga.set_array_name(g_a, 'g_a')
            ga.set_block_cyclic(g_a, block_size)
            if not ga.allocate(g_a):
                ga.error('create failed')
        else:
            g_a = ga.create(ga.C_DBL, (num_m,num_k), 'g_a')
            g_b = ga.create(ga.C_DBL, (num_k,num_n), 'g_b')
            g_c = ga.create(ga.C_DBL, (num_m,num_n), 'g_c')
            for handle in [g_a,g_b,g_c]:
                if 0 == handle:
                    ga.error('create failed')

        # initialize matrices A and B
        if 0 == me:
            load_ga(g_a, h0, num_m, num_k)
            load_ga(g_b, h0, num_k, num_n)
        ga.zero(g_c)
        ga.sync()

        if 0 == me:
            print '\nMatrix Multiplication C = A[%d,%d] x B[%d,%d]\n' % (
                    num_m, num_k, num_k, num_n)
            print ' %4s  %12s  %12s  %7s  %7s'%(
                    "Run#", "Time (seconds)", "mflops/proc",
                    "A trans", "B trans")
        avg_t[:] = 0
        avg_mf[:] = 0
        for itime in range(ntimes):
            for i in range(ntrans):
                ga.sync()
                ta = transa[i]
                tb = transb[i]
                t1 = time.time()
                ga.gemm(ta,tb,num_m,num_n,num_k,1,g_a,g_b,0,g_c)
                t1 = time.time() - t1
                if 0 == me:
                    mf = 2*num_m*num_n*num_k/t1*10**-6/nproc
                    avg_t[i] += t1
                    avg_mf[i] += mf
                    print ' %4d  %12.4f  %12.1f  %7s  %7s'%(
                            itime+1, t1, mf, ta, tb)
                    if VERIFY and itime == 0:
                        verify_ga_gemm(ta, tb, num_m, num_n, num_k,
                                1.0, g_a, g_b, 0.0, g_c)
        if 0 == me:
            print ''
            for i in range(ntrans):
                print 'Average: %12.4f seconds %12.1f mflops/proc %s %s'%(
                            avg_t[i]/ntimes, avg_mf[i]/ntimes,
                            transa[i], transb[i])
            if VERIFY:
                print 'All ga.gemms are verified...O.K.'