Beispiel #1
0
def srumma(g_a, g_b, g_c, chunk_size, multiplier, g_counter):
    # statically partition the task list among nprocs
    task_list = get_task_list(chunk_size, multiplier)
    task_id = ga.read_inc(g_counter, 0)
    # the srumma algorithm, more or less
    task_prev = task_list[task_id]
    a_prev,a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi)
    b_prev,b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi)
    task_id = ga.read_inc(g_counter, 0)
    while task_id < multiplier**3:
        task_next = task_list[task_id]
        a_next,a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi)
        b_next,b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi)
        ga.nbwait(a_nb_prev)
        ga.nbwait(b_nb_prev)
        result = np.dot(a_prev,b_prev)
        ga.acc(g_c, result, task_prev.clo, task_prev.chi)
        task_prev = task_next
        a_prev,a_nb_prev = a_next,a_nb_next
        b_prev,b_nb_prev = b_next,b_nb_next
        task_id = ga.read_inc(g_counter, 0)
    ga.nbwait(a_nb_prev)
    ga.nbwait(b_nb_prev)
    result = np.dot(a_prev,b_prev)
    ga.acc(g_c, result, task_prev.clo, task_prev.chi)
    ga.sync()
Beispiel #2
0
def srumma(g_a, g_b, g_c, chunk_size, multiplier):
    # statically partition the task list among nprocs
    task_list = get_task_list(chunk_size, multiplier)
    ntasks = multiplier**3 // nproc
    start = me*ntasks
    stop = (me+1)*ntasks
    if me+1 == nproc:
        stop += multiplier**3 % nproc
    # the srumma algorithm, more or less
    task_prev = task_list[start]
    a_prev,a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi)
    b_prev,b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi)
    for i in range(start+1,stop):
        task_next = task_list[i]
        a_next,a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi)
        b_next,b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi)
        ga.nbwait(a_nb_prev)
        ga.nbwait(b_nb_prev)
        result = np.dot(a_prev,b_prev)
        ga.acc(g_c, result, task_prev.clo, task_prev.chi)
        task_prev = task_next
        a_prev,a_nb_prev = a_next,a_nb_next
        b_prev,b_nb_prev = b_next,b_nb_next
    ga.nbwait(a_nb_prev)
    ga.nbwait(b_nb_prev)
    result = np.dot(a_prev,b_prev)
    ga.acc(g_c, result, task_prev.clo, task_prev.chi)
    ga.sync()
Beispiel #3
0
def srumma(g_a, g_b, g_c, chunk_size, multiplier, g_counter):
    task_list = get_task_list(chunk_size, multiplier)
    ### get first integer from g_counter and assign to 'task_id'
    # the srumma algorithm, more or less
    task_prev = task_list[task_id]
    a_prev,a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi)
    b_prev,b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi)
    ### get next integer from g_counter and assign to 'task_id'
    while task_id < multiplier**3:
        task_next = task_list[task_id]
        a_next,a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi)
        b_next,b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi)
        ga.nbwait(a_nb_prev)
        ga.nbwait(b_nb_prev)
        result = np.dot(a_prev,b_prev)
        ga.acc(g_c, result, task_prev.clo, task_prev.chi)
        task_prev = task_next
        a_prev,a_nb_prev = a_next,a_nb_next
        b_prev,b_nb_prev = b_next,b_nb_next
        ### get next integer from g_counter and assign to 'task_id'
    ga.nbwait(a_nb_prev)
    ga.nbwait(b_nb_prev)
    result = np.dot(a_prev,b_prev)
    ga.acc(g_c, result, task_prev.clo, task_prev.chi)
    ga.sync()
Beispiel #4
0
def srumma(g_a, g_b, g_c, chunk_size, multiplier, g_counter):
    # statically partition the task list among nprocs
    task_list = get_task_list(chunk_size, multiplier)
    task_id = ga.read_inc(g_counter, 0)
    # the srumma algorithm, more or less
    task_prev = task_list[task_id]
    a_prev, a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi)
    b_prev, b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi)
    task_id = ga.read_inc(g_counter, 0)
    while task_id < multiplier**3:
        task_next = task_list[task_id]
        a_next, a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi)
        b_next, b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi)
        ga.nbwait(a_nb_prev)
        ga.nbwait(b_nb_prev)
        result = np.dot(a_prev, b_prev)
        ga.acc(g_c, result, task_prev.clo, task_prev.chi)
        task_prev = task_next
        a_prev, a_nb_prev = a_next, a_nb_next
        b_prev, b_nb_prev = b_next, b_nb_next
        task_id = ga.read_inc(g_counter, 0)
    ga.nbwait(a_nb_prev)
    ga.nbwait(b_nb_prev)
    result = np.dot(a_prev, b_prev)
    ga.acc(g_c, result, task_prev.clo, task_prev.chi)
    ga.sync()
Beispiel #5
0
def srumma(g_a, g_b, g_c, chunk_size, multiplier):
    # statically partition the task list among nprocs
    task_list = get_task_list(chunk_size, multiplier)
    ntasks = multiplier**3 // nproc
    start = me * ntasks
    stop = (me + 1) * ntasks
    if me + 1 == nproc:
        stop += multiplier**3 % nproc
    # the srumma algorithm, more or less
    task_prev = task_list[start]
    a_prev, a_nb_prev = ga.nbget(g_a, task_prev.alo, task_prev.ahi)
    b_prev, b_nb_prev = ga.nbget(g_b, task_prev.blo, task_prev.bhi)
    for i in range(start + 1, stop):
        task_next = task_list[i]
        a_next, a_nb_next = ga.nbget(g_a, task_next.alo, task_next.ahi)
        b_next, b_nb_next = ga.nbget(g_b, task_next.blo, task_next.bhi)
        ga.nbwait(a_nb_prev)
        ga.nbwait(b_nb_prev)
        result = np.dot(a_prev, b_prev)
        ga.acc(g_c, result, task_prev.clo, task_prev.chi)
        task_prev = task_next
        a_prev, a_nb_prev = a_next, a_nb_next
        b_prev, b_nb_prev = b_next, b_nb_next
    ga.nbwait(a_nb_prev)
    ga.nbwait(b_nb_prev)
    result = np.dot(a_prev, b_prev)
    ga.acc(g_c, result, task_prev.clo, task_prev.chi)
    ga.sync()
Beispiel #6
0
def matrix_multiply():
    # Configure array dimensions. Force an unequal data distribution.
    dims = [TOTALELEMS] * NDIM
    chunk = [TOTALELEMS / nprocs - 1] * NDIM

    # Create a global array g_a and duplicate it to get g_b and g_c.
    g_a = ga.create(ga.C_DBL, dims, "array A", chunk)
    if not g_a: ga.error("create failed: A")
    if not me: print "Created Array A"

    g_b = ga.duplicate(g_a, "array B")
    g_c = ga.duplicate(g_a, "array C")
    if not g_b or not g_c: ga.eror("duplicate failed")
    if not me: print "Created Arrays B and C"

    # Initialize data in matrices a and b.
    if not me: print "Initializing matrix A and B"
    a = np.random.rand(*dims) * 29
    b = np.random.rand(*dims) * 37

    # Copy data to global arrays g_a and g_b.
    if not me:
        ga.put(g_a, a)
        ga.put(g_b, b)

    # Synchronize all processors to make sure everyone has data.
    ga.sync()

    # Determine which block of data is locally owned. Note that
    # the same block is locally owned for all GAs.
    lo, hi = ga.distribution(g_c)

    # Get the blocks from g_a and g_b needed to compute this block in
    # g_c and copy them into the local buffers a and b.
    a = ga.get(g_a, (lo[0], 0), (hi[0], dims[0]))
    b = ga.get(g_b, (0, lo[1]), (dims[1], hi[1]))

    # Do local matrix multiplication and store the result in local
    # buffer c. Start by evaluating the transpose of b.
    btrns = b.transpose()

    # Multiply a and b to get c.
    c = np.dot(a, b)

    # Copy c back to g_c.
    ga.put(g_c, c, lo, hi)

    verify(g_a, g_b, g_c)

    # Deallocate arrays.
    ga.destroy(g_a)
    ga.destroy(g_b)
    ga.destroy(g_c)
Beispiel #7
0
def matrix_multiply():
    # Configure array dimensions. Force an unequal data distribution.
    dims = [TOTALELEMS]*NDIM
    chunk = [TOTALELEMS/nprocs-1]*NDIM

    # Create a global array g_a and duplicate it to get g_b and g_c.
    g_a = ga.create(ga.C_DBL, dims, "array A", chunk)
    if not g_a: ga.error("create failed: A")
    if not me: print "Created Array A"

    g_b = ga.duplicate(g_a, "array B")
    g_c = ga.duplicate(g_a, "array C")
    if not g_b or not g_c: ga.eror("duplicate failed")
    if not me: print "Created Arrays B and C"

    # Initialize data in matrices a and b.
    if not me: print "Initializing matrix A and B"
    a = np.random.rand(*dims)*29
    b = np.random.rand(*dims)*37

    # Copy data to global arrays g_a and g_b.
    if not me:
        ga.put(g_a, a)
        ga.put(g_b, b)

    # Synchronize all processors to make sure everyone has data.
    ga.sync()

    # Determine which block of data is locally owned. Note that
    # the same block is locally owned for all GAs.
    lo,hi = ga.distribution(g_c)

    # Get the blocks from g_a and g_b needed to compute this block in
    # g_c and copy them into the local buffers a and b.
    a = ga.get(g_a, (lo[0],0), (hi[0],dims[0]))
    b = ga.get(g_b, (0,lo[1]), (dims[1],hi[1]))

    # Do local matrix multiplication and store the result in local
    # buffer c. Start by evaluating the transpose of b.
    btrns = b.transpose()

    # Multiply a and b to get c.
    c = np.dot(a,b)

    # Copy c back to g_c.
    ga.put(g_c, c, lo, hi)

    verify(g_a, g_b, g_c)

    # Deallocate arrays.
    ga.destroy(g_a)
    ga.destroy(g_b)
    ga.destroy(g_c)
Beispiel #8
0
def TRANSPOSE1D():
    # Configure array dimensions. Force an unequal data distribution.
    dims = [nprocs * TOTALELEMS + nprocs / 2]
    chunk = [TOTALELEMS]  # minimum data on each process

    # create a global array g_a and duplicate it to get g_b
    g_a = ga.create(ga.C_INT, dims, "array A", chunk)
    if not g_a: ga.error("create failed: A")
    if not me: print "Created Array A"

    g_b = ga.duplicate(g_a, "array B")
    if not g_b: ga.error("duplicate failed")
    if not me: print "Created Array B"

    # initialize data in g_a
    if not me:
        print "Initializing matrix A"
        ga.put(g_a, np.arange(dims[0], dtype=np.int32))

    # Synchronize all processors to guarantee that everyone has data
    # before proceeding to the next step.
    ga.sync()

    # Start initial phase of inversion by inverting the data held locally on
    # each processor. Start by finding out which data each processor owns.
    lo, hi = ga.distribution(g_a)

    # Get locally held data and copy it into local buffer a
    a = ga.get(g_a, lo, hi)

    # Invert data locally
    b = a[::-1]

    # Invert data globally by copying locally inverted blocks into
    # their inverted positions in the GA
    ga.put(g_b, b, dims[0] - hi[0], dims[0] - lo[0])

    # Synchronize all processors to make sure inversion is complete
    ga.sync()

    # Check to see if inversion is correct
    if not me: verify(g_a, g_b)

    # Deallocate arrays
    ga.destroy(g_a)
    ga.destroy(g_b)
Beispiel #9
0
def TRANSPOSE1D():
    # Configure array dimensions. Force an unequal data distribution.
    dims = [nprocs*TOTALELEMS + nprocs/2]
    chunk = [TOTALELEMS] # minimum data on each process

    # create a global array g_a and duplicate it to get g_b
    g_a = ga.create(ga.C_INT, dims, "array A", chunk)
    if not g_a: ga.error("create failed: A")
    if not me: print "Created Array A"

    g_b = ga.duplicate(g_a, "array B")
    if not g_b: ga.error("duplicate failed")
    if not me: print "Created Array B"

    # initialize data in g_a
    if not me:
        print "Initializing matrix A"
        ga.put(g_a, np.arange(dims[0], dtype=np.int32))

    # Synchronize all processors to guarantee that everyone has data
    # before proceeding to the next step.
    ga.sync()

    # Start initial phase of inversion by inverting the data held locally on
    # each processor. Start by finding out which data each processor owns.
    lo,hi = ga.distribution(g_a)

    # Get locally held data and copy it into local buffer a
    a = ga.get(g_a, lo, hi)

    # Invert data locally
    b = a[::-1]

    # Invert data globally by copying locally inverted blocks into
    # their inverted positions in the GA
    ga.put(g_b, b, dims[0]-hi[0], dims[0]-lo[0])

    # Synchronize all processors to make sure inversion is complete
    ga.sync()

    # Check to see if inversion is correct
    if not me: verify(g_a, g_b)

    # Deallocate arrays
    ga.destroy(g_a)
    ga.destroy(g_b)
Beispiel #10
0
def verify(g_a, g_b, g_c):
    g_chk = ga.duplicate(g_a, "array check")
    if not g_chk: ga.error("duplicate failed")
    ga.sync()

    ga.gemm(False, False, TOTALELEMS, TOTALELEMS, TOTALELEMS, 1.0, g_a, g_b,
            0.0, g_chk)
    ga.sync()

    ga.add(g_c, g_chk, g_chk, 1.0, -1.0)
    rchk = ga.dot(g_chk, g_chk)

    if not me:
        print "Normed difference in matrices: %12.4f" % rchk
        if not (-TOLERANCE < rchk < TOLERANCE):
            ga.error("Matrix multiply verify failed")
        else:
            print "Matrix Multiply OK"

    ga.destroy(g_chk)
Beispiel #11
0
def verify(g_a, g_b, g_c):
    g_chk = ga.duplicate(g_a, "array check")
    if not g_chk: ga.error("duplicate failed")
    ga.sync()

    ga.gemm(False, False, TOTALELEMS, TOTALELEMS, TOTALELEMS, 1.0, g_a, g_b,
            0.0, g_chk);
    ga.sync()

    ga.add(g_c, g_chk, g_chk, 1.0, -1.0)
    rchk = ga.dot(g_chk, g_chk)

    if not me:
        print "Normed difference in matrices: %12.4f" % rchk
        if not (-TOLERANCE < rchk < TOLERANCE):
            ga.error("Matrix multiply verify failed")
        else:
            print "Matrix Multiply OK"

    ga.destroy(g_chk)
Beispiel #12
0
    val = ga.gop_add(val)
    return val == 0

if __name__ == '__main__':
    if nproc > MULTIPLIER**3:
        if 0 == me:
            print "You must use less than %s processors" % (MULTIPLIER**3+1)
    else:
        g_a = ga.create(ga.C_DBL, [N,N])
        g_b = ga.create(ga.C_DBL, [N,N])
        g_c = ga.create(ga.C_DBL, [N,N])
        # put some fake data into input arrays A and B
        if me == 0:
            ga.put(g_a, np.random.random(N*N))
            ga.put(g_b, np.random.random(N*N))
        ga.sync()
        if me == 0:
            print "srumma...",
        srumma(g_a, g_b, g_c, CHUNK_SIZE, MULTIPLIER)
        if me == 0:
            print "done"
        if me == 0:
            print "verifying using ga.gemm...",
        ok = verify_using_ga(g_a, g_b, g_c)
        if me == 0:
            if ok:
                print "OKAY"
            else:
                print "FAILED"
        if me == 0:
            print "verifying using np.dot...",
Beispiel #13
0
        x = h * (i - 0.5);
        s += 4.0 / (1.0 + x**2);
    return s * h

def prn_pi(pi, PI):
    message = "pi is approximately %.16f, error is %.16f"
    print  (message % (pi, abs(pi - PI)))

### assign total number of processors to variable 'nprocs'
### assign processor ID to the variable 'myrank'

### create a global array 'g_pi' of type double and a single value

while True:
    if myrank == 0:
        n = get_n()
        ### broadcast the value of 'n'
    else:
        ### receive the broadcast of the value of 'n'
    if n == 0:
        break
    ### zero the global array 'g_pi'
    mypi = comp_pi(n, myrank, nprocs)
    ### accumulate local value 'mypi' into global array 'g_pi'
    ga.sync()
    if myrank == 0:
        ### get value of 'pi' from global array 'g_pi'
        prn_pi(pi, PI)

### destroy the global array 'g_pi'