def verify_using_ga(g_a, g_b, g_c): g_v = ga.duplicate(g_c) ga.gemm(False, False, N, N, N, 1, g_a, g_b, 0, g_v) c = ga.access(g_c) v = ga.access(g_v) if c is not None: val = int(np.abs(np.sum(c - v)) > 0.0001) else: val = 0 val = ga.gop_add(val) ga.destroy(g_v) return val == 0
def verify_using_ga(g_a, g_b, g_c): g_v = ga.duplicate(g_c) ga.gemm(False,False,N,N,N,1,g_a,g_b,0,g_v) c = ga.access(g_c) v = ga.access(g_v) if c is not None: val = int(np.abs(np.sum(c-v))>0.0001) else: val = 0 val = ga.gop_add(val) ga.destroy(g_v) return val == 0
def verify(g_a, g_b, g_c): g_chk = ga.duplicate(g_a, "array check") if not g_chk: ga.error("duplicate failed") ga.sync() ga.gemm(False, False, TOTALELEMS, TOTALELEMS, TOTALELEMS, 1.0, g_a, g_b, 0.0, g_chk); ga.sync() ga.add(g_c, g_chk, g_chk, 1.0, -1.0) rchk = ga.dot(g_chk, g_chk) if not me: print "Normed difference in matrices: %12.4f" % rchk if not (-TOLERANCE < rchk < TOLERANCE): ga.error("Matrix multiply verify failed") else: print "Matrix Multiply OK" ga.destroy(g_chk)
def main(): # TODO there's got to be a loopless, more pythonic way to do this ii = 0 for i in range(num1*num1): ii += 1 if ii > num1: ii = 0 h0[i] = ii # compute times assuming 500 mflops and 5 second target time # ntimes = max(3.0, 5.0/(4.0-9*num**3)) ntimes = 5 for ii in range(howmany): num_m = nums_m[ii] num_n = nums_n[ii] num_k = nums_k[ii] a = 0.5/(num_m*num_n) if num_m > nummax or num_n > nummax or num_k > nummax: ga.error('Insufficient memory: check nummax') if BLOCK_CYCLIC: block_size = [128,128] g_c = ga.create_handle() ga.set_data(g_c, (num_m,num_n), ga.C_DBL) ga.set_array_name(g_c, 'g_c') ga.set_block_cyclic(g_c, block_size) if not ga.allocate(g_c): ga.error('create failed') block_size = [128,128] g_b = ga.create_handle() ga.set_data(g_b, (num_k,num_n), ga.C_DBL) ga.set_array_name(g_b, 'g_b') ga.set_block_cyclic(g_b, block_size) if not ga.allocate(g_b): ga.error('create failed') block_size = [128,128] g_a = ga.create_handle() ga.set_data(g_a, (num_m,num_k), ga.C_DBL) ga.set_array_name(g_a, 'g_a') ga.set_block_cyclic(g_a, block_size) if not ga.allocate(g_a): ga.error('create failed') else: g_a = ga.create(ga.C_DBL, (num_m,num_k), 'g_a') g_b = ga.create(ga.C_DBL, (num_k,num_n), 'g_b') g_c = ga.create(ga.C_DBL, (num_m,num_n), 'g_c') for handle in [g_a,g_b,g_c]: if 0 == handle: ga.error('create failed') # initialize matrices A and B if 0 == me: load_ga(g_a, h0, num_m, num_k) load_ga(g_b, h0, num_k, num_n) ga.zero(g_c) ga.sync() if 0 == me: print '\nMatrix Multiplication C = A[%d,%d] x B[%d,%d]\n' % ( num_m, num_k, num_k, num_n) print ' %4s %12s %12s %7s %7s'%( "Run#", "Time (seconds)", "mflops/proc", "A trans", "B trans") avg_t[:] = 0 avg_mf[:] = 0 for itime in range(ntimes): for i in range(ntrans): ga.sync() ta = transa[i] tb = transb[i] t1 = time.time() ga.gemm(ta,tb,num_m,num_n,num_k,1,g_a,g_b,0,g_c) t1 = time.time() - t1 if 0 == me: mf = 2*num_m*num_n*num_k/t1*10**-6/nproc avg_t[i] += t1 avg_mf[i] += mf print ' %4d %12.4f %12.1f %7s %7s'%( itime+1, t1, mf, ta, tb) if VERIFY and itime == 0: verify_ga_gemm(ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c) if 0 == me: print '' for i in range(ntrans): print 'Average: %12.4f seconds %12.1f mflops/proc %s %s'%( avg_t[i]/ntimes, avg_mf[i]/ntimes, transa[i], transb[i]) if VERIFY: print 'All ga.gemms are verified...O.K.'