def test_edge_cases(device_id): """ Test several edge cases related to min/max bin, and rounding. Also test backend dump_hist_data functionality. """ gpuflag = (check_gpu.get_compute_capability(0) >= 3.0) if gpuflag is False: raise RuntimeError("Device does not have CUDA compute capability 3.0 or greater") ng = NervanaGPU(device_id=device_id) nc = NervanaCPU() # edge case test np_ref = dict() inputs = [ ("edges", np.array([2 ** -48, 2 ** 15], dtype=np.float32)), ("rounding", np.array([2 ** 5, 63.99998856, 2 ** 6, 2 ** -3, 2 ** -4, 0.11262291, 92.22483826], dtype=np.float32)), ("fp16 rounding", np.array([45.21875], dtype=np.float16)) ] for tag, inp in inputs: np_ref[tag] = ref_hist(inp) for be in [ng, nc]: be_inp = be.array(inp) be_hist = be_inp.hist(tag) assert tensors_allclose(np_ref[tag], be_hist), tag + str(be) # dump_hist_data test for be in [ng, nc]: be_hist_data, be_hist_map = be.dump_hist_data() for tag, inp in inputs: be_data = be_hist_data[be_hist_map[tag]] assert tensors_allclose(np_ref[tag], be_data), tag + str(be) del(ng) del(nc)
def test_gradients(backend_tests, custom_args): test_idx, f, flag, dim = custom_args # backend_tests fixture will parameterize over cpu and gpu # backends as well as float16 and float32 # pull the be and dtype from the actions of the fixture be = NervanaObject.be dtype = be.default_dtype # tensors tensors = gen_backend_tensors([np, be], [dim] * 5, [flag] * 5, dtype=dtype) # compare function value and gradient numpy_func_val = call_func(f, np, tensors[0]) backend_func_val = call_func(f, be, tensors[1]) numerical_gradient = get_numerical_gradient(f, tensors[0]) ad = get_audiff_gradient(f, be, tensors[1]) autodiff_gradient = ad.get_grad_asnumpyarray(tensors[1]) # TODO: stricter test to fix numerical issues assert tensors_allclose(numpy_func_val, backend_func_val, rtol=1e-2, atol=1e-2) assert tensors_allclose(numerical_gradient, autodiff_gradient, rtol=1e-02, atol=1e-3) # cleanup diff tree ad.cleanup() dtype = None be = None
def test_cpu_randomstate(): # run 1 be = gen_backend(backend='cpu', rng_seed=100) a = be.empty((3, 3)) be.make_binary_mask(a, keepthresh=be.rng.rand()) x0 = a.get() be.make_binary_mask(a, keepthresh=be.rng.rand()) x1 = a.get() # run 2, using reset be.rng_reset() be.make_binary_mask(a, keepthresh=be.rng.rand()) y0 = a.get() be.make_binary_mask(a, keepthresh=be.rng.rand()) y1 = a.get() del(be) # run 3, using a new backend be = gen_backend(backend='cpu', rng_seed=100) a = be.empty((3, 3)) be.make_binary_mask(a, keepthresh=be.rng.rand()) z0 = a.get() be.make_binary_mask(a, keepthresh=be.rng.rand()) z1 = a.get() # check equality assert tensors_allclose([x0, x1], [y0, y1], rtol=0., atol=0.) assert tensors_allclose([x0, x1], [z0, z1], rtol=0., atol=0.) del(be)
def test_slicing(fargs_tests, device_id): dims, dtype = fargs_tests gpu = NervanaGPU(default_dtype=dtype, device_id=device_id) cpu = NervanaCPU(default_dtype=dtype) array_np = np.random.uniform(-1, 1, dims).astype(dtype) array_ng = gpu.array(array_np, dtype=dtype) array_nc = cpu.array(array_np, dtype=dtype) assert tensors_allclose(array_ng[0], array_nc[0], rtol=0, atol=1e-3) assert tensors_allclose(array_ng[-1], array_nc[-1], rtol=0, atol=1e-3) assert tensors_allclose(array_ng[0, :], array_nc[0, :], rtol=0, atol=1e-3) assert tensors_allclose(array_ng[0:], array_nc[0:], rtol=0, atol=1e-3) assert tensors_allclose(array_ng[:-1], array_nc[:-1], rtol=0, atol=1e-3) assert tensors_allclose(array_ng[:, 0], array_nc[:, 0], rtol=0, atol=1e-3) assert tensors_allclose(array_ng[:, 0:1], array_nc[:, 0:1], rtol=0, atol=1e-3) assert tensors_allclose(array_ng[-1, 0:], array_nc[-1:, 0:], rtol=0, atol=1e-3) array_ng[0] = 0 array_nc[0] = 0 assert tensors_allclose(array_ng, array_nc, rtol=0, atol=1e-3) del (gpu)
def test_gpu_randomstate(device_id): # run 1 be = gen_backend(backend='gpu', rng_seed=100, device_id=device_id) a = be.empty((3, 3)) a[:] = be.rand() # gpu rand x0 = a.get() x1 = be.rng.rand(3, 3) # host rand a[:] = be.rand() # gpu rand x2 = a.get() be.make_binary_mask(a, keepthresh=be.rng.rand()) x3 = a.get() assert len(be.context_rand_state_map) == 1 and len(be.context_rand_state_alive) == 1 for ctx in be.context_rand_state_alive: assert be.context_rand_state_alive[ctx] is True # run 2, using reset be.rng_reset() a[:] = be.rand() y0 = a.get() y1 = be.rng.rand(3, 3) a[:] = be.rand() y2 = a.get() be.make_binary_mask(a, keepthresh=be.rng.rand()) y3 = a.get() assert len(be.context_rand_state_map) == 1 and len(be.context_rand_state_alive) == 1 for ctx in be.context_rand_state_alive: assert be.context_rand_state_alive[ctx] is True del(be) # run 3, using a new backend be = gen_backend(backend='gpu', rng_seed=100, device_id=device_id) a = be.empty((3, 3)) a[:] = be.rand() # gpu rand z0 = a.get() z1 = be.rng.rand(3, 3) # host rand a[:] = be.rand() # gpu rand z2 = a.get() be.make_binary_mask(a, keepthresh=be.rng.rand()) z3 = a.get() # check equality assert tensors_allclose([x0, x1, x2, x3], [y0, y1, y2, y3], rtol=0., atol=0.) assert tensors_allclose([x0, x1, x2, x3], [z0, z1, z2, z3], rtol=0., atol=0.) del(be)
def test_hist(nbin_offset_dim_dtype_inp, device_id): """ Compare the nervanagpu and nervanacpu hist implementation to the reference implementation above. Parameterized test case, uses pytest_generate_test to enumerate dim_dtype_inp tuples that drive the test. """ (nbins, offset), dim, dtype, (name, inp_gen) = nbin_offset_dim_dtype_inp gpuflag = (check_gpu.get_compute_capability(0) >= 3.0) if gpuflag is False: raise RuntimeError("Device does not have CUDA compute capability 3.0 or greater") ng = NervanaGPU(hist_bins=nbins, hist_offset=offset, device_id=device_id) nc = NervanaCPU(hist_bins=nbins, hist_offset=offset) np_inp = inp_gen(dim).astype(dtype) np_hist = ref_hist(np_inp, nbins=nbins, offset=offset) for be in [ng, nc]: be_inp = be.array(np_inp, dtype=dtype) be_hist = be_inp.hist(name) assert tensors_allclose(np_hist, be_hist) del(ng) del(nc)
def test_batched_dot(device_id): np.set_printoptions(threshold=8192 * 4, linewidth=600, formatter={ 'int': lambda x: "%2d" % x, 'float': lambda x: "%2.0f" % x }) ng = NervanaGPU(stochastic_round=False, bench=1, device_id=device_id) nc = NervanaCPU() dtype = np.float32 # np.float16 or np.float32 X = 100 # Batch Size N = 32 # Minibatch Size C = 1536 # Input Features K = 768 # Output Features cpuI, cpuE, cpuW = setup_test_data(X, N, C, K, dtype) ngO, ngB, ngU = run_batched_dot(ng, cpuI, cpuE, cpuW, X, dtype) ncO, ncB, ncU = run_batched_dot(nc, cpuI, cpuE, cpuW, X, dtype) npO, npB, npU = run_batched_dot(np, cpuI, cpuE, cpuW, X, dtype) # set_trace() assert tensors_allclose(npO, ngO, rtol=0, atol=1e-3) assert tensors_allclose(npB, ngB, rtol=0, atol=1e-3) assert tensors_allclose(npU, ngU, rtol=0, atol=1e-3) assert tensors_allclose(npO, ncO, rtol=0, atol=1e-3) assert tensors_allclose(npB, ncB, rtol=0, atol=1e-3) assert tensors_allclose(npU, ncU, rtol=0, atol=1e-3) del (ng)
def test_vs_numpy(backend_tests, custom_args): test_idx, f, flag, dim = custom_args # backend be = NervanaObject.be dtype = be.default_dtype # tensors tensors = gen_backend_tensors([np, be], [dim] * 4, [flag] * 4, dtype=dtype) # compare function values numpy_func_val = call_func(f, np, tensors[0]) backend_func_val = call_func(f, be, tensors[1]) assert tensors_allclose(numpy_func_val, backend_func_val, rtol=1e-2, atol=1e-2)
def test_copy_transpose(shape_dtype_inp, device_id): """ Parameterized test case, uses pytest_generate_test to enumerate dim_dtype_inp tuples that drive the test. """ shape, dtype, (name, inp_gen) = shape_dtype_inp ng = NervanaGPU(default_dtype=dtype, device_id=device_id) nc = NervanaCPU(default_dtype=dtype) np_inp = inp_gen(shape).astype(dtype) ndims = len(shape) axes = [None] + list(itt.permutations(range(ndims), ndims)) axes.remove(tuple(range(ndims))) for be, ax in itt.product([ng, nc], axes): be_inp = be.array(np_inp, dtype=dtype) np_trans = np.transpose(np_inp, axes=ax) be_trans = be.zeros(np_trans.shape) be.copy_transpose(be_inp, be_trans, axes=ax) assert tensors_allclose(np_trans, be_trans) del (ng) del (nc)
def pool_helper(dtype, ones, cpu, repeat, alpha, beta, ng, pool, config, op): err_string = "Error in dtype: '%s' op: '%s' config: '%s'" % (str(dtype), op, config) dimI = pool.dimI dimO = pool.dimO # colapse pooling dimensions into one # this allows for easy cpu pooling in numpy def slicable(dim, pad=0): dim0 = reduce(mul, dim[:-1], 1) + pad return (dim0, dim[-1]) # cpu input arrays # Note that we truncte these to 16 bits so that the cpu and gpu will agree # on an index if there is a tie. if ones: cpuI = np.ones(slicable(dimI), dtype=np.float32) cpuB = np.ones(slicable(dimI), dtype=np.float32) cpuE = np.ones(dimO, dtype=np.float32) cpuO = np.ones(dimO, dtype=np.float32) else: # .astype(np.float16) cpuI = np.random.uniform(-1.0, 1.0, slicable(dimI)).astype( np.float16).astype(np.float32) cpuB = np.random.uniform(-1.0, 1.0, slicable(dimI)).astype( np.float16).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, dimO).astype(np.float16).astype(np.float32) cpuO = np.random.uniform(-1.0, 1.0, dimO).astype(np.float16).astype(np.float32) cpuA = np.empty(dimO, dtype=np.int32) # give gpu the input array without zero padding (not needed) devI = ng.array(cpuI.reshape(dimI), dtype=dtype) devB = ng.array(cpuB.reshape(dimI), dtype=dtype) devE = ng.array(cpuE, dtype=dtype) devO = ng.array(cpuO, dtype=dtype) devA = ng.empty(dimO, dtype=np.uint8) ng.fprop_pool(pool, devI, devO, devA, alpha=alpha, beta=beta, repeat=repeat) ng.bprop_pool(pool, devE, devB, devA, alpha=alpha, beta=beta, repeat=repeat) cpuO *= beta cpuB *= beta def pixel_indices(kj, mt, pr, qs): C = pool.C J, T, R, S = pool.JTRS D, H, W = pool.DHW HW = H * W DHW = D * H * W idx = [] for j in range(J): c = kj + j ci = c * DHW cb = c >= 0 and c < C for t in range(T): z = mt + t zi = ci + z * HW zb = cb and z >= 0 and z < D for r in range(R): y = pr + r yi = zi + y * W yb = zb and y >= 0 and y < H for s in range(S): x = qs + s if yb and x >= 0 and x < W: xi = yi + x idx.append(xi) return idx # numpy pooling implementation if cpu: op = pool.op K = pool.K N = pool.N M, P, Q = pool.MPQ pad_j, pad_d, pad_h, pad_w = pool.padding str_j, str_d, str_h, str_w = pool.strides for k in range(K): kj = k * str_j - pad_j for m in range(M): mt = m * str_d - pad_d for p in range(P): pr = p * str_h - pad_h for q in range(Q): qs = q * str_w - pad_w idx = pixel_indices(kj, mt, pr, qs) # print idx # exit() if op == "max": # set_trace() cpuO[k, m, p, q, :] += np.max(cpuI[idx, :], axis=0) * alpha b_idx = np.argmax(cpuI[idx, :], axis=0) cpuA[k, m, p, q, :] = b_idx.astype(np.int32) # There's probably a more elegant numpy way to do # this.. for n in range(N): cpuB[idx[b_idx[n]], n] += cpuE[k, m, p, q, n] * alpha elif op == "avg": cpuO[k, m, p, q, :] += np.mean(cpuI[idx, :], axis=0) * alpha cpuB[idx, :] += cpuE[k, m, p, q, :] * \ (1.0/len(idx)) * alpha # bprop not implemented yet elif op == "l2": cpuO[k, m, p, q, :] = np.sqrt( np.sum(cpuI[idx, :]**2, axis=0)) # drop zero padding cpuI = cpuI.reshape(dimI) cpuB = cpuB.reshape(dimI) devA = devA.get().astype(np.int32) devO = devO.get().astype(np.float32) devB = devB.get().astype(np.float32) # difA = np.absolute(cpuA - devA) # np.savetxt("out_cpuB.txt", cpuB.reshape((-1,pool.N))[:,0:8], fmt='%5.2f') # np.savetxt("out_devB.txt", devB.reshape((-1,pool.N))[:,0:8], fmt='%5.2f') difO = np.absolute(cpuO - devO) maxD = difO.max() maxO = np.absolute(cpuO).max() print("difO max: %.6f cpuO max: %5.2f ratio: %.6f" % (maxD, maxO, maxD / maxO)) assert tensors_allclose(cpuO, devO, rtol=0, atol=1e-2), "fprop:" + err_string difB = np.absolute(cpuB - devB) maxD = difB.max() maxB = np.absolute(cpuB).max() print("difB max: %.6f cpuB max: %5.2f ratio: %.6f" % (maxD, maxB, maxD / maxB)) assert tensors_allclose(cpuB, devB, rtol=0, atol=1e-2), "bprop:" + err_string