def test_slicing(fargs_tests): dims, dtype = fargs_tests gpu = NervanaGPU(default_dtype=dtype) cpu = NervanaCPU(default_dtype=dtype) array_np = np.random.uniform(-1, 1, dims).astype(dtype) array_ng = gpu.array(array_np, dtype=dtype) array_nc = cpu.array(array_np, dtype=dtype) assert_tensors_allclose(array_ng[0], array_nc[0], rtol=0, atol=1e-3) assert_tensors_allclose(array_ng[-1], array_nc[-1], rtol=0, atol=1e-3) assert_tensors_allclose(array_ng[0, :], array_nc[0, :], rtol=0, atol=1e-3) assert_tensors_allclose(array_ng[0:], array_nc[0:], rtol=0, atol=1e-3) assert_tensors_allclose(array_ng[:-1], array_nc[:-1], rtol=0, atol=1e-3) assert_tensors_allclose(array_ng[:, 0], array_nc[:, 0], rtol=0, atol=1e-3) assert_tensors_allclose(array_ng[:, 0:1], array_nc[:, 0:1], rtol=0, atol=1e-3) assert_tensors_allclose(array_ng[-1, 0:], array_nc[-1:, 0:], rtol=0, atol=1e-3) array_ng[0] = 0 array_nc[0] = 0 assert_tensors_allclose(array_ng, array_nc, rtol=0, atol=1e-3) del(gpu)
def test_pool_layer(poolargs): op = poolargs[0] dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True) nc = NervanaCPU() N, C = 32, 32 D, H, W = 1, 32, 32 J, T, R, S = 2, 1, 3, 3 padding_j, padding_d, padding_h, padding_w = 0, 0, 0, 0 strides_j, strides_d, strides_h, strides_w = 2, 1, 2, 2 # op = 'max' pool_ng = ng.pool_layer(dtype, op, N, C, D, H, W, J, T, R, S, padding_j, padding_d, padding_h, padding_w, strides_j, strides_d, strides_h, strides_w) pool_nc = nc.pool_layer(dtype, op, N, C, D, H, W, J, T, R, S, padding_j, padding_d, padding_h, padding_w, strides_j, strides_d, strides_h, strides_w) assert pool_ng.dimI == pool_nc.dimI assert pool_ng.dimO == pool_nc.dimO dimI = pool_ng.dimI dimO = pool_ng.dimO # generating input arrays for inputs and errors cpuI = np.random.uniform(0.0, 1.0, sliceable(dimI, 1)).astype(np.float16).astype(dtype) cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(dtype) # zero pad the last row of cpu input for the sake of numpy if op == "max": cpuI[-1, :] = np.finfo(dtype).min else: cpuI[-1, :] = 0 # =========GPU and CPU and numpy ========== beI = cpuI[:-1, :].reshape(dimI) beE = cpuE ngO, ngB = run_backend_pool(ng, pool_ng, beI, beE, dtype) ncO, ncB = run_backend_pool(nc, pool_nc, beI, beE, dtype) cpuO, cpuB = run_numpy_pool(op, cpuI, cpuE, dtype, pool_ng) for opA, ngA, ncA, cpuA in (("fprop", ngO, ncO, cpuO), ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI))): print opA assert np.allclose(ngA.get(), ncA.get(), rtol=0, atol=1e-4) assert np.allclose(ncA.get(), cpuA, rtol=0, atol=1e-5) del ng, nc
def test_pooling(device_id): ng = NervanaGPU(stochastic_round=False, bench=True, device_id=device_id) nc = NervanaCPU() layer_args = dict(dtype=np.float32, N=122, C=16, D=1, H=32, W=32, J=5) pool_test_args = dict(ones=0, cpu=1, ng=ng, nc=nc, alpha=1.0, # not supported in CPU ascale=1.2, beta=0.0, # not supported in CPU bpower=0.5, layer_g=ng.lrn_layer(**layer_args), # returns a pool layer layer_c=nc.lrn_layer(**layer_args), **layer_args) lrn_helper(**pool_test_args)
def test_edge_cases(device_id): """ Test several edge cases related to min/max bin, and rounding. Also test backend dump_hist_data functionality. """ gpuflag = (check_gpu.get_compute_capability(0) >= 3.0) if gpuflag is False: raise RuntimeError("Device does not have CUDA compute capability 3.0 or greater") ng = NervanaGPU(device_id=device_id) nc = NervanaCPU() # edge case test np_ref = dict() inputs = [ ("edges", np.array([2 ** -48, 2 ** 15], dtype=np.float32)), ("rounding", np.array([2 ** 5, 63.99998856, 2 ** 6, 2 ** -3, 2 ** -4, 0.11262291, 92.22483826], dtype=np.float32)), ("fp16 rounding", np.array([45.21875], dtype=np.float16)) ] for tag, inp in inputs: np_ref[tag] = ref_hist(inp) for be in [ng, nc]: be_inp = be.array(inp) be_hist = be_inp.hist(tag) assert tensors_allclose(np_ref[tag], be_hist), tag + str(be) # dump_hist_data test for be in [ng, nc]: be_hist_data, be_hist_map = be.dump_hist_data() for tag, inp in inputs: be_data = be_hist_data[be_hist_map[tag]] assert tensors_allclose(np_ref[tag], be_data), tag + str(be) del(ng) del(nc)
def test_hist(nbin_offset_dim_dtype_inp): """ Compare the nervanagpu and nervanacpu hist implementation to the reference implementation above. Parameterized test case, uses pytest_generate_test to enumerate dim_dtype_inp tuples that drive the test. """ (nbins, offset), dim, dtype, (name, inp_gen) = nbin_offset_dim_dtype_inp gpuflag = (check_gpu.get_compute_capability(0) >= 3.0) if gpuflag is False: raise RuntimeError( "Device does not have CUDA compute capability 3.0 or greater") ng = NervanaGPU(hist_bins=nbins, hist_offset=offset) nc = NervanaCPU(hist_bins=nbins, hist_offset=offset) np_inp = inp_gen(dim).astype(dtype) np_hist = ref_hist(np_inp, nbins=nbins, offset=offset) for be in [ng, nc]: be_inp = be.array(np_inp, dtype=dtype) be_hist = be_inp.hist(name) assert_tensors_allclose(np_hist, be_hist) del (ng) del (nc)
def test_batched_dot(): np.set_printoptions(threshold=8192 * 4, linewidth=600, formatter={'int': lambda x: "%2d" % x, 'float': lambda x: "%2.0f" % x}) ng = NervanaGPU(stochastic_round=False, bench=1) nc = NervanaCPU() dtype = np.float32 # np.float16 or np.float32 X = 100 # Batch Size N = 32 # Minibatch Size C = 1536 # Input Features K = 768 # Output Features cpuI, cpuE, cpuW = setup_test_data(X, N, C, K, dtype) ngO, ngB, ngU = run_batched_dot(ng, cpuI, cpuE, cpuW, X, dtype) ncO, ncB, ncU = run_batched_dot(nc, cpuI, cpuE, cpuW, X, dtype) npO, npB, npU = run_batched_dot(np, cpuI, cpuE, cpuW, X, dtype) # set_trace() assert_tensors_allclose(npO, ngO, rtol=0, atol=1e-3) assert_tensors_allclose(npB, ngB, rtol=0, atol=1e-3) assert_tensors_allclose(npU, ngU, rtol=0, atol=1e-3) assert_tensors_allclose(npO, ncO, rtol=0, atol=1e-3) assert_tensors_allclose(npB, ncB, rtol=0, atol=1e-3) assert_tensors_allclose(npU, ncU, rtol=0, atol=1e-3) ng.ctx.detach() del(ng)
def test_cpu(): from neon.backends.nervanacpu import NervanaCPU # ng = NervanaGPU() nc = NervanaCPU() # test_nms((ng, nc), (0.1, 5)) import datetime time_old = datetime.datetime.now() score = 4 box_count = 5 thre = 0.1 for i in range(10000): dets = np.asarray( [[1, 2, 3, 4, 0.6], [1, 3, 3, 4, 0.7], [1, 1, 4, 4, 0.9], [2, 1, 4, 4, 0.85], [1, 1, 3, 4, 0.85]], dtype=np.float32) print(dets) dets[:, score] = np.sort(np.random.random((box_count,)))[::-1] # call reference nms keep_ref = py_cpu_nms(dets, thre) # call cpu nms # dets_nc = nc.array(dets) # # tic_cpu = nc.init_mark() # # toc_cpu = nc.init_mark() # # nc.record_mark(tic_cpu) # # keep_nc = nc.nms(dets_nc, 0.1) # print("keep_nc nms", keep_nc) print('requests cpu', (datetime.datetime.now() - time_old).microseconds)
def compare_helper(op, inA, inB, dtype): numpy_result = math_helper(np, op, inA, inB, dtype=np.float32) if np.dtype(dtype).kind == 'i' or np.dtype(dtype).kind == 'u': numpy_result = np.around(numpy_result) numpy_result = numpy_result.clip( np.iinfo(dtype).min, np.iinfo(dtype).max) numpy_result = numpy_result.astype(dtype) if dtype in (np.float32, np.float16): gpu = NervanaGPU(default_dtype=dtype) nervanaGPU_result = math_helper(gpu, op, inA, inB, dtype=dtype) nervanaGPU_result = nervanaGPU_result.get() np.allclose(numpy_result, nervanaGPU_result, rtol=0, atol=1e-5) cpu = NervanaCPU(default_dtype=dtype) nervanaCPU_result = math_helper(cpu, op, inA, inB, dtype=dtype) nervanaCPU_result = nervanaCPU_result.get() np.allclose(numpy_result, nervanaCPU_result, rtol=0, atol=1e-5)
def test_hist(nbin_offset_dim_dtype_inp): """ Compare the nervanagpu and nervanacpu hist implementation to the reference implementation above. Parameterized test case, uses pytest_generate_test to enumerate dim_dtype_inp tuples that drive the test. """ (nbins, offset), dim, dtype, (name, inp_gen) = nbin_offset_dim_dtype_inp ng = NervanaGPU(hist_bins=nbins, hist_offset=offset) nc = NervanaCPU(hist_bins=nbins, hist_offset=offset) np_inp = inp_gen(dim).astype(dtype) np_hist = ref_hist(np_inp, nbins=nbins, offset=offset) for be in [ng, nc]: be_inp = be.array(np_inp, dtype=dtype) be_hist = be_inp.hist(name) assert_tensors_allclose(np_hist, be_hist)
def test_copy_transpose(shape_dtype_inp): """ Parameterized test case, uses pytest_generate_test to enumerate dim_dtype_inp tuples that drive the test. """ shape, dtype, (name, inp_gen) = shape_dtype_inp ng = NervanaGPU(default_dtype=dtype) nc = NervanaCPU(default_dtype=dtype) np_inp = inp_gen(shape).astype(dtype) ndims = len(shape) axes = [None] + list(itt.permutations(range(ndims), ndims)) axes.remove(tuple(range(ndims))) for be, ax in itt.product([ng, nc], axes): be_inp = be.array(np_inp, dtype=dtype) np_trans = np.transpose(np_inp, axes=ax) be_trans = be.zeros(np_trans.shape) be.copy_transpose(be_inp, be_trans, axes=ax) assert_tensors_allclose(np_trans, be_trans) del (ng) del (nc)
def test_edge_cases(): """ Test several edge cases related to min/max bin, and rounding. Also test backend dump_hist_data functionality. """ ng = NervanaGPU() nc = NervanaCPU() # edge case test np_ref = dict() inputs = [ ("edges", np.array([2**-48, 2**15], dtype=np.float32)), ("rounding", np.array( [2**5, 63.99998856, 2**6, 2**-3, 2**-4, 0.11262291, 92.22483826], dtype=np.float32)), ("fp16 rounding", np.array([45.21875], dtype=np.float16)) ] for tag, inp in inputs: np_ref[tag] = ref_hist(inp) for be in [ng, nc]: be_inp = be.array(inp) be_hist = be_inp.hist(tag) assert_tensors_allclose(np_ref[tag], be_hist, err_msg=tag + str(be)) # dump_hist_data test for be in [ng, nc]: be_hist_data, be_hist_map = be.dump_hist_data() for tag, inp in inputs: be_data = be_hist_data[be_hist_map[tag]] assert_tensors_allclose(np_ref[tag], be_data, err_msg=tag + str(be))
from neon.backends.convolution import (_ceil_div, FpropCuda, BpropCuda, UpdateCuda, FpropDirect, BpropDirect, UpdateDirect) from neon.backends.winograd_conv import ( FpropWinograd_2x2_3x3, BpropWinograd_2x2_3x3, UpdateWinograd_3x3_2x2, FpropWinograd_4x4_3x3, BpropWinograd_4x4_3x3, UpdateWinograd_3x3_4x4, FpropWinograd_2x2_5x5, BpropWinograd_2x2_5x5) fprop_kernels = (FpropCuda, FpropDirect, FpropWinograd_2x2_3x3, FpropWinograd_4x4_3x3, FpropWinograd_2x2_5x5) bprop_kernels = (BpropCuda, BpropDirect, BpropWinograd_2x2_3x3, BpropWinograd_4x4_3x3, BpropWinograd_2x2_5x5) update_kernels = (UpdateCuda, UpdateDirect, UpdateWinograd_3x3_2x2, UpdateWinograd_3x3_4x4) ng = NervanaGPU(0) nc = NervanaCPU() neon_logger.display(drv.Context.get_current().get_device().name()) out = 0 ones = 0 # D, H, W, T, R, S, pad, str conv_1x1 = ( 1, 14, 14, 1, 1, 1, 0,0,0, 1,1,1) conv_3x3 = ( 1, 14, 14, 1, 3, 3, 0,1,1, 1,1,1) conv_3x3p0 = ( 1, 14, 14, 1, 3, 3, 0,0,0, 1,1,1) conv_3x3p2 = ( 1, 14, 14, 1, 3, 3, 0,2,2, 1,1,1) conv_3x3s2 = ( 1, 14, 14, 1, 3, 3, 0,1,1, 1,2,2) conv_1x3 = ( 1, 14, 14, 1, 1, 3, 0,0,1, 1,1,1) conv_3x1 = ( 1, 14, 14, 1, 3, 1, 0,1,0, 1,1,1) conv_5x5 = ( 1, 14, 14, 1, 5, 5, 0,2,2, 1,1,1)
def gen_backend(backend='cpu', rng_seed=None, datatype=np.float32, batch_size=0, stochastic_round=False, device_id=0, max_devices=get_device_count(), compat_mode=None): """ Construct and return a backend instance of the appropriate type based on the arguments given. With no parameters, a single CPU core, float32 backend is returned. Arguments: backend (string, optional): 'cpu' or 'gpu'. rng_seed (numeric, optional): Set this to a numeric value which can be used to seed the random number generator of the instantiated backend. Defaults to None, which doesn't explicitly seed (so each run will be different) dataype (dtype): Default tensor data type. CPU backend supports np.float64, np.float32 and np.float16; GPU backend supports np.float32 and np.float16. batch_size (int): Set the size the data batches. stochastic_round (int/bool, optional): Set this to True or an integer to implent stochastic rounding. If this is False rounding will be to nearest. If True will perform stochastic rounding using default bit width. If set to an integer will round to that number of bits. Only affects the gpu backend. device_id (numeric, optional): Set this to a numeric value which can be used to select device on which to run the process max_devices (int, optional): For use with multi-GPU backend only. Controls the maximum number of GPUs to run on. compat_mode (str, optional): if this is set to 'caffe' then the conv and pooling layer output sizes will match that of caffe as will the dropout layer implementation Returns: Backend: newly constructed backend instance of the specifed type. Notes: * Attempts to construct a GPU instance without a CUDA capable card or without nervanagpu package installed will cause the program to display an error message and exit. """ logger = logging.getLogger(__name__) if NervanaObject.be is not None: # backend was already generated clean it up first cleanup_backend() else: # at exit from python force cleanup of backend only register this function once, will use # NervanaObject.be instead of a global atexit.register(cleanup_backend) if backend == 'cpu' or backend is None: from neon.backends.nervanacpu import NervanaCPU be = NervanaCPU(rng_seed=rng_seed, default_dtype=datatype, compat_mode=compat_mode) elif backend == 'gpu' or backend == 'mgpu': gpuflag = False # check nvcc from neon.backends.util import check_gpu gpuflag = (check_gpu.get_compute_capability(device_id) >= 5.0) if gpuflag is False: raise RuntimeError("Device " + str(device_id) + " does not have CUDA compute " + "capability 5.0 or greater") if backend == 'gpu': from neon.backends.nervanagpu import NervanaGPU # init gpu be = NervanaGPU(rng_seed=rng_seed, default_dtype=datatype, stochastic_round=stochastic_round, device_id=device_id, compat_mode=compat_mode) else: try: from mgpu.nervanamgpu import NervanaMGPU # init multiple GPU be = NervanaMGPU(rng_seed=rng_seed, default_dtype=datatype, stochastic_round=stochastic_round, num_devices=max_devices) except ImportError: logger.error("Multi-GPU support is a premium feature " "available exclusively through the Nervana cloud." " Please contact [email protected] for details.") raise else: raise ValueError("backend must be one of ('cpu', 'gpu', 'mgpu')") logger.info("Backend: {}, RNG seed: {}".format(backend, rng_seed)) NervanaObject.be = be be.bsz = batch_size return be
dets_ng = ng.array(dets) scores = dets_ng[:, 4].get() order = scores.argsort()[::-1] sorted_dets_dev = dets_ng[order, :] tic_gpu = ng.init_mark() toc_gpu = ng.init_mark() # call through backend ng.record_mark(tic_gpu) keep_ng = ng.nms(sorted_dets_dev, thre) ng.record_mark(toc_gpu) ng.synchronize_mark(toc_gpu) print("gpu NMS time (ms): {}".format(ng.get_time(tic_gpu, toc_gpu))) assert keep_ng == keep_ref if __name__ == '__main__': from neon.backends.nervanagpu import NervanaGPU from neon.backends.nervanacpu import NervanaCPU ng = NervanaGPU() nc = NervanaCPU() test_nms((ng, nc), (0.7, 300))
def gen_backend(backend='cpu', rng_seed=None, default_dtype=np.float32, batch_size=0, stochastic_round=False, device_id=0): """ Construct and return a backend instance of the appropriate type based on the arguments given. With no parameters, a single CPU core, float32 backend is returned. Arguments: backend (string, optional): 'cpu' or 'gpu'. rng_seed (numeric, optional): Set this to a numeric value which can be used to seed the random number generator of the instantiated backend. Defaults to None, which doesn't explicitly seed (so each run will be different) default_dtype (dtype): Default tensor data type. CPU backend supports np.float64, np.float32 and np.float16; GPU backend supports np.float32 and np.float16. batch_size (int): Set the size the data batches. stochastic_round (int/bool, optional): Set this to True or an integer to implent stochastic rounding. If this is False rounding will be to nearest. If True will perform stochastic rounding using default bit width. If set to an integer will round to that number of bits. Only affects the gpu backend. device_id (numeric, optional): Set this to a numeric value which can be used to select which device to run the process on Returns: Backend: newly constructed backend instance of the specifed type. Notes: * Attempts to construct a GPU instance without a CUDA capable card or without nervanagpu package installed will cause the program to display an error message and exit. """ logger = logging.getLogger(__name__) if NervanaObject.be is not None: # backend was already generated # clean it up first cleanup_backend() else: # at exit from python force cleanup of backend # only register this function once, will use # NervanaObject.be instead of a global atexit.register(cleanup_backend) if backend == 'cpu' or backend is None: from neon.backends.nervanacpu import NervanaCPU be = NervanaCPU(rng_seed=rng_seed, default_dtype=default_dtype) elif backend == 'gpu': gpuflag = False # check nvcc from neon.backends.util import check_gpu gpuflag = (check_gpu.get_compute_capability(device_id) >= 5.0) if gpuflag is False: raise RuntimeError("Device " + str(device_id) + " does not have CUDA compute " + "capability 5.0 or greater") from neon.backends.nervanagpu import NervanaGPU # init gpu be = NervanaGPU(rng_seed=rng_seed, default_dtype=default_dtype, stochastic_round=stochastic_round, device_id=device_id) elif backend == 'mgpu': raise NotImplementedError("mgpu will be ready soon") else: raise ValueError("backend must be one of " "('cpu', 'gpu', 'mgpu')") logger.info("Backend: {}, RNG seed: {}".format(backend, rng_seed)) NervanaObject.be = be be.bsz = batch_size return be
def get_backend_pair_mkl(device_id, dtype=np.float32, bench=False): nm = NervanaMKL(default_dtype=dtype) nc = NervanaCPU(default_dtype=dtype) return (nm, nc)
def test_conv_layer(): dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True) nc = NervanaCPU() N, C, K = 64, 64, 64 D, H, W = 1, 5, 5 T, R, S = 1, 3, 3 padding_d, padding_h, padding_w = 0, 1, 1 strides_d, strides_h, strides_w = 1, 1, 1 conv_ng = ng.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) conv_nc = nc.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) assert conv_nc.dimI == conv_ng.dimI assert conv_nc.dimF == conv_ng.dimF assert conv_nc.dimO == conv_ng.dimO assert conv_nc.M == conv_ng.M dimI = conv_ng.dimI dimF = conv_ng.dimF dimO = conv_ng.dimO # cpu input arrays cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32) cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32) cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32) # zero pad the last row of cpu input for the sake of numpy cpuI[-1, :] = 0.0 # =======GPU and CPU========== beI = cpuI[:-1, :].reshape(dimI) beF = cpuF.reshape(dimF) beE = cpuE start_gpu = default_timer() ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype) end_gpu = default_timer() start_cpu = default_timer() ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype) end_cpu = default_timer() print("gputime: %s, cputime %s" % (end_gpu - start_gpu, end_cpu - start_cpu)) # ======numpy=========== # cpu output arrays cpuO = np.zeros(dimO, dtype=dtype) cpuB = np.zeros(slicable(dimI, 1), dtype=dtype) cpuU = np.zeros(slicable(dimF), dtype=dtype) D, H, W = conv_nc.DHW T, R, S = conv_nc.TRS M, P, Q = conv_nc.MPQ pad_d, pad_h, pad_w = conv_nc.padding str_d, str_h, str_w = conv_nc.strides for m in range(M): mt = m * str_d - pad_d for p in range(P): pr = p * str_h - pad_h for q in range(Q): qs = q * str_w - pad_w idx = pixel_indices(conv_nc, mt, pr, qs) cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :]) cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :]) cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T) for op, ngA, ncA, cpuA, w in (("fprop", ngO, ncO, cpuO, Q), ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI), W), ("update", ngU, ncU.reshape(dimF), cpuU.reshape(dimF), S)): print op assert np.allclose(ngA.get(), cpuA, rtol=0, atol=1e-4) assert np.allclose(ncA.get(), cpuA, rtol=0, atol=1e-5) ng.ctx.detach() del ng
def get_backend_pair(device_id, dtype=np.float32, bench=False): from neon.backends.nervanagpu import NervanaGPU ng = NervanaGPU(default_dtype=dtype, bench=bench, device_id=device_id) nc = NervanaCPU(default_dtype=dtype) return (ng, nc)
def gen_backend(backend='cpu', rng_seed=None, datatype=np.float32, batch_size=0, stochastic_round=False, device_id=0, max_devices=get_device_count(), compat_mode=None, deterministic_update=False, deterministic=True): """ Construct and return a backend instance of the appropriate type based on the arguments given. With no parameters, a single CPU core, float32 backend is returned. Arguments: backend (string, optional): 'cpu' or 'gpu'. rng_seed (numeric, optional): Set this to a numeric value which can be used to seed the random number generator of the instantiated backend. Defaults to None, which doesn't explicitly seed (so each run will be different) dataype (dtype): Default tensor data type. CPU backend supports np.float64, np.float32 and np.float16; GPU backend supports np.float32 and np.float16. batch_size (int): Set the size the data batches. stochastic_round (int/bool, optional): Set this to True or an integer to implent stochastic rounding. If this is False rounding will be to nearest. If True will perform stochastic rounding using default bit width. If set to an integer will round to that number of bits. Only affects the gpu backend. device_id (numeric, optional): Set this to a numeric value which can be used to select device on which to run the process max_devices (int, optional): For use with multi-GPU backend only. Controls the maximum number of GPUs to run on. compat_mode (str, optional): if this is set to 'caffe' then the conv and pooling layer output sizes will match that of caffe as will the dropout layer implementation deterministic (bool, optional): if set to true, all operations will be done deterministically. Returns: Backend: newly constructed backend instance of the specifed type. Notes: * Attempts to construct a GPU instance without a CUDA capable card or without nervanagpu package installed will cause the program to display an error message and exit. """ logger = logging.getLogger(__name__) if NervanaObject.be is not None: # backend was already generated clean it up first cleanup_backend() else: # at exit from python force cleanup of backend only register this function once, will use # NervanaObject.be instead of a global atexit.register(cleanup_backend) if deterministic_update: deterministic = True logger.warning( "--deterministic_update is deprecated in favor of --deterministic") if backend == 'cpu' or backend is None: from neon.backends.nervanacpu import NervanaCPU be = NervanaCPU(rng_seed=rng_seed, default_dtype=datatype, compat_mode=compat_mode) elif backend == 'gpu' or backend == 'mgpu': gpuflag = False # check nvcc from neon.backends.util import check_gpu gpuflag = (check_gpu.get_compute_capability(device_id) >= 3.0) if gpuflag is False: raise RuntimeError("Device " + str(device_id) + " does not have CUDA compute " + "capability 3.0 or greater") if backend == 'gpu': from neon.backends.nervanagpu import NervanaGPU # init gpu be = NervanaGPU(rng_seed=rng_seed, default_dtype=datatype, stochastic_round=stochastic_round, device_id=device_id, compat_mode=compat_mode, deterministic=deterministic) else: try: from mgpu.nervanamgpu import NervanaMGPU # init multiple GPU be = NervanaMGPU(rng_seed=rng_seed, default_dtype=datatype, stochastic_round=stochastic_round, num_devices=max_devices, compat_mode=compat_mode, deterministic=deterministic) except ImportError: logger.error( "Multi-GPU support is a premium feature " "available exclusively through the Nervana cloud." " Please contact [email protected] for details.") raise elif backend == 'argon': from argon.neon_backend.ar_backend import ArBackend be = ArBackend(rng_seed=rng_seed, default_dtype=datatype) else: raise ValueError("backend must be one of ('cpu', 'gpu', 'mgpu')") logger.info("Backend: {}, RNG seed: {}".format(backend, rng_seed)) NervanaObject.be = be be.bsz = batch_size return be
def test_conv_layer(fargs_tests): dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True) N, C, K = fargs_tests[0] D, H, W = fargs_tests[1] T, R, S = fargs_tests[2] padding_d, padding_h, padding_w = 0, 1, 1 strides_d, strides_h, strides_w = 1, 1, 1 conv_ng = ng.conv_layer( dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) nc = NervanaCPU() conv_nc = nc.conv_layer( dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) assert conv_nc.dimI == conv_ng.dimI assert conv_nc.dimF == conv_ng.dimF assert conv_nc.dimO == conv_ng.dimO assert conv_nc.M == conv_ng.M dimI = conv_ng.dimI dimF = conv_ng.dimF dimO = conv_ng.dimO # cpu input arrays cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32) cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32) cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32) # zero pad the last row of cpu input for the sake of numpy cpuI[-1, :] = 0.0 # =======GPU and CPU========== beI = cpuI[:-1, :].reshape(dimI) beF = cpuF.reshape(dimF) beE = cpuE start_gpu = default_timer() ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype) end_gpu = default_timer() start_cpu = default_timer() ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype) end_cpu = default_timer() print("gputime: %s, cputime %s" % (end_gpu - start_gpu, end_cpu - start_cpu)) # ======numpy=========== # cpu output arrays cpuO = np.zeros(dimO, dtype=dtype) cpuB = np.zeros(slicable(dimI, 1), dtype=dtype) cpuU = np.zeros(slicable(dimF), dtype=dtype) D, H, W = conv_nc.DHW T, R, S = conv_nc.TRS M, P, Q = conv_nc.MPQ pad_d, pad_h, pad_w = conv_nc.padding str_d, str_h, str_w = conv_nc.strides for m in range(M): mt = m * str_d - pad_d for p in range(P): pr = p * str_h - pad_h for q in range(Q): qs = q * str_w - pad_w idx = pixel_indices(conv_nc, mt, pr, qs) cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :]) cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :]) cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T) for op, ngA, ncA, cpuA, w in ( ("fprop", ngO, ncO, cpuO, Q), ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI), W), ("update", ngU, ncU.reshape(dimF), cpuU.reshape(dimF), S)): print(op) assert np.allclose(ngA.get(), cpuA, rtol=0, atol=1e-4) assert np.allclose(ncA.get(), cpuA, rtol=0, atol=1e-4) del ng del nc
def setup(self): self.gpu = NervanaGPU(stochastic_round=False) self.cpu = NervanaCPU() self.dims = (1024, 1024)
def test_pool_layer(poolargs): op = poolargs[0] dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True) nc = NervanaCPU() N, C = 32, 32 D, H, W = 1, 32, 32 J, T, R, S = 2, 1, 3, 3 padding_j, padding_d, padding_h, padding_w = 0, 0, 0, 0 strides_j, strides_d, strides_h, strides_w = 2, 1, 2, 2 # op = 'max' pool_ng = ng.pool_layer( dtype, op, N, C, D, H, W, J, T, R, S, padding_j, padding_d, padding_h, padding_w, strides_j, strides_d, strides_h, strides_w) pool_nc = nc.pool_layer( dtype, op, N, C, D, H, W, J, T, R, S, padding_j, padding_d, padding_h, padding_w, strides_j, strides_d, strides_h, strides_w) assert pool_ng.dimI == pool_nc.dimI assert pool_ng.dimO == pool_nc.dimO dimI = pool_ng.dimI dimO = pool_ng.dimO # generating input arrays for inputs and errors cpuI = np.random.uniform(0.0, 1.0, sliceable(dimI, 1)).astype( np.float16).astype(dtype) cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(dtype) # zero pad the last row of cpu input for the sake of numpy if op == "max": cpuI[-1, :] = np.finfo(dtype).min else: cpuI[-1, :] = 0 # =========GPU and CPU and numpy ========== beI = cpuI[:-1, :].reshape(dimI) beE = cpuE ngO, ngB = run_backend_pool(ng, pool_ng, beI, beE, dtype) ncO, ncB = run_backend_pool(nc, pool_nc, beI, beE, dtype) cpuO, cpuB = run_numpy_pool(op, cpuI, cpuE, dtype, pool_ng) for opA, ngA, ncA, cpuA in ( ("fprop", ngO, ncO, cpuO), ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI))): print opA assert np.allclose(ngA.get(), ncA.get(), rtol=0, atol=1e-4) assert np.allclose(ncA.get(), cpuA, rtol=0, atol=1e-5) del ng, nc
def test_conv_layer(fargs_tests, device_id): dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True, device_id=device_id) N, C, K = fargs_tests[0] D, H, W = fargs_tests[1] T, R, S = fargs_tests[2] padding_d, padding_h, padding_w = fargs_tests[3] strides_d, strides_h, strides_w = fargs_tests[4] conv_ng = ng.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) nc = NervanaCPU() conv_nc = nc.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) assert conv_nc.dimI == conv_ng.dimI assert conv_nc.dimF == conv_ng.dimF assert conv_nc.dimO == conv_ng.dimO assert conv_nc.M == conv_ng.M dimI = conv_ng.dimI dimF = conv_ng.dimF dimO = conv_ng.dimO # cpu input arrays cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32) cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32) cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32) # zero pad the last row of cpu input for the sake of numpy cpuI[-1, :] = 0.0 # =======GPU and CPU========== beI = cpuI[:-1, :].reshape(dimI) beF = cpuF.reshape(dimF) beE = cpuE start_gpu = default_timer() ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype) end_gpu = default_timer() start_cpu = default_timer() ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype) end_cpu = default_timer() print("gputime: %s, cputime %s" % (end_gpu - start_gpu, end_cpu - start_cpu)) # ======numpy=========== # cpu output arrays cpuO = np.zeros(dimO, dtype=dtype) cpuB = np.zeros(slicable(dimI, 1), dtype=dtype) cpuU = np.zeros(slicable(dimF), dtype=dtype) D, H, W = conv_nc.DHW T, R, S = conv_nc.TRS M, P, Q = conv_nc.MPQ pad_d, pad_h, pad_w = conv_nc.padding str_d, str_h, str_w = conv_nc.strides for m in range(M): mt = m * str_d - pad_d for p in range(P): pr = p * str_h - pad_h for q in range(Q): qs = q * str_w - pad_w idx = pixel_indices(conv_nc, mt, pr, qs) cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :]) cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :]) cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T) for op, ngA, ncA, cpuA, w in (("fprop", ngO, ncO, cpuO, Q), ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI), W), ("update", ngU, ncU.reshape(dimF), cpuU.reshape(dimF), S)): print(op) ncAnp = ncA.get().astype(np.float32) ngAnp = ngA.get().astype(np.float32) ncdif = cpuA - ncAnp ngdif = cpuA - ngAnp maxval = abs(cpuA).max() ncmaxdif = abs(ncdif).max() ngmaxdif = abs(ngdif).max() ncRatio = ncmaxdif / maxval ngRatio = ngmaxdif / maxval assert ncRatio < 1e-5 assert ngRatio < 1e-5 assert allclose_with_out(ncA.get(), cpuA, rtol=0, atol=1e-4) assert allclose_with_out(ngA.get(), cpuA, rtol=0, atol=1e-3) del ng del nc