def setup(self): from neon.backends.mgpu import MGPU, MGPUTensor # this code gets called prior to each test try: self.be = MGPU(rng_seed=0, num_dev=2) except AssertionError: # likely that only one GPU device is available self.be = MGPU(rng_seed=0, num_dev=1) self.gpt = MGPUTensor
class TestGPU(object): def setup(self): from neon.backends.mgpu import MGPU, MGPUTensor # this code gets called prior to each test try: self.be = MGPU(rng_seed=0, num_dev=2) except AssertionError: # likely that only one GPU device is available self.be = MGPU(rng_seed=0, num_dev=1) self.gpt = MGPUTensor def reduction_test(self): nr = self.be.num_dev if nr == 1: # This shouldn't be supported return # create a numpy array as the test-bed asize = 9 # round up to the nearest multiple of num_dev bsize = -(-asize // nr) * nr h_a = np.random.randn(asize * nr).reshape( (nr, asize)).astype(self.be.default_dtype) h_result = np.sum(h_a, axis=0, keepdims=True) d_a = self.be.empty((1, asize)) u_a = self.be.empty((1, bsize)) self.be.scatter(h_a, d_a) self.be.reduce(d_a, u_a) print(h_result) print(d_a.tlist[0].asnumpyarray()) for i in range(nr): np.testing.assert_allclose(d_a.tlist[i].asnumpyarray(), h_result, atol=1e-6, rtol=0) def memset_test(self): # create a numpy array as the test-bed asize = 9 h_result = np.zeros((1, asize)) d_a = self.be.zeros((1, asize)) for i in range(self.be.num_dev): np.testing.assert_allclose(d_a.tlist[i].asnumpyarray(), h_result, atol=1e-6, rtol=0) def frag2rep_test(self): nr = self.be.num_dev if nr == 1: # This shouldn't be supported return np.random.seed(0) # create a numpy array as the test-bed (rows, cols) = (24, 128) indim = rows * cols odim = indim * nr # h_frags has the data in the order we expect on the device h_frags_t = np.random.randn(odim).reshape( (nr * cols, rows)).astype(self.be.default_dtype) h_frags = h_frags_t.transpose().astype(self.be.default_dtype, order='C') d_frags = self.be.empty((rows, cols)) d_frags_t = self.be.empty((cols, rows)) d_reps = self.be.empty((rows, cols * nr)) d_reps_t = self.be.empty((cols * nr, rows)) self.be.scatter(h_frags_t, d_frags_t) self.be.transpose(d_frags_t, d_frags) np.testing.assert_allclose(d_frags.asnumpyarray(), h_frags, atol=1e-5, rtol=0) self.be.fragment_to_replica(d_frags_t, d_reps_t) self.be.transpose(d_reps_t, d_reps) for i in range(nr): np.testing.assert_allclose(d_frags.asnumpyarray(), d_reps.tlist[i].asnumpyarray(), atol=1e-5, rtol=0) print("Frag2Rep OK") d_frags_t.fill(0) self.be.replica_to_fragment(d_reps_t, d_frags_t) self.be.transpose(d_frags_t, d_frags) for i in range(nr): np.testing.assert_allclose(d_frags.asnumpyarray(), d_reps.tlist[i].asnumpyarray(), atol=1e-5, rtol=0) print("Rep2Frag OK")
class TestGPU(object): def setup(self): from neon.backends.mgpu import MGPU, MGPUTensor # this code gets called prior to each test try: self.be = MGPU(rng_seed=0, num_dev=2) except AssertionError: # likely that only one GPU device is available self.be = MGPU(rng_seed=0, num_dev=1) self.gpt = MGPUTensor def reduction_test(self): nr = self.be.num_dev if nr == 1: # This shouldn't be supported return # create a numpy array as the test-bed asize = 9 # round up to the nearest multiple of num_dev bsize = -(-asize // nr) * nr h_a = np.random.randn(asize * nr).reshape( (nr, asize)).astype(self.be.default_dtype) h_result = np.sum(h_a, axis=0, keepdims=True) d_a = self.be.empty((1, asize)) u_a = self.be.empty((1, bsize)) self.be.scatter(h_a, d_a) self.be.reduce(d_a, u_a) print(h_result) print(d_a.tlist[0].asnumpyarray()) for i in range(nr): np.testing.assert_allclose(d_a.tlist[i].asnumpyarray(), h_result, atol=1e-6, rtol=0) def memset_test(self): # create a numpy array as the test-bed asize = 9 h_result = np.zeros((1, asize)) d_a = self.be.zeros((1, asize)) for i in range(self.be.num_dev): np.testing.assert_allclose(d_a.tlist[i].asnumpyarray(), h_result, atol=1e-6, rtol=0) def frag2rep_test(self): nr = self.be.num_dev if nr == 1: # This shouldn't be supported return np.random.seed(0) # create a numpy array as the test-bed (rows, cols) = (24, 128) indim = rows * cols odim = indim * nr # h_frags has the data in the order we expect on the device h_frags_t = np.random.randn(odim).reshape( (nr * cols, rows)).astype(self.be.default_dtype) h_frags = h_frags_t.transpose().astype( self.be.default_dtype, order='C') d_frags = self.be.empty((rows, cols)) d_frags_t = self.be.empty((cols, rows)) d_reps = self.be.empty((rows, cols * nr)) d_reps_t = self.be.empty((cols * nr, rows)) self.be.scatter(h_frags_t, d_frags_t) self.be.transpose(d_frags_t, d_frags) np.testing.assert_allclose(d_frags.asnumpyarray(), h_frags, atol=1e-5, rtol=0) self.be.fragment_to_replica(d_frags_t, d_reps_t) self.be.transpose(d_reps_t, d_reps) for i in range(nr): np.testing.assert_allclose(d_frags.asnumpyarray(), d_reps.tlist[i].asnumpyarray(), atol=1e-5, rtol=0) print("Frag2Rep OK") d_frags_t.fill(0) self.be.replica_to_fragment(d_reps_t, d_frags_t) self.be.transpose(d_frags_t, d_frags) for i in range(nr): np.testing.assert_allclose(d_frags.asnumpyarray(), d_reps.tlist[i].asnumpyarray(), atol=1e-5, rtol=0) print("Rep2Frag OK")
def gen_backend(model=None, gpu=None, nrv=False, flexpoint=False, rng_seed=None, numerr_handling=None, half=False, stochastic_round=0, device_id=None): """ Construct and return a backend instance of the appropriate type based on the arguments given. With no parameters, a single CPU core, float32 backend is returned. Arguments: model (neon.models.model.Model): The instantiated model upon which we will utilize this backend. gpu (string, optional): Attempt to utilize a CUDA capable GPU if installed in the system. Defaults to None which implies a CPU based backend. If 'cudanet', utilize a cuda-convnet2 based backed, which supports Kepler and Maxwell GPUs with single precision. If 'nervanagpu', attempt to utilize the NervanaGPU Maxwell backend with float16 and float32 support. nrv (bool, optional): If True, attempt to utilize the Nervana Engine for computation (must be installed on the system). Defaults to False which implies a CPU based backend. rng_seed (numeric, optional): Set this to a numeric value which can be used to seed the random number generator of the instantiated backend. Defaults to None, which doesn't explicitly seed (so each run will be different) stochastic_round (numeric, optional): Only affects the max backend. If 1, perform stochastic rounding. If 0, round to nearest. numerr_handling (dict, optional): Dictate how numeric errors are displayed and handled. The keys and values permissible for this dict match that seen in numpy.seterr. If set to None (the default), behavior is equivalent to {'all': 'warn'} device_id (numeric, optional): Set this to a numeric value which can be used to select which device to run the process on Returns: Backend: newly constructed backend instance of the specifed type. Notes: * Attempts to construct a GPU instance without a CUDA capable card or without cudanet or nervanagpu package installed will cause the program to display an error message and exit. * Attempts to construct a parallel instance without mpi4py installed will cause the program to display an error message and exit. * The returned backend will still need to call its par.init_model() at some point after the model has been linked, in order for parallel training to proceed. """ logger = logging.getLogger(__name__) gpuflag = False if gpu is not None: gpu = gpu.lower() if sys.platform.startswith("linux"): gpuflag = (os.system("nvcc --version > /dev/null 2>&1") == 0) elif sys.platform.startswith("darwin"): gpuflag = ( os.system("kextstat | grep -i cuda > /dev/null 2>&1") == 0) if gpuflag and gpu == 'cudanet': try: import cudanet # noqa from neon.backends.cc2 import GPU be_name = 'Cudanet' be = GPU(rng_seed=rng_seed, device_id=device_id) except ImportError: raise RuntimeError("cudanet not found, can't run via GPU") elif gpuflag and gpu.startswith('nervanagpu'): try: import nervanagpu # noqa try: be_name = 'NervanaGPU' if gpu == 'nervanagpu': device_id = 0 if device_id is None else device_id[0] from neon.backends.gpu import GPU be = GPU(rng_seed=rng_seed, stochastic_round=stochastic_round, device_id=device_id) else: from neon.backends.mgpu import MGPU try: num_dev = int(gpu.strip('nervanagpu')) except (ValueError): raise ValueError("invalid number of GPUs" + " specified") if not device_id: device_id = range(num_dev) if len(device_id) != num_dev: raise RuntimeError( "Incorrect number of devices" " specified ", device_id, num_dev) be = MGPU(rng_seed=rng_seed, stochastic_round=stochastic_round, device_id=device_id, num_dev=num_dev) except ImportError: logger.warning("pycuda error, can't run via GPU") gpuflag = False except ImportError: logger.warning("nervanagpu not found, can't run via GPU") gpuflag = False if gpuflag is False: raise RuntimeError("Can't find CUDA capable GPU") elif nrv: nrv = False try: from umd.nrv_backend import NRVBackend nrv = True except ImportError: logger.warning("Nervana Engine system software not found") if flexpoint: logger.warning("Flexpoint(TM) backend not currently available") if nrv: be_name = 'NRV' be = NRVBackend(rng_seed=rng_seed, seterr_handling=numerr_handling, device_id=device_id) elif not gpuflag: be_name = 'CPU' be = CPU(rng_seed=rng_seed, seterr_handling=numerr_handling) logger.info("{} backend, RNG seed: {}, numerr: {}".format( be_name, rng_seed, numerr_handling)) return be