def test_post_nccl_fork(): # device_count() takes into account CUDA_VISIBLE_DEVICES, list_gpus() does not. visible_gpus = get_device_count() present_gpus = len(mx.test_utils.list_gpus()) # some push-ups to create a one line CI output sys.stdout.write(str(visible_gpus) + ' of ' + str(present_gpus) + ' gpus are visible ... ') sys.stdout.flush()
def test_sync_batchnorm(): num_devices = get_device_count() # no need to use SyncBN with 1 gpu if num_devices < 2: sys.stderr.write('bypassing test: needs 2 or more gpus, found {} ...'.format(num_devices)) return ndev = 2 # check with unsync version for i in range(10): _check_batchnorm_result(mx.nd.random.uniform(shape=(4, 1, 4, 4)), num_devices=ndev, cuda=True)
def test_rsp_push_pull(): num_gpus = get_device_count() def check_rsp_push_pull(kv_type, sparse_pull, is_push_cpu=True): kv = init_kv_with_str('row_sparse', kv_type) kv.init('e', mx.nd.ones(shape).tostype('row_sparse')) push_ctxs = [mx.cpu(i) if is_push_cpu else mx.gpu(i) for i in range(2)] kv.push('e', [ mx.nd.ones(shape, ctx=context).tostype('row_sparse') for context in push_ctxs ]) def check_rsp_pull(kv, count, ctxs, sparse_pull, is_same_rowid=False, use_slice=False): num_rows = shape[0] row_ids = [] all_row_ids = np.arange(num_rows) vals = [ mx.nd.sparse.zeros(shape=shape, ctx=ctxs[i], stype='row_sparse') for i in range(count) ] if is_same_rowid: row_id = np.random.randint(num_rows, size=num_rows) row_ids = [mx.nd.array(row_id)] * count elif use_slice: total_row_ids = mx.nd.array( np.random.randint(num_rows, size=count * num_rows)) row_ids = [ total_row_ids[i * num_rows:(i + 1) * num_rows] for i in range(count) ] else: for i in range(count): row_id = np.random.randint(num_rows, size=num_rows) row_ids.append(mx.nd.array(row_id)) row_ids_to_pull = row_ids[0] if (len(row_ids) == 1 or is_same_rowid) else row_ids vals_to_pull = vals[0] if len(vals) == 1 else vals kv.row_sparse_pull('e', out=vals_to_pull, row_ids=row_ids_to_pull) for val, row_id in zip(vals, row_ids): retained = val.asnumpy() excluded_row_ids = np.setdiff1d(all_row_ids, row_id.asnumpy()) for row in range(num_rows): expected_val = np.zeros_like(retained[row]) expected_val += 0 if row in excluded_row_ids else 2 assert_almost_equal(retained[row], expected_val) if sparse_pull is True: kv.pull('e', out=vals_to_pull, ignore_sparse=False) for val in vals: retained = val.asnumpy() expected_val = np.zeros_like(retained) expected_val[:] = 2 assert_almost_equal(retained, expected_val) check_rsp_pull(kv, 1, [mx.gpu(0)], sparse_pull) check_rsp_pull(kv, 1, [mx.cpu(0)], sparse_pull) num_gpu_ctxs = 2 * num_gpus num_cpu_ctxs = 4 check_rsp_pull(kv, num_gpu_ctxs, [mx.gpu(i // 2) for i in range(num_gpu_ctxs)], sparse_pull) check_rsp_pull(kv, num_gpu_ctxs, [mx.gpu(i // 2) for i in range(num_gpu_ctxs)], sparse_pull, is_same_rowid=True) check_rsp_pull(kv, num_cpu_ctxs, [mx.cpu(i) for i in range(num_cpu_ctxs)], sparse_pull) check_rsp_pull(kv, num_cpu_ctxs, [mx.cpu(i) for i in range(num_cpu_ctxs)], sparse_pull, is_same_rowid=True) check_rsp_pull(kv, num_gpu_ctxs, [mx.gpu(i // 2) for i in range(num_gpu_ctxs)], sparse_pull, use_slice=True) check_rsp_pull(kv, num_cpu_ctxs, [mx.cpu(i) for i in range(num_cpu_ctxs)], sparse_pull, use_slice=True) envs = ["", "1"] key = "MXNET_KVSTORE_USETREE" for val in envs: with EnvManager(key, val): if val is "1": sparse_pull = False else: sparse_pull = True check_rsp_push_pull('local', sparse_pull) check_rsp_push_pull('device', sparse_pull) if num_gpus >= 2: check_rsp_push_pull('device', sparse_pull, is_push_cpu=False) else: sys.stdout.write('Bypassing 2-GPU test, num gpus found = ' + str(num_gpus) + ' ... ') sys.stdout.flush()
import sys import os import mxnet as mx import numpy as np import unittest from mxnet.cuda_utils import get_device_count curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) sys.path.insert(0, os.path.join(curr_path, '../unittest')) from common import * shapes = [(10), (100), (1000), (10000), (100000), (2,2), (2,3,4,5,6,7,8)] keys = [1,2,3,4,5,6,7] num_gpus = get_device_count() if num_gpus > 8 : print("The machine has {} gpus. We will run the test on 8 gpus.".format(num_gpus)) print("There is a limit for all PCI-E hardware on creating number of P2P peers. The limit is 8.") num_gpus = 8; gpus = range(1, 1+num_gpus) @with_seed() def test_nccl_pushpull(): sys.stdout.write('Performing nccl test with ' + str(num_gpus) + ' gpu(s): ') sys.stdout.flush() for shape, key in zip(shapes, keys): for n_gpus in gpus: