def test_layernorm_forward(shape=(5, 3)): ctx = ndarray.gpu(1) # shape = (5, 3) last_dim = shape[-1] x = np.random.random(shape).astype(np.float32) scale = np.random.random((last_dim,)).astype(np.float32) bias = np.random.random((last_dim,)).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_scale = ndarray.array(scale, ctx=ctx) arr_bias = ndarray.array(bias, ctx=ctx) arr_mean = ndarray.empty(list(shape[:-1]) + [1], ctx=ctx) arr_var = ndarray.empty(list(shape[:-1]) + [1], ctx=ctx) arr_y = ndarray.empty((shape), ctx=ctx) gpu_op.layer_normalization(arr_x, arr_scale, arr_bias, arr_mean, arr_var, arr_y, 0.01) y = arr_y.asnumpy() np_means = x.mean(axis=-1, dtype=np.float32, keepdims=True) np_vars = x.var(axis=-1, dtype=np.float32, keepdims=True) std = np.sqrt(np_vars + 0.01, dtype=np.float32) centered_input = x - np_means normed_input = centered_input / std bc_shape = [1] * len(x.shape) bc_shape[-1] = x.shape[-1] y_ = scale.reshape(bc_shape) * normed_input + \ bias.reshape(bc_shape) np.testing.assert_allclose(np_means, arr_mean.asnumpy(), atol=1e-6) np.testing.assert_allclose(np_vars, arr_var.asnumpy(), atol=1e-6) np.testing.assert_allclose(y_, y, atol=1e-6) print('Pass forward test with shape ', shape)
def test_truncated_normal(size, mean=0, std=1): ctx = ndarray.gpu(0) cuda_x = ndarray.empty(size, ctx=ctx) stre = stream.create_stream_handle(ctx) np_st = time() for i in range(10): x = truncnorm.rvs(-2.0, 2.0, loc=mean, scale=std, size=size).astype(np.float32) cuda_x[:] = x np_en = time() print('numpy time: ', np_en - np_st) cu_st = time() for i in range(10): gpu_op.truncated_normal_init(cuda_x, mean, std, 123, stre) stre.sync() cu_en = time() print('cuda time: ', cu_en - cu_st) fig, ax = plt.subplots(1, 1) cuda_x = cuda_x.asnumpy() assert (cuda_x.shape == x.shape) ax.hist(x.flatten(), histtype='stepfilled', alpha=0.2, bins=50, label='numpy') ax.hist(cuda_x.flatten(), histtype='step', alpha=0.2, bins=50, label='cuda') ax.legend(loc='best', frameon=False) # ax2.legend(loc='best', frameon=False) file_name = 'truncated_normal_%f_%f.png' % (mean, std) plt.savefig(file_name) plt.close()
def test_uniform(size, lb=-1, ub=1): ctx = ndarray.gpu(0) cuda_x = ndarray.empty(size, ctx=ctx) stre = stream.create_stream_handle(ctx) np_st = time() for i in range(10): x = np.random.uniform(low=lb, high=ub, size=size).astype(np.float32) cuda_x[:] = x np_en = time() print('numpy time: ', np_en - np_st) cu_st = time() for i in range(10): gpu_op.uniform_init(cuda_x, lb, ub, 123, stre) stre.sync() cu_en = time() print('cuda time: ', cu_en - cu_st) fig, ax = plt.subplots(1, 1) cuda_x = cuda_x.asnumpy() assert (cuda_x.shape == x.shape) ax.hist(x.flatten(), histtype='stepfilled', alpha=0.2, bins=50, label='numpy') ax.hist(cuda_x.flatten(), histtype='step', alpha=0.2, bins=50, label='cuda') ax.legend(loc='best', frameon=False) # ax2.legend(loc='best', frameon=False) file_name = 'uniform_%f_%f.png' % (lb, ub) plt.savefig(file_name) plt.close()
def test_cpu_normal(size, mean=0, std=1): cpu_x = ndarray.empty(size, ctx=ndarray.cpu(0)) np_st = time() for i in range(10): x = np.random.normal(loc=mean, scale=std, size=size).astype(np.float32) cpu_x[:] = x np_en = time() print('numpy time: ', np_en - np_st) cpu_st = time() for i in range(10): cpu_op.normal_init(cpu_x, mean, std, 123) cpu_en = time() print('cpu time: ', cpu_en - cpu_st) fig, ax = plt.subplots(1, 1) cpu_x = cpu_x.asnumpy() assert (cpu_x.shape == x.shape) ax.hist(x.flatten(), histtype='stepfilled', alpha=0.2, bins=50, label='numpy') ax.hist(cpu_x.flatten(), histtype='step', alpha=0.2, bins=50, label='cpu') ax.legend(loc='best', frameon=False) # ax2.legend(loc='best', frameon=False) file_name = 'normal_%f_%f_cpu.png' % (mean, std) plt.savefig(file_name) plt.close()
def test_layernorm_backward(shape=(5, 3)): ctx = ndarray.gpu(1) # shape = (5, 3) last_dim = shape[-1] grads = np.random.random(shape).astype(np.float32) x = np.random.random(shape).astype(np.float32) scale = np.random.random((last_dim,)).astype(np.float32) mean = np.random.random(list(shape[:-1])+[1]).astype(np.float32) var = np.random.random(list(shape[:-1])+[1]).astype(np.float32) arr_grads = ndarray.array(grads, ctx=ctx) arr_x = ndarray.array(x, ctx=ctx) arr_scale = ndarray.array(scale, ctx=ctx) arr_mean = ndarray.array(mean, ctx=ctx) arr_var = ndarray.array(var, ctx=ctx) grad_inarr = ndarray.empty(shape, ctx=ctx) grad_scale = ndarray.empty((last_dim,), ctx=ctx) grad_bias = ndarray.empty((last_dim,), ctx=ctx) gpu_op.layer_normalization_gradient(arr_grads, arr_x, arr_scale, grad_inarr, grad_scale, grad_bias, arr_mean, arr_var, 0.01) # numpy calculate phase red_axis = tuple(range(grads.ndim-1)) np_grad_bias = grads.sum(red_axis) # (X,) std = np.sqrt(var + 0.01) # (N, 1) x_centered = x - mean # (N, X) x_norm = x_centered / std # (N, X) np_grad_scale = (grads * x_norm).sum(red_axis) # (X,) last_dim = x.shape[-1] dx_norm = grads * scale.reshape([1] * (grads.ndim - 1) + [-1]) # (N, X) dvar = (dx_norm * x_centered).sum(axis=-1, keepdims=True) * -0.5 / (var + 0.01) / std # (N, 1) dx_mu_1 = dx_norm / std # (N, X) dx_mu_2 = dvar * 2 * x_centered / last_dim # (N, X) dx_1 = dx_mu_1 + dx_mu_2 # (N, X) dx_2 = -1 * dx_1.sum(axis=-1, keepdims=True) / last_dim # (N, 1) np_grad_inarr = dx_1 + dx_2 # (N, X) np.testing.assert_allclose(np_grad_bias, grad_bias.asnumpy(), rtol=1e-4, atol=1e-4) np.testing.assert_allclose(np_grad_scale, grad_scale.asnumpy(), rtol=1e-4, atol=1e-4) np.testing.assert_allclose(np_grad_inarr, grad_inarr.asnumpy(), rtol=1e-4, atol=1e-4) print('Pass backward test with shape ', shape)
def test_sparse_matrix_multiply(): density = 1e-3 ctx = ndarray.gpu(0) x = scipy.sparse.rand(500, 7000,density=density,format='coo',dtype=np.float32) y = np.random.uniform(0, 10, size=(7000, 100)).astype(np.float32) mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 7000], ctx=ctx) mat_y = ndarray.array(y, ctx=ctx) mat_z = ndarray.empty((500, 100), ctx=ctx) gpu_op.CuSparse_Csrmm(mat_x, False, mat_y, False, mat_z) z = mat_z.asnumpy() np.testing.assert_allclose(x.dot(y), z, rtol=1e-5)
def test_sparse_array_dense_vector_multiply(): density = 1e-3 ctx = ndarray.gpu(0) x = scipy.sparse.rand(500, 70000, density=density,format='coo',dtype=np.float32) y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32) mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 70000], ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((500, 1), ctx=ctx) trans = False gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(x.dot(y), z, rtol=1e-5) x = scipy.sparse.rand(70000, 500, density=density,format='coo',dtype=np.float32) y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32) mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [70000, 500], ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((500, 1), ctx=ctx) trans = True gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(x.transpose().dot(y), z, rtol=1e-5)
def test_init_ps(rarr, init_type, init_a, init_b=1.0, sparse=False): assert init_type in ('constant', 'uniform', 'normal', 'truncated_normal') init_type_map = { 'constant': 0, 'uniform': 1, 'normal': 2, 'truncated_normal': 3 } ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(nitem, item_len) if rank == 0: arr = ndarray.array(local_arr, ctx=ctx) else: arr = ndarray.empty((nitem, item_len), ctx=ctx) comm = ad.get_worker_communicate() if sparse: arr_len = ctypes.c_int(nitem) arr_wid = ctypes.c_int(item_len) else: arr_len = ctypes.c_int(nitem * item_len) arr_wid = ctypes.c_int(1) itype = ctypes.c_int(init_type_map[init_type]) comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len, arr_wid, itype, ctypes.c_double(init_a), ctypes.c_double(init_b), ctypes.c_ulonglong(123), ctypes.c_int(0), (ctypes.c_float * 1)(0.1), ctypes.c_int(1)) comm.Pull(ctypes.c_int(0), arr.handle) comm.Wait(ctypes.c_int(0)) if rank == 0: local_arr[:] = arr.asnumpy() comm.BarrierWorker() if rank != 0: np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7) else: if init_type == 'constant': np.testing.assert_allclose(np.full((nitem, item_len), init_a), arr.asnumpy(), rtol=5e-7) else: if init_type == 'uniform': numpy_samples = np.random.uniform( low=init_a, high=init_b, size=(nitem, item_len)).astype(np.float32) elif init_type == 'normal': numpy_samples = np.random.normal( loc=init_a, scale=init_b, size=(nitem, item_len)).astype(np.float32) else: numpy_samples = truncnorm.rvs(-2.0, 2.0, loc=init_a, scale=init_b, size=(nitem, item_len)).astype( np.float32) fig, ax = plt.subplots(1, 1) ax.hist(numpy_samples.flatten(), histtype='stepfilled', alpha=0.2, bins=50, label='numpy') ax.hist(local_arr.flatten(), histtype='step', alpha=0.2, bins=50, label='ps') ax.legend(loc='best', frameon=False) # ax2.legend(loc='best', frameon=False) file_name = '%s_%.1f_%.1f_%d.png' % (init_type, init_a, init_b, int(sparse)) plt.savefig(file_name) print('Check file %s.' % file_name) print('Init parameters %d/%d passed.' % (rank, nrank)) if rank == 0: comm.ClearOnServer(0) comm.Clear(0) comm.BarrierWorker()
def test_api(rarr, rpush, rpull, sparse=False, lr=0.5): ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(nitem, item_len).copy() local_push = np.frombuffer(rpush, dtype=np.float32).copy() local_pull = np.frombuffer(rpull, dtype=np.float32).copy() if rank == 0: arr = ndarray.array(local_arr, ctx=ctx) else: arr = ndarray.empty((nitem, item_len), ctx=ctx) comm = ad.get_worker_communicate() if sparse: arr_len = ctypes.c_int(nitem) arr_wid = ctypes.c_int(item_len) else: arr_len = ctypes.c_int(nitem * item_len) arr_wid = ctypes.c_int(1) comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len, arr_wid, ctypes.c_int(0), ctypes.c_double(0.0), ctypes.c_double(1.0), ctypes.c_ulonglong(123),\ ctypes.c_int(0), (ctypes.c_float * 1)(lr), ctypes.c_int(1)) if sparse: local_arr[:] = 0 for j in local_push: local_arr[int(j)] += 1 if rank == 0: push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx) push_val = ndarray.array(np.ones( (indx1, indx2, item_len)).astype(np.float32), ctx=ctx) comm.SparsePush(0, push_ind.handle, push_val.handle, None) comm.Wait(0) comm.BarrierWorker() comm.Pull(0, arr.handle) comm.Wait(0) np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7) print('SparsePush DensePull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() for j in local_push: local_arr[int(j)] += 1 if rank == 0: push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx) push_val = ndarray.array(np.ones( (indx1, indx2, item_len)).astype(np.float32), ctx=ctx) comm.SDPushPull(0, push_ind.handle, push_val.handle, arr.handle, None) comm.Wait(0) comm.BarrierWorker() if rank != 0: comm.Pull(0, arr.handle) comm.Wait(0) np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7) print('SDPushPull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() for j in local_push: local_arr[int(j)] += 1 pull_ind = ndarray.array(local_pull.reshape(indx1, indx2), ctx=ctx) pull_val = ndarray.empty((indx1, indx2, item_len), ctx=ctx) if rank == 0: push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx) push_val = ndarray.array(np.ones( (indx1, indx2, item_len)).astype(np.float32), ctx=ctx) comm.SSPushPull(0, push_ind.handle, push_val.handle, \ pull_ind.handle, pull_val.handle, None) comm.Wait(0) comm.BarrierWorker() if rank != 0: comm.SparsePull(0, pull_ind.handle, pull_val.handle) comm.Wait(0) np.testing.assert_allclose(local_arr[local_pull.astype(int)].reshape( indx1, indx2, item_len), pull_val.asnumpy(), rtol=5e-7) print('SSPushPull and SparsePull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() else: if rank == 0: comm.Push(0, arr.handle, None) comm.Wait(0) comm.BarrierWorker() comm.Pull(0, arr.handle) comm.Wait(0) np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7) print('DensePush DensePull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() if rank == 0: temp_push_val = ndarray.array(np.ones( (nitem, item_len)).astype(np.float32), ctx=ctx) comm.DDPushPull(0, temp_push_val.handle, arr.handle, None) comm.Wait(0) comm.BarrierWorker() if rank != 0: comm.Pull(0, arr.handle) comm.Wait(0) np.testing.assert_allclose(local_arr + 1, arr.asnumpy()) print('DenseDensePushPull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() if rank == 0: comm.ClearOnServer(0) comm.Clear(0) comm.BarrierWorker()