def train_hetu(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) ctx = ndarray.gpu(rank) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") mask_ = ad.Variable(name="mask_") gcn1 = GraphSage(meta["feature"], hidden_layer_size, activation="relu", dropout=0.1) gcn2 = GraphSage(2*hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1) x = gcn1(x_) x = gcn2(x) W = initializers.xavier_uniform(shape=(2*hidden_layer_size, meta["class"])) B = initializers.zeros(shape=(meta["class"],)) x = ad.matmul_op(x, W) y = x + ad.broadcastto_op(B, x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.mul_op(loss, mask_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(0.1) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS') distributed.ps_init(rank, nrank) batch_size = 4000 with DistributedGraphSageSampler(args.path, batch_size, 2, 2, rank=rank, nrank=nrank) as sampler: epoch = 0 nnodes = 0 start = time.time() while True: g_sample, mask = sampler.sample() mp_val = mp_matrix(g_sample, ndarray.gpu(rank)) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, mask_ : ndarray.array(mask, ctx=ctx), x_ : ndarray.array(g_sample.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=g_sample.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = ((y_predicted == g_sample.y) * mask).sum() distributed.ps_get_worker_communicator().BarrierWorker() nnodes += batch_size if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 print("Epoch :", epoch, time.time() - start) print("Train accuracy:", acc/mask.sum()) start = time.time() if epoch >= num_epoch: break
def __init__(self, stream=None, mpi_init=True): ''' mpicomm: the MPI communicator, to use in MPI_Bcast, MPI_Reduce, MPI_Scatter, etc ncclcomm: the NCCL communicator, to use in ncclAllReduce ... nRanks: the total number of MPI threads myRanks: the rank in all MPI threads localRank: the rank among the MPI threads in this device ncclId: ncclGetUniqueId should be called once when creating a communicator and the Id should be distributed to all ranks in the communicator before calling ncclCommInitRank. stream: the stream for NCCL communication ''' self.mpicomm = c_int64(0) self.ncclcomm = c_int64(0) self.nRanks = c_int32(0) self.myRank = c_int32(0) self.localRank = c_int32(-1) self.ncclId = ncclUniqueId() self.device_id = c_int(0) if mpi_init: self.MPI_Init() self.groupComm_flag = False self.MPIGetComm() self.MPI_Comm_rank() self.MPI_Comm_size() self.getLocalRank() self.device_id.value = self.localRank.value if stream == None: self.stream = create_stream_handle( ndarray.gpu(self.device_id.value)) else: self.stream = stream
def test_uniform(size, lb=-1, ub=1): ctx = ndarray.gpu(0) cuda_x = ndarray.empty(size, ctx=ctx) stre = stream.create_stream_handle(ctx) np_st = time() for i in range(10): x = np.random.uniform(low=lb, high=ub, size=size).astype(np.float32) cuda_x[:] = x np_en = time() print('numpy time: ', np_en - np_st) cu_st = time() for i in range(10): gpu_op.uniform_init(cuda_x, lb, ub, 123, stre) stre.sync() cu_en = time() print('cuda time: ', cu_en - cu_st) fig, ax = plt.subplots(1, 1) cuda_x = cuda_x.asnumpy() assert (cuda_x.shape == x.shape) ax.hist(x.flatten(), histtype='stepfilled', alpha=0.2, bins=50, label='numpy') ax.hist(cuda_x.flatten(), histtype='step', alpha=0.2, bins=50, label='cuda') ax.legend(loc='best', frameon=False) # ax2.legend(loc='best', frameon=False) file_name = 'uniform_%f_%f.png' % (lb, ub) plt.savefig(file_name) plt.close()
def test_truncated_normal(size, mean=0, std=1): ctx = ndarray.gpu(0) cuda_x = ndarray.empty(size, ctx=ctx) stre = stream.create_stream_handle(ctx) np_st = time() for i in range(10): x = truncnorm.rvs(-2.0, 2.0, loc=mean, scale=std, size=size).astype(np.float32) cuda_x[:] = x np_en = time() print('numpy time: ', np_en - np_st) cu_st = time() for i in range(10): gpu_op.truncated_normal_init(cuda_x, mean, std, 123, stre) stre.sync() cu_en = time() print('cuda time: ', cu_en - cu_st) fig, ax = plt.subplots(1, 1) cuda_x = cuda_x.asnumpy() assert (cuda_x.shape == x.shape) ax.hist(x.flatten(), histtype='stepfilled', alpha=0.2, bins=50, label='numpy') ax.hist(cuda_x.flatten(), histtype='step', alpha=0.2, bins=50, label='cuda') ax.legend(loc='best', frameon=False) # ax2.legend(loc='best', frameon=False) file_name = 'truncated_normal_%f_%f.png' % (mean, std) plt.savefig(file_name) plt.close()
def test_layernorm_forward(shape=(5, 3)): ctx = ndarray.gpu(1) # shape = (5, 3) last_dim = shape[-1] x = np.random.random(shape).astype(np.float32) scale = np.random.random((last_dim,)).astype(np.float32) bias = np.random.random((last_dim,)).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_scale = ndarray.array(scale, ctx=ctx) arr_bias = ndarray.array(bias, ctx=ctx) arr_mean = ndarray.empty(list(shape[:-1]) + [1], ctx=ctx) arr_var = ndarray.empty(list(shape[:-1]) + [1], ctx=ctx) arr_y = ndarray.empty((shape), ctx=ctx) gpu_op.layer_normalization(arr_x, arr_scale, arr_bias, arr_mean, arr_var, arr_y, 0.01) y = arr_y.asnumpy() np_means = x.mean(axis=-1, dtype=np.float32, keepdims=True) np_vars = x.var(axis=-1, dtype=np.float32, keepdims=True) std = np.sqrt(np_vars + 0.01, dtype=np.float32) centered_input = x - np_means normed_input = centered_input / std bc_shape = [1] * len(x.shape) bc_shape[-1] = x.shape[-1] y_ = scale.reshape(bc_shape) * normed_input + \ bias.reshape(bc_shape) np.testing.assert_allclose(np_means, arr_mean.asnumpy(), atol=1e-6) np.testing.assert_allclose(np_vars, arr_var.asnumpy(), atol=1e-6) np.testing.assert_allclose(y_, y, atol=1e-6) print('Pass forward test with shape ', shape)
def train_hetu(num_epoch): ctx = ndarray.gpu(0) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") if use_same_init: gcn1 = GCN(num_features, hidden_layer_size, custom_init=(init_w1, init_b1)) gcn2 = GCN(hidden_layer_size, num_classes, custom_init=(init_w2, init_b2)) else: gcn1 = GCN(num_features, hidden_layer_size) gcn2 = GCN(hidden_layer_size, num_classes) mp_val = mp_matrix(graph, ctx, use_original_gcn_norm=True) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(graph.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(graph.y, max_val=num_classes), ctx=ctx) } x = gcn1(x_) x = ad.relu_op(x) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) opt = optimizer.AdamOptimizer(0.01) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx) start_time = time.time() losses = [] for i in range(num_epoch): loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == graph.y).sum() losses.append(loss_val.asnumpy().mean()) if i==0: start_time= time.time() print("Train loss :", loss_val.asnumpy().mean()) print("Train accuracy:", acc/len(y_predicted)) print("Hetu time:",i, time.time()-start_time) print("Hetu time:", time.time()-start_time) mp_val = mp_matrix(graph_full, ctx) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(graph_full.x, ctx=ctx), } executor_eval = ad.Executor([y], ctx=ctx) y_predicted, = executor_eval.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == graph_full.y)[train_split:].sum() print("Test accuracy:", acc/len(y_predicted[train_split:])) return losses
def transform(result): [graph, sample_mask] = result train_mask = np.zeros(node_upper_bound) train_mask[0:graph.num_nodes] = sample_mask * graph.x[:, -1] test_mask = np.zeros(node_upper_bound) test_mask[0:graph.num_nodes] = (sample_mask - graph.x[:, -1]) * sample_mask graph = padding(graph, node_upper_bound) mp_val = mp_matrix(graph, ndarray.gpu(rank % args.num_local_worker)) return graph, mp_val, train_mask, test_mask
def test_sparse_matrix_multiply(): density = 1e-3 ctx = ndarray.gpu(0) x = scipy.sparse.rand(500, 7000,density=density,format='coo',dtype=np.float32) y = np.random.uniform(0, 10, size=(7000, 100)).astype(np.float32) mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 7000], ctx=ctx) mat_y = ndarray.array(y, ctx=ctx) mat_z = ndarray.empty((500, 100), ctx=ctx) gpu_op.CuSparse_Csrmm(mat_x, False, mat_y, False, mat_z) z = mat_z.asnumpy() np.testing.assert_allclose(x.dot(y), z, rtol=1e-5)
def train_hetu(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) hosts, ports = load_ip_config(args.ip_config) ctx = ndarray.gpu(rank) distributed.grpc_init(hosts=hosts, ports=ports, rank=rank, nrank=nrank) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") gcn1 = GCN(meta["feature"], hidden_layer_size, activation="relu") gcn2 = GCN(hidden_layer_size, meta["class"]) x = gcn1(x_) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(0.1) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS') def transform(graph): mp_val = mp_matrix(graph, ndarray.gpu(rank)) return graph, mp_val with DistributedSubgraphSampler(args.path, 4000, 2, rank=rank, nrank=nrank ,transformer=transform, backend="grpc") as sampler: epoch = 0 nnodes = 0 start = time.time() while True: g_sample, mp_val = sampler.sample() feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(g_sample.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=g_sample.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == g_sample.y).sum() distributed.ps_get_worker_communicator().BarrierWorker() nnodes += g_sample.num_nodes if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 print("Epoch :", epoch, time.time() - start) print("Train accuracy:", acc/len(y_predicted)) start = time.time() if epoch >= num_epoch: break
def test_allreduce(comm=None): shape = (24, 24) size = 4 for val in shape: size *= val input_arr = np.ones(shape) * comm.localRank.value input_arr = ndarray.array(input_arr, ctx=ndarray.gpu(comm.localRank.value)) # input_arr = ndarray.array(input_arr, ctx = ndarray.cpu()) start = time.time() comm.dlarrayNcclAllReduce(input_arr, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.stream.sync() end = time.time() secs = end - start return size, secs
def test_layernorm_backward(shape=(5, 3)): ctx = ndarray.gpu(1) # shape = (5, 3) last_dim = shape[-1] grads = np.random.random(shape).astype(np.float32) x = np.random.random(shape).astype(np.float32) scale = np.random.random((last_dim,)).astype(np.float32) mean = np.random.random(list(shape[:-1])+[1]).astype(np.float32) var = np.random.random(list(shape[:-1])+[1]).astype(np.float32) arr_grads = ndarray.array(grads, ctx=ctx) arr_x = ndarray.array(x, ctx=ctx) arr_scale = ndarray.array(scale, ctx=ctx) arr_mean = ndarray.array(mean, ctx=ctx) arr_var = ndarray.array(var, ctx=ctx) grad_inarr = ndarray.empty(shape, ctx=ctx) grad_scale = ndarray.empty((last_dim,), ctx=ctx) grad_bias = ndarray.empty((last_dim,), ctx=ctx) gpu_op.layer_normalization_gradient(arr_grads, arr_x, arr_scale, grad_inarr, grad_scale, grad_bias, arr_mean, arr_var, 0.01) # numpy calculate phase red_axis = tuple(range(grads.ndim-1)) np_grad_bias = grads.sum(red_axis) # (X,) std = np.sqrt(var + 0.01) # (N, 1) x_centered = x - mean # (N, X) x_norm = x_centered / std # (N, X) np_grad_scale = (grads * x_norm).sum(red_axis) # (X,) last_dim = x.shape[-1] dx_norm = grads * scale.reshape([1] * (grads.ndim - 1) + [-1]) # (N, X) dvar = (dx_norm * x_centered).sum(axis=-1, keepdims=True) * -0.5 / (var + 0.01) / std # (N, 1) dx_mu_1 = dx_norm / std # (N, X) dx_mu_2 = dvar * 2 * x_centered / last_dim # (N, X) dx_1 = dx_mu_1 + dx_mu_2 # (N, X) dx_2 = -1 * dx_1.sum(axis=-1, keepdims=True) / last_dim # (N, 1) np_grad_inarr = dx_1 + dx_2 # (N, X) np.testing.assert_allclose(np_grad_bias, grad_bias.asnumpy(), rtol=1e-4, atol=1e-4) np.testing.assert_allclose(np_grad_scale, grad_scale.asnumpy(), rtol=1e-4, atol=1e-4) np.testing.assert_allclose(np_grad_inarr, grad_inarr.asnumpy(), rtol=1e-4, atol=1e-4) print('Pass backward test with shape ', shape)
def test_dense(): npw = np.random.random((5, 10)).astype(np.float32) npx = np.random.random((7, 5)).astype(np.float32) cpuctx = ndarray.cpu(0) gpuctx = ndarray.gpu(0) X = ad.Variable(name="x") mid = X + 3 W = ad.Variable(name='w', value=npw, ctx=cpuctx) y = ad.matmul_op(mid, W) opt = optimizer.SGDOptimizer(learning_rate=0.1) train_op = opt.minimize(y) executor = ad.Executor([y, train_op], ctx=gpuctx) pred_y, _ = executor.run(feed_dict={X: npx}, convert_to_numpy_ret_vals=True) nppred_y = np.matmul((npx + 3), npw) np.testing.assert_allclose(pred_y, nppred_y, rtol=1e-6) new_npw = npw - 0.1 * np.matmul((npx+3).T, np.ones(nppred_y.shape).astype(np.float32)) np.testing.assert_allclose(W.tensor_value.asnumpy(), new_npw, rtol=1e-10)
def test_add_lazy(shape1=(1, 4, 1), shape2=(2, 3, 4, 5), ctx=ndarray.gpu(1)): x = np.random.random(shape1).astype(np.float32) z = np.random.random(shape2).astype(np.float32) ath_x = ad.Variable(name='x', value=x) ath_z = ad.Variable(name='z', value=z) ath_y = ad.add_op(ad.broadcast_shape_op(ath_x, shape2), ath_z) executor = ad.Executor([ath_y], ctx=ctx) ath_results = [var.asnumpy() for var in executor.run()] import tensorflow as tf tf_x = tf.convert_to_tensor(x) tf_z = tf.convert_to_tensor(z) tf_y = tf_x + tf_z with tf.Session() as sess: sess.run(tf.global_variables_initializer()) tf_results = sess.run([tf_y]) np.testing.assert_allclose(ath_results[0], tf_results[0]) print('Passed add op test with shape ', shape1, shape2)
def test_broadcast(shape1=(3, 1), shape2=(2, 3, 4)): ctx = ndarray.gpu(1) x = np.random.random(shape1).astype(np.float32) ath_x = ad.Variable(name='x', value=x) ath_y = ad.broadcast_shape_op(ath_x, shape2) ath_grad = ad.gradients(ath_y, [ath_x])[0] executor = ad.Executor([ath_y, ath_grad], ctx=ctx) ath_results = [var.asnumpy() for var in executor.run()] import tensorflow as tf tf_x = tf.convert_to_tensor(x) tf_y = tf.broadcast_to(tf_x, shape2) tf_grad = tf.gradients(tf_y, tf_x) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) tf_results = sess.run([tf_y, tf_grad]) np.testing.assert_allclose(ath_results[0], tf_results[0]) np.testing.assert_allclose(ath_results[1], np.reshape(tf_results[1], ath_results[1].shape)) print('Passed broadcast shape op test with shape ', shape1, shape2)
def test_transpose(shape=(2, 3, 4, 5), perm=None): ctx = ndarray.gpu(1) x = np.random.random(shape).astype(np.float32) ath_x = ad.Variable(name='x', value=x) ath_y = ad.transpose_op(ath_x, perm) ath_grad = ad.gradients(ath_y, [ath_x])[0] executor = ad.Executor([ath_y, ath_grad], ctx=ctx, enable_lazy=False) ath_results = [var.asnumpy() for var in executor.run()] import tensorflow as tf tf_x = tf.convert_to_tensor(x) tf_y = tf.transpose(tf_x, perm) tf_grad = tf.gradients(tf_y, tf_x) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) tf_results = sess.run([tf_y, tf_grad]) np.testing.assert_allclose(ath_results[0], tf_results[0]) np.testing.assert_allclose(ath_results[1], np.reshape(tf_results[1], ath_results[1].shape)) print('Passed transpose shape op test with shape ', shape, ' and perm ', perm)
def test_slice(shape1=(7, 11, 13), shape2=(2, 3, 4), begin_pos=(0, 0, 0)): ctx = ndarray.gpu(1) x = np.random.random(shape1).astype(np.float32) ath_x = ad.Variable(name='x', value=x) ath_y = ad.slice_op(ath_x, begin_pos, shape2) ath_grad = ad.gradients(ath_y, [ath_x])[0] executor = ad.Executor([ath_y, ath_grad], ctx=ctx, enable_lazy=False) ath_results = [var.asnumpy() for var in executor.run()] import tensorflow as tf tf_x = tf.convert_to_tensor(x) tf_y = tf.slice(tf_x, begin_pos, shape2) tf_grad = tf.gradients(tf_y, tf_x) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) tf_results = sess.run([tf_y, tf_grad]) np.testing.assert_allclose(ath_results[0], tf_results[0]) np.testing.assert_allclose(ath_results[1], np.reshape(tf_results[1], ath_results[1].shape)) print('Passed slice op test with shape ', shape1, shape2, ' and begin pos ', begin_pos)
def test_reduce_sum(shape=(2, 3, 4), axes=[2]): ctx = ndarray.gpu(1) x = np.random.random(shape).astype(np.float32) ath_x = ad.Variable(name='x', value=x) ath_y = ad.reduce_sum_op(ath_x, axes, keepdims=False) ath_grad = ad.gradients(ath_y, [ath_x])[0] executor = ad.Executor([ath_y, ath_grad], ctx=ctx) ath_results = [var.asnumpy() for var in executor.run()] import tensorflow as tf tf_x = tf.convert_to_tensor(x) tf_y = tf.reduce_sum(tf_x, axes) tf_grad = tf.gradients(tf_y, tf_x) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) tf_results = sess.run([tf_y, tf_grad]) np.testing.assert_allclose(ath_results[0], np.reshape(tf_results[0], ath_results[0].shape), rtol=1e-6) np.testing.assert_allclose(ath_results[1], np.reshape(tf_results[1], ath_results[1].shape), rtol=1e-6) print('Passed reduce sum op test with shape and axes ', shape, axes)
def test_p2p(comm=None, src=0, target=1): shape = (1000, 30, 224, 224) size = 4 for val in shape: size *= val print("MyRank: ", comm.myRank.value) arr = np.ones(shape) * comm.localRank.value arr = ndarray.array(arr, ctx=ndarray.gpu(comm.localRank.value)) # arr = ndarray.array(arr, ctx = ndarray.cpu()) start = time.time() if comm.myRank.value == 0: comm.dlarraySend(arr, ncclDataType_t.ncclFloat32, 1) else: comm.dlarrayRecv(arr, ncclDataType_t.ncclFloat32, 0) comm.stream.sync() end = time.time() secs = end - start # size: /Bytes # dur_time: /s return size, secs
def test_batch_matmul(shape1=(7, 4, 6), shape2=(7, 6, 5), transA=False, transB=False): executor_ctx = ndarray.gpu(1) if transA: shape1 = tuple(list(shape1)[:-2] + [shape1[-1], shape1[-2]]) if transB: shape2 = tuple(list(shape2)[:-2] + [shape2[-1], shape2[-2]]) data = np.random.normal(0.0, 0.2, shape1).astype(np.float32) weights = np.random.normal(0.0, 0.1, shape2).astype(np.float32) ath_data = ad.Variable(name='data') ath_weights = ad.Variable(name='weights') ath_output = ad.batch_matmul_op(ath_data, ath_weights, trans_A=transA, trans_B=transB) ath_grads = ad.gradients(ath_output, [ath_data, ath_weights]) executor = ad.Executor( [ath_output] + ath_grads, ctx=executor_ctx ) ath_results = executor.run(feed_dict={ath_data: data, ath_weights: weights}) ath_results = [res.asnumpy() for res in ath_results] import tensorflow as tf tf_data = tf.placeholder(name='data', dtype=tf.float32) tf_weights = tf.placeholder(name='weights', dtype=tf.float32) tf_output = tf.matmul(tf_data, tf_weights, transpose_a=transA, transpose_b=transB) tf_grads = tf.gradients(tf_output, [tf_data, tf_weights]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) tf_results = sess.run([tf_output] + tf_grads, feed_dict={tf_data: data, tf_weights: weights}) np.testing.assert_allclose(ath_results[0], tf_results[0], atol=1e-6) np.testing.assert_allclose(ath_results[1], tf_results[1], atol=1e-6) np.testing.assert_allclose(ath_results[2], tf_results[2], atol=1e-6) print('Pass batch matmul op test with shape ', shape1, shape2)
def test_sparse_array_dense_vector_multiply(): density = 1e-3 ctx = ndarray.gpu(0) x = scipy.sparse.rand(500, 70000, density=density,format='coo',dtype=np.float32) y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32) mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 70000], ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((500, 1), ctx=ctx) trans = False gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(x.dot(y), z, rtol=1e-5) x = scipy.sparse.rand(70000, 500, density=density,format='coo',dtype=np.float32) y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32) mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [70000, 500], ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((500, 1), ctx=ctx) trans = True gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(x.transpose().dot(y), z, rtol=1e-5)
def test_sparse(): npemb = np.random.random((100, 20)).astype(np.float32) npind = np.array(np.random.randint(100, size=(10,))) npw = np.random.random((20, 30)).astype(np.float32) cpuctx = ndarray.cpu(0) gpuctx = ndarray.gpu(0) embedding = ad.Variable('embeddingtable', value=npemb, ctx=cpuctx) index = ad.Variable(name="index", ctx=cpuctx) W = ad.Variable(name="w", value=npw) y = ad.embedding_lookup_op(embedding, index) # (10, 20) y = ad.matmul_op(y, W) opt = optimizer.SGDOptimizer(0.1) train_op = opt.minimize(y) executor = ad.Executor([y, train_op],ctx=gpuctx) out, _ = executor.run(feed_dict={index: npind.astype(np.float32)}, convert_to_numpy_ret_vals=True) np_out = np.matmul(npemb[npind], npw) np.testing.assert_allclose(out, np_out, rtol=1e-6) tmp_grad = np.matmul(np.ones(np_out.shape).astype(np.float32), npw.T) for i, localid in enumerate(npind): npemb[localid] -= 0.1 * tmp_grad[i] np.testing.assert_allclose(embedding.tensor_value.asnumpy(), npemb, rtol=1e-6)
from hetu import ndarray import numpy as np # import time logging.basicConfig(level=logging.INFO) logging.info("# hparams") hparams = Hparams() parser = hparams.parser hp = parser.parse_args() print(hp) logging.info("# Prepare train/eval batches") dataloader = DataLoader(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab) ctx = ndarray.gpu(1) xs = ad.Variable(name='xs') ys1 = ad.Variable(name='ys1') ys2 = ad.Variable(name='ys2') nonpadding = ad.Variable(name='nonpadding') logging.info("# Load model") m = Transformer(hp) loss = m.train(xs, (ys1, ys2)) loss = ad.div_op(ad.reduce_sum_op(loss * nonpadding, axes=[0, 1]), ad.reduce_sum_op(nonpadding, axes=[0, 1]) + 1e-7) opt = optimizer.SGDOptimizer(hp.lr) train_op = opt.minimize(loss) executor = ad.Executor([loss, train_op], ctx=ctx) logging.info("# Session")
def worker(model, rank, args): def train(iterations): train_loss, train_acc, train_auc = [], [], [] for it in tqdm(range(iterations)): loss_val, predict_y, y_val, _ = executor.run( convert_to_numpy_ret_vals=True) if y_val.shape[1] == 1: # for criteo case acc_val = np.equal(y_val, predict_y > 0.5).astype(np.float) else: acc_val = np.equal(np.argmax(y_val, 1), np.argmax(predict_y, 1)).astype(np.float) train_loss.append(loss_val[0]) train_acc.append(acc_val) train_auc.append(metrics.roc_auc_score(y_val, predict_y)) return np.mean(train_loss), np.mean(train_acc), np.mean(train_auc) def validate(iterations): test_loss, test_acc, test_auc = [], [], [] for it in range(iterations): loss_val, test_y_predicted, y_test_val = val_executor.run( convert_to_numpy_ret_vals=True) if y_test_val.shape[1] == 1: # for criteo case correct_prediction = np.equal( y_test_val, test_y_predicted > 0.5).astype(np.float) else: correct_prediction = np.equal(np.argmax(y_test_val, 1), np.argmax(test_y_predicted, 1)).astype(np.float) test_loss.append(loss_val[0]) test_acc.append(correct_prediction) test_auc.append(metrics.roc_auc_score(y_test_val, test_y_predicted)) return np.mean(test_loss), np.mean(test_acc), np.mean(test_auc) from models.load_data import process_all_criteo_data dense, sparse, labels = process_all_criteo_data(return_val=args.val) loss, prediction, y_, train_op = model(dense, sparse, labels) executor = ad.Executor([loss, prediction, y_, train_op], ctx=ndarray.gpu(rank),\ dataloader_name='train', stream_mode='AllStreams', comm_mode='Hybrid', use_sparse_pull=True, cstable_policy=args.cache, bsp=args.bsp, seed=123, cache_bound=args.bound) if args.val: val_executor = ad.Executor([loss, prediction, y_], ctx=ndarray.gpu(rank),\ dataloader_name='validate', stream_mode='AllStreams', comm_mode='Hybrid', use_sparse_pull=True, inference=True, bsp=args.bsp) executor.recordLoads() raw_log_file = './logs/localhybrid_%s' % (args.model) if args.bsp: raw_log_file += '_bsp' else: raw_log_file += '_asp' if args.cache: raw_log_file += '_%s' % (args.cache) raw_log_file += '_%d.log' % (rank) print('Processing all data, log to', raw_log_file) log_file = open(raw_log_file, 'w') total_epoch = 400 for ep in range(total_epoch): # print("iters: %d" % (lp * 1000)) print("epoch %d" % ep) st_time = time.time() train_loss, train_acc, train_auc = train(executor.batch_num // 10 + (ep % 10 == 9) * (executor.batch_num % 10)) en_time = time.time() train_time = en_time - st_time executor.recordLoads() if args.val: executor.ps_comm.BarrierWorker() val_loss, val_acc, val_auc = validate(val_executor.batch_num) executor.ps_comm.BarrierWorker() printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\ % (train_loss, train_acc, train_auc, val_loss, val_acc, val_auc, train_time) executor.recordLoads() else: printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ % (train_loss, train_acc, train_auc, train_time) print(printstr) log_file.write(printstr + '\n') log_file.flush()
def train_main(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = ad.get_worker_communicate().rank() nrank = int(os.environ["DMLC_NUM_WORKER"]) ctx = ndarray.gpu(rank % args.num_local_worker) embedding_width = args.hidden_size extract_width = embedding_width * (meta["feature"] - 1) y_ = dl.GNNDataLoaderOp(lambda g: ndarray.array( convert_to_one_hot(g.y, max_val=g.num_classes), ctx=ndarray.cpu())) mask_ = ad.Variable(name="mask_") gcn1 = GCN(extract_width, hidden_layer_size, activation="relu") gcn2 = GCN(hidden_layer_size, meta["class"]) index = dl.GNNDataLoaderOp( lambda g: ndarray.array(g.x[:, 0:-1], ctx=ndarray.cpu()), ctx=ndarray.cpu()) embedding = initializers.random_normal([meta["idx_max"], embedding_width], stddev=0.1) embed = ad.embedding_lookup_op(embedding, index) embed = ad.array_reshape_op(embed, (-1, extract_width)) # embed = ad.reduce_mean_op(embed, axes=1) # x = ad.concat_op(x_, embed, axis=1) x = gcn1(embed) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) train_loss = loss * mask_ train_loss = ad.reduce_mean_op(train_loss, [0]) opt = optimizer.SGDOptimizer(args.learning_rate) train_op = opt.minimize(train_loss) ad.worker_init() distributed.ps_init(rank, nrank) ngraph = meta["partition"]["nodes"][rank] // args.batch_size graphs = prepare_data(ngraph) idx = 0 g_sample, mp_val, mask, mask_eval = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample) dl.GNNDataLoaderOp.step(g_sample) epoch = 0 nnodes = 0 executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS', use_sparse_pull=False, cstable_policy=args.cache) while True: g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample_nxt) feed_dict = {gcn1.mp: mp_val, gcn2.mp: mp_val, mask_: mask} loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = np.sum((y_predicted == g_sample.y) * mask_eval) train_acc = np.sum((y_predicted == g_sample.y) * mask) stat.update(acc, mask_eval.sum(), np.sum(loss_val.asnumpy() * mask_eval) / mask_eval.sum()) stat.update_train(train_acc, mask.sum(), np.sum(loss_val.asnumpy() * mask) / mask.sum()) # distributed.ps_get_worker_communicator().BarrierWorker() nnodes += mask.sum() + mask_eval.sum() if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 if rank == 0: stat.print(epoch) if epoch >= num_epoch: break g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt
def transform(graph): mp_val = mp_matrix(graph, ndarray.gpu(0)) return graph, mp_val
group_col = col_procs[rank_col] comm_row = row_groups[rank_row] comm_col = col_groups[rank_col] a = ndarray.array(np.array([rank,rank,rank,rank,rank]),ctx=ctx) comm_row.dlarrayBroadcast(a, a, ncclDataType_t.ncclFloat32, root = group_row[1]) print("Broadcast device=%d, a:"%device_id,a.asnumpy()) b = ndarray.array(np.array([rank,rank,rank,rank,rank]),ctx=ctx) comm_col.dlarrayBroadcast(b, b, ncclDataType_t.ncclFloat32, root = group_col[1]) print("Broadcast device=%d, b:"%device_id,b.asnumpy()) comm, device_id = ad.mpi_nccl_init() device = comm.device_id.value rank = comm.localRank.value size = comm.nRanks.value ctx = ndarray.gpu(rank) a = ndarray.array(np.array([1,2,3,4,5]),ctx=ctx) test_default() test_broadcast(group = [0,2,4,5,6], root=4) test_broadcast(group = [1,4,2,7],root=4) test_allreduce(group = [1,4,2,5]) test_allreduce(group = [0,7,6,2,4]) test_allgather(group = [2,5,3,7]) test_allgather(group = [2,6,1,7,4]) test_group_broadcast()
W_val = W_val.asnumpy() loss_val = [val.asnumpy() for val in loss_val] y_groundtruth = X_val.dot(W_val) loss_groundtruth = np.mean( -np.sum(Y_val * np.log(softmax_func(y_groundtruth)), axis=1), keepdims=True) Y_grad_groundtruth = (softmax_func(y_groundtruth) + -1 * Y_val) * np.ones(loss_groundtruth.shape) / 500 W_grad_groundtruth = X_val.T.dot(Y_grad_groundtruth) np.testing.assert_allclose(loss_val[0], loss_groundtruth, rtol=1e-4) np.testing.assert_allclose(loss_val[1], W_grad_groundtruth, rtol=1e-4) np.testing.assert_allclose(loss_val[2], Y_grad_groundtruth, rtol=1e-4) test_csrmm_op(ndarray.cpu(0)) test_csrmm_op(ndarray.gpu(1)) def test_csrmv_op(executor_ctx): X = ad.Variable(name="X") W = ad.Variable(name="W") Y = ad.csrmv_op(X, W) Y_ = ad.Variable(name="Y_") temp = Y + (-1) * Y_ loss = temp * temp grads = ad.gradients(loss, [W, Y]) executor = ad.Executor( [loss, grads[0], grads[1]], ctx=executor_ctx)
def worker(args): def validate(): hits, ndcgs = [], [] for idx in range(testData.shape[0]): start_index = idx * 100 predictions = val_executor.run(convert_to_numpy_ret_vals=True) map_item_score = {testItemInput[start_index + i]: predictions[0][i] for i in range(100)} gtItem = testItemInput[start_index] # Evaluate top rank list ranklist = heapq.nlargest(topK, map_item_score, key=map_item_score.get) hr = getHitRatio(ranklist, gtItem) ndcg = getNDCG(ranklist, gtItem) hits.append(hr) ndcgs.append(ndcg) hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() return hr, ndcg def get_current_shard(data): if args.comm is not None: part_size = data.shape[0] // nrank start = part_size * rank end = start + part_size if rank != nrank - 1 else data.shape[0] return data[start:end] else: return data device_id = 0 if args.comm == 'PS': rank = ad.get_worker_communicate().rank() nrank = int(os.environ['DMLC_NUM_WORKER']) device_id = rank % 8 elif args.comm == 'Hybrid': comm, rank = ad.mpi_nccl_init() nrank = int(os.environ['DMLC_NUM_WORKER']) device_id = rank % 8 from movielens import getdata if args.all: trainData, testData = getdata('ml-25m', 'datasets') trainUsers = get_current_shard(trainData['user_input']) trainItems = get_current_shard(trainData['item_input']) trainLabels = get_current_shard(trainData['labels']) testData = get_current_shard(testData) testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32), 100) testItemInput = testData.reshape((-1,)) else: trainData, testData = getdata('ml-25m', 'datasets') trainUsers = get_current_shard(trainData['user_input'][:1024000]) trainItems = get_current_shard(trainData['item_input'][:1024000]) trainLabels = get_current_shard(trainData['labels'][:1024000]) testData = get_current_shard(testData[:1470]) testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32), 100) testItemInput = testData.reshape((-1,)) num_users, num_items = { 'ml-1m': (6040, 3706), 'ml-20m': (138493, 26744), 'ml-25m': (162541, 59047), }['ml-25m'] # assert not args.all or num_users == testData.shape[0] batch_size = 1024 num_negatives = 4 topK = 10 user_input = dl.dataloader_op([ dl.Dataloader(trainUsers, batch_size, 'train'), dl.Dataloader(testUserInput, 100, 'validate'), ]) item_input = dl.dataloader_op([ dl.Dataloader(trainItems, batch_size, 'train'), dl.Dataloader(testItemInput, 100, 'validate'), ]) y_ = dl.dataloader_op([ dl.Dataloader(trainLabels.reshape((-1, 1)), batch_size, 'train'), ]) loss, y, train_op = neural_mf(user_input, item_input, y_, num_users, num_items) executor = ad.Executor([loss, train_op], ctx=ndarray.gpu(device_id), dataloader_name='train', \ comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123) val_executor = ad.Executor([y], ctx=ndarray.gpu(device_id), inference=True, dataloader_name='validate', comm_mode=args.comm, bsp=args.bsp) path = 'logs/hetulog_%s' % ({None: 'local', 'PS': 'ps', 'Hybrid': 'hybrid'}[args.comm]) path += '_%d.txt' % rank if args.comm else '.txt' log = Logging(path=path) epoch = 7 start = time.time() for ep in range(epoch): ep_st = time.time() log.write('epoch %d' % ep) train_loss = [] for idx in tqdm(range(executor.batch_num)): loss_val = executor.run(convert_to_numpy_ret_vals=True) train_loss.append(loss_val[0]) # if idx % 10000 == 0: # hr, ndcg = validate() # printstr = "HR: %.4f, NDCF: %.4f" % (hr, ndcg) # log.write(printstr) tra_loss = np.mean(train_loss) ep_en = time.time() # validate phase if args.val: hr, ndcg = validate() printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % (tra_loss, hr, ndcg, ep_en - ep_st) else: printstr = "train_loss: %.4f, train_time: %.4f" % (tra_loss, ep_en - ep_st) log.write(printstr) log.write('all time: %f' % (time.time() - start))
def train_hetu(num_epoch): ctx = ndarray.gpu(0) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") mask_ = ad.Variable(name="mask_") gcn1 = GraphSage(graph.num_features, hidden_layer_size, activation="relu", dropout=0.1) gcn2 = GraphSage(2*hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1) x = gcn1(x_) x = gcn2(x) W = initializers.xavier_uniform(shape=(2*hidden_layer_size, graph.num_classes)) B = initializers.zeros(shape=(graph.num_classes,)) x = ad.matmul_op(x, W) y = x + ad.broadcastto_op(B, x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.mul_op(loss, mask_) opt = optimizer.AdamOptimizer(0.01) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx) def eval(): start = time.time() ad.Dropout.DropoutOp.phase = "eval" mp_val = mp_matrix(graph_full, ctx) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(graph_full.x, ctx=ctx), } executor_eval = ad.Executor([y], ctx=ctx) y_predicted, = executor_eval.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == graph_full.y)[train_split:].sum() print("Test accuracy:", acc/len(y_predicted[train_split:])) ad.Dropout.DropoutOp.phase = "training" epoch = 0 nnodes = 0 batch_size = 1000 with GraphSageSampler(graph, batch_size, depth=2, num_sample_thread=4) as sampler: start = time.time() while True: g_sample, mask = sampler.sample() mp_val = mp_matrix(g_sample, ctx) #print(time.time() - start) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, mask_ : ndarray.array(mask,ctx=ctx), x_ : ndarray.array(g_sample.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=graph.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = ((y_predicted == g_sample.y) * mask).sum() # print(i, "Train loss :", loss_val.asnumpy().mean()) # print(i, "Train accuracy:", acc/len(y_predicted)) nnodes += batch_size if nnodes > graph_full.num_nodes: nnodes = 0 epoch += 1 print("Epoch :", epoch, time.time() - start) print("Train accuracy:", acc/mask.sum()) eval() start = time.time() if epoch >= num_epoch: break
def test(args): comm, device_id = ad.mpi_nccl_init() rank = comm.localRank.value size = comm.nRanks.value dataset_info = { 'Reddit': [232965, 602, 41], 'Proteins': [132534, 602, 8], 'Arch': [1644228, 602, 10], 'Products': [2449029, 100, 47] } node_count, num_features, num_classes = dataset_info[args.dataset] hidden_layer_size = 128 if num_features < 128: hidden_layer_size = 64 replication = args.replication node_Count_Self = row_num(node_count, rank // replication, size // replication) node_Count_All = node_count _, _, row_groups, col_groups = get_proc_groups(size, replication) executor_ctx = ndarray.gpu(device_id) if size > 1: adj_part, data_part, row_part, col_part, input_part, label_part = load_data( args, size, replication, rank) else: adj_part, data_part, row_part, col_part, input_part, label_part = load_data_whole( args) adj_matrix = ndarray.sparse_array(data_part, (row_part, col_part), shape=adj_part.shape, ctx=executor_ctx) # train:val:test=6:2:2 # Our optimization on distributed GNN algorithm does NOT affect the correctness! # Here due to the limitation of current slice_op, data is split continuously. # Continuous split is unfriendly for reordered graph data where nodes are already clustered. # Specifically, training on some node clusters and testing on other clusters may cause poor test accuracy. # The better way is to split data randomly! train_split, test_split = 0.6, 0.8 train_node = int(train_split * node_Count_Self) test_node = int(test_split * node_Count_Self) A = ad.Variable(name="A", trainable=False) H = ad.Variable(name="H") np.random.seed(123) bounds = np.sqrt(6.0 / (num_features + hidden_layer_size)) W1_val = np.random.uniform(low=-bounds, high=bounds, size=[num_features, hidden_layer_size]).astype(np.float32) W1 = ad.Variable(name="W1", value=W1_val) bounds = np.sqrt(6.0 / (num_classes + hidden_layer_size)) np.random.seed(123) W2_val = np.random.uniform(low=-bounds, high=bounds, size=[hidden_layer_size, num_classes]).astype(np.float32) W2 = ad.Variable(name="W2", value=W2_val) y_ = ad.Variable(name="y_") z = ad.distgcn_15d_op(A, H, W1, node_Count_Self, node_Count_All, size, replication, device_id, comm, [row_groups, col_groups], True) H1 = ad.relu_op(z) y = ad.distgcn_15d_op(A, H1, W2, node_Count_Self, node_Count_All, size, replication, device_id, comm, [row_groups, col_groups], True) y_train = ad.slice_op(y, (0, 0), (train_node, num_classes)) label_train = ad.slice_op(y_, (0, 0), (train_node, num_classes)) y_test = ad.slice_op(y, (test_node, 0), (node_Count_Self - test_node, num_classes)) label_test = ad.slice_op(y_, (test_node, 0), (node_Count_Self - test_node, num_classes)) loss = ad.softmaxcrossentropy_op(y_train, label_train) loss_test = ad.softmaxcrossentropy_op(y_test, label_test) opt = optimizer.AdamOptimizer() train_op = opt.minimize(loss) executor = ad.Executor([loss, y, loss_test, train_op], ctx=executor_ctx) feed_dict = { A: adj_matrix, H: ndarray.array(input_part, ctx=executor_ctx), y_: ndarray.array(convert_to_one_hot(label_part, max_val=num_classes), ctx=executor_ctx), } epoch_num = 100 epoch_all, epoch_0 = 0, 0 for i in range(epoch_num): epoch_start_time = time.time() results = executor.run(feed_dict=feed_dict) loss = results[0].asnumpy().sum() y_out = results[1] loss_test = results[2].asnumpy().sum() epoch_end_time = time.time() epoch_time = epoch_end_time - epoch_start_time epoch_all += epoch_time if i == 0: epoch_0 = epoch_time print("[Epoch: %d, Rank: %d] Epoch time: %.3f, Total time: %.3f" % (i, rank, epoch_time, epoch_all)) y_out_train, y_predict = y_out.asnumpy().argmax( axis=1)[:train_node], y_out.asnumpy().argmax(axis=1)[test_node:] label_train, label_test = label_part[:train_node], label_part[ test_node:] train_acc = ndarray.array(np.array([(y_out_train == label_train).sum() ]), ctx=executor_ctx) test_acc = ndarray.array(np.array([(y_predict == label_test).sum()]), ctx=executor_ctx) train_loss = ndarray.array(np.array([loss]), ctx=executor_ctx) test_loss = ndarray.array(np.array([loss_test]), ctx=executor_ctx) if replication > 1: col_groups[rank % replication].dlarrayNcclAllReduce( test_acc, test_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( test_loss, test_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( train_acc, train_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( train_loss, train_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) else: comm.dlarrayNcclAllReduce(test_acc, test_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(test_loss, test_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(train_acc, train_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(train_loss, train_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) test_acc = float( test_acc.asnumpy()[0]) / (node_count - test_split * node_count) test_loss = test_loss.asnumpy()[0] / (node_count - test_split * node_count) train_acc = float(train_acc.asnumpy()[0]) / (train_split * node_count) train_loss = train_loss.asnumpy()[0] / (train_split * node_count) if rank == 0: print("[Epoch: %d] Train Loss: %.3f, Train Accuracy: %.3f, Test Loss: %.3f, Test Accuracy: %.3f"\ %(i,train_loss, train_acc, test_loss, test_acc)) avg_epoch_time = (epoch_all - epoch_0) / (epoch_num - 1) results = ndarray.array(np.array([epoch_all, avg_epoch_time]), ctx=executor_ctx) comm.dlarrayNcclAllReduce(results, results, ncclDataType_t.ncclFloat32, reduceop=ncclRedOp_t.ncclSum) results = results.asnumpy() / size if rank == 0: print("\nAverage Total Time: %.3f, Average Epoch Time: %.3f" % (results[0], results[1]))