def test_allgather(group): comm1 = ad.new_group_comm(group) a = ndarray.array(np.array([rank,rank]),ctx=ctx) b = ndarray.array(np.zeros(2*len(group)),ctx=ctx) if rank in group: comm1.dlarrayAllGather(a, b, ncclDataType_t.ncclFloat32) print("Allgather device=%d"%comm1.device_id.value,b.asnumpy())
def test_layernorm_forward(shape=(5, 3)): ctx = ndarray.gpu(1) # shape = (5, 3) last_dim = shape[-1] x = np.random.random(shape).astype(np.float32) scale = np.random.random((last_dim,)).astype(np.float32) bias = np.random.random((last_dim,)).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_scale = ndarray.array(scale, ctx=ctx) arr_bias = ndarray.array(bias, ctx=ctx) arr_mean = ndarray.empty(list(shape[:-1]) + [1], ctx=ctx) arr_var = ndarray.empty(list(shape[:-1]) + [1], ctx=ctx) arr_y = ndarray.empty((shape), ctx=ctx) gpu_op.layer_normalization(arr_x, arr_scale, arr_bias, arr_mean, arr_var, arr_y, 0.01) y = arr_y.asnumpy() np_means = x.mean(axis=-1, dtype=np.float32, keepdims=True) np_vars = x.var(axis=-1, dtype=np.float32, keepdims=True) std = np.sqrt(np_vars + 0.01, dtype=np.float32) centered_input = x - np_means normed_input = centered_input / std bc_shape = [1] * len(x.shape) bc_shape[-1] = x.shape[-1] y_ = scale.reshape(bc_shape) * normed_input + \ bias.reshape(bc_shape) np.testing.assert_allclose(np_means, arr_mean.asnumpy(), atol=1e-6) np.testing.assert_allclose(np_vars, arr_var.asnumpy(), atol=1e-6) np.testing.assert_allclose(y_, y, atol=1e-6) print('Pass forward test with shape ', shape)
def test_group_broadcast(): row_procs = [] for i in range(0,8,2): row_procs.append(list(range(i,i+2))) col_procs = [] for i in range(2): col_procs.append(list(range(i,8,2))) row_groups = [] for i in range(len(row_procs)): row_groups.append(ad.new_group_comm(row_procs[i])) col_groups = [] for i in range(len(col_procs)): col_groups.append(ad.new_group_comm(col_procs[i])) rank_row = rank//2 rank_col = rank%2 group_row = row_procs[rank_row] group_col = col_procs[rank_col] comm_row = row_groups[rank_row] comm_col = col_groups[rank_col] a = ndarray.array(np.array([rank,rank,rank,rank,rank]),ctx=ctx) comm_row.dlarrayBroadcast(a, a, ncclDataType_t.ncclFloat32, root = group_row[1]) print("Broadcast device=%d, a:"%device_id,a.asnumpy()) b = ndarray.array(np.array([rank,rank,rank,rank,rank]),ctx=ctx) comm_col.dlarrayBroadcast(b, b, ncclDataType_t.ncclFloat32, root = group_col[1]) print("Broadcast device=%d, b:"%device_id,b.asnumpy())
def test(): ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) nitem = 2000 item_len = 1000 arr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) # generate a long buffer push_indices = np.arange(nitem) * nrank + rank print(push_indices) push_length = np.repeat(item_len, repeats=nitem) worker_communicate = ad.get_worker_communicate() worker_communicate.PushData(pointer(push_indices), nitem, arr.handle, pointer(push_length)) print("Waiting") worker_communicate.WaitPushData(pointer(push_indices), nitem) worker_communicate.BarrierWorker() print("OK") arr2 = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) worker_communicate.PullData(pointer(push_indices), nitem, arr2.handle, pointer(push_length)) worker_communicate.WaitPullData(pointer(push_indices), nitem) assert np.all(arr.asnumpy() == arr2.asnumpy()) print("Check Complete")
def test_broadcast(group, root): comm1 = ad.new_group_comm(group) a = ndarray.array(np.array([-1,-1,-1,-1,-1]),ctx=ctx) if rank == root: a = ndarray.array(np.array([2,3,4,5,6]),ctx=ctx) if rank in group: comm1.dlarrayBroadcast(a, a, ncclDataType_t.ncclFloat32, root = root) print("Broadcast device=%d"%comm1.device_id.value,a.asnumpy())
def train_hetu(num_epoch): ctx = ndarray.gpu(0) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") if use_same_init: gcn1 = GCN(num_features, hidden_layer_size, custom_init=(init_w1, init_b1)) gcn2 = GCN(hidden_layer_size, num_classes, custom_init=(init_w2, init_b2)) else: gcn1 = GCN(num_features, hidden_layer_size) gcn2 = GCN(hidden_layer_size, num_classes) mp_val = mp_matrix(graph, ctx, use_original_gcn_norm=True) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(graph.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(graph.y, max_val=num_classes), ctx=ctx) } x = gcn1(x_) x = ad.relu_op(x) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) opt = optimizer.AdamOptimizer(0.01) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx) start_time = time.time() losses = [] for i in range(num_epoch): loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == graph.y).sum() losses.append(loss_val.asnumpy().mean()) if i==0: start_time= time.time() print("Train loss :", loss_val.asnumpy().mean()) print("Train accuracy:", acc/len(y_predicted)) print("Hetu time:",i, time.time()-start_time) print("Hetu time:", time.time()-start_time) mp_val = mp_matrix(graph_full, ctx) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(graph_full.x, ctx=ctx), } executor_eval = ad.Executor([y], ctx=ctx) y_predicted, = executor_eval.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == graph_full.y)[train_split:].sum() print("Test accuracy:", acc/len(y_predicted[train_split:])) return losses
def train_hetu(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) ctx = ndarray.gpu(rank) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") mask_ = ad.Variable(name="mask_") gcn1 = GraphSage(meta["feature"], hidden_layer_size, activation="relu", dropout=0.1) gcn2 = GraphSage(2*hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1) x = gcn1(x_) x = gcn2(x) W = initializers.xavier_uniform(shape=(2*hidden_layer_size, meta["class"])) B = initializers.zeros(shape=(meta["class"],)) x = ad.matmul_op(x, W) y = x + ad.broadcastto_op(B, x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.mul_op(loss, mask_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(0.1) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS') distributed.ps_init(rank, nrank) batch_size = 4000 with DistributedGraphSageSampler(args.path, batch_size, 2, 2, rank=rank, nrank=nrank) as sampler: epoch = 0 nnodes = 0 start = time.time() while True: g_sample, mask = sampler.sample() mp_val = mp_matrix(g_sample, ndarray.gpu(rank)) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, mask_ : ndarray.array(mask, ctx=ctx), x_ : ndarray.array(g_sample.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=g_sample.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = ((y_predicted == g_sample.y) * mask).sum() distributed.ps_get_worker_communicator().BarrierWorker() nnodes += batch_size if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 print("Epoch :", epoch, time.time() - start) print("Train accuracy:", acc/mask.sum()) start = time.time() if epoch >= num_epoch: break
def func(name): np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, )) np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, )) inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx) outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx) uni_inind_len = np.unique(np_inind).size uni_outind_len = np.unique(np_outind).size comm.SparsePush(name, inind.handle, inarr.handle, None) comm.SparsePull(name, outind.handle, outarr.handle) comm.Wait(name) nonlocal byte_count byte_count += (uni_inind_len + uni_outind_len) * item_len * 4
def train_hetu(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) hosts, ports = load_ip_config(args.ip_config) ctx = ndarray.gpu(rank) distributed.grpc_init(hosts=hosts, ports=ports, rank=rank, nrank=nrank) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") gcn1 = GCN(meta["feature"], hidden_layer_size, activation="relu") gcn2 = GCN(hidden_layer_size, meta["class"]) x = gcn1(x_) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(0.1) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS') def transform(graph): mp_val = mp_matrix(graph, ndarray.gpu(rank)) return graph, mp_val with DistributedSubgraphSampler(args.path, 4000, 2, rank=rank, nrank=nrank ,transformer=transform, backend="grpc") as sampler: epoch = 0 nnodes = 0 start = time.time() while True: g_sample, mp_val = sampler.sample() feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(g_sample.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=g_sample.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == g_sample.y).sum() distributed.ps_get_worker_communicator().BarrierWorker() nnodes += g_sample.num_nodes if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 print("Epoch :", epoch, time.time() - start) print("Train accuracy:", acc/len(y_predicted)) start = time.time() if epoch >= num_epoch: break
def test_layernorm_backward(shape=(5, 3)): ctx = ndarray.gpu(1) # shape = (5, 3) last_dim = shape[-1] grads = np.random.random(shape).astype(np.float32) x = np.random.random(shape).astype(np.float32) scale = np.random.random((last_dim,)).astype(np.float32) mean = np.random.random(list(shape[:-1])+[1]).astype(np.float32) var = np.random.random(list(shape[:-1])+[1]).astype(np.float32) arr_grads = ndarray.array(grads, ctx=ctx) arr_x = ndarray.array(x, ctx=ctx) arr_scale = ndarray.array(scale, ctx=ctx) arr_mean = ndarray.array(mean, ctx=ctx) arr_var = ndarray.array(var, ctx=ctx) grad_inarr = ndarray.empty(shape, ctx=ctx) grad_scale = ndarray.empty((last_dim,), ctx=ctx) grad_bias = ndarray.empty((last_dim,), ctx=ctx) gpu_op.layer_normalization_gradient(arr_grads, arr_x, arr_scale, grad_inarr, grad_scale, grad_bias, arr_mean, arr_var, 0.01) # numpy calculate phase red_axis = tuple(range(grads.ndim-1)) np_grad_bias = grads.sum(red_axis) # (X,) std = np.sqrt(var + 0.01) # (N, 1) x_centered = x - mean # (N, X) x_norm = x_centered / std # (N, X) np_grad_scale = (grads * x_norm).sum(red_axis) # (X,) last_dim = x.shape[-1] dx_norm = grads * scale.reshape([1] * (grads.ndim - 1) + [-1]) # (N, X) dvar = (dx_norm * x_centered).sum(axis=-1, keepdims=True) * -0.5 / (var + 0.01) / std # (N, 1) dx_mu_1 = dx_norm / std # (N, X) dx_mu_2 = dvar * 2 * x_centered / last_dim # (N, X) dx_1 = dx_mu_1 + dx_mu_2 # (N, X) dx_2 = -1 * dx_1.sum(axis=-1, keepdims=True) / last_dim # (N, 1) np_grad_inarr = dx_1 + dx_2 # (N, X) np.testing.assert_allclose(np_grad_bias, grad_bias.asnumpy(), rtol=1e-4, atol=1e-4) np.testing.assert_allclose(np_grad_scale, grad_scale.asnumpy(), rtol=1e-4, atol=1e-4) np.testing.assert_allclose(np_grad_inarr, grad_inarr.asnumpy(), rtol=1e-4, atol=1e-4) print('Pass backward test with shape ', shape)
def test_sparse_matrix_multiply(): density = 1e-3 ctx = ndarray.gpu(0) x = scipy.sparse.rand(500, 7000,density=density,format='coo',dtype=np.float32) y = np.random.uniform(0, 10, size=(7000, 100)).astype(np.float32) mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 7000], ctx=ctx) mat_y = ndarray.array(y, ctx=ctx) mat_z = ndarray.empty((500, 100), ctx=ctx) gpu_op.CuSparse_Csrmm(mat_x, False, mat_y, False, mat_z) z = mat_z.asnumpy() np.testing.assert_allclose(x.dot(y), z, rtol=1e-5)
def test_sparse_array_dense_vector_multiply(): density = 1e-3 ctx = ndarray.gpu(0) x = scipy.sparse.rand(500, 70000, density=density,format='coo',dtype=np.float32) y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32) mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 70000], ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((500, 1), ctx=ctx) trans = False gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(x.dot(y), z, rtol=1e-5) x = scipy.sparse.rand(70000, 500, density=density,format='coo',dtype=np.float32) y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32) mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [70000, 500], ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((500, 1), ctx=ctx) trans = True gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(x.transpose().dot(y), z, rtol=1e-5)
def test(): ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) if rank > 0: return arr = ndarray.array(np.random.rand(nitem, item_len),ctx = ctx) # generate a long buffer push_indices = np.arange(nitem) print(push_indices) push_length = np.repeat(item_len, repeats=nitem) worker_communicate = ad.get_worker_communicate() query = worker_communicate.PushData(pointer(push_indices), nitem, arr.handle, pointer(push_length)) worker_communicate.WaitData(query) print("data_pushed") t = ThreadPoolExecutor(max_workers=max_thread) byte_count = 0 arr2 = ndarray.array(np.random.rand(nitem, item_len),ctx = ctx) def pull_data(): query = worker_communicate.PullData(pointer(push_indices), nitem, arr2.handle, pointer(push_length)) worker_communicate.WaitData(query) # print( np.all(arr.asnumpy() == arr2.asnumpy()) ) nonlocal byte_count byte_count += nitem * item_len * 4 def watch(): nonlocal byte_count start = time.time() while True: time.sleep(1) speed = byte_count / (time.time() - start) print("speed : {} MB/s".format(speed / 2**20)) task_list = [None for i in range(max_thread)] threading.Thread(target=watch).start() while True: for i in range(max_thread): if task_list[i] is None or task_list[i].done(): task_list[i] = t.submit(pull_data)
def sync_and_clear(self): self.count += 1 train_stat = ndarray.array(self.train_stat, ndarray.cpu()) test_stat = ndarray.array(self.test_stat, ndarray.cpu()) comm.dlarrayNcclAllReduce(train_stat, train_stat, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum, comm.stream) comm.dlarrayNcclAllReduce(test_stat, test_stat, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum, comm.stream) comm.stream.sync() train_stat, test_stat = train_stat.asnumpy(), test_stat.asnumpy() printstr = "epoch {}: test loss: {:.3f} test acc: {:.3f} train loss: {:.3f} train acc: {:.3f}".format( self.count, test_stat[3] / test_stat[0], test_stat[1] / test_stat[2], train_stat[3] / train_stat[0], train_stat[1] / train_stat[2], ) logstr = "{} {} {} {}".format( test_stat[3] / test_stat[0], test_stat[1] / test_stat[2], train_stat[3] / train_stat[0], train_stat[1] / train_stat[2], ) self.time.append(time.time()) if comm.device_id.value == 0: print(printstr, flush=True) print(logstr, file=self.file, flush=True) if len(self.time) > 3: epoch_time = np.array(self.time[1:]) - np.array(self.time[:-1]) print("epoch time: {:.3f}+-{:.3f}".format( np.mean(epoch_time), np.var(epoch_time))) self.train_stat[:] = 0 self.test_stat[:] = 0
def test(): ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) arr = ndarray.array(np.random.rand(2,rank+100),ctx = ctx) print(arr.asnumpy()) push_indices = np.array([2*rank+1,2*rank+2]) if rank == 0: pull_indices = np.array([3]) elif rank == 1: pull_indices = np.array([1]) push_length = np.array([rank+100,rank+100]) if rank == 0: pull_length = np.array([101]) out_arr = ndarray.array(np.zeros(101),ctx = ctx) elif rank == 1: pull_length = np.array([100]) out_arr = ndarray.array(np.zeros(100),ctx = ctx) print(out_arr.asnumpy()) worker_communicate = ad.get_worker_communicate() query = worker_communicate.PushData(pointer(push_indices), 2, arr.handle, pointer(push_length)) worker_communicate.WaitData(query); worker_communicate.BarrierWorker() worker_communicate.PullData(pointer(pull_indices), 1, out_arr.handle, pointer(pull_length)) worker_communicate.WaitData(query); print(out_arr.asnumpy())
def eval(): start = time.time() ad.Dropout.DropoutOp.phase = "eval" mp_val = mp_matrix(graph_full, ctx) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(graph_full.x, ctx=ctx), } executor_eval = ad.Executor([y], ctx=ctx) y_predicted, = executor_eval.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == graph_full.y)[train_split:].sum() print("Test accuracy:", acc/len(y_predicted[train_split:])) ad.Dropout.DropoutOp.phase = "training"
def test_allreduce(comm=None): shape = (24, 24) size = 4 for val in shape: size *= val input_arr = np.ones(shape) * comm.localRank.value input_arr = ndarray.array(input_arr, ctx=ndarray.gpu(comm.localRank.value)) # input_arr = ndarray.array(input_arr, ctx = ndarray.cpu()) start = time.time() comm.dlarrayNcclAllReduce(input_arr, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.stream.sync() end = time.time() secs = end - start return size, secs
def test_p2p(comm=None, src=0, target=1): shape = (1000, 30, 224, 224) size = 4 for val in shape: size *= val print("MyRank: ", comm.myRank.value) arr = np.ones(shape) * comm.localRank.value arr = ndarray.array(arr, ctx=ndarray.gpu(comm.localRank.value)) # arr = ndarray.array(arr, ctx = ndarray.cpu()) start = time.time() if comm.myRank.value == 0: comm.dlarraySend(arr, ncclDataType_t.ncclFloat32, 1) else: comm.dlarrayRecv(arr, ncclDataType_t.ncclFloat32, 0) comm.stream.sync() end = time.time() secs = end - start # size: /Bytes # dur_time: /s return size, secs
def test_csrmv_op(executor_ctx): X = ad.Variable(name="X") W = ad.Variable(name="W") Y = ad.csrmv_op(X, W) Y_ = ad.Variable(name="Y_") temp = Y + (-1) * Y_ loss = temp * temp grads = ad.gradients(loss, [W, Y]) executor = ad.Executor( [loss, grads[0], grads[1]], ctx=executor_ctx) rand = np.random.RandomState(seed=123) W_val =rand.normal(scale=0.1, size=[70000, ]) if ndarray.is_gpu_ctx(executor_ctx): W_val = ndarray.array(W_val, ctx=executor_ctx) X_val = scipy.sparse.rand(500, 70000, density=1e-5,format='coo',dtype=np.float32) Y_val = np.random.uniform(0, 10, size=(500, )).astype(np.float32) loss_val = executor.run(feed_dict={X: X_val, Y_: Y_val, W: W_val}) if ndarray.is_gpu_ctx(executor_ctx): W_val = W_val.asnumpy() loss_val = [val.asnumpy() for val in loss_val] y_groundtruth = X_val.dot(W_val) loss_groundtruth = (y_groundtruth - Y_val) ** 2 Y_grad_groundtruth = 2 * (y_groundtruth - Y_val) * np.ones(loss_groundtruth.shape) W_grad_groundtruth = X_val.T.dot(Y_grad_groundtruth) np.testing.assert_allclose(loss_val[0], loss_groundtruth, rtol=1e-4) np.testing.assert_allclose(loss_val[1], W_grad_groundtruth, rtol=1e-4) np.testing.assert_allclose(loss_val[2], Y_grad_groundtruth, rtol=1e-4)
def test_csrmm_op(executor_ctx): X = ad.Variable(name="X") W = ad.Variable(name="W") Y = ad.csrmm_op(X, W) Y_ = ad.Variable(name="Y_") loss = ad.softmaxcrossentropy_op(Y, Y_) loss = ad.reduce_mean_op(loss, [0]) grads = ad.gradients(loss, [W, Y]) executor = ad.Executor( [loss, grads[0], grads[1]], ctx=executor_ctx) rand = np.random.RandomState(seed=123) W_val = rand.normal(scale=0.1, size=[70000, 2]).astype(np.float32) if ndarray.is_gpu_ctx(executor_ctx): W_val = ndarray.array(W_val, ctx=executor_ctx) X_val = scipy.sparse.rand(500, 70000, density=1e-5,format='coo',dtype=np.float32) Y_val = np.random.uniform(0, 10, size=(500, 2)).astype(np.float32) loss_val = executor.run(feed_dict={X: X_val, Y_: Y_val, W: W_val}) if ndarray.is_gpu_ctx(executor_ctx): W_val = W_val.asnumpy() loss_val = [val.asnumpy() for val in loss_val] y_groundtruth = X_val.dot(W_val) loss_groundtruth = np.mean( -np.sum(Y_val * np.log(softmax_func(y_groundtruth)), axis=1), keepdims=True) Y_grad_groundtruth = (softmax_func(y_groundtruth) + -1 * Y_val) * np.ones(loss_groundtruth.shape) / 500 W_grad_groundtruth = X_val.T.dot(Y_grad_groundtruth) np.testing.assert_allclose(loss_val[0], loss_groundtruth, rtol=1e-4) np.testing.assert_allclose(loss_val[1], W_grad_groundtruth, rtol=1e-4) np.testing.assert_allclose(loss_val[2], Y_grad_groundtruth, rtol=1e-4)
group_col = col_procs[rank_col] comm_row = row_groups[rank_row] comm_col = col_groups[rank_col] a = ndarray.array(np.array([rank,rank,rank,rank,rank]),ctx=ctx) comm_row.dlarrayBroadcast(a, a, ncclDataType_t.ncclFloat32, root = group_row[1]) print("Broadcast device=%d, a:"%device_id,a.asnumpy()) b = ndarray.array(np.array([rank,rank,rank,rank,rank]),ctx=ctx) comm_col.dlarrayBroadcast(b, b, ncclDataType_t.ncclFloat32, root = group_col[1]) print("Broadcast device=%d, b:"%device_id,b.asnumpy()) comm, device_id = ad.mpi_nccl_init() device = comm.device_id.value rank = comm.localRank.value size = comm.nRanks.value ctx = ndarray.gpu(rank) a = ndarray.array(np.array([1,2,3,4,5]),ctx=ctx) test_default() test_broadcast(group = [0,2,4,5,6], root=4) test_broadcast(group = [1,4,2,7],root=4) test_allreduce(group = [1,4,2,5]) test_allreduce(group = [0,7,6,2,4]) test_allgather(group = [2,5,3,7]) test_allgather(group = [2,6,1,7,4]) test_group_broadcast()
def train_hetu(num_epoch): ctx = ndarray.gpu(0) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") mask_ = ad.Variable(name="mask_") gcn1 = GraphSage(graph.num_features, hidden_layer_size, activation="relu", dropout=0.1) gcn2 = GraphSage(2*hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1) x = gcn1(x_) x = gcn2(x) W = initializers.xavier_uniform(shape=(2*hidden_layer_size, graph.num_classes)) B = initializers.zeros(shape=(graph.num_classes,)) x = ad.matmul_op(x, W) y = x + ad.broadcastto_op(B, x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.mul_op(loss, mask_) opt = optimizer.AdamOptimizer(0.01) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx) def eval(): start = time.time() ad.Dropout.DropoutOp.phase = "eval" mp_val = mp_matrix(graph_full, ctx) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(graph_full.x, ctx=ctx), } executor_eval = ad.Executor([y], ctx=ctx) y_predicted, = executor_eval.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == graph_full.y)[train_split:].sum() print("Test accuracy:", acc/len(y_predicted[train_split:])) ad.Dropout.DropoutOp.phase = "training" epoch = 0 nnodes = 0 batch_size = 1000 with GraphSageSampler(graph, batch_size, depth=2, num_sample_thread=4) as sampler: start = time.time() while True: g_sample, mask = sampler.sample() mp_val = mp_matrix(g_sample, ctx) #print(time.time() - start) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, mask_ : ndarray.array(mask,ctx=ctx), x_ : ndarray.array(g_sample.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=graph.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = ((y_predicted == g_sample.y) * mask).sum() # print(i, "Train loss :", loss_val.asnumpy().mean()) # print(i, "Train accuracy:", acc/len(y_predicted)) nnodes += batch_size if nnodes > graph_full.num_nodes: nnodes = 0 epoch += 1 print("Epoch :", epoch, time.time() - start) print("Train accuracy:", acc/mask.sum()) eval() start = time.time() if epoch >= num_epoch: break
return self.ncclCommInitRank() def mpi_nccl_communicator(mpi_init=True): ''' ''' return MPI_NCCL_Communicator(mpi_init=mpi_init) # NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 4 python mpi_nccl_comm.py if __name__ == "__main__": t = mpi_nccl_communicator() t.ncclInit() arr = np.ones(16) * t.localRank.value print("before: = ", arr) arr = ndarray.array(arr, ctx=ndarray.gpu(t.device_id.value)) output_arr = np.zeros(16 * t.nRanks.value) output_arr = ndarray.array(output_arr, ctx=ndarray.gpu(t.device_id.value)) t.dlarrayNcclAllReduce(arr, arr, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) # t.dlarrayBroadcast(arr, ncclDataType_t.ncclFloat32, 0) # t.dlarrayAllGather(arr, output_arr, ncclDataType_t.ncclFloat32) print("after: = ", arr.asnumpy()) t.ncclFinish()
def train_main(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = ad.get_worker_communicate().rank() nrank = int(os.environ["DMLC_NUM_WORKER"]) ctx = ndarray.gpu(rank % args.num_local_worker) embedding_width = args.hidden_size extract_width = embedding_width * (meta["feature"] - 1) y_ = dl.GNNDataLoaderOp(lambda g: ndarray.array( convert_to_one_hot(g.y, max_val=g.num_classes), ctx=ndarray.cpu())) mask_ = ad.Variable(name="mask_") gcn1 = GCN(extract_width, hidden_layer_size, activation="relu") gcn2 = GCN(hidden_layer_size, meta["class"]) index = dl.GNNDataLoaderOp( lambda g: ndarray.array(g.x[:, 0:-1], ctx=ndarray.cpu()), ctx=ndarray.cpu()) embedding = initializers.random_normal([meta["idx_max"], embedding_width], stddev=0.1) embed = ad.embedding_lookup_op(embedding, index) embed = ad.array_reshape_op(embed, (-1, extract_width)) # embed = ad.reduce_mean_op(embed, axes=1) # x = ad.concat_op(x_, embed, axis=1) x = gcn1(embed) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) train_loss = loss * mask_ train_loss = ad.reduce_mean_op(train_loss, [0]) opt = optimizer.SGDOptimizer(args.learning_rate) train_op = opt.minimize(train_loss) ad.worker_init() distributed.ps_init(rank, nrank) ngraph = meta["partition"]["nodes"][rank] // args.batch_size graphs = prepare_data(ngraph) idx = 0 g_sample, mp_val, mask, mask_eval = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample) dl.GNNDataLoaderOp.step(g_sample) epoch = 0 nnodes = 0 executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS', use_sparse_pull=False, cstable_policy=args.cache) while True: g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample_nxt) feed_dict = {gcn1.mp: mp_val, gcn2.mp: mp_val, mask_: mask} loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = np.sum((y_predicted == g_sample.y) * mask_eval) train_acc = np.sum((y_predicted == g_sample.y) * mask) stat.update(acc, mask_eval.sum(), np.sum(loss_val.asnumpy() * mask_eval) / mask_eval.sum()) stat.update_train(train_acc, mask.sum(), np.sum(loss_val.asnumpy() * mask) / mask.sum()) # distributed.ps_get_worker_communicator().BarrierWorker() nnodes += mask.sum() + mask_eval.sum() if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 if rank == 0: stat.print(epoch) if epoch >= num_epoch: break g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt
return self.nRanks def DLArrayAllReduce(self, dlarray, datatype, reduceop): lib_mpi.dlarrayAllReduce(dlarray.handle, c_int(datatype.value), c_int(reduceop.value), ctypes.byref(self.mpicomm)) def allReduce(self, arr): self.DLArrayAllReduce(arr, MPIDataType_t.MPI_Float32, MPIOp_t.MPI_SUM) def finish(self): lib_mpi.MPIFinalize() def mpi_communicator(): ''' ''' return MPI_Communicator() # mpirun --allow-run-as-root -np 4 python2 mpi_comm.py if __name__ == "__main__": comm = mpi_communicator() comm.MPI_GetComm() print("rank = %d" % (comm.rank().value)) arr = np.ones([10]) * comm.rank().value arr = ndarray.array(arr) comm.allReduce(arr) print(arr.asnumpy()) comm.finish()
def test_api(rarr, rpush, rpull, sparse=False, lr=0.5): ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(nitem, item_len).copy() local_push = np.frombuffer(rpush, dtype=np.float32).copy() local_pull = np.frombuffer(rpull, dtype=np.float32).copy() if rank == 0: arr = ndarray.array(local_arr, ctx=ctx) else: arr = ndarray.empty((nitem, item_len), ctx=ctx) comm = ad.get_worker_communicate() if sparse: arr_len = ctypes.c_int(nitem) arr_wid = ctypes.c_int(item_len) else: arr_len = ctypes.c_int(nitem * item_len) arr_wid = ctypes.c_int(1) comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len, arr_wid, ctypes.c_int(0), ctypes.c_double(0.0), ctypes.c_double(1.0), ctypes.c_ulonglong(123),\ ctypes.c_int(0), (ctypes.c_float * 1)(lr), ctypes.c_int(1)) if sparse: local_arr[:] = 0 for j in local_push: local_arr[int(j)] += 1 if rank == 0: push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx) push_val = ndarray.array(np.ones( (indx1, indx2, item_len)).astype(np.float32), ctx=ctx) comm.SparsePush(0, push_ind.handle, push_val.handle, None) comm.Wait(0) comm.BarrierWorker() comm.Pull(0, arr.handle) comm.Wait(0) np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7) print('SparsePush DensePull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() for j in local_push: local_arr[int(j)] += 1 if rank == 0: push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx) push_val = ndarray.array(np.ones( (indx1, indx2, item_len)).astype(np.float32), ctx=ctx) comm.SDPushPull(0, push_ind.handle, push_val.handle, arr.handle, None) comm.Wait(0) comm.BarrierWorker() if rank != 0: comm.Pull(0, arr.handle) comm.Wait(0) np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7) print('SDPushPull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() for j in local_push: local_arr[int(j)] += 1 pull_ind = ndarray.array(local_pull.reshape(indx1, indx2), ctx=ctx) pull_val = ndarray.empty((indx1, indx2, item_len), ctx=ctx) if rank == 0: push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx) push_val = ndarray.array(np.ones( (indx1, indx2, item_len)).astype(np.float32), ctx=ctx) comm.SSPushPull(0, push_ind.handle, push_val.handle, \ pull_ind.handle, pull_val.handle, None) comm.Wait(0) comm.BarrierWorker() if rank != 0: comm.SparsePull(0, pull_ind.handle, pull_val.handle) comm.Wait(0) np.testing.assert_allclose(local_arr[local_pull.astype(int)].reshape( indx1, indx2, item_len), pull_val.asnumpy(), rtol=5e-7) print('SSPushPull and SparsePull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() else: if rank == 0: comm.Push(0, arr.handle, None) comm.Wait(0) comm.BarrierWorker() comm.Pull(0, arr.handle) comm.Wait(0) np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7) print('DensePush DensePull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() if rank == 0: temp_push_val = ndarray.array(np.ones( (nitem, item_len)).astype(np.float32), ctx=ctx) comm.DDPushPull(0, temp_push_val.handle, arr.handle, None) comm.Wait(0) comm.BarrierWorker() if rank != 0: comm.Pull(0, arr.handle) comm.Wait(0) np.testing.assert_allclose(local_arr + 1, arr.asnumpy()) print('DenseDensePushPull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() if rank == 0: comm.ClearOnServer(0) comm.Clear(0) comm.BarrierWorker()
def test(func_name, nitem=2000, item_len=10000, ind_len=500, max_thread=10, ret_ans=False): func_name = func_name.lower() ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) comm = ad.get_worker_communicate() byte_count = 0 if func_name == 'pushnpull': inarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) def func(name): comm.Push(name, inarr.handle, None) comm.Pull(name, outarr.handle) comm.Wait(name) nonlocal byte_count byte_count += nitem * item_len * 4 * 2 elif func_name == 'pushpull': inarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) def func(name): comm.DDPushPull(name, inarr.handle, outarr.handle, None) comm.Wait(name) nonlocal byte_count byte_count += nitem * item_len * 4 * 2 elif func_name == 'sparsepushnpull': inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) def func(name): np_ind = np.random.randint(low=0, high=nitem, size=(ind_len, )) inind = ndarray.array(np_ind.astype(np.float32), ctx=ctx) uni_ind_len = np.unique(np_ind).size comm.SparsePush(name, inind.handle, inarr.handle, None) comm.Pull(name, outarr.handle) comm.Wait(name) nonlocal byte_count byte_count += (nitem + uni_ind_len) * item_len * 4 elif func_name == 'sparsepushnsparsepull': inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) outarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) def func(name): np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, )) np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, )) inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx) outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx) uni_inind_len = np.unique(np_inind).size uni_outind_len = np.unique(np_outind).size comm.SparsePush(name, inind.handle, inarr.handle, None) comm.SparsePull(name, outind.handle, outarr.handle) comm.Wait(name) nonlocal byte_count byte_count += (uni_inind_len + uni_outind_len) * item_len * 4 elif func_name == 'push': inarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) def func(name): comm.Push(name, inarr.handle, None) comm.Wait(name) nonlocal byte_count byte_count += nitem * item_len * 4 elif func_name == 'pull': outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) def func(name): comm.Pull(name, outarr.handle) comm.Wait(name) nonlocal byte_count byte_count += nitem * item_len * 4 elif func_name == 'sparsepush': inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) def func(name): np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, )) inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx) uni_inind_len = np.unique(np_inind).size comm.SparsePush(name, inind.handle, inarr.handle, None) comm.Wait(name) nonlocal byte_count byte_count += uni_inind_len * item_len * 4 elif func_name == 'sparsepull': outarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) def func(name): np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, )) outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx) uni_outind_len = np.unique(np_outind).size comm.SparsePull(name, outind.handle, outarr.handle) comm.Wait(name) nonlocal byte_count byte_count += uni_outind_len * item_len * 4 elif func_name == 'sdpushpull': inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) def func(name): np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, )) inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx) uni_inind_len = np.unique(np_inind).size comm.SDPushPull(name, inind.handle, inarr.handle, outarr.handle, None) comm.Wait(name) nonlocal byte_count byte_count += (uni_inind_len + nitem) * item_len * 4 elif func_name == 'sspushpull': inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) outarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) def func(name): np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, )) np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, )) inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx) uni_inind_len = np.unique(np_inind).size outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx) uni_outind_len = np.unique(np_outind).size comm.SSPushPull(name, inind.handle, inarr.handle, outind.handle, outarr.handle, None) comm.Wait(name) nonlocal byte_count byte_count += (uni_inind_len + uni_outind_len) * item_len * 4 else: assert False if 'sparse' in func_name or func_name in ('sdpushpull', 'sspushpull'): arr_len = ctypes.c_int(nitem) arr_wid = ctypes.c_int(item_len) sparse_init = ctypes.c_int(1) else: arr_len = ctypes.c_int(nitem * item_len) arr_wid = ctypes.c_int(1) sparse_init = ctypes.c_int(0) for i in range(max_thread): comm.InitTensor(i, sparse_init, arr_len, arr_wid, ctypes.c_int(0), ctypes.c_double(0), ctypes.c_double(1), ctypes.c_ulonglong(123),\ ctypes.c_int(0), (ctypes.c_float * 1)(0.1), ctypes.c_int(1)) # print("data init") t = ThreadPoolExecutor(max_workers=max_thread) if ret_ans: task_list = [None for i in range(max_thread)] for i in range(max_thread): task_list[i] = t.submit(func, i) curByte = byte_count start = time.time() cnt = 0 while cnt < 30: for i in range(max_thread): if task_list[i].done(): cnt += 1 task_list[i] = t.submit(func, i) speed = (byte_count - curByte) / (time.time() - start) / 2**20 t.shutdown() for i in range(max_thread): comm.ClearOnServer(i) comm.Clear(i) return speed else: def watch(): start = time.time() while True: time.sleep(1) speed = byte_count / (time.time() - start) print("speed : {} MB/s".format(speed / 2**20)) task_list = [None for i in range(max_thread)] threading.Thread(target=watch).start() while True: for i in range(max_thread): if task_list[i] is None or task_list[i].done(): task_list[i] = t.submit(func, i)
def test_init_ps(rarr, init_type, init_a, init_b=1.0, sparse=False): assert init_type in ('constant', 'uniform', 'normal', 'truncated_normal') init_type_map = { 'constant': 0, 'uniform': 1, 'normal': 2, 'truncated_normal': 3 } ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(nitem, item_len) if rank == 0: arr = ndarray.array(local_arr, ctx=ctx) else: arr = ndarray.empty((nitem, item_len), ctx=ctx) comm = ad.get_worker_communicate() if sparse: arr_len = ctypes.c_int(nitem) arr_wid = ctypes.c_int(item_len) else: arr_len = ctypes.c_int(nitem * item_len) arr_wid = ctypes.c_int(1) itype = ctypes.c_int(init_type_map[init_type]) comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len, arr_wid, itype, ctypes.c_double(init_a), ctypes.c_double(init_b), ctypes.c_ulonglong(123), ctypes.c_int(0), (ctypes.c_float * 1)(0.1), ctypes.c_int(1)) comm.Pull(ctypes.c_int(0), arr.handle) comm.Wait(ctypes.c_int(0)) if rank == 0: local_arr[:] = arr.asnumpy() comm.BarrierWorker() if rank != 0: np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7) else: if init_type == 'constant': np.testing.assert_allclose(np.full((nitem, item_len), init_a), arr.asnumpy(), rtol=5e-7) else: if init_type == 'uniform': numpy_samples = np.random.uniform( low=init_a, high=init_b, size=(nitem, item_len)).astype(np.float32) elif init_type == 'normal': numpy_samples = np.random.normal( loc=init_a, scale=init_b, size=(nitem, item_len)).astype(np.float32) else: numpy_samples = truncnorm.rvs(-2.0, 2.0, loc=init_a, scale=init_b, size=(nitem, item_len)).astype( np.float32) fig, ax = plt.subplots(1, 1) ax.hist(numpy_samples.flatten(), histtype='stepfilled', alpha=0.2, bins=50, label='numpy') ax.hist(local_arr.flatten(), histtype='step', alpha=0.2, bins=50, label='ps') ax.legend(loc='best', frameon=False) # ax2.legend(loc='best', frameon=False) file_name = '%s_%.1f_%.1f_%d.png' % (init_type, init_a, init_b, int(sparse)) plt.savefig(file_name) print('Check file %s.' % file_name) print('Init parameters %d/%d passed.' % (rank, nrank)) if rank == 0: comm.ClearOnServer(0) comm.Clear(0) comm.BarrierWorker()
def test_default(): comm1 = ad.new_group_comm() a = ndarray.array(np.array([1,2,3,4,5]),ctx=ctx) comm1.dlarrayNcclAllReduce(a, a, ncclDataType_t.ncclFloat32, reduceop=ncclRedOp_t.ncclSum) print("Default Allreduce device=%d"%comm1.device_id.value,a.asnumpy())
def test(args): comm, device_id = ad.mpi_nccl_init() rank = comm.localRank.value size = comm.nRanks.value dataset_info = { 'Reddit': [232965, 602, 41], 'Proteins': [132534, 602, 8], 'Arch': [1644228, 602, 10], 'Products': [2449029, 100, 47] } node_count, num_features, num_classes = dataset_info[args.dataset] hidden_layer_size = 128 if num_features < 128: hidden_layer_size = 64 replication = args.replication node_Count_Self = row_num(node_count, rank // replication, size // replication) node_Count_All = node_count _, _, row_groups, col_groups = get_proc_groups(size, replication) executor_ctx = ndarray.gpu(device_id) if size > 1: adj_part, data_part, row_part, col_part, input_part, label_part = load_data( args, size, replication, rank) else: adj_part, data_part, row_part, col_part, input_part, label_part = load_data_whole( args) adj_matrix = ndarray.sparse_array(data_part, (row_part, col_part), shape=adj_part.shape, ctx=executor_ctx) # train:val:test=6:2:2 # Our optimization on distributed GNN algorithm does NOT affect the correctness! # Here due to the limitation of current slice_op, data is split continuously. # Continuous split is unfriendly for reordered graph data where nodes are already clustered. # Specifically, training on some node clusters and testing on other clusters may cause poor test accuracy. # The better way is to split data randomly! train_split, test_split = 0.6, 0.8 train_node = int(train_split * node_Count_Self) test_node = int(test_split * node_Count_Self) A = ad.Variable(name="A", trainable=False) H = ad.Variable(name="H") np.random.seed(123) bounds = np.sqrt(6.0 / (num_features + hidden_layer_size)) W1_val = np.random.uniform(low=-bounds, high=bounds, size=[num_features, hidden_layer_size]).astype(np.float32) W1 = ad.Variable(name="W1", value=W1_val) bounds = np.sqrt(6.0 / (num_classes + hidden_layer_size)) np.random.seed(123) W2_val = np.random.uniform(low=-bounds, high=bounds, size=[hidden_layer_size, num_classes]).astype(np.float32) W2 = ad.Variable(name="W2", value=W2_val) y_ = ad.Variable(name="y_") z = ad.distgcn_15d_op(A, H, W1, node_Count_Self, node_Count_All, size, replication, device_id, comm, [row_groups, col_groups], True) H1 = ad.relu_op(z) y = ad.distgcn_15d_op(A, H1, W2, node_Count_Self, node_Count_All, size, replication, device_id, comm, [row_groups, col_groups], True) y_train = ad.slice_op(y, (0, 0), (train_node, num_classes)) label_train = ad.slice_op(y_, (0, 0), (train_node, num_classes)) y_test = ad.slice_op(y, (test_node, 0), (node_Count_Self - test_node, num_classes)) label_test = ad.slice_op(y_, (test_node, 0), (node_Count_Self - test_node, num_classes)) loss = ad.softmaxcrossentropy_op(y_train, label_train) loss_test = ad.softmaxcrossentropy_op(y_test, label_test) opt = optimizer.AdamOptimizer() train_op = opt.minimize(loss) executor = ad.Executor([loss, y, loss_test, train_op], ctx=executor_ctx) feed_dict = { A: adj_matrix, H: ndarray.array(input_part, ctx=executor_ctx), y_: ndarray.array(convert_to_one_hot(label_part, max_val=num_classes), ctx=executor_ctx), } epoch_num = 100 epoch_all, epoch_0 = 0, 0 for i in range(epoch_num): epoch_start_time = time.time() results = executor.run(feed_dict=feed_dict) loss = results[0].asnumpy().sum() y_out = results[1] loss_test = results[2].asnumpy().sum() epoch_end_time = time.time() epoch_time = epoch_end_time - epoch_start_time epoch_all += epoch_time if i == 0: epoch_0 = epoch_time print("[Epoch: %d, Rank: %d] Epoch time: %.3f, Total time: %.3f" % (i, rank, epoch_time, epoch_all)) y_out_train, y_predict = y_out.asnumpy().argmax( axis=1)[:train_node], y_out.asnumpy().argmax(axis=1)[test_node:] label_train, label_test = label_part[:train_node], label_part[ test_node:] train_acc = ndarray.array(np.array([(y_out_train == label_train).sum() ]), ctx=executor_ctx) test_acc = ndarray.array(np.array([(y_predict == label_test).sum()]), ctx=executor_ctx) train_loss = ndarray.array(np.array([loss]), ctx=executor_ctx) test_loss = ndarray.array(np.array([loss_test]), ctx=executor_ctx) if replication > 1: col_groups[rank % replication].dlarrayNcclAllReduce( test_acc, test_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( test_loss, test_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( train_acc, train_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( train_loss, train_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) else: comm.dlarrayNcclAllReduce(test_acc, test_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(test_loss, test_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(train_acc, train_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(train_loss, train_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) test_acc = float( test_acc.asnumpy()[0]) / (node_count - test_split * node_count) test_loss = test_loss.asnumpy()[0] / (node_count - test_split * node_count) train_acc = float(train_acc.asnumpy()[0]) / (train_split * node_count) train_loss = train_loss.asnumpy()[0] / (train_split * node_count) if rank == 0: print("[Epoch: %d] Train Loss: %.3f, Train Accuracy: %.3f, Test Loss: %.3f, Test Accuracy: %.3f"\ %(i,train_loss, train_acc, test_loss, test_acc)) avg_epoch_time = (epoch_all - epoch_0) / (epoch_num - 1) results = ndarray.array(np.array([epoch_all, avg_epoch_time]), ctx=executor_ctx) comm.dlarrayNcclAllReduce(results, results, ncclDataType_t.ncclFloat32, reduceop=ncclRedOp_t.ncclSum) results = results.asnumpy() / size if rank == 0: print("\nAverage Total Time: %.3f, Average Epoch Time: %.3f" % (results[0], results[1]))