def rnn(x, y_): ''' RNN model, for MNIST dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' print("Building RNN model...") diminput = 28 dimhidden = 128 dimoutput = 10 nsteps = 28 weight1 = init.random_normal(shape=(diminput, dimhidden), stddev=0.1, name='rnn_weight1') bias1 = init.random_normal(shape=(dimhidden, ), stddev=0.1, name='rnn_bias1') weight2 = init.random_normal(shape=(dimhidden + dimhidden, dimhidden), stddev=0.1, name='rnn_weight2') bias2 = init.random_normal(shape=(dimhidden, ), stddev=0.1, name='rnn_bias2') weight3 = init.random_normal(shape=(dimhidden, dimoutput), stddev=0.1, name='rnn_weight3') bias3 = init.random_normal(shape=(dimoutput, ), stddev=0.1, name='rnn_bias3') last_state = ad.Variable(value=np.zeros((1, )).astype(np.float32), name='initial_state', trainable=False) for i in range(nsteps): cur_x = ad.slice_op(x, (0, i * diminput), (-1, diminput)) h = ad.matmul_op(cur_x, weight1) h = h + ad.broadcastto_op(bias1, h) if i == 0: last_state = ad.broadcastto_op(last_state, h) s = ad.concat_op(h, last_state, axis=1) s = ad.matmul_op(s, weight2) s = s + ad.broadcastto_op(bias2, s) last_state = ad.relu_op(s) final_state = last_state x = ad.matmul_op(final_state, weight3) y = x + ad.broadcastto_op(bias3, x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def train_hetu(num_epoch): ctx = ndarray.gpu(0) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") if use_same_init: gcn1 = GCN(num_features, hidden_layer_size, custom_init=(init_w1, init_b1)) gcn2 = GCN(hidden_layer_size, num_classes, custom_init=(init_w2, init_b2)) else: gcn1 = GCN(num_features, hidden_layer_size) gcn2 = GCN(hidden_layer_size, num_classes) mp_val = mp_matrix(graph, ctx, use_original_gcn_norm=True) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(graph.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(graph.y, max_val=num_classes), ctx=ctx) } x = gcn1(x_) x = ad.relu_op(x) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) opt = optimizer.AdamOptimizer(0.01) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx) start_time = time.time() losses = [] for i in range(num_epoch): loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == graph.y).sum() losses.append(loss_val.asnumpy().mean()) if i==0: start_time= time.time() print("Train loss :", loss_val.asnumpy().mean()) print("Train accuracy:", acc/len(y_predicted)) print("Hetu time:",i, time.time()-start_time) print("Hetu time:", time.time()-start_time) mp_val = mp_matrix(graph_full, ctx) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(graph_full.x, ctx=ctx), } executor_eval = ad.Executor([y], ctx=ctx) y_predicted, = executor_eval.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == graph_full.y)[train_split:].sum() print("Test accuracy:", acc/len(y_predicted[train_split:])) return losses
def train_hetu(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) ctx = ndarray.gpu(rank) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") mask_ = ad.Variable(name="mask_") gcn1 = GraphSage(meta["feature"], hidden_layer_size, activation="relu", dropout=0.1) gcn2 = GraphSage(2*hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1) x = gcn1(x_) x = gcn2(x) W = initializers.xavier_uniform(shape=(2*hidden_layer_size, meta["class"])) B = initializers.zeros(shape=(meta["class"],)) x = ad.matmul_op(x, W) y = x + ad.broadcastto_op(B, x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.mul_op(loss, mask_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(0.1) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS') distributed.ps_init(rank, nrank) batch_size = 4000 with DistributedGraphSageSampler(args.path, batch_size, 2, 2, rank=rank, nrank=nrank) as sampler: epoch = 0 nnodes = 0 start = time.time() while True: g_sample, mask = sampler.sample() mp_val = mp_matrix(g_sample, ndarray.gpu(rank)) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, mask_ : ndarray.array(mask, ctx=ctx), x_ : ndarray.array(g_sample.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=g_sample.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = ((y_predicted == g_sample.y) * mask).sum() distributed.ps_get_worker_communicator().BarrierWorker() nnodes += batch_size if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 print("Epoch :", epoch, time.time() - start) print("Train accuracy:", acc/mask.sum()) start = time.time() if epoch >= num_epoch: break
def train(self, xs, ys): # forward memory = self.encode(xs) logits = self.decode(ys[0], memory, xs) # train scheme y = ys[1] y_ = label_smoothing(ad.one_hot_op(y, self.hp.vocab_size), self.hp.vocab_size) # (N, T, vocab) loss = ad.softmaxcrossentropy_op(logits, y_) return loss
def alexnet(x, y_): ''' AlexNet model, for MNIST dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' print('Building AlexNet model...') x = ad.array_reshape_op(x, [-1, 1, 28, 28]) x = conv_bn_relu_pool(x, 1, 32, 'alexnet_conv1', with_relu=True, with_pool=True) x = conv_bn_relu_pool(x, 32, 64, 'alexnet_conv2', with_relu=True, with_pool=True) x = conv_bn_relu_pool(x, 64, 128, 'alexnet_conv3', with_relu=True, with_pool=False) x = conv_bn_relu_pool(x, 128, 256, 'alexnet_conv4', with_relu=True, with_pool=False) x = conv_bn_relu_pool(x, 256, 256, 'alexnet_conv5', with_relu=False, with_pool=True) x = ad.array_reshape_op(x, (-1, 256 * 3 * 3)) x = fc(x, (256 * 3 * 3, 1024), name='alexnet_fc1', with_relu=True) x = fc(x, (1024, 512), name='alexnet_fc2', with_relu=True) y = fc(x, (512, 10), name='alexnet_fc3', with_relu=False) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def train_hetu(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) hosts, ports = load_ip_config(args.ip_config) ctx = ndarray.gpu(rank) distributed.grpc_init(hosts=hosts, ports=ports, rank=rank, nrank=nrank) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") gcn1 = GCN(meta["feature"], hidden_layer_size, activation="relu") gcn2 = GCN(hidden_layer_size, meta["class"]) x = gcn1(x_) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(0.1) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS') def transform(graph): mp_val = mp_matrix(graph, ndarray.gpu(rank)) return graph, mp_val with DistributedSubgraphSampler(args.path, 4000, 2, rank=rank, nrank=nrank ,transformer=transform, backend="grpc") as sampler: epoch = 0 nnodes = 0 start = time.time() while True: g_sample, mp_val = sampler.sample() feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(g_sample.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=g_sample.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == g_sample.y).sum() distributed.ps_get_worker_communicator().BarrierWorker() nnodes += g_sample.num_nodes if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 print("Epoch :", epoch, time.time() - start) print("Train accuracy:", acc/len(y_predicted)) start = time.time() if epoch >= num_epoch: break
def wdl_adult(X_deep, X_wide, y_): lr = 5 / 128 dim_wide = 809 dim_deep = 68 W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W") W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1") b1 = init.random_normal([50], stddev=0.1, name="b1") W2 = init.random_normal([50, 20], stddev=0.1, name="W2") b2 = init.random_normal([20], stddev=0.1, name="b2") #deep Embedding = [] X_deep_input = None for i in range(8): Embedding_name = "Embedding_deep_" + str(i) Embedding.append(init.random_normal([50, 8], stddev=0.1, name=Embedding_name)) now = ad.embedding_lookup_op(Embedding[i], X_deep[i]) now = ad.array_reshape_op(now, (-1, 8)) if X_deep_input is None: X_deep_input = now else: X_deep_input = ad.concat_op(X_deep_input, now, 1) for i in range(4): now = ad.array_reshape_op(X_deep[i + 8], (-1, 1)) X_deep_input = ad.concat_op(X_deep_input, now, 1) mat1 = ad.matmul_op(X_deep_input, W1) add1 = mat1 + ad.broadcastto_op(b1, mat1) relu1= ad.relu_op(add1) dropout1 = relu1 #ad.dropout_op(relu1, 0.5) mat2 = ad.matmul_op(dropout1, W2) add2 = mat2 + ad.broadcastto_op(b2, mat2) relu2= ad.relu_op(add2) dropout2 = relu2 #ad.dropout_op(relu2, 0.5) dmodel=dropout2 # wide wmodel = ad.concat_op(X_wide, dmodel, 1) wmodel = ad.matmul_op(wmodel, W) prediction = wmodel loss = ad.softmaxcrossentropy_op(prediction, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=lr) train_op = opt.minimize(loss) return loss, prediction, y_, train_op
def mlp(x, y_): ''' MLP model, for MNIST dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' print("Building MLP model...") x = fc(x, (784, 256), 'mlp_fc1', with_relu=True) x = fc(x, (256, 256), 'mlp_fc2', with_relu=True) y = fc(x, (256, 10), 'mlp_fc3', with_relu=False) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def cnn_3_layers(x, y_): ''' 3-layer-CNN model, for MNIST dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' print('Building 3-layer-CNN model...') x = ad.array_reshape_op(x, [-1, 1, 28, 28]) x = conv_relu_avg(x, (32, 1, 5, 5)) x = conv_relu_avg(x, (64, 32, 5, 5)) y = fc(x, (7 * 7 * 64, 10)) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def logreg(x, y_): ''' Logistic Regression model, for MNIST dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' print("Build logistic regression model...") weight = init.zeros((784, 10), name='logreg_weight') bias = init.zeros((10, ), name='logreg_bias') x = ad.matmul_op(x, weight) y = x + ad.broadcastto_op(bias, x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def vgg(x, y_, num_layers): ''' VGG model, for CIFAR10 dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) num_layers: 16 or 19 Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' if num_layers == 16: print('Building VGG-16 model...') x = vgg_2block(x, 3, 64, 'vgg_block1') x = vgg_2block(x, 64, 128, 'vgg_block2') x = vgg_3block(x, 128, 256, 'vgg_block3') x = vgg_3block(x, 256, 512, 'vgg_block4') x = vgg_3block(x, 512, 512, 'vgg_block5') elif num_layers == 19: print('Building VGG-19 model...') x = vgg_2block(x, 3, 64, 'vgg_block1') x = vgg_2block(x, 64, 128, 'vgg_block2') x = vgg_4block(x, 128, 256, 'vgg_block3') x = vgg_4block(x, 256, 512, 'vgg_block4') x = vgg_4block(x, 512, 512, 'vgg_block5') else: assert False, 'VGG model should have 16 or 19 layers!' x = ad.array_reshape_op(x, (-1, 512)) x = vgg_fc(x, 512, 4096, 'vgg_fc1') x = vgg_fc(x, 4096, 4096, 'vgg_fc2') y = vgg_fc(x, 4096, 10, 'vgg_fc3') loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def lenet(x, y_): ''' LeNet model, for MNIST dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' print('Building LeNet model...') x = ad.array_reshape_op(x, (-1, 1, 28, 28)) x = conv_pool(x, 1, 6, name='lenet_conv1') x = conv_pool(x, 6, 16, name='lenet_conv2') x = ad.array_reshape_op(x, (-1, 7 * 7 * 16)) x = fc(x, (7 * 7 * 16, 120), name='lenet_fc1', with_relu=True) x = fc(x, (120, 84), name='lenet_fc2', with_relu=True) y = fc(x, (84, 10), name='lenet_fc3', with_relu=False) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def test_csrmm_op(executor_ctx): X = ad.Variable(name="X") W = ad.Variable(name="W") Y = ad.csrmm_op(X, W) Y_ = ad.Variable(name="Y_") loss = ad.softmaxcrossentropy_op(Y, Y_) loss = ad.reduce_mean_op(loss, [0]) grads = ad.gradients(loss, [W, Y]) executor = ad.Executor( [loss, grads[0], grads[1]], ctx=executor_ctx) rand = np.random.RandomState(seed=123) W_val = rand.normal(scale=0.1, size=[70000, 2]).astype(np.float32) if ndarray.is_gpu_ctx(executor_ctx): W_val = ndarray.array(W_val, ctx=executor_ctx) X_val = scipy.sparse.rand(500, 70000, density=1e-5,format='coo',dtype=np.float32) Y_val = np.random.uniform(0, 10, size=(500, 2)).astype(np.float32) loss_val = executor.run(feed_dict={X: X_val, Y_: Y_val, W: W_val}) if ndarray.is_gpu_ctx(executor_ctx): W_val = W_val.asnumpy() loss_val = [val.asnumpy() for val in loss_val] y_groundtruth = X_val.dot(W_val) loss_groundtruth = np.mean( -np.sum(Y_val * np.log(softmax_func(y_groundtruth)), axis=1), keepdims=True) Y_grad_groundtruth = (softmax_func(y_groundtruth) + -1 * Y_val) * np.ones(loss_groundtruth.shape) / 500 W_grad_groundtruth = X_val.T.dot(Y_grad_groundtruth) np.testing.assert_allclose(loss_val[0], loss_groundtruth, rtol=1e-4) np.testing.assert_allclose(loss_val[1], W_grad_groundtruth, rtol=1e-4) np.testing.assert_allclose(loss_val[2], Y_grad_groundtruth, rtol=1e-4)
def wdl_adult(whatever): batch_size = 128 lr=5 dim_wide = 809 lr_ = lr / batch_size dim_deep = 68 from .load_data import load_adult_data x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_adult_data() W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W") W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1") b1 = init.random_normal([50], stddev=0.1, name="b1") W2 = init.random_normal([50, 20], stddev=0.1, name="W2") b2 = init.random_normal([20], stddev=0.1, name="b2") X_wide = dl.dataloader_op([ [x_train_wide, batch_size, 'train'], [x_test_wide, batch_size, 'validate'], ]) y_ = dl.dataloader_op([ [y_train, batch_size, 'train'], [y_test, batch_size, 'validate'], ]) #deep Embedding = [] X_deep = [] X_deep_input = None for i in range(8): X_deep_name = "x_deep_" + str(i) Embedding_name = "Embedding_deep_" + str(i) X_deep.append(dl.dataloader_op([ [x_train_deep[:,i], batch_size, 'train'], [x_test_deep[:,i], batch_size, 'validate'], ])) Embedding.append(init.random_normal([50, 8], stddev=0.1, name=Embedding_name)) now = ad.embedding_lookup_op(Embedding[i], X_deep[i]) now = ad.array_reshape_op(now, (-1, 8)) if X_deep_input is None: X_deep_input = now else: X_deep_input = ad.concat_op(X_deep_input, now, 1) for i in range(4): X_deep_name = "x_deep_" + str(8+i) X_deep.append(dl.dataloader_op([ [x_train_deep[:,8+i], batch_size, 'train'], [x_test_deep[:,8+i], batch_size, 'validate'], ])) now = ad.array_reshape_op(X_deep[i + 8], (batch_size, 1)) X_deep_input = ad.concat_op(X_deep_input, now, 1) mat1 = ad.matmul_op(X_deep_input, W1) add1 = mat1 + ad.broadcastto_op(b1, mat1) relu1= ad.relu_op(add1) dropout1 = relu1 #ad.dropout_op(relu1, 0.5) mat2 = ad.matmul_op(dropout1, W2) add2 = mat2 + ad.broadcastto_op(b2, mat2) relu2= ad.relu_op(add2) dropout2 = relu2 #ad.dropout_op(relu2, 0.5) dmodel=dropout2 # wide wmodel = ad.concat_op(X_wide, dmodel, 1) wmodel = ad.matmul_op(wmodel, W) prediction = wmodel loss = ad.softmaxcrossentropy_op(prediction, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=lr_) train_op = opt.minimize(loss) return loss, prediction, y_, train_op
def resnet(x, y_, num_layers=18): ''' ResNet model, for CIFAR10 dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, C, H, W) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) num_layers: 18 or 34 Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' base_size = 16 x = conv2d(x, 3, base_size, stride=1, padding=1, name='resnet_initial_conv') x = batch_norm_with_relu(x, base_size, 'resnet_initial_bn') if num_layers == 18: print("Building ResNet-18 model...") x = resnet_block(x, base_size, num_blocks=2, is_first=True, name='resnet_block1') x = resnet_block(x, base_size, num_blocks=2, is_first=False, name='resnet_block2') x = resnet_block(x, 2 * base_size, num_blocks=2, is_first=False, name='resnet_block3') x = resnet_block(x, 4 * base_size, num_blocks=2, is_first=False, name='resnet_block4') elif num_layers == 34: print("Building ResNet-34 model...") x = resnet_block(x, base_size, num_blocks=3, is_first=True, name='resnet_block1') x = resnet_block(x, base_size, num_blocks=4, is_first=False, name='resnet_block2') x = resnet_block(x, 2 * base_size, num_blocks=6, is_first=False, name='resnet_block3') x = resnet_block(x, 4 * base_size, num_blocks=3, is_first=False, name='resnet_block4') else: assert False, "Number of layers should be 18 or 34 !" x = batch_norm_with_relu(x, 8 * base_size, 'resnet_final_bn') x = ad.array_reshape_op(x, (-1, 128 * base_size)) y = fc(x, (128 * base_size, 10), name='resnet_final_fc') # here we don't use cudnn for softmax crossentropy to avoid overflows loss = ad.softmaxcrossentropy_op(y, y_, use_cudnn=False) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def train_main(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = ad.get_worker_communicate().rank() nrank = int(os.environ["DMLC_NUM_WORKER"]) ctx = ndarray.gpu(rank % args.num_local_worker) embedding_width = args.hidden_size extract_width = embedding_width * (meta["feature"] - 1) y_ = dl.GNNDataLoaderOp(lambda g: ndarray.array( convert_to_one_hot(g.y, max_val=g.num_classes), ctx=ndarray.cpu())) mask_ = ad.Variable(name="mask_") gcn1 = GCN(extract_width, hidden_layer_size, activation="relu") gcn2 = GCN(hidden_layer_size, meta["class"]) index = dl.GNNDataLoaderOp( lambda g: ndarray.array(g.x[:, 0:-1], ctx=ndarray.cpu()), ctx=ndarray.cpu()) embedding = initializers.random_normal([meta["idx_max"], embedding_width], stddev=0.1) embed = ad.embedding_lookup_op(embedding, index) embed = ad.array_reshape_op(embed, (-1, extract_width)) # embed = ad.reduce_mean_op(embed, axes=1) # x = ad.concat_op(x_, embed, axis=1) x = gcn1(embed) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) train_loss = loss * mask_ train_loss = ad.reduce_mean_op(train_loss, [0]) opt = optimizer.SGDOptimizer(args.learning_rate) train_op = opt.minimize(train_loss) ad.worker_init() distributed.ps_init(rank, nrank) ngraph = meta["partition"]["nodes"][rank] // args.batch_size graphs = prepare_data(ngraph) idx = 0 g_sample, mp_val, mask, mask_eval = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample) dl.GNNDataLoaderOp.step(g_sample) epoch = 0 nnodes = 0 executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS', use_sparse_pull=False, cstable_policy=args.cache) while True: g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample_nxt) feed_dict = {gcn1.mp: mp_val, gcn2.mp: mp_val, mask_: mask} loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = np.sum((y_predicted == g_sample.y) * mask_eval) train_acc = np.sum((y_predicted == g_sample.y) * mask) stat.update(acc, mask_eval.sum(), np.sum(loss_val.asnumpy() * mask_eval) / mask_eval.sum()) stat.update_train(train_acc, mask.sum(), np.sum(loss_val.asnumpy() * mask) / mask.sum()) # distributed.ps_get_worker_communicator().BarrierWorker() nnodes += mask.sum() + mask_eval.sum() if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 if rank == 0: stat.print(epoch) if epoch >= num_epoch: break g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt
def train_hetu(num_epoch): ctx = ndarray.gpu(0) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") gcn1 = GraphSage(graph.num_features, hidden_layer_size, activation="relu", dropout=0.1) gcn2 = GraphSage(2 * hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1) x = gcn1(x_) x = gcn2(x) W = initializers.xavier_uniform(shape=(2 * hidden_layer_size, graph.num_classes)) B = initializers.zeros(shape=(graph.num_classes, )) x = ad.matmul_op(x, W) y = x + ad.broadcastto_op(B, x) loss = ad.softmaxcrossentropy_op(y, y_) opt = optimizer.AdamOptimizer(0.01) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx) def eval(): start = time.time() ad.Dropout.DropoutOp.phase = "eval" mp_val = mp_matrix(graph_full, ctx) feed_dict = { gcn1.mp: mp_val, gcn2.mp: mp_val, x_: ndarray.array(graph_full.x, ctx=ctx), } executor_eval = ad.Executor([y], ctx=ctx) y_predicted, = executor_eval.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == graph_full.y)[train_split:].sum() print("Test accuracy:", acc / len(y_predicted[train_split:])) ad.Dropout.DropoutOp.phase = "training" with RandomWalkSampler(graph, 4000, 2, transformer=transform, num_sample_thread=3) as sampler: for i in range(num_epoch): start = time.time() g_sample, mp_val = sampler.sample() #mp_val = mp_matrix(g_sample, ctx) #print(time.time() - start) feed_dict = { gcn1.mp: mp_val, gcn2.mp: mp_val, x_: ndarray.array(g_sample.x, ctx=ctx), y_: ndarray.array(convert_to_one_hot(g_sample.y, max_val=graph.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == g_sample.y).sum() print(i, "Train loss :", loss_val.asnumpy().mean()) print(i, "Train accuracy:", acc / len(y_predicted)) if (i + 1) % 100 == 0: eval() print(time.time() - start)
def train_hetu(num_epoch): ctx = ndarray.gpu(0) feed_dict = {} nparts = 4 graph.add_self_loop() norm = graph.gcn_norm(True) graphs, edge_list, reindexed_edges = graph.part_graph(nparts) x_val = np.concatenate(list(map(lambda g: g.x, graphs))) y_concat = np.concatenate(list(map(lambda g: g.y, graphs))) y_val = convert_to_one_hot( y_concat, max_val=graph.num_classes) # shape=(n, num_classes) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") feed_dict[x_] = ndarray.array(x_val, ctx=ctx) feed_dict[y_] = ndarray.array(y_val, ctx=ctx) gcn1 = PCGCN(graph.num_features, 16, npart=nparts) gcn2 = PCGCN(16, graph.num_classes, npart=nparts) mp_val = [[None for j in range(nparts)] for i in range(nparts)] use_sparse = [True for g in graphs] for i in range(nparts): for j in range(nparts): if i == j: edges = graphs[i].edge_index else: edges = pick_edges(reindexed_edges, edge_list[i][j]) if i == j and use_sparse[i] == False: mp_val[i][j] = sparse.csr_matrix( (norm[edge_list[i][j]], (edges[1], edges[0])), shape=(graphs[j].num_nodes, graphs[i].num_nodes)).toarray() else: mp_val[i][j] = ndarray.sparse_array( values=norm[edge_list[i][j]], indices=(edges[1], edges[0]), shape=(graphs[j].num_nodes, graphs[i].num_nodes), ctx=ctx) feed_dict[gcn1.mp[i][j]] = mp_val[i][j] feed_dict[gcn2.mp[i][j]] = mp_val[i][j] subgraph_size = list(map(lambda g: g.num_nodes, graphs)) x = gcn1(x_, subgraph_size=subgraph_size, use_sparse=use_sparse) x = ad.relu_op(x) y = gcn2(x, subgraph_size=subgraph_size, use_sparse=use_sparse) # y_train = ad.slice_op(y, (0, 0), (train_split, graph.num_classes)) # loss = ad.softmaxcrossentropy_op(y_train, y_) loss = ad.softmaxcrossentropy_op(y, y_) opt = optimizer.AdamOptimizer(0.01) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx) losses = [] for i in range(num_epoch): loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == y_concat).sum() losses.append(loss_val.asnumpy()[0]) if i == 0: start_time = time.time() print("Train loss :", loss_val.asnumpy().mean()) print("Val accuracy:", acc / len(y_predicted)) print("Hetu time:", (time.time() - start_time) / 199) return losses
def lstm(x, y_): ''' LSTM model, for MNIST dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' print("Building LSTM model...") diminput = 28 dimhidden = 128 dimoutput = 10 nsteps = 28 forget_gate_w = init.random_normal(shape=(diminput, dimhidden), stddev=0.1, name="lstm_forget_gate_w") forget_gate_u = init.random_normal(shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_forget_gate_u") forget_gate_b = init.random_normal(shape=(dimhidden, ), stddev=0.1, name="lstm_forget_gate_b") input_gate_w = init.random_normal(shape=(diminput, dimhidden), stddev=0.1, name="lstm_input_gate_w") input_gate_u = init.random_normal(shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_input_gate_u") input_gate_b = init.random_normal(shape=(dimhidden, ), stddev=0.1, name="lstm_input_gate_b") output_gate_w = init.random_normal(shape=(diminput, dimhidden), stddev=0.1, name="lstm_output_gate_w") output_gate_u = init.random_normal(shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_output_gate_u") output_gate_b = init.random_normal(shape=(dimhidden, ), stddev=0.1, name="lstm_output_gate_b") tanh_w = init.random_normal(shape=(diminput, dimhidden), stddev=0.1, name="lstm_tanh_w") tanh_u = init.random_normal(shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_tanh_u") tanh_b = init.random_normal(shape=(dimhidden, ), stddev=0.1, name="lstm_tanh_b") out_weights = init.random_normal(shape=(dimhidden, dimoutput), stddev=0.1, name="lstm_out_weight") out_bias = init.random_normal(shape=(dimoutput, ), stddev=0.1, name="lstm_out_bias") initial_state = ad.Variable(value=np.zeros((1, )).astype(np.float32), name='initial_state', trainable=False) for i in range(nsteps): cur_x = ad.slice_op(x, (0, i * diminput), (-1, diminput)) # forget gate if i == 0: temp = ad.matmul_op(cur_x, forget_gate_w) last_c_state = ad.broadcastto_op(initial_state, temp) last_h_state = ad.broadcastto_op(initial_state, temp) cur_forget = ad.matmul_op(last_h_state, forget_gate_u) + temp else: cur_forget = ad.matmul_op(last_h_state, forget_gate_u) + ad.matmul_op( cur_x, forget_gate_w) cur_forget = cur_forget + ad.broadcastto_op(forget_gate_b, cur_forget) cur_forget = ad.sigmoid_op(cur_forget) # input gate cur_input = ad.matmul_op(last_h_state, input_gate_u) + ad.matmul_op( cur_x, input_gate_w) cur_input = cur_input + ad.broadcastto_op(input_gate_b, cur_input) cur_input = ad.sigmoid_op(cur_input) # output gate cur_output = ad.matmul_op(last_h_state, output_gate_u) + ad.matmul_op( cur_x, output_gate_w) cur_output = cur_output + ad.broadcastto_op(output_gate_b, cur_output) cur_output = ad.sigmoid_op(cur_output) # tanh cur_tanh = ad.matmul_op(last_h_state, tanh_u) + ad.matmul_op( cur_x, tanh_w) cur_tanh = cur_tanh + ad.broadcastto_op(tanh_b, cur_tanh) cur_tanh = ad.tanh_op(cur_tanh) last_c_state = ad.mul_op(last_c_state, cur_forget) + ad.mul_op( cur_input, cur_tanh) last_h_state = ad.tanh_op(last_c_state) * cur_output x = ad.matmul_op(last_h_state, out_weights) y = x + ad.broadcastto_op(out_bias, x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) return loss, y
def train_hetu(num_epoch): ctx = ndarray.gpu(0) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") mask_ = ad.Variable(name="mask_") gcn1 = GraphSage(graph.num_features, hidden_layer_size, activation="relu", dropout=0.1) gcn2 = GraphSage(2*hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1) x = gcn1(x_) x = gcn2(x) W = initializers.xavier_uniform(shape=(2*hidden_layer_size, graph.num_classes)) B = initializers.zeros(shape=(graph.num_classes,)) x = ad.matmul_op(x, W) y = x + ad.broadcastto_op(B, x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.mul_op(loss, mask_) opt = optimizer.AdamOptimizer(0.01) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx) def eval(): start = time.time() ad.Dropout.DropoutOp.phase = "eval" mp_val = mp_matrix(graph_full, ctx) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(graph_full.x, ctx=ctx), } executor_eval = ad.Executor([y], ctx=ctx) y_predicted, = executor_eval.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == graph_full.y)[train_split:].sum() print("Test accuracy:", acc/len(y_predicted[train_split:])) ad.Dropout.DropoutOp.phase = "training" epoch = 0 nnodes = 0 batch_size = 1000 with GraphSageSampler(graph, batch_size, depth=2, num_sample_thread=4) as sampler: start = time.time() while True: g_sample, mask = sampler.sample() mp_val = mp_matrix(g_sample, ctx) #print(time.time() - start) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, mask_ : ndarray.array(mask,ctx=ctx), x_ : ndarray.array(g_sample.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=graph.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = ((y_predicted == g_sample.y) * mask).sum() # print(i, "Train loss :", loss_val.asnumpy().mean()) # print(i, "Train accuracy:", acc/len(y_predicted)) nnodes += batch_size if nnodes > graph_full.num_nodes: nnodes = 0 epoch += 1 print("Epoch :", epoch, time.time() - start) print("Train accuracy:", acc/mask.sum()) eval() start = time.time() if epoch >= num_epoch: break
def test(args): comm, device_id = ad.mpi_nccl_init() rank = comm.localRank.value size = comm.nRanks.value dataset_info = { 'Reddit': [232965, 602, 41], 'Proteins': [132534, 602, 8], 'Arch': [1644228, 602, 10], 'Products': [2449029, 100, 47] } node_count, num_features, num_classes = dataset_info[args.dataset] hidden_layer_size = 128 if num_features < 128: hidden_layer_size = 64 replication = args.replication node_Count_Self = row_num(node_count, rank // replication, size // replication) node_Count_All = node_count _, _, row_groups, col_groups = get_proc_groups(size, replication) executor_ctx = ndarray.gpu(device_id) if size > 1: adj_part, data_part, row_part, col_part, input_part, label_part = load_data( args, size, replication, rank) else: adj_part, data_part, row_part, col_part, input_part, label_part = load_data_whole( args) adj_matrix = ndarray.sparse_array(data_part, (row_part, col_part), shape=adj_part.shape, ctx=executor_ctx) # train:val:test=6:2:2 # Our optimization on distributed GNN algorithm does NOT affect the correctness! # Here due to the limitation of current slice_op, data is split continuously. # Continuous split is unfriendly for reordered graph data where nodes are already clustered. # Specifically, training on some node clusters and testing on other clusters may cause poor test accuracy. # The better way is to split data randomly! train_split, test_split = 0.6, 0.8 train_node = int(train_split * node_Count_Self) test_node = int(test_split * node_Count_Self) A = ad.Variable(name="A", trainable=False) H = ad.Variable(name="H") np.random.seed(123) bounds = np.sqrt(6.0 / (num_features + hidden_layer_size)) W1_val = np.random.uniform(low=-bounds, high=bounds, size=[num_features, hidden_layer_size]).astype(np.float32) W1 = ad.Variable(name="W1", value=W1_val) bounds = np.sqrt(6.0 / (num_classes + hidden_layer_size)) np.random.seed(123) W2_val = np.random.uniform(low=-bounds, high=bounds, size=[hidden_layer_size, num_classes]).astype(np.float32) W2 = ad.Variable(name="W2", value=W2_val) y_ = ad.Variable(name="y_") z = ad.distgcn_15d_op(A, H, W1, node_Count_Self, node_Count_All, size, replication, device_id, comm, [row_groups, col_groups], True) H1 = ad.relu_op(z) y = ad.distgcn_15d_op(A, H1, W2, node_Count_Self, node_Count_All, size, replication, device_id, comm, [row_groups, col_groups], True) y_train = ad.slice_op(y, (0, 0), (train_node, num_classes)) label_train = ad.slice_op(y_, (0, 0), (train_node, num_classes)) y_test = ad.slice_op(y, (test_node, 0), (node_Count_Self - test_node, num_classes)) label_test = ad.slice_op(y_, (test_node, 0), (node_Count_Self - test_node, num_classes)) loss = ad.softmaxcrossentropy_op(y_train, label_train) loss_test = ad.softmaxcrossentropy_op(y_test, label_test) opt = optimizer.AdamOptimizer() train_op = opt.minimize(loss) executor = ad.Executor([loss, y, loss_test, train_op], ctx=executor_ctx) feed_dict = { A: adj_matrix, H: ndarray.array(input_part, ctx=executor_ctx), y_: ndarray.array(convert_to_one_hot(label_part, max_val=num_classes), ctx=executor_ctx), } epoch_num = 100 epoch_all, epoch_0 = 0, 0 for i in range(epoch_num): epoch_start_time = time.time() results = executor.run(feed_dict=feed_dict) loss = results[0].asnumpy().sum() y_out = results[1] loss_test = results[2].asnumpy().sum() epoch_end_time = time.time() epoch_time = epoch_end_time - epoch_start_time epoch_all += epoch_time if i == 0: epoch_0 = epoch_time print("[Epoch: %d, Rank: %d] Epoch time: %.3f, Total time: %.3f" % (i, rank, epoch_time, epoch_all)) y_out_train, y_predict = y_out.asnumpy().argmax( axis=1)[:train_node], y_out.asnumpy().argmax(axis=1)[test_node:] label_train, label_test = label_part[:train_node], label_part[ test_node:] train_acc = ndarray.array(np.array([(y_out_train == label_train).sum() ]), ctx=executor_ctx) test_acc = ndarray.array(np.array([(y_predict == label_test).sum()]), ctx=executor_ctx) train_loss = ndarray.array(np.array([loss]), ctx=executor_ctx) test_loss = ndarray.array(np.array([loss_test]), ctx=executor_ctx) if replication > 1: col_groups[rank % replication].dlarrayNcclAllReduce( test_acc, test_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( test_loss, test_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( train_acc, train_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( train_loss, train_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) else: comm.dlarrayNcclAllReduce(test_acc, test_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(test_loss, test_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(train_acc, train_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(train_loss, train_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) test_acc = float( test_acc.asnumpy()[0]) / (node_count - test_split * node_count) test_loss = test_loss.asnumpy()[0] / (node_count - test_split * node_count) train_acc = float(train_acc.asnumpy()[0]) / (train_split * node_count) train_loss = train_loss.asnumpy()[0] / (train_split * node_count) if rank == 0: print("[Epoch: %d] Train Loss: %.3f, Train Accuracy: %.3f, Test Loss: %.3f, Test Accuracy: %.3f"\ %(i,train_loss, train_acc, test_loss, test_acc)) avg_epoch_time = (epoch_all - epoch_0) / (epoch_num - 1) results = ndarray.array(np.array([epoch_all, avg_epoch_time]), ctx=executor_ctx) comm.dlarrayNcclAllReduce(results, results, ncclDataType_t.ncclFloat32, reduceop=ncclRedOp_t.ncclSum) results = results.asnumpy() / size if rank == 0: print("\nAverage Total Time: %.3f, Average Epoch Time: %.3f" % (results[0], results[1]))