def dc_criteo(dense_input, sparse_input, y_): feature_dimension = 33762577 embedding_size = 8 learning_rate = 0.001 Embedding = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding") sparse_input = ad.embedding_lookup_op(Embedding, sparse_input) sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size)) ## dc_model x = ad.concat_op(sparse_input, dense_input, axis=1) input_dim = 26 * 8 + 13 hidden_dim = input_dim residual_out = build_residual_layers(x, input_dim, hidden_dim, num_layers=5) W4 = init.random_normal([26 * embedding_size + 13, 1], stddev=0.1, name="W4") y = ad.matmul_op(residual_out, W4) y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def neural_mf(user_input, item_input, y_, num_users, num_items): batch_size = 256 embed_dim = 8 layers = [64, 32, 16, 8] learning_rate = 0.01 User_Embedding = init.random_normal( (num_users, embed_dim + layers[0] // 2), stddev=0.01, name="user_embed", ctx=ndarray.cpu(0)) Item_Embedding = init.random_normal( (num_items, embed_dim + layers[0] // 2), stddev=0.01, name="item_embed", ctx=ndarray.cpu(0)) # MLP_User_Embedding = init.random_normal((num_users, layers[0] // 2), stddev=0.01, name="mlp_user_embed", ctx=ndarray.cpu(0)) # MLP_Item_Embedding = init.random_normal((num_items, layers[0] // 2), stddev=0.01, name="mlp_item_embed", ctx=ndarray.cpu(0)) user_latent = ad.embedding_lookup_op(User_Embedding, user_input, ctx=ndarray.cpu(0)) item_latent = ad.embedding_lookup_op(Item_Embedding, item_input, ctx=ndarray.cpu(0)) mf_user_latent = ad.slice_op(user_latent, (0, 0), (-1, embed_dim)) mlp_user_latent = ad.slice_op(user_latent, (0, embed_dim), (-1, -1)) mf_item_latent = ad.slice_op(item_latent, (0, 0), (-1, embed_dim)) mlp_item_latent = ad.slice_op(item_latent, (0, embed_dim), (-1, -1)) # mf_user_latent = ad.embedding_lookup_op(MF_User_Embedding, user_input, ctx=ndarray.cpu(0)) # mf_item_latent = ad.embedding_lookup_op(MF_Item_Embedding, item_input, ctx=ndarray.cpu(0)) # mlp_user_latent = ad.embedding_lookup_op(MLP_User_Embedding, user_input, ctx=ndarray.cpu(0)) # mlp_item_latent = ad.embedding_lookup_op(MLP_Item_Embedding, item_input, ctx=ndarray.cpu(0)) W1 = init.random_normal((layers[0], layers[1]), stddev=0.1, name='W1') W2 = init.random_normal((layers[1], layers[2]), stddev=0.1, name='W2') W3 = init.random_normal((layers[2], layers[3]), stddev=0.1, name='W3') W4 = init.random_normal((embed_dim + layers[3], 1), stddev=0.1, name='W4') mf_vector = ad.mul_op(mf_user_latent, mf_item_latent) mlp_vector = ad.concat_op(mlp_user_latent, mlp_item_latent, axis=1) fc1 = ad.matmul_op(mlp_vector, W1) relu1 = ad.relu_op(fc1) fc2 = ad.matmul_op(relu1, W2) relu2 = ad.relu_op(fc2) fc3 = ad.matmul_op(relu2, W3) relu3 = ad.relu_op(fc3) concat_vector = ad.concat_op(mf_vector, relu3, axis=1) y = ad.matmul_op(concat_vector, W4) y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) # opt = optimizer.AdamOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, train_op
def dfm_criteo(dense_input, sparse_input, y_): feature_dimension = 33762577 embedding_size = 128 learning_rate = 0.01 # FM Embedding1 = init.random_normal([feature_dimension, 1], stddev=0.01, name="fst_order_embedding", ctx=ndarray.cpu(0)) FM_W = init.random_normal([13, 1], stddev=0.01, name="dense_parameter") sparse_1dim_input = ad.embedding_lookup_op(Embedding1, sparse_input, ctx=ndarray.cpu(0)) fm_dense_part = ad.matmul_op(dense_input, FM_W) fm_sparse_part = ad.reduce_sum_op(sparse_1dim_input, axes=1) """ fst order output""" y1 = fm_dense_part + fm_sparse_part Embedding2 = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ndarray.cpu(0)) sparse_2dim_input = ad.embedding_lookup_op(Embedding2, sparse_input, ctx=ndarray.cpu(0)) sparse_2dim_sum = ad.reduce_sum_op(sparse_2dim_input, axes=1) sparse_2dim_sum_square = ad.mul_op(sparse_2dim_sum, sparse_2dim_sum) sparse_2dim_square = ad.mul_op(sparse_2dim_input, sparse_2dim_input) sparse_2dim_square_sum = ad.reduce_sum_op(sparse_2dim_square, axes=1) sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum sparse_2dim_half = sparse_2dim * 0.5 """snd order output""" y2 = ad.reduce_sum_op(sparse_2dim_half, axes=1, keepdims=True) #DNN flatten = ad.array_reshape_op(sparse_2dim_input, (-1, 26 * embedding_size)) W1 = init.random_normal([26 * embedding_size, 256], stddev=0.01, name="W1") W2 = init.random_normal([256, 256], stddev=0.01, name="W2") W3 = init.random_normal([256, 1], stddev=0.01, name="W3") fc1 = ad.matmul_op(flatten, W1) relu1 = ad.relu_op(fc1) fc2 = ad.matmul_op(relu1, W2) relu2 = ad.relu_op(fc2) y3 = ad.matmul_op(relu2, W3) y4 = y1 + y2 y = y4 + y3 y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def train_hetu(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) ctx = ndarray.gpu(rank) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") mask_ = ad.Variable(name="mask_") gcn1 = GraphSage(meta["feature"], hidden_layer_size, activation="relu", dropout=0.1) gcn2 = GraphSage(2*hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1) x = gcn1(x_) x = gcn2(x) W = initializers.xavier_uniform(shape=(2*hidden_layer_size, meta["class"])) B = initializers.zeros(shape=(meta["class"],)) x = ad.matmul_op(x, W) y = x + ad.broadcastto_op(B, x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.mul_op(loss, mask_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(0.1) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS') distributed.ps_init(rank, nrank) batch_size = 4000 with DistributedGraphSageSampler(args.path, batch_size, 2, 2, rank=rank, nrank=nrank) as sampler: epoch = 0 nnodes = 0 start = time.time() while True: g_sample, mask = sampler.sample() mp_val = mp_matrix(g_sample, ndarray.gpu(rank)) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, mask_ : ndarray.array(mask, ctx=ctx), x_ : ndarray.array(g_sample.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=g_sample.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = ((y_predicted == g_sample.y) * mask).sum() distributed.ps_get_worker_communicator().BarrierWorker() nnodes += batch_size if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 print("Epoch :", epoch, time.time() - start) print("Train accuracy:", acc/mask.sum()) start = time.time() if epoch >= num_epoch: break
def wdl_criteo(dense, sparse, labels): batch_size = 128 feature_dimension = 33762577 embedding_size = 128 learning_rate = 0.01 if isinstance(dense, tuple): dense_input = dl.dataloader_op([[dense[0], batch_size, 'train'], [dense[1], batch_size, 'validate']]) sparse_input = dl.dataloader_op([[sparse[0], batch_size, 'train'], [sparse[1], batch_size, 'validate']]) y_ = dl.dataloader_op([[labels[0], batch_size, 'train'], [labels[1], batch_size, 'validate']]) else: dense_input = dl.dataloader_op([[dense, batch_size, 'train']]) sparse_input = dl.dataloader_op([[sparse, batch_size, 'train']]) y_ = dl.dataloader_op([[labels, batch_size, 'train']]) print("Data loaded.") Embedding = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ndarray.cpu(0)) sparse_input = ad.embedding_lookup_op(Embedding, sparse_input, ctx=ndarray.cpu(0)) sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size)) #DNN flatten = dense_input W1 = init.random_normal([13, 256], stddev=0.01, name="W1") W2 = init.random_normal([256, 256], stddev=0.01, name="W2") W3 = init.random_normal([256, 256], stddev=0.01, name="W3") W4 = init.random_normal([256 + 26 * embedding_size, 1], stddev=0.01, name="W4") fc1 = ad.matmul_op(flatten, W1) relu1 = ad.relu_op(fc1) fc2 = ad.matmul_op(relu1, W2) relu2 = ad.relu_op(fc2) y3 = ad.matmul_op(relu2, W3) y4 = ad.concat_op(sparse_input, y3, axis=1) y = ad.matmul_op(y4, W4) y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def wdl_adult(X_deep, X_wide, y_): lr = 5 / 128 dim_wide = 809 dim_deep = 68 W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W") W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1") b1 = init.random_normal([50], stddev=0.1, name="b1") W2 = init.random_normal([50, 20], stddev=0.1, name="W2") b2 = init.random_normal([20], stddev=0.1, name="b2") #deep Embedding = [] X_deep_input = None for i in range(8): Embedding_name = "Embedding_deep_" + str(i) Embedding.append(init.random_normal([50, 8], stddev=0.1, name=Embedding_name)) now = ad.embedding_lookup_op(Embedding[i], X_deep[i]) now = ad.array_reshape_op(now, (-1, 8)) if X_deep_input is None: X_deep_input = now else: X_deep_input = ad.concat_op(X_deep_input, now, 1) for i in range(4): now = ad.array_reshape_op(X_deep[i + 8], (-1, 1)) X_deep_input = ad.concat_op(X_deep_input, now, 1) mat1 = ad.matmul_op(X_deep_input, W1) add1 = mat1 + ad.broadcastto_op(b1, mat1) relu1= ad.relu_op(add1) dropout1 = relu1 #ad.dropout_op(relu1, 0.5) mat2 = ad.matmul_op(dropout1, W2) add2 = mat2 + ad.broadcastto_op(b2, mat2) relu2= ad.relu_op(add2) dropout2 = relu2 #ad.dropout_op(relu2, 0.5) dmodel=dropout2 # wide wmodel = ad.concat_op(X_wide, dmodel, 1) wmodel = ad.matmul_op(wmodel, W) prediction = wmodel loss = ad.softmaxcrossentropy_op(prediction, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=lr) train_op = opt.minimize(loss) return loss, prediction, y_, train_op
def train_hetu(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) hosts, ports = load_ip_config(args.ip_config) ctx = ndarray.gpu(rank) distributed.grpc_init(hosts=hosts, ports=ports, rank=rank, nrank=nrank) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") gcn1 = GCN(meta["feature"], hidden_layer_size, activation="relu") gcn2 = GCN(hidden_layer_size, meta["class"]) x = gcn1(x_) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(0.1) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS') def transform(graph): mp_val = mp_matrix(graph, ndarray.gpu(rank)) return graph, mp_val with DistributedSubgraphSampler(args.path, 4000, 2, rank=rank, nrank=nrank ,transformer=transform, backend="grpc") as sampler: epoch = 0 nnodes = 0 start = time.time() while True: g_sample, mp_val = sampler.sample() feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(g_sample.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=g_sample.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == g_sample.y).sum() distributed.ps_get_worker_communicator().BarrierWorker() nnodes += g_sample.num_nodes if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 print("Epoch :", epoch, time.time() - start) print("Train accuracy:", acc/len(y_predicted)) start = time.time() if epoch >= num_epoch: break
def dc_criteo(dense, sparse, labels): batch_size = 128 feature_dimension = 33762577 embedding_size = 8 learning_rate = 0.001 if isinstance(dense, tuple): dense_input = dl.dataloader_op([[dense[0], batch_size, 'train'], [dense[1], batch_size, 'validate']]) sparse_input = dl.dataloader_op([[sparse[0], batch_size, 'train'], [sparse[1], batch_size, 'validate']]) y_ = dl.dataloader_op([[labels[0], batch_size, 'train'], [labels[1], batch_size, 'validate']]) else: dense_input = dl.dataloader_op([[dense, batch_size, 'train']]) sparse_input = dl.dataloader_op([[sparse, batch_size, 'train']]) y_ = dl.dataloader_op([[labels, batch_size, 'train']]) print("Data loaded.") Embedding = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding") sparse_input = ad.embedding_lookup_op(Embedding, sparse_input) sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size)) ## dc_model x = ad.concat_op(sparse_input, dense_input, axis=1) input_dim = 26 * 8 + 13 hidden_dim = input_dim residual_out = build_residual_layers(x, input_dim, hidden_dim, num_layers=5) W4 = init.random_normal([26 * embedding_size + 13, 1], stddev=0.1, name="W4") y = ad.matmul_op(residual_out, W4) y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def dcn_criteo(dense_input, sparse_input, y_): feature_dimension = 33762577 embedding_size = 128 learning_rate = 0.003 Embedding = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ndarray.cpu(0)) sparse_input = ad.embedding_lookup_op(Embedding, sparse_input, ctx=ndarray.cpu(0)) sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size)) x = ad.concat_op(sparse_input, dense_input, axis=1) # Cross Network cross_output = build_cross_layer(x, num_layers=3) #DNN flatten = x W1 = init.random_normal([26 * embedding_size + 13, 256], stddev=0.01, name="W1") W2 = init.random_normal([256, 256], stddev=0.01, name="W2") W3 = init.random_normal([256, 256], stddev=0.01, name="W3") W4 = init.random_normal([256 + 26 * embedding_size + 13, 1], stddev=0.01, name="W4") fc1 = ad.matmul_op(flatten, W1) relu1 = ad.relu_op(fc1) fc2 = ad.matmul_op(relu1, W2) relu2 = ad.relu_op(fc2) y3 = ad.matmul_op(relu2, W3) y4 = ad.concat_op(cross_output, y3, axis=1) y = ad.matmul_op(y4, W4) y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def test_dense(): npw = np.random.random((5, 10)).astype(np.float32) npx = np.random.random((7, 5)).astype(np.float32) cpuctx = ndarray.cpu(0) gpuctx = ndarray.gpu(0) X = ad.Variable(name="x") mid = X + 3 W = ad.Variable(name='w', value=npw, ctx=cpuctx) y = ad.matmul_op(mid, W) opt = optimizer.SGDOptimizer(learning_rate=0.1) train_op = opt.minimize(y) executor = ad.Executor([y, train_op], ctx=gpuctx) pred_y, _ = executor.run(feed_dict={X: npx}, convert_to_numpy_ret_vals=True) nppred_y = np.matmul((npx + 3), npw) np.testing.assert_allclose(pred_y, nppred_y, rtol=1e-6) new_npw = npw - 0.1 * np.matmul((npx+3).T, np.ones(nppred_y.shape).astype(np.float32)) np.testing.assert_allclose(W.tensor_value.asnumpy(), new_npw, rtol=1e-10)
def test_sparse(): npemb = np.random.random((100, 20)).astype(np.float32) npind = np.array(np.random.randint(100, size=(10,))) npw = np.random.random((20, 30)).astype(np.float32) cpuctx = ndarray.cpu(0) gpuctx = ndarray.gpu(0) embedding = ad.Variable('embeddingtable', value=npemb, ctx=cpuctx) index = ad.Variable(name="index", ctx=cpuctx) W = ad.Variable(name="w", value=npw) y = ad.embedding_lookup_op(embedding, index) # (10, 20) y = ad.matmul_op(y, W) opt = optimizer.SGDOptimizer(0.1) train_op = opt.minimize(y) executor = ad.Executor([y, train_op],ctx=gpuctx) out, _ = executor.run(feed_dict={index: npind.astype(np.float32)}, convert_to_numpy_ret_vals=True) np_out = np.matmul(npemb[npind], npw) np.testing.assert_allclose(out, np_out, rtol=1e-6) tmp_grad = np.matmul(np.ones(np_out.shape).astype(np.float32), npw.T) for i, localid in enumerate(npind): npemb[localid] -= 0.1 * tmp_grad[i] np.testing.assert_allclose(embedding.tensor_value.asnumpy(), npemb, rtol=1e-6)
def wdl_adult(whatever): batch_size = 128 lr=5 dim_wide = 809 lr_ = lr / batch_size dim_deep = 68 from .load_data import load_adult_data x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_adult_data() W = init.random_normal([dim_wide+20, 2], stddev=0.1, name="W") W1 = init.random_normal([dim_deep, 50], stddev=0.1, name="W1") b1 = init.random_normal([50], stddev=0.1, name="b1") W2 = init.random_normal([50, 20], stddev=0.1, name="W2") b2 = init.random_normal([20], stddev=0.1, name="b2") X_wide = dl.dataloader_op([ [x_train_wide, batch_size, 'train'], [x_test_wide, batch_size, 'validate'], ]) y_ = dl.dataloader_op([ [y_train, batch_size, 'train'], [y_test, batch_size, 'validate'], ]) #deep Embedding = [] X_deep = [] X_deep_input = None for i in range(8): X_deep_name = "x_deep_" + str(i) Embedding_name = "Embedding_deep_" + str(i) X_deep.append(dl.dataloader_op([ [x_train_deep[:,i], batch_size, 'train'], [x_test_deep[:,i], batch_size, 'validate'], ])) Embedding.append(init.random_normal([50, 8], stddev=0.1, name=Embedding_name)) now = ad.embedding_lookup_op(Embedding[i], X_deep[i]) now = ad.array_reshape_op(now, (-1, 8)) if X_deep_input is None: X_deep_input = now else: X_deep_input = ad.concat_op(X_deep_input, now, 1) for i in range(4): X_deep_name = "x_deep_" + str(8+i) X_deep.append(dl.dataloader_op([ [x_train_deep[:,8+i], batch_size, 'train'], [x_test_deep[:,8+i], batch_size, 'validate'], ])) now = ad.array_reshape_op(X_deep[i + 8], (batch_size, 1)) X_deep_input = ad.concat_op(X_deep_input, now, 1) mat1 = ad.matmul_op(X_deep_input, W1) add1 = mat1 + ad.broadcastto_op(b1, mat1) relu1= ad.relu_op(add1) dropout1 = relu1 #ad.dropout_op(relu1, 0.5) mat2 = ad.matmul_op(dropout1, W2) add2 = mat2 + ad.broadcastto_op(b2, mat2) relu2= ad.relu_op(add2) dropout2 = relu2 #ad.dropout_op(relu2, 0.5) dmodel=dropout2 # wide wmodel = ad.concat_op(X_wide, dmodel, 1) wmodel = ad.matmul_op(wmodel, W) prediction = wmodel loss = ad.softmaxcrossentropy_op(prediction, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=lr_) train_op = opt.minimize(loss) return loss, prediction, y_, train_op
def train_main(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = ad.get_worker_communicate().rank() nrank = int(os.environ["DMLC_NUM_WORKER"]) ctx = ndarray.gpu(rank % args.num_local_worker) embedding_width = args.hidden_size extract_width = embedding_width * (meta["feature"] - 1) y_ = dl.GNNDataLoaderOp(lambda g: ndarray.array( convert_to_one_hot(g.y, max_val=g.num_classes), ctx=ndarray.cpu())) mask_ = ad.Variable(name="mask_") gcn1 = GCN(extract_width, hidden_layer_size, activation="relu") gcn2 = GCN(hidden_layer_size, meta["class"]) index = dl.GNNDataLoaderOp( lambda g: ndarray.array(g.x[:, 0:-1], ctx=ndarray.cpu()), ctx=ndarray.cpu()) embedding = initializers.random_normal([meta["idx_max"], embedding_width], stddev=0.1) embed = ad.embedding_lookup_op(embedding, index) embed = ad.array_reshape_op(embed, (-1, extract_width)) # embed = ad.reduce_mean_op(embed, axes=1) # x = ad.concat_op(x_, embed, axis=1) x = gcn1(embed) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) train_loss = loss * mask_ train_loss = ad.reduce_mean_op(train_loss, [0]) opt = optimizer.SGDOptimizer(args.learning_rate) train_op = opt.minimize(train_loss) ad.worker_init() distributed.ps_init(rank, nrank) ngraph = meta["partition"]["nodes"][rank] // args.batch_size graphs = prepare_data(ngraph) idx = 0 g_sample, mp_val, mask, mask_eval = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample) dl.GNNDataLoaderOp.step(g_sample) epoch = 0 nnodes = 0 executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS', use_sparse_pull=False, cstable_policy=args.cache) while True: g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample_nxt) feed_dict = {gcn1.mp: mp_val, gcn2.mp: mp_val, mask_: mask} loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = np.sum((y_predicted == g_sample.y) * mask_eval) train_acc = np.sum((y_predicted == g_sample.y) * mask) stat.update(acc, mask_eval.sum(), np.sum(loss_val.asnumpy() * mask_eval) / mask_eval.sum()) stat.update_train(train_acc, mask.sum(), np.sum(loss_val.asnumpy() * mask) / mask.sum()) # distributed.ps_get_worker_communicator().BarrierWorker() nnodes += mask.sum() + mask_eval.sum() if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 if rank == 0: stat.print(epoch) if epoch >= num_epoch: break g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt
logging.info("# Prepare train/eval batches") dataloader = DataLoader(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab) ctx = ndarray.gpu(1) xs = ad.Variable(name='xs') ys1 = ad.Variable(name='ys1') ys2 = ad.Variable(name='ys2') nonpadding = ad.Variable(name='nonpadding') logging.info("# Load model") m = Transformer(hp) loss = m.train(xs, (ys1, ys2)) loss = ad.div_op(ad.reduce_sum_op(loss * nonpadding, axes=[0, 1]), ad.reduce_sum_op(nonpadding, axes=[0, 1]) + 1e-7) opt = optimizer.SGDOptimizer(hp.lr) train_op = opt.minimize(loss) executor = ad.Executor([loss, train_op], ctx=ctx) logging.info("# Session") for ep in range(hp.num_epochs): dataloader.make_epoch_data(hp.batch_size) for i in tqdm(range(dataloader.batch_num)): xs_val, ys_val = dataloader.get_batch() # st = time.time() xs_val = xs_val[0] ys1_val = ys_val[0][:, :-1] ys2_val = ys_val[0][:, 1:] nonpadding_val = np.not_equal(ys2_val, dataloader.get_pad()).astype(np.float32)