def main(): parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, required=True, help="model to be tested") parser.add_argument("--config", type=str, default="../config/local_w8.yml", help="configuration for ps") parser.add_argument("--val", action="store_true", help="whether to use validation") parser.add_argument("--cache", default=None, help="cache policy") parser.add_argument("--bsp", action="store_true", help="whether to use bsp instead of asp") parser.add_argument("--bound", default=100, help="cache bound") args = parser.parse_args() config = args.config import models model = eval('models.' + args.model) settings = yaml.load(open(config).read(), Loader=yaml.FullLoader) comm, device_id = ad.mpi_nccl_init() print('Model:', args.model, '; rank:', device_id) value = settings['w' + str(device_id)] for k, v in value.items(): os.environ[k] = str(v) worker(model, device_id, args) ad.mpi_nccl_finish(comm)
def worker(args): def validate(): hits, ndcgs = [], [] for idx in range(testData.shape[0]): start_index = idx * 100 predictions = val_executor.run(convert_to_numpy_ret_vals=True) map_item_score = {testItemInput[start_index + i]: predictions[0][i] for i in range(100)} gtItem = testItemInput[start_index] # Evaluate top rank list ranklist = heapq.nlargest(topK, map_item_score, key=map_item_score.get) hr = getHitRatio(ranklist, gtItem) ndcg = getNDCG(ranklist, gtItem) hits.append(hr) ndcgs.append(ndcg) hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() return hr, ndcg def get_current_shard(data): if args.comm is not None: part_size = data.shape[0] // nrank start = part_size * rank end = start + part_size if rank != nrank - 1 else data.shape[0] return data[start:end] else: return data device_id = 0 if args.comm == 'PS': rank = ad.get_worker_communicate().rank() nrank = int(os.environ['DMLC_NUM_WORKER']) device_id = rank % 8 elif args.comm == 'Hybrid': comm, rank = ad.mpi_nccl_init() nrank = int(os.environ['DMLC_NUM_WORKER']) device_id = rank % 8 from movielens import getdata if args.all: trainData, testData = getdata('ml-25m', 'datasets') trainUsers = get_current_shard(trainData['user_input']) trainItems = get_current_shard(trainData['item_input']) trainLabels = get_current_shard(trainData['labels']) testData = get_current_shard(testData) testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32), 100) testItemInput = testData.reshape((-1,)) else: trainData, testData = getdata('ml-25m', 'datasets') trainUsers = get_current_shard(trainData['user_input'][:1024000]) trainItems = get_current_shard(trainData['item_input'][:1024000]) trainLabels = get_current_shard(trainData['labels'][:1024000]) testData = get_current_shard(testData[:1470]) testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32), 100) testItemInput = testData.reshape((-1,)) num_users, num_items = { 'ml-1m': (6040, 3706), 'ml-20m': (138493, 26744), 'ml-25m': (162541, 59047), }['ml-25m'] # assert not args.all or num_users == testData.shape[0] batch_size = 1024 num_negatives = 4 topK = 10 user_input = dl.dataloader_op([ dl.Dataloader(trainUsers, batch_size, 'train'), dl.Dataloader(testUserInput, 100, 'validate'), ]) item_input = dl.dataloader_op([ dl.Dataloader(trainItems, batch_size, 'train'), dl.Dataloader(testItemInput, 100, 'validate'), ]) y_ = dl.dataloader_op([ dl.Dataloader(trainLabels.reshape((-1, 1)), batch_size, 'train'), ]) loss, y, train_op = neural_mf(user_input, item_input, y_, num_users, num_items) executor = ad.Executor([loss, train_op], ctx=ndarray.gpu(device_id), dataloader_name='train', \ comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123) val_executor = ad.Executor([y], ctx=ndarray.gpu(device_id), inference=True, dataloader_name='validate', comm_mode=args.comm, bsp=args.bsp) path = 'logs/hetulog_%s' % ({None: 'local', 'PS': 'ps', 'Hybrid': 'hybrid'}[args.comm]) path += '_%d.txt' % rank if args.comm else '.txt' log = Logging(path=path) epoch = 7 start = time.time() for ep in range(epoch): ep_st = time.time() log.write('epoch %d' % ep) train_loss = [] for idx in tqdm(range(executor.batch_num)): loss_val = executor.run(convert_to_numpy_ret_vals=True) train_loss.append(loss_val[0]) # if idx % 10000 == 0: # hr, ndcg = validate() # printstr = "HR: %.4f, NDCF: %.4f" % (hr, ndcg) # log.write(printstr) tra_loss = np.mean(train_loss) ep_en = time.time() # validate phase if args.val: hr, ndcg = validate() printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % (tra_loss, hr, ndcg, ep_en - ep_st) else: printstr = "train_loss: %.4f, train_time: %.4f" % (tra_loss, ep_en - ep_st) log.write(printstr) log.write('all time: %f' % (time.time() - start))
def test(args): comm, device_id = ad.mpi_nccl_init() rank = comm.localRank.value size = comm.nRanks.value dataset_info = { 'Reddit': [232965, 602, 41], 'Proteins': [132534, 602, 8], 'Arch': [1644228, 602, 10], 'Products': [2449029, 100, 47] } node_count, num_features, num_classes = dataset_info[args.dataset] hidden_layer_size = 128 if num_features < 128: hidden_layer_size = 64 replication = args.replication node_Count_Self = row_num(node_count, rank // replication, size // replication) node_Count_All = node_count _, _, row_groups, col_groups = get_proc_groups(size, replication) executor_ctx = ndarray.gpu(device_id) if size > 1: adj_part, data_part, row_part, col_part, input_part, label_part = load_data( args, size, replication, rank) else: adj_part, data_part, row_part, col_part, input_part, label_part = load_data_whole( args) adj_matrix = ndarray.sparse_array(data_part, (row_part, col_part), shape=adj_part.shape, ctx=executor_ctx) # train:val:test=6:2:2 # Our optimization on distributed GNN algorithm does NOT affect the correctness! # Here due to the limitation of current slice_op, data is split continuously. # Continuous split is unfriendly for reordered graph data where nodes are already clustered. # Specifically, training on some node clusters and testing on other clusters may cause poor test accuracy. # The better way is to split data randomly! train_split, test_split = 0.6, 0.8 train_node = int(train_split * node_Count_Self) test_node = int(test_split * node_Count_Self) A = ad.Variable(name="A", trainable=False) H = ad.Variable(name="H") np.random.seed(123) bounds = np.sqrt(6.0 / (num_features + hidden_layer_size)) W1_val = np.random.uniform(low=-bounds, high=bounds, size=[num_features, hidden_layer_size]).astype(np.float32) W1 = ad.Variable(name="W1", value=W1_val) bounds = np.sqrt(6.0 / (num_classes + hidden_layer_size)) np.random.seed(123) W2_val = np.random.uniform(low=-bounds, high=bounds, size=[hidden_layer_size, num_classes]).astype(np.float32) W2 = ad.Variable(name="W2", value=W2_val) y_ = ad.Variable(name="y_") z = ad.distgcn_15d_op(A, H, W1, node_Count_Self, node_Count_All, size, replication, device_id, comm, [row_groups, col_groups], True) H1 = ad.relu_op(z) y = ad.distgcn_15d_op(A, H1, W2, node_Count_Self, node_Count_All, size, replication, device_id, comm, [row_groups, col_groups], True) y_train = ad.slice_op(y, (0, 0), (train_node, num_classes)) label_train = ad.slice_op(y_, (0, 0), (train_node, num_classes)) y_test = ad.slice_op(y, (test_node, 0), (node_Count_Self - test_node, num_classes)) label_test = ad.slice_op(y_, (test_node, 0), (node_Count_Self - test_node, num_classes)) loss = ad.softmaxcrossentropy_op(y_train, label_train) loss_test = ad.softmaxcrossentropy_op(y_test, label_test) opt = optimizer.AdamOptimizer() train_op = opt.minimize(loss) executor = ad.Executor([loss, y, loss_test, train_op], ctx=executor_ctx) feed_dict = { A: adj_matrix, H: ndarray.array(input_part, ctx=executor_ctx), y_: ndarray.array(convert_to_one_hot(label_part, max_val=num_classes), ctx=executor_ctx), } epoch_num = 100 epoch_all, epoch_0 = 0, 0 for i in range(epoch_num): epoch_start_time = time.time() results = executor.run(feed_dict=feed_dict) loss = results[0].asnumpy().sum() y_out = results[1] loss_test = results[2].asnumpy().sum() epoch_end_time = time.time() epoch_time = epoch_end_time - epoch_start_time epoch_all += epoch_time if i == 0: epoch_0 = epoch_time print("[Epoch: %d, Rank: %d] Epoch time: %.3f, Total time: %.3f" % (i, rank, epoch_time, epoch_all)) y_out_train, y_predict = y_out.asnumpy().argmax( axis=1)[:train_node], y_out.asnumpy().argmax(axis=1)[test_node:] label_train, label_test = label_part[:train_node], label_part[ test_node:] train_acc = ndarray.array(np.array([(y_out_train == label_train).sum() ]), ctx=executor_ctx) test_acc = ndarray.array(np.array([(y_predict == label_test).sum()]), ctx=executor_ctx) train_loss = ndarray.array(np.array([loss]), ctx=executor_ctx) test_loss = ndarray.array(np.array([loss_test]), ctx=executor_ctx) if replication > 1: col_groups[rank % replication].dlarrayNcclAllReduce( test_acc, test_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( test_loss, test_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( train_acc, train_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( train_loss, train_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) else: comm.dlarrayNcclAllReduce(test_acc, test_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(test_loss, test_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(train_acc, train_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(train_loss, train_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) test_acc = float( test_acc.asnumpy()[0]) / (node_count - test_split * node_count) test_loss = test_loss.asnumpy()[0] / (node_count - test_split * node_count) train_acc = float(train_acc.asnumpy()[0]) / (train_split * node_count) train_loss = train_loss.asnumpy()[0] / (train_split * node_count) if rank == 0: print("[Epoch: %d] Train Loss: %.3f, Train Accuracy: %.3f, Test Loss: %.3f, Test Accuracy: %.3f"\ %(i,train_loss, train_acc, test_loss, test_acc)) avg_epoch_time = (epoch_all - epoch_0) / (epoch_num - 1) results = ndarray.array(np.array([epoch_all, avg_epoch_time]), ctx=executor_ctx) comm.dlarrayNcclAllReduce(results, results, ncclDataType_t.ncclFloat32, reduceop=ncclRedOp_t.ncclSum) results = results.asnumpy() / size if rank == 0: print("\nAverage Total Time: %.3f, Average Epoch Time: %.3f" % (results[0], results[1]))
train_state.sync_and_clear() if epoch >= num_epoch: break g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt def signal_handler(signal, frame): print("SIGINT signal caught, stop Training") exit(0) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("config") parser.add_argument("--path", "-p", required=True) parser.add_argument("--num_epoch", default=300, type=int) parser.add_argument("--hidden_size", default=128, type=int) parser.add_argument("--learning_rate", default=1, type=float) parser.add_argument("--batch_size", default=128, type=int) parser.add_argument("--cache", default="LFUOpt", type=str) args = parser.parse_args() comm, device_id = ad.mpi_nccl_init() file_path = args.config settings = yaml.load(open(file_path).read(), Loader=yaml.FullLoader) for k, v in settings['shared'].items(): os.environ[k] = str(v) os.environ["DMLC_ROLE"] = "worker" signal.signal(signal.SIGINT, signal_handler) train_main(args) ad.mpi_nccl_finish(comm)
def worker(args): def train(iterations, auc_enabled=True, tqdm_enabled=False): localiter = tqdm( range(iterations)) if tqdm_enabled else range(iterations) train_loss = [] train_acc = [] if auc_enabled: train_auc = [] for it in localiter: loss_val, predict_y, y_val, _ = executor.run( convert_to_numpy_ret_vals=True) if y_val.shape[1] == 1: # for criteo case acc_val = np.equal(y_val, predict_y > 0.5).astype(np.float) else: acc_val = np.equal(np.argmax(y_val, 1), np.argmax(predict_y, 1)).astype(np.float) train_loss.append(loss_val[0]) train_acc.append(acc_val) if auc_enabled: train_auc.append(metrics.roc_auc_score(y_val, predict_y)) if auc_enabled: return np.mean(train_loss), np.mean(train_acc), np.mean(train_auc) else: return np.mean(train_loss), np.mean(train_acc) def validate(iterations, tqdm_enabled=False): localiter = tqdm( range(iterations)) if tqdm_enabled else range(iterations) test_loss = [] test_acc = [] test_auc = [] for it in localiter: loss_val, test_y_predicted, y_test_val = val_executor.run( convert_to_numpy_ret_vals=True) if y_test_val.shape[1] == 1: # for criteo case correct_prediction = np.equal( y_test_val, test_y_predicted > 0.5).astype(np.float) else: correct_prediction = np.equal(np.argmax(y_test_val, 1), np.argmax(test_y_predicted, 1)).astype(np.float) test_loss.append(loss_val[0]) test_acc.append(correct_prediction) test_auc.append(metrics.roc_auc_score(y_test_val, test_y_predicted)) return np.mean(test_loss), np.mean(test_acc), np.mean(test_auc) def get_current_shard(data): if args.comm is not None: part_size = data.shape[0] // nrank start = part_size * rank end = start + part_size if rank != nrank - 1 else data.shape[0] return data[start:end] else: return data batch_size = 128 dataset = args.dataset model = args.model device_id = 0 if args.comm == 'PS': rank = ad.get_worker_communicate().rank() nrank = int(os.environ['DMLC_NUM_WORKER']) device_id = rank % 8 elif args.comm == 'Hybrid': comm, rank = ad.mpi_nccl_init() nrank = int(os.environ['DMLC_NUM_WORKER']) device_id = rank % 8 if dataset == 'criteo': # define models for criteo if args.all: from models.load_data import process_all_criteo_data dense, sparse, labels = process_all_criteo_data( return_val=args.val) elif args.val: from models.load_data import process_head_criteo_data dense, sparse, labels = process_head_criteo_data(return_val=True) else: from models.load_data import process_sampled_criteo_data dense, sparse, labels = process_sampled_criteo_data() if isinstance(dense, tuple): dense_input = dl.dataloader_op( [[get_current_shard(dense[0]), batch_size, 'train'], [get_current_shard(dense[1]), batch_size, 'validate']]) sparse_input = dl.dataloader_op( [[get_current_shard(sparse[0]), batch_size, 'train'], [get_current_shard(sparse[1]), batch_size, 'validate']]) y_ = dl.dataloader_op( [[get_current_shard(labels[0]), batch_size, 'train'], [get_current_shard(labels[1]), batch_size, 'validate']]) else: dense_input = dl.dataloader_op( [[get_current_shard(dense), batch_size, 'train']]) sparse_input = dl.dataloader_op( [[get_current_shard(sparse), batch_size, 'train']]) y_ = dl.dataloader_op( [[get_current_shard(labels), batch_size, 'train']]) elif dataset == 'adult': from models.load_data import load_adult_data x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test = load_adult_data( ) dense_input = [ dl.dataloader_op([ [get_current_shard(x_train_deep[:, i]), batch_size, 'train'], [get_current_shard(x_test_deep[:, i]), batch_size, 'validate'], ]) for i in range(12) ] sparse_input = dl.dataloader_op([ [get_current_shard(x_train_wide), batch_size, 'train'], [get_current_shard(x_test_wide), batch_size, 'validate'], ]) y_ = dl.dataloader_op([ [get_current_shard(y_train), batch_size, 'train'], [get_current_shard(y_test), batch_size, 'validate'], ]) else: raise NotImplementedError print("Data loaded.") loss, prediction, y_, train_op = model(dense_input, sparse_input, y_) executor = ad.Executor([loss, prediction, y_, train_op], ctx=ndarray.gpu(device_id),\ dataloader_name='train', stream_mode='AllStreams', comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123, log_path='./logs/') if args.val: print('Validation enabled...') val_executor = ad.Executor([loss, prediction, y_], ctx=ndarray.gpu(device_id),\ dataloader_name='validate', stream_mode='AllStreams', inference=True, comm_mode=args.comm) if args.all and dataset == 'criteo': print('Processing all data...') file_path = '%s_%s' % ({ None: 'local', 'PS': 'ps', 'Hybrid': 'hybrid' }[args.comm], args.raw_model) file_path += '%d.log' % rank if args.comm else '.log' file_path = os.path.join( os.path.split(os.path.abspath(__file__))[0], 'logs', file_path) log_file = open(file_path, 'w') total_epoch = 11 for ep in range(total_epoch): print("ep: %d" % ep) ep_st = time.time() train_loss, train_acc, train_auc = train(executor.batch_num // 10 + (ep % 10 == 9) * (executor.batch_num % 10), tqdm_enabled=True) ep_en = time.time() if args.val: val_loss, val_acc, val_auc = validate(val_executor.batch_num) printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\ % (train_loss, train_acc, train_auc, val_loss, val_acc, val_auc, ep_en - ep_st) else: printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ % (train_loss, train_acc, train_auc, ep_en - ep_st) print(printstr) log_file.write(printstr + '\n') log_file.flush() else: total_epoch = 50 for ep in range(total_epoch): if ep == 5: start = time.time() print("epoch %d" % ep) ep_st = time.time() train_loss, train_acc = train(executor.batch_num, auc_enabled=False) ep_en = time.time() if args.val: val_loss, val_acc, val_auc = validate(val_executor.batch_num) print( "train_loss: %.4f, train_acc: %.4f, train_time: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f" % (train_loss, train_acc, ep_en - ep_st, val_loss, val_acc, val_auc)) else: print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" % (train_loss, train_acc, ep_en - ep_st)) print('all time:', time.time() - start) if args.comm == 'Hybrid': ad.mpi_nccl_finish(comm)