def main(setting): for key, value in setting.items(): os.environ[key] = str(value) if os.environ['DMLC_ROLE'] == "server": ad.server_init() ad.server_finish() elif os.environ['DMLC_ROLE'] == "worker": ad.worker_init() worker() ad.worker_finish() elif os.environ['DMLC_ROLE'] == "scheduler": ad.scheduler_init() ad.scheduler_finish() else: raise ValueError("Unknown role", os.environ['DMLC_ROLE'])
def start_process(settings, args): for key, value in settings.items(): os.environ[key] = str(value) if os.environ['DMLC_ROLE'] == "server": ad.server_init() ad.server_finish() elif os.environ['DMLC_ROLE'] == "worker": ad.worker_init() test() ad.worker_finish() elif os.environ['DMLC_ROLE'] == "scheduler": ad.scheduler_init() ad.scheduler_finish() else: raise ValueError("Unknown role", os.environ['DMLC_ROLE'])
def start_process(settings, args): for key, value in settings.items(): os.environ[key] = str(value) if os.environ['DMLC_ROLE'] == "server": ad.server_init() ad.server_finish() elif os.environ['DMLC_ROLE'] == "worker": ad.worker_init() test(args.func) # test_dense_n_draw(range(100000, 1000000, 100000), 'pushpull') # test_sparse_n_draw(range(100, 600, 100), range(1000, 6000, 1000), 'sparsepush') ad.worker_finish() elif os.environ['DMLC_ROLE'] == "scheduler": ad.scheduler_init() ad.scheduler_finish() else: raise ValueError("Unknown role", os.environ['DMLC_ROLE'])
def start_process(settings, args, arr=None, push_arr=None, pull_arr=None): for key, value in settings.items(): os.environ[key] = str(value) if os.environ['DMLC_ROLE'] == "server": ad.server_init() ad.server_finish() elif os.environ['DMLC_ROLE'] == "worker": ad.worker_init() test_api(arr, push_arr, pull_arr) test_init_ps(arr, 'constant', 1234.567) test_init_ps(arr, 'uniform', -0.5, 0.4) test_init_ps(arr, 'normal', 5.6, 2.0) test_init_ps(arr, 'truncated_normal', -2.3, 1.4) test_api(arr, push_arr, pull_arr, True) test_init_ps(arr, 'constant', 1234.567, True) test_init_ps(arr, 'uniform', -0.5, 0.4, True) test_init_ps(arr, 'normal', 5.6, 2.0, True) test_init_ps(arr, 'truncated_normal', -2.3, 1.4, True) ad.worker_finish() elif os.environ['DMLC_ROLE'] == "scheduler": ad.scheduler_init() ad.scheduler_finish() else: raise ValueError("Unknown role", os.environ['DMLC_ROLE'])
def train_main(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = ad.get_worker_communicate().rank() nrank = int(os.environ["DMLC_NUM_WORKER"]) ctx = ndarray.gpu(rank % args.num_local_worker) embedding_width = args.hidden_size extract_width = embedding_width * (meta["feature"] - 1) y_ = dl.GNNDataLoaderOp(lambda g: ndarray.array( convert_to_one_hot(g.y, max_val=g.num_classes), ctx=ndarray.cpu())) mask_ = ad.Variable(name="mask_") gcn1 = GCN(extract_width, hidden_layer_size, activation="relu") gcn2 = GCN(hidden_layer_size, meta["class"]) index = dl.GNNDataLoaderOp( lambda g: ndarray.array(g.x[:, 0:-1], ctx=ndarray.cpu()), ctx=ndarray.cpu()) embedding = initializers.random_normal([meta["idx_max"], embedding_width], stddev=0.1) embed = ad.embedding_lookup_op(embedding, index) embed = ad.array_reshape_op(embed, (-1, extract_width)) # embed = ad.reduce_mean_op(embed, axes=1) # x = ad.concat_op(x_, embed, axis=1) x = gcn1(embed) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) train_loss = loss * mask_ train_loss = ad.reduce_mean_op(train_loss, [0]) opt = optimizer.SGDOptimizer(args.learning_rate) train_op = opt.minimize(train_loss) ad.worker_init() distributed.ps_init(rank, nrank) ngraph = meta["partition"]["nodes"][rank] // args.batch_size graphs = prepare_data(ngraph) idx = 0 g_sample, mp_val, mask, mask_eval = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample) dl.GNNDataLoaderOp.step(g_sample) epoch = 0 nnodes = 0 executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS', use_sparse_pull=False, cstable_policy=args.cache) while True: g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample_nxt) feed_dict = {gcn1.mp: mp_val, gcn2.mp: mp_val, mask_: mask} loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = np.sum((y_predicted == g_sample.y) * mask_eval) train_acc = np.sum((y_predicted == g_sample.y) * mask) stat.update(acc, mask_eval.sum(), np.sum(loss_val.asnumpy() * mask_eval) / mask_eval.sum()) stat.update_train(train_acc, mask.sum(), np.sum(loss_val.asnumpy() * mask) / mask.sum()) # distributed.ps_get_worker_communicator().BarrierWorker() nnodes += mask.sum() + mask_eval.sum() if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 if rank == 0: stat.print(epoch) if epoch >= num_epoch: break g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt