def dfm_criteo(dense_input, sparse_input, y_): feature_dimension = 33762577 embedding_size = 128 learning_rate = 0.01 # FM Embedding1 = init.random_normal([feature_dimension, 1], stddev=0.01, name="fst_order_embedding", ctx=ndarray.cpu(0)) FM_W = init.random_normal([13, 1], stddev=0.01, name="dense_parameter") sparse_1dim_input = ad.embedding_lookup_op(Embedding1, sparse_input, ctx=ndarray.cpu(0)) fm_dense_part = ad.matmul_op(dense_input, FM_W) fm_sparse_part = ad.reduce_sum_op(sparse_1dim_input, axes=1) """ fst order output""" y1 = fm_dense_part + fm_sparse_part Embedding2 = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ndarray.cpu(0)) sparse_2dim_input = ad.embedding_lookup_op(Embedding2, sparse_input, ctx=ndarray.cpu(0)) sparse_2dim_sum = ad.reduce_sum_op(sparse_2dim_input, axes=1) sparse_2dim_sum_square = ad.mul_op(sparse_2dim_sum, sparse_2dim_sum) sparse_2dim_square = ad.mul_op(sparse_2dim_input, sparse_2dim_input) sparse_2dim_square_sum = ad.reduce_sum_op(sparse_2dim_square, axes=1) sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum sparse_2dim_half = sparse_2dim * 0.5 """snd order output""" y2 = ad.reduce_sum_op(sparse_2dim_half, axes=1, keepdims=True) #DNN flatten = ad.array_reshape_op(sparse_2dim_input, (-1, 26 * embedding_size)) W1 = init.random_normal([26 * embedding_size, 256], stddev=0.01, name="W1") W2 = init.random_normal([256, 256], stddev=0.01, name="W2") W3 = init.random_normal([256, 1], stddev=0.01, name="W3") fc1 = ad.matmul_op(flatten, W1) relu1 = ad.relu_op(fc1) fc2 = ad.matmul_op(relu1, W2) relu2 = ad.relu_op(fc2) y3 = ad.matmul_op(relu2, W3) y4 = y1 + y2 y = y4 + y3 y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def neural_mf(user_input, item_input, y_, num_users, num_items): batch_size = 256 embed_dim = 8 layers = [64, 32, 16, 8] learning_rate = 0.01 User_Embedding = init.random_normal( (num_users, embed_dim + layers[0] // 2), stddev=0.01, name="user_embed", ctx=ndarray.cpu(0)) Item_Embedding = init.random_normal( (num_items, embed_dim + layers[0] // 2), stddev=0.01, name="item_embed", ctx=ndarray.cpu(0)) # MLP_User_Embedding = init.random_normal((num_users, layers[0] // 2), stddev=0.01, name="mlp_user_embed", ctx=ndarray.cpu(0)) # MLP_Item_Embedding = init.random_normal((num_items, layers[0] // 2), stddev=0.01, name="mlp_item_embed", ctx=ndarray.cpu(0)) user_latent = ad.embedding_lookup_op(User_Embedding, user_input, ctx=ndarray.cpu(0)) item_latent = ad.embedding_lookup_op(Item_Embedding, item_input, ctx=ndarray.cpu(0)) mf_user_latent = ad.slice_op(user_latent, (0, 0), (-1, embed_dim)) mlp_user_latent = ad.slice_op(user_latent, (0, embed_dim), (-1, -1)) mf_item_latent = ad.slice_op(item_latent, (0, 0), (-1, embed_dim)) mlp_item_latent = ad.slice_op(item_latent, (0, embed_dim), (-1, -1)) # mf_user_latent = ad.embedding_lookup_op(MF_User_Embedding, user_input, ctx=ndarray.cpu(0)) # mf_item_latent = ad.embedding_lookup_op(MF_Item_Embedding, item_input, ctx=ndarray.cpu(0)) # mlp_user_latent = ad.embedding_lookup_op(MLP_User_Embedding, user_input, ctx=ndarray.cpu(0)) # mlp_item_latent = ad.embedding_lookup_op(MLP_Item_Embedding, item_input, ctx=ndarray.cpu(0)) W1 = init.random_normal((layers[0], layers[1]), stddev=0.1, name='W1') W2 = init.random_normal((layers[1], layers[2]), stddev=0.1, name='W2') W3 = init.random_normal((layers[2], layers[3]), stddev=0.1, name='W3') W4 = init.random_normal((embed_dim + layers[3], 1), stddev=0.1, name='W4') mf_vector = ad.mul_op(mf_user_latent, mf_item_latent) mlp_vector = ad.concat_op(mlp_user_latent, mlp_item_latent, axis=1) fc1 = ad.matmul_op(mlp_vector, W1) relu1 = ad.relu_op(fc1) fc2 = ad.matmul_op(relu1, W2) relu2 = ad.relu_op(fc2) fc3 = ad.matmul_op(relu2, W3) relu3 = ad.relu_op(fc3) concat_vector = ad.concat_op(mf_vector, relu3, axis=1) y = ad.matmul_op(concat_vector, W4) y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) # opt = optimizer.AdamOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, train_op
def wdl_criteo(dense, sparse, labels): batch_size = 128 feature_dimension = 33762577 embedding_size = 128 learning_rate = 0.01 if isinstance(dense, tuple): dense_input = dl.dataloader_op([[dense[0], batch_size, 'train'], [dense[1], batch_size, 'validate']]) sparse_input = dl.dataloader_op([[sparse[0], batch_size, 'train'], [sparse[1], batch_size, 'validate']]) y_ = dl.dataloader_op([[labels[0], batch_size, 'train'], [labels[1], batch_size, 'validate']]) else: dense_input = dl.dataloader_op([[dense, batch_size, 'train']]) sparse_input = dl.dataloader_op([[sparse, batch_size, 'train']]) y_ = dl.dataloader_op([[labels, batch_size, 'train']]) print("Data loaded.") Embedding = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ndarray.cpu(0)) sparse_input = ad.embedding_lookup_op(Embedding, sparse_input, ctx=ndarray.cpu(0)) sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size)) #DNN flatten = dense_input W1 = init.random_normal([13, 256], stddev=0.01, name="W1") W2 = init.random_normal([256, 256], stddev=0.01, name="W2") W3 = init.random_normal([256, 256], stddev=0.01, name="W3") W4 = init.random_normal([256 + 26 * embedding_size, 1], stddev=0.01, name="W4") fc1 = ad.matmul_op(flatten, W1) relu1 = ad.relu_op(fc1) fc2 = ad.matmul_op(relu1, W2) relu2 = ad.relu_op(fc2) y3 = ad.matmul_op(relu2, W3) y4 = ad.concat_op(sparse_input, y3, axis=1) y = ad.matmul_op(y4, W4) y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def test_cpu_truncated_normal(size, mean=0, std=1): cpu_x = ndarray.empty(size, ctx=ndarray.cpu(0)) np_st = time() for i in range(10): x = truncnorm.rvs(-2.0, 2.0, loc=mean, scale=std, size=size).astype(np.float32) cpu_x[:] = x np_en = time() print('numpy time: ', np_en - np_st) cpu_st = time() for i in range(10): cpu_op.truncated_normal_init(cpu_x, mean, std, 123) cpu_en = time() print('cpu time: ', cpu_en - cpu_st) fig, ax = plt.subplots(1, 1) cpu_x = cpu_x.asnumpy() assert (cpu_x.shape == x.shape) ax.hist(x.flatten(), histtype='stepfilled', alpha=0.2, bins=50, label='numpy') ax.hist(cpu_x.flatten(), histtype='step', alpha=0.2, bins=50, label='cpu') ax.legend(loc='best', frameon=False) # ax2.legend(loc='best', frameon=False) file_name = 'truncated_normal_%f_%f.png' % (mean, std) plt.savefig(file_name) plt.close()
def test_cpu_uniform(size, lb=-1, ub=1): cpu_x = ndarray.empty(size, ctx=ndarray.cpu(0)) np_st = time() for i in range(10): x = np.random.uniform(low=lb, high=ub, size=size).astype(np.float32) cpu_x[:] = x np_en = time() print('numpy time: ', np_en - np_st) cpu_st = time() for i in range(10): cpu_op.uniform_init(cpu_x, lb, ub, 123) cpu_en = time() print('cpu time: ', cpu_en - cpu_st) fig, ax = plt.subplots(1, 1) cpu_x = cpu_x.asnumpy() assert (cpu_x.shape == x.shape) ax.hist(x.flatten(), histtype='stepfilled', alpha=0.2, bins=50, label='numpy') ax.hist(cpu_x.flatten(), histtype='step', alpha=0.2, bins=50, label='cpu') ax.legend(loc='best', frameon=False) # ax2.legend(loc='best', frameon=False) file_name = 'uniform_%f_%f_cpu.png' % (lb, ub) plt.savefig(file_name) plt.close()
def test(): ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) nitem = 2000 item_len = 1000 arr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) # generate a long buffer push_indices = np.arange(nitem) * nrank + rank print(push_indices) push_length = np.repeat(item_len, repeats=nitem) worker_communicate = ad.get_worker_communicate() worker_communicate.PushData(pointer(push_indices), nitem, arr.handle, pointer(push_length)) print("Waiting") worker_communicate.WaitPushData(pointer(push_indices), nitem) worker_communicate.BarrierWorker() print("OK") arr2 = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) worker_communicate.PullData(pointer(push_indices), nitem, arr2.handle, pointer(push_length)) worker_communicate.WaitPullData(pointer(push_indices), nitem) assert np.all(arr.asnumpy() == arr2.asnumpy()) print("Check Complete")
def dcn_criteo(dense_input, sparse_input, y_): feature_dimension = 33762577 embedding_size = 128 learning_rate = 0.003 Embedding = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ndarray.cpu(0)) sparse_input = ad.embedding_lookup_op(Embedding, sparse_input, ctx=ndarray.cpu(0)) sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size)) x = ad.concat_op(sparse_input, dense_input, axis=1) # Cross Network cross_output = build_cross_layer(x, num_layers=3) #DNN flatten = x W1 = init.random_normal([26 * embedding_size + 13, 256], stddev=0.01, name="W1") W2 = init.random_normal([256, 256], stddev=0.01, name="W2") W3 = init.random_normal([256, 256], stddev=0.01, name="W3") W4 = init.random_normal([256 + 26 * embedding_size + 13, 1], stddev=0.01, name="W4") fc1 = ad.matmul_op(flatten, W1) relu1 = ad.relu_op(fc1) fc2 = ad.matmul_op(relu1, W2) relu2 = ad.relu_op(fc2) y3 = ad.matmul_op(relu2, W3) y4 = ad.concat_op(cross_output, y3, axis=1) y = ad.matmul_op(y4, W4) y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def sync_and_clear(self): self.count += 1 train_stat = ndarray.array(self.train_stat, ndarray.cpu()) test_stat = ndarray.array(self.test_stat, ndarray.cpu()) comm.dlarrayNcclAllReduce(train_stat, train_stat, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum, comm.stream) comm.dlarrayNcclAllReduce(test_stat, test_stat, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum, comm.stream) comm.stream.sync() train_stat, test_stat = train_stat.asnumpy(), test_stat.asnumpy() printstr = "epoch {}: test loss: {:.3f} test acc: {:.3f} train loss: {:.3f} train acc: {:.3f}".format( self.count, test_stat[3] / test_stat[0], test_stat[1] / test_stat[2], train_stat[3] / train_stat[0], train_stat[1] / train_stat[2], ) logstr = "{} {} {} {}".format( test_stat[3] / test_stat[0], test_stat[1] / test_stat[2], train_stat[3] / train_stat[0], train_stat[1] / train_stat[2], ) self.time.append(time.time()) if comm.device_id.value == 0: print(printstr, flush=True) print(logstr, file=self.file, flush=True) if len(self.time) > 3: epoch_time = np.array(self.time[1:]) - np.array(self.time[:-1]) print("epoch time: {:.3f}+-{:.3f}".format( np.mean(epoch_time), np.var(epoch_time))) self.train_stat[:] = 0 self.test_stat[:] = 0
def test_dense(): npw = np.random.random((5, 10)).astype(np.float32) npx = np.random.random((7, 5)).astype(np.float32) cpuctx = ndarray.cpu(0) gpuctx = ndarray.gpu(0) X = ad.Variable(name="x") mid = X + 3 W = ad.Variable(name='w', value=npw, ctx=cpuctx) y = ad.matmul_op(mid, W) opt = optimizer.SGDOptimizer(learning_rate=0.1) train_op = opt.minimize(y) executor = ad.Executor([y, train_op], ctx=gpuctx) pred_y, _ = executor.run(feed_dict={X: npx}, convert_to_numpy_ret_vals=True) nppred_y = np.matmul((npx + 3), npw) np.testing.assert_allclose(pred_y, nppred_y, rtol=1e-6) new_npw = npw - 0.1 * np.matmul((npx+3).T, np.ones(nppred_y.shape).astype(np.float32)) np.testing.assert_allclose(W.tensor_value.asnumpy(), new_npw, rtol=1e-10)
def test(): ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) if rank > 0: return arr = ndarray.array(np.random.rand(nitem, item_len),ctx = ctx) # generate a long buffer push_indices = np.arange(nitem) print(push_indices) push_length = np.repeat(item_len, repeats=nitem) worker_communicate = ad.get_worker_communicate() query = worker_communicate.PushData(pointer(push_indices), nitem, arr.handle, pointer(push_length)) worker_communicate.WaitData(query) print("data_pushed") t = ThreadPoolExecutor(max_workers=max_thread) byte_count = 0 arr2 = ndarray.array(np.random.rand(nitem, item_len),ctx = ctx) def pull_data(): query = worker_communicate.PullData(pointer(push_indices), nitem, arr2.handle, pointer(push_length)) worker_communicate.WaitData(query) # print( np.all(arr.asnumpy() == arr2.asnumpy()) ) nonlocal byte_count byte_count += nitem * item_len * 4 def watch(): nonlocal byte_count start = time.time() while True: time.sleep(1) speed = byte_count / (time.time() - start) print("speed : {} MB/s".format(speed / 2**20)) task_list = [None for i in range(max_thread)] threading.Thread(target=watch).start() while True: for i in range(max_thread): if task_list[i] is None or task_list[i].done(): task_list[i] = t.submit(pull_data)
def test_sparse(): npemb = np.random.random((100, 20)).astype(np.float32) npind = np.array(np.random.randint(100, size=(10,))) npw = np.random.random((20, 30)).astype(np.float32) cpuctx = ndarray.cpu(0) gpuctx = ndarray.gpu(0) embedding = ad.Variable('embeddingtable', value=npemb, ctx=cpuctx) index = ad.Variable(name="index", ctx=cpuctx) W = ad.Variable(name="w", value=npw) y = ad.embedding_lookup_op(embedding, index) # (10, 20) y = ad.matmul_op(y, W) opt = optimizer.SGDOptimizer(0.1) train_op = opt.minimize(y) executor = ad.Executor([y, train_op],ctx=gpuctx) out, _ = executor.run(feed_dict={index: npind.astype(np.float32)}, convert_to_numpy_ret_vals=True) np_out = np.matmul(npemb[npind], npw) np.testing.assert_allclose(out, np_out, rtol=1e-6) tmp_grad = np.matmul(np.ones(np_out.shape).astype(np.float32), npw.T) for i, localid in enumerate(npind): npemb[localid] -= 0.1 * tmp_grad[i] np.testing.assert_allclose(embedding.tensor_value.asnumpy(), npemb, rtol=1e-6)
def test(): ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) arr = ndarray.array(np.random.rand(2,rank+100),ctx = ctx) print(arr.asnumpy()) push_indices = np.array([2*rank+1,2*rank+2]) if rank == 0: pull_indices = np.array([3]) elif rank == 1: pull_indices = np.array([1]) push_length = np.array([rank+100,rank+100]) if rank == 0: pull_length = np.array([101]) out_arr = ndarray.array(np.zeros(101),ctx = ctx) elif rank == 1: pull_length = np.array([100]) out_arr = ndarray.array(np.zeros(100),ctx = ctx) print(out_arr.asnumpy()) worker_communicate = ad.get_worker_communicate() query = worker_communicate.PushData(pointer(push_indices), 2, arr.handle, pointer(push_length)) worker_communicate.WaitData(query); worker_communicate.BarrierWorker() worker_communicate.PullData(pointer(pull_indices), 1, out_arr.handle, pointer(pull_length)) worker_communicate.WaitData(query); print(out_arr.asnumpy())
def test(func_name, nitem=2000, item_len=10000, ind_len=500, max_thread=10, ret_ans=False): func_name = func_name.lower() ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) comm = ad.get_worker_communicate() byte_count = 0 if func_name == 'pushnpull': inarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) def func(name): comm.Push(name, inarr.handle, None) comm.Pull(name, outarr.handle) comm.Wait(name) nonlocal byte_count byte_count += nitem * item_len * 4 * 2 elif func_name == 'pushpull': inarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) def func(name): comm.DDPushPull(name, inarr.handle, outarr.handle, None) comm.Wait(name) nonlocal byte_count byte_count += nitem * item_len * 4 * 2 elif func_name == 'sparsepushnpull': inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) def func(name): np_ind = np.random.randint(low=0, high=nitem, size=(ind_len, )) inind = ndarray.array(np_ind.astype(np.float32), ctx=ctx) uni_ind_len = np.unique(np_ind).size comm.SparsePush(name, inind.handle, inarr.handle, None) comm.Pull(name, outarr.handle) comm.Wait(name) nonlocal byte_count byte_count += (nitem + uni_ind_len) * item_len * 4 elif func_name == 'sparsepushnsparsepull': inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) outarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) def func(name): np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, )) np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, )) inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx) outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx) uni_inind_len = np.unique(np_inind).size uni_outind_len = np.unique(np_outind).size comm.SparsePush(name, inind.handle, inarr.handle, None) comm.SparsePull(name, outind.handle, outarr.handle) comm.Wait(name) nonlocal byte_count byte_count += (uni_inind_len + uni_outind_len) * item_len * 4 elif func_name == 'push': inarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) def func(name): comm.Push(name, inarr.handle, None) comm.Wait(name) nonlocal byte_count byte_count += nitem * item_len * 4 elif func_name == 'pull': outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) def func(name): comm.Pull(name, outarr.handle) comm.Wait(name) nonlocal byte_count byte_count += nitem * item_len * 4 elif func_name == 'sparsepush': inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) def func(name): np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, )) inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx) uni_inind_len = np.unique(np_inind).size comm.SparsePush(name, inind.handle, inarr.handle, None) comm.Wait(name) nonlocal byte_count byte_count += uni_inind_len * item_len * 4 elif func_name == 'sparsepull': outarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) def func(name): np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, )) outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx) uni_outind_len = np.unique(np_outind).size comm.SparsePull(name, outind.handle, outarr.handle) comm.Wait(name) nonlocal byte_count byte_count += uni_outind_len * item_len * 4 elif func_name == 'sdpushpull': inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx) def func(name): np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, )) inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx) uni_inind_len = np.unique(np_inind).size comm.SDPushPull(name, inind.handle, inarr.handle, outarr.handle, None) comm.Wait(name) nonlocal byte_count byte_count += (uni_inind_len + nitem) * item_len * 4 elif func_name == 'sspushpull': inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) outarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx) def func(name): np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, )) np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, )) inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx) uni_inind_len = np.unique(np_inind).size outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx) uni_outind_len = np.unique(np_outind).size comm.SSPushPull(name, inind.handle, inarr.handle, outind.handle, outarr.handle, None) comm.Wait(name) nonlocal byte_count byte_count += (uni_inind_len + uni_outind_len) * item_len * 4 else: assert False if 'sparse' in func_name or func_name in ('sdpushpull', 'sspushpull'): arr_len = ctypes.c_int(nitem) arr_wid = ctypes.c_int(item_len) sparse_init = ctypes.c_int(1) else: arr_len = ctypes.c_int(nitem * item_len) arr_wid = ctypes.c_int(1) sparse_init = ctypes.c_int(0) for i in range(max_thread): comm.InitTensor(i, sparse_init, arr_len, arr_wid, ctypes.c_int(0), ctypes.c_double(0), ctypes.c_double(1), ctypes.c_ulonglong(123),\ ctypes.c_int(0), (ctypes.c_float * 1)(0.1), ctypes.c_int(1)) # print("data init") t = ThreadPoolExecutor(max_workers=max_thread) if ret_ans: task_list = [None for i in range(max_thread)] for i in range(max_thread): task_list[i] = t.submit(func, i) curByte = byte_count start = time.time() cnt = 0 while cnt < 30: for i in range(max_thread): if task_list[i].done(): cnt += 1 task_list[i] = t.submit(func, i) speed = (byte_count - curByte) / (time.time() - start) / 2**20 t.shutdown() for i in range(max_thread): comm.ClearOnServer(i) comm.Clear(i) return speed else: def watch(): start = time.time() while True: time.sleep(1) speed = byte_count / (time.time() - start) print("speed : {} MB/s".format(speed / 2**20)) task_list = [None for i in range(max_thread)] threading.Thread(target=watch).start() while True: for i in range(max_thread): if task_list[i] is None or task_list[i].done(): task_list[i] = t.submit(func, i)
import tensorflow.compat.v1 as tf # tf.disable_v2_behavior() import tf2onnx import argparse import six.moves.cPickle as pickle import gzip import os import pdb import ctypes import time batch_size = 128 # ctx=ndarray.gpu(0) ctx = ndarray.cpu(0) def load_mnist_data(dataset): """ Load the dataset Code adapted from http://deeplearning.net/tutorial/code/logistic_sgd.py :type dataset: string :param dataset: the path to the dataset (here MNIST) """ # Download the MNIST dataset if it is not present data_dir, data_file = os.path.split(dataset) if data_dir == "" and not os.path.isfile(dataset): # Check if dataset is in the data directory. new_path = os.path.join(os.path.split(__file__)[0], dataset) if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
def test_init_ps(rarr, init_type, init_a, init_b=1.0, sparse=False): assert init_type in ('constant', 'uniform', 'normal', 'truncated_normal') init_type_map = { 'constant': 0, 'uniform': 1, 'normal': 2, 'truncated_normal': 3 } ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(nitem, item_len) if rank == 0: arr = ndarray.array(local_arr, ctx=ctx) else: arr = ndarray.empty((nitem, item_len), ctx=ctx) comm = ad.get_worker_communicate() if sparse: arr_len = ctypes.c_int(nitem) arr_wid = ctypes.c_int(item_len) else: arr_len = ctypes.c_int(nitem * item_len) arr_wid = ctypes.c_int(1) itype = ctypes.c_int(init_type_map[init_type]) comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len, arr_wid, itype, ctypes.c_double(init_a), ctypes.c_double(init_b), ctypes.c_ulonglong(123), ctypes.c_int(0), (ctypes.c_float * 1)(0.1), ctypes.c_int(1)) comm.Pull(ctypes.c_int(0), arr.handle) comm.Wait(ctypes.c_int(0)) if rank == 0: local_arr[:] = arr.asnumpy() comm.BarrierWorker() if rank != 0: np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7) else: if init_type == 'constant': np.testing.assert_allclose(np.full((nitem, item_len), init_a), arr.asnumpy(), rtol=5e-7) else: if init_type == 'uniform': numpy_samples = np.random.uniform( low=init_a, high=init_b, size=(nitem, item_len)).astype(np.float32) elif init_type == 'normal': numpy_samples = np.random.normal( loc=init_a, scale=init_b, size=(nitem, item_len)).astype(np.float32) else: numpy_samples = truncnorm.rvs(-2.0, 2.0, loc=init_a, scale=init_b, size=(nitem, item_len)).astype( np.float32) fig, ax = plt.subplots(1, 1) ax.hist(numpy_samples.flatten(), histtype='stepfilled', alpha=0.2, bins=50, label='numpy') ax.hist(local_arr.flatten(), histtype='step', alpha=0.2, bins=50, label='ps') ax.legend(loc='best', frameon=False) # ax2.legend(loc='best', frameon=False) file_name = '%s_%.1f_%.1f_%d.png' % (init_type, init_a, init_b, int(sparse)) plt.savefig(file_name) print('Check file %s.' % file_name) print('Init parameters %d/%d passed.' % (rank, nrank)) if rank == 0: comm.ClearOnServer(0) comm.Clear(0) comm.BarrierWorker()
def test_api(rarr, rpush, rpull, sparse=False, lr=0.5): ctx = ndarray.cpu(0) rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(nitem, item_len).copy() local_push = np.frombuffer(rpush, dtype=np.float32).copy() local_pull = np.frombuffer(rpull, dtype=np.float32).copy() if rank == 0: arr = ndarray.array(local_arr, ctx=ctx) else: arr = ndarray.empty((nitem, item_len), ctx=ctx) comm = ad.get_worker_communicate() if sparse: arr_len = ctypes.c_int(nitem) arr_wid = ctypes.c_int(item_len) else: arr_len = ctypes.c_int(nitem * item_len) arr_wid = ctypes.c_int(1) comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len, arr_wid, ctypes.c_int(0), ctypes.c_double(0.0), ctypes.c_double(1.0), ctypes.c_ulonglong(123),\ ctypes.c_int(0), (ctypes.c_float * 1)(lr), ctypes.c_int(1)) if sparse: local_arr[:] = 0 for j in local_push: local_arr[int(j)] += 1 if rank == 0: push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx) push_val = ndarray.array(np.ones( (indx1, indx2, item_len)).astype(np.float32), ctx=ctx) comm.SparsePush(0, push_ind.handle, push_val.handle, None) comm.Wait(0) comm.BarrierWorker() comm.Pull(0, arr.handle) comm.Wait(0) np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7) print('SparsePush DensePull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() for j in local_push: local_arr[int(j)] += 1 if rank == 0: push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx) push_val = ndarray.array(np.ones( (indx1, indx2, item_len)).astype(np.float32), ctx=ctx) comm.SDPushPull(0, push_ind.handle, push_val.handle, arr.handle, None) comm.Wait(0) comm.BarrierWorker() if rank != 0: comm.Pull(0, arr.handle) comm.Wait(0) np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7) print('SDPushPull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() for j in local_push: local_arr[int(j)] += 1 pull_ind = ndarray.array(local_pull.reshape(indx1, indx2), ctx=ctx) pull_val = ndarray.empty((indx1, indx2, item_len), ctx=ctx) if rank == 0: push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx) push_val = ndarray.array(np.ones( (indx1, indx2, item_len)).astype(np.float32), ctx=ctx) comm.SSPushPull(0, push_ind.handle, push_val.handle, \ pull_ind.handle, pull_val.handle, None) comm.Wait(0) comm.BarrierWorker() if rank != 0: comm.SparsePull(0, pull_ind.handle, pull_val.handle) comm.Wait(0) np.testing.assert_allclose(local_arr[local_pull.astype(int)].reshape( indx1, indx2, item_len), pull_val.asnumpy(), rtol=5e-7) print('SSPushPull and SparsePull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() else: if rank == 0: comm.Push(0, arr.handle, None) comm.Wait(0) comm.BarrierWorker() comm.Pull(0, arr.handle) comm.Wait(0) np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7) print('DensePush DensePull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() if rank == 0: temp_push_val = ndarray.array(np.ones( (nitem, item_len)).astype(np.float32), ctx=ctx) comm.DDPushPull(0, temp_push_val.handle, arr.handle, None) comm.Wait(0) comm.BarrierWorker() if rank != 0: comm.Pull(0, arr.handle) comm.Wait(0) np.testing.assert_allclose(local_arr + 1, arr.asnumpy()) print('DenseDensePushPull %d/%d passed.' % (rank, nrank)) comm.BarrierWorker() if rank == 0: comm.ClearOnServer(0) comm.Clear(0) comm.BarrierWorker()
def train_main(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = ad.get_worker_communicate().rank() nrank = int(os.environ["DMLC_NUM_WORKER"]) ctx = ndarray.gpu(rank % args.num_local_worker) embedding_width = args.hidden_size extract_width = embedding_width * (meta["feature"] - 1) y_ = dl.GNNDataLoaderOp(lambda g: ndarray.array( convert_to_one_hot(g.y, max_val=g.num_classes), ctx=ndarray.cpu())) mask_ = ad.Variable(name="mask_") gcn1 = GCN(extract_width, hidden_layer_size, activation="relu") gcn2 = GCN(hidden_layer_size, meta["class"]) index = dl.GNNDataLoaderOp( lambda g: ndarray.array(g.x[:, 0:-1], ctx=ndarray.cpu()), ctx=ndarray.cpu()) embedding = initializers.random_normal([meta["idx_max"], embedding_width], stddev=0.1) embed = ad.embedding_lookup_op(embedding, index) embed = ad.array_reshape_op(embed, (-1, extract_width)) # embed = ad.reduce_mean_op(embed, axes=1) # x = ad.concat_op(x_, embed, axis=1) x = gcn1(embed) y = gcn2(x) loss = ad.softmaxcrossentropy_op(y, y_) train_loss = loss * mask_ train_loss = ad.reduce_mean_op(train_loss, [0]) opt = optimizer.SGDOptimizer(args.learning_rate) train_op = opt.minimize(train_loss) ad.worker_init() distributed.ps_init(rank, nrank) ngraph = meta["partition"]["nodes"][rank] // args.batch_size graphs = prepare_data(ngraph) idx = 0 g_sample, mp_val, mask, mask_eval = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample) dl.GNNDataLoaderOp.step(g_sample) epoch = 0 nnodes = 0 executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS', use_sparse_pull=False, cstable_policy=args.cache) while True: g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt = graphs[idx] idx = (idx + 1) % ngraph dl.GNNDataLoaderOp.step(g_sample_nxt) feed_dict = {gcn1.mp: mp_val, gcn2.mp: mp_val, mask_: mask} loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = np.sum((y_predicted == g_sample.y) * mask_eval) train_acc = np.sum((y_predicted == g_sample.y) * mask) stat.update(acc, mask_eval.sum(), np.sum(loss_val.asnumpy() * mask_eval) / mask_eval.sum()) stat.update_train(train_acc, mask.sum(), np.sum(loss_val.asnumpy() * mask) / mask.sum()) # distributed.ps_get_worker_communicator().BarrierWorker() nnodes += mask.sum() + mask_eval.sum() if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 if rank == 0: stat.print(epoch) if epoch >= num_epoch: break g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt
if ndarray.is_gpu_ctx(executor_ctx): W_val = W_val.asnumpy() loss_val = [val.asnumpy() for val in loss_val] y_groundtruth = X_val.dot(W_val) loss_groundtruth = np.mean( -np.sum(Y_val * np.log(softmax_func(y_groundtruth)), axis=1), keepdims=True) Y_grad_groundtruth = (softmax_func(y_groundtruth) + -1 * Y_val) * np.ones(loss_groundtruth.shape) / 500 W_grad_groundtruth = X_val.T.dot(Y_grad_groundtruth) np.testing.assert_allclose(loss_val[0], loss_groundtruth, rtol=1e-4) np.testing.assert_allclose(loss_val[1], W_grad_groundtruth, rtol=1e-4) np.testing.assert_allclose(loss_val[2], Y_grad_groundtruth, rtol=1e-4) test_csrmm_op(ndarray.cpu(0)) test_csrmm_op(ndarray.gpu(1)) def test_csrmv_op(executor_ctx): X = ad.Variable(name="X") W = ad.Variable(name="W") Y = ad.csrmv_op(X, W) Y_ = ad.Variable(name="Y_") temp = Y + (-1) * Y_ loss = temp * temp grads = ad.gradients(loss, [W, Y]) executor = ad.Executor( [loss, grads[0], grads[1]], ctx=executor_ctx)
import tensorflow as tf tf_x = tf.convert_to_tensor(x) tf_z = tf.convert_to_tensor(z) tf_y = tf_x + tf_z with tf.Session() as sess: sess.run(tf.global_variables_initializer()) tf_results = sess.run([tf_y]) np.testing.assert_allclose(ath_results[0], tf_results[0]) print('Passed add op test with shape ', shape) test_add() test_add((7, 9)) test_add((4, 5, 6, 7, 8)) test_add(ctx=ndarray.cpu(0)) test_add((7, 9), ctx=ndarray.cpu(0)) test_add((4, 5, 6, 7, 8), ctx=ndarray.cpu(0)) def test_add_broadcast(shape1=(2, 3, 4, 5), shape2=(1, 4, 1), ctx=ndarray.gpu(1)): x = np.random.random(shape1).astype(np.float32) z = np.random.random(shape2).astype(np.float32) ath_x = ad.Variable(name='x', value=x) ath_z = ad.Variable(name='z', value=z) ath_y = ad.add_op(ath_x, ath_z) executor = ad.Executor([ath_y], ctx=ctx, enable_lazy=False) ath_results = [var.asnumpy() for var in executor.run()] import tensorflow as tf tf_x = tf.convert_to_tensor(x)