Esempio n. 1
0
def test_allgather(group):
    comm1 = ad.new_group_comm(group)
    a = ndarray.array(np.array([rank,rank]),ctx=ctx)
    b = ndarray.array(np.zeros(2*len(group)),ctx=ctx)
    if rank in group:
        comm1.dlarrayAllGather(a, b, ncclDataType_t.ncclFloat32)
    print("Allgather device=%d"%comm1.device_id.value,b.asnumpy())
Esempio n. 2
0
def test_layernorm_forward(shape=(5, 3)):
    ctx = ndarray.gpu(1)
    # shape = (5, 3)
    last_dim = shape[-1]
    x = np.random.random(shape).astype(np.float32)
    scale = np.random.random((last_dim,)).astype(np.float32)
    bias = np.random.random((last_dim,)).astype(np.float32)
    arr_x = ndarray.array(x, ctx=ctx)
    arr_scale = ndarray.array(scale, ctx=ctx)
    arr_bias = ndarray.array(bias, ctx=ctx)
    arr_mean = ndarray.empty(list(shape[:-1]) + [1], ctx=ctx)
    arr_var = ndarray.empty(list(shape[:-1]) + [1], ctx=ctx)
    arr_y = ndarray.empty((shape), ctx=ctx)
    gpu_op.layer_normalization(arr_x, arr_scale, arr_bias, arr_mean, arr_var, arr_y, 0.01)

    y = arr_y.asnumpy()

    np_means = x.mean(axis=-1, dtype=np.float32, keepdims=True)
    np_vars = x.var(axis=-1, dtype=np.float32, keepdims=True)
    std = np.sqrt(np_vars + 0.01, dtype=np.float32)
    centered_input = x - np_means
    normed_input = centered_input / std

    bc_shape = [1] * len(x.shape)
    bc_shape[-1] = x.shape[-1]

    y_ = scale.reshape(bc_shape) * normed_input + \
        bias.reshape(bc_shape)
    
    np.testing.assert_allclose(np_means, arr_mean.asnumpy(), atol=1e-6)
    np.testing.assert_allclose(np_vars, arr_var.asnumpy(), atol=1e-6)
    np.testing.assert_allclose(y_, y, atol=1e-6)
    print('Pass forward test with shape ', shape)
Esempio n. 3
0
def test_group_broadcast():
    row_procs = []
    for i in range(0,8,2):
        row_procs.append(list(range(i,i+2)))

    col_procs = []
    for i in range(2):
        col_procs.append(list(range(i,8,2)))

    row_groups = []
    for i in range(len(row_procs)):
        row_groups.append(ad.new_group_comm(row_procs[i]))

    col_groups = []
    for i in range(len(col_procs)):
        col_groups.append(ad.new_group_comm(col_procs[i]))

    rank_row = rank//2
    rank_col = rank%2
    group_row = row_procs[rank_row]
    group_col = col_procs[rank_col]
    comm_row = row_groups[rank_row]
    comm_col = col_groups[rank_col]
    
    a = ndarray.array(np.array([rank,rank,rank,rank,rank]),ctx=ctx)
    comm_row.dlarrayBroadcast(a, a, ncclDataType_t.ncclFloat32, root = group_row[1])
    print("Broadcast device=%d, a:"%device_id,a.asnumpy()) 
    
    b = ndarray.array(np.array([rank,rank,rank,rank,rank]),ctx=ctx)  
    comm_col.dlarrayBroadcast(b, b, ncclDataType_t.ncclFloat32, root = group_col[1])
    print("Broadcast device=%d, b:"%device_id,b.asnumpy())   
Esempio n. 4
0
def test():
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    nitem = 2000
    item_len = 1000
    arr = ndarray.array(np.random.rand(nitem, item_len),
                        ctx=ctx)  # generate a long buffer

    push_indices = np.arange(nitem) * nrank + rank
    print(push_indices)
    push_length = np.repeat(item_len, repeats=nitem)
    worker_communicate = ad.get_worker_communicate()
    worker_communicate.PushData(pointer(push_indices), nitem, arr.handle,
                                pointer(push_length))
    print("Waiting")
    worker_communicate.WaitPushData(pointer(push_indices), nitem)
    worker_communicate.BarrierWorker()
    print("OK")
    arr2 = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)
    worker_communicate.PullData(pointer(push_indices), nitem, arr2.handle,
                                pointer(push_length))
    worker_communicate.WaitPullData(pointer(push_indices), nitem)
    assert np.all(arr.asnumpy() == arr2.asnumpy())
    print("Check Complete")
Esempio n. 5
0
def test_broadcast(group, root):
    comm1 = ad.new_group_comm(group)
    a = ndarray.array(np.array([-1,-1,-1,-1,-1]),ctx=ctx)
    if rank == root:
        a = ndarray.array(np.array([2,3,4,5,6]),ctx=ctx)
    if rank in group:
        comm1.dlarrayBroadcast(a, a, ncclDataType_t.ncclFloat32, root = root)
    print("Broadcast device=%d"%comm1.device_id.value,a.asnumpy())  
Esempio n. 6
0
def train_hetu(num_epoch):
    ctx = ndarray.gpu(0)

    x_ = ad.Variable(name="x_")
    y_ = ad.Variable(name="y_")

    if use_same_init:
        gcn1 = GCN(num_features, hidden_layer_size, custom_init=(init_w1, init_b1))
        gcn2 = GCN(hidden_layer_size, num_classes, custom_init=(init_w2, init_b2))
    else:
        gcn1 = GCN(num_features, hidden_layer_size)
        gcn2 = GCN(hidden_layer_size, num_classes)

    mp_val = mp_matrix(graph, ctx, use_original_gcn_norm=True)
    feed_dict = {
        gcn1.mp : mp_val,
        gcn2.mp : mp_val,
        x_ : ndarray.array(graph.x, ctx=ctx),
        y_ : ndarray.array(convert_to_one_hot(graph.y, max_val=num_classes), ctx=ctx)
    }

    x = gcn1(x_)
    x = ad.relu_op(x)
    y = gcn2(x)

    loss = ad.softmaxcrossentropy_op(y, y_)

    opt = optimizer.AdamOptimizer(0.01)
    train_op = opt.minimize(loss)
    executor = ad.Executor([loss, y, train_op], ctx=ctx)
    start_time = time.time()
    losses = []
    for i in range(num_epoch):
        loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict)

        y_predicted = y_predicted.asnumpy().argmax(axis=1)
        acc = (y_predicted == graph.y).sum()
        losses.append(loss_val.asnumpy().mean())
        if i==0:
            start_time= time.time()
        print("Train loss :", loss_val.asnumpy().mean())
        print("Train accuracy:", acc/len(y_predicted))
        print("Hetu time:",i, time.time()-start_time)
    print("Hetu time:", time.time()-start_time)

    mp_val = mp_matrix(graph_full, ctx)

    feed_dict = {
        gcn1.mp : mp_val,
        gcn2.mp : mp_val,
        x_ : ndarray.array(graph_full.x, ctx=ctx),
    }
    executor_eval = ad.Executor([y], ctx=ctx)
    y_predicted, = executor_eval.run(feed_dict=feed_dict)
    y_predicted = y_predicted.asnumpy().argmax(axis=1)
    acc = (y_predicted == graph_full.y)[train_split:].sum()
    print("Test accuracy:", acc/len(y_predicted[train_split:]))
    return losses
Esempio n. 7
0
def train_hetu(args):
    with open(os.path.join(args.path, "meta.yml"), 'rb') as f:
        meta = yaml.load(f.read(), Loader=yaml.FullLoader)
    hidden_layer_size = args.hidden_size
    num_epoch = args.num_epoch
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    ctx = ndarray.gpu(rank)

    x_ = ad.Variable(name="x_")
    y_ = ad.Variable(name="y_")
    mask_ = ad.Variable(name="mask_")
    gcn1 = GraphSage(meta["feature"], hidden_layer_size, activation="relu", dropout=0.1)
    gcn2 = GraphSage(2*hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1)

    x = gcn1(x_)
    x = gcn2(x)
    W = initializers.xavier_uniform(shape=(2*hidden_layer_size, meta["class"]))
    B = initializers.zeros(shape=(meta["class"],))
    x = ad.matmul_op(x, W)
    y = x + ad.broadcastto_op(B, x)
    loss = ad.softmaxcrossentropy_op(y, y_)
    loss = ad.mul_op(loss, mask_)
    loss = ad.reduce_mean_op(loss, [0])
    opt = optimizer.SGDOptimizer(0.1)
    train_op = opt.minimize(loss)
    executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS')
    distributed.ps_init(rank, nrank)

    batch_size = 4000
    with DistributedGraphSageSampler(args.path, batch_size, 2, 2, rank=rank, nrank=nrank) as sampler:
        epoch = 0
        nnodes = 0
        start = time.time()
        while True:
            g_sample, mask = sampler.sample()
            mp_val = mp_matrix(g_sample, ndarray.gpu(rank))
            feed_dict = {
                gcn1.mp : mp_val,
                gcn2.mp : mp_val,
                mask_ : ndarray.array(mask, ctx=ctx),
                x_ : ndarray.array(g_sample.x, ctx=ctx),
                y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=g_sample.num_classes), ctx=ctx)
            }
            loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict)
            y_predicted = y_predicted.asnumpy().argmax(axis=1)
            acc = ((y_predicted == g_sample.y) * mask).sum()
            distributed.ps_get_worker_communicator().BarrierWorker()
            nnodes += batch_size
            if nnodes > meta["partition"]["nodes"][rank]:
                nnodes = 0
                epoch += 1
                print("Epoch :", epoch, time.time() - start)
                print("Train accuracy:", acc/mask.sum())
                start = time.time()
                if epoch >= num_epoch:
                    break
Esempio n. 8
0
 def func(name):
     np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
     np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
     inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx)
     outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx)
     uni_inind_len = np.unique(np_inind).size
     uni_outind_len = np.unique(np_outind).size
     comm.SparsePush(name, inind.handle, inarr.handle, None)
     comm.SparsePull(name, outind.handle, outarr.handle)
     comm.Wait(name)
     nonlocal byte_count
     byte_count += (uni_inind_len + uni_outind_len) * item_len * 4
Esempio n. 9
0
def train_hetu(args):
    with open(os.path.join(args.path, "meta.yml"), 'rb') as f:
        meta = yaml.load(f.read(), Loader=yaml.FullLoader)
    hidden_layer_size = args.hidden_size
    num_epoch = args.num_epoch
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    hosts, ports = load_ip_config(args.ip_config)
    ctx = ndarray.gpu(rank)
    distributed.grpc_init(hosts=hosts, ports=ports, rank=rank, nrank=nrank)

    x_ = ad.Variable(name="x_")
    y_ = ad.Variable(name="y_")
    gcn1 = GCN(meta["feature"], hidden_layer_size, activation="relu")
    gcn2 = GCN(hidden_layer_size, meta["class"])
    x = gcn1(x_)
    y = gcn2(x)
    loss = ad.softmaxcrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    opt = optimizer.SGDOptimizer(0.1)
    train_op = opt.minimize(loss)
    executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS')

    def transform(graph):
        mp_val = mp_matrix(graph, ndarray.gpu(rank))
        return graph, mp_val
    with DistributedSubgraphSampler(args.path, 4000, 2, rank=rank, nrank=nrank ,transformer=transform, backend="grpc") as sampler:
        epoch = 0
        nnodes = 0
        start = time.time()
        while True:
            g_sample, mp_val = sampler.sample()
            feed_dict = {
                gcn1.mp : mp_val,
                gcn2.mp : mp_val,
                x_ : ndarray.array(g_sample.x, ctx=ctx),
                y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=g_sample.num_classes), ctx=ctx)
            }
            loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict)
            y_predicted = y_predicted.asnumpy().argmax(axis=1)
            acc = (y_predicted == g_sample.y).sum()
            distributed.ps_get_worker_communicator().BarrierWorker()
            nnodes += g_sample.num_nodes
            if nnodes > meta["partition"]["nodes"][rank]:
                nnodes = 0
                epoch += 1
                print("Epoch :", epoch, time.time() - start)
                print("Train accuracy:", acc/len(y_predicted))
                start = time.time()
                if epoch >= num_epoch:
                    break
Esempio n. 10
0
def test_layernorm_backward(shape=(5, 3)):
    ctx = ndarray.gpu(1)
    # shape = (5, 3)
    last_dim = shape[-1]
    grads = np.random.random(shape).astype(np.float32)
    x = np.random.random(shape).astype(np.float32)
    scale = np.random.random((last_dim,)).astype(np.float32)
    mean = np.random.random(list(shape[:-1])+[1]).astype(np.float32)
    var = np.random.random(list(shape[:-1])+[1]).astype(np.float32)

    arr_grads = ndarray.array(grads, ctx=ctx)
    arr_x = ndarray.array(x, ctx=ctx)
    arr_scale = ndarray.array(scale, ctx=ctx)
    arr_mean = ndarray.array(mean, ctx=ctx)
    arr_var = ndarray.array(var, ctx=ctx)

    grad_inarr = ndarray.empty(shape, ctx=ctx)
    grad_scale = ndarray.empty((last_dim,), ctx=ctx)
    grad_bias = ndarray.empty((last_dim,), ctx=ctx)
    gpu_op.layer_normalization_gradient(arr_grads, arr_x, arr_scale,
        grad_inarr, grad_scale, grad_bias, arr_mean, arr_var, 0.01)

    # numpy calculate phase
    red_axis = tuple(range(grads.ndim-1))
    np_grad_bias = grads.sum(red_axis) # (X,)
    
    std = np.sqrt(var + 0.01) # (N, 1)
    x_centered = x - mean # (N, X)
    x_norm = x_centered / std # (N, X)
    np_grad_scale = (grads * x_norm).sum(red_axis) # (X,)

    last_dim = x.shape[-1]
    dx_norm = grads * scale.reshape([1] * (grads.ndim - 1) + [-1]) # (N, X)
    dvar = (dx_norm * x_centered).sum(axis=-1, keepdims=True) * -0.5 / (var + 0.01) / std # (N, 1)
    dx_mu_1 = dx_norm / std # (N, X)
    dx_mu_2 = dvar * 2 * x_centered / last_dim # (N, X)
    dx_1 = dx_mu_1 + dx_mu_2 # (N, X)
    dx_2 = -1 * dx_1.sum(axis=-1, keepdims=True) / last_dim # (N, 1)
    np_grad_inarr = dx_1 + dx_2 # (N, X)
    
    np.testing.assert_allclose(np_grad_bias, grad_bias.asnumpy(), rtol=1e-4, atol=1e-4)
    np.testing.assert_allclose(np_grad_scale, grad_scale.asnumpy(), rtol=1e-4, atol=1e-4)
    np.testing.assert_allclose(np_grad_inarr, grad_inarr.asnumpy(), rtol=1e-4, atol=1e-4)
    print('Pass backward test with shape ', shape)
Esempio n. 11
0
def test_sparse_matrix_multiply():
	density = 1e-3
	ctx = ndarray.gpu(0)
	x = scipy.sparse.rand(500, 7000,density=density,format='coo',dtype=np.float32)
	y = np.random.uniform(0, 10, size=(7000, 100)).astype(np.float32)
	mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 7000], ctx=ctx)
	mat_y = ndarray.array(y, ctx=ctx)
	mat_z = ndarray.empty((500, 100), ctx=ctx)
	gpu_op.CuSparse_Csrmm(mat_x, False, mat_y, False, mat_z)
	z = mat_z.asnumpy()
	np.testing.assert_allclose(x.dot(y), z, rtol=1e-5)
Esempio n. 12
0
def test_sparse_array_dense_vector_multiply():
	density = 1e-3
	ctx = ndarray.gpu(0)
	x = scipy.sparse.rand(500, 70000, density=density,format='coo',dtype=np.float32)
	y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32)
	mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 70000], ctx=ctx)
	arr_y = ndarray.array(y, ctx=ctx)
	arr_z = ndarray.empty((500, 1), ctx=ctx)
	trans = False
	gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z)
	z = arr_z.asnumpy()
	np.testing.assert_allclose(x.dot(y), z, rtol=1e-5)

	
	x = scipy.sparse.rand(70000, 500, density=density,format='coo',dtype=np.float32)
	y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32)
	mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [70000, 500], ctx=ctx)
	arr_y = ndarray.array(y, ctx=ctx)
	arr_z = ndarray.empty((500, 1), ctx=ctx)
	trans = True
	gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z)
	z = arr_z.asnumpy()
	np.testing.assert_allclose(x.transpose().dot(y), z, rtol=1e-5)
Esempio n. 13
0
def test():
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    if rank > 0:
        return
    arr = ndarray.array(np.random.rand(nitem, item_len),ctx = ctx) # generate a long buffer

    push_indices = np.arange(nitem)
    print(push_indices)
    push_length = np.repeat(item_len, repeats=nitem)
    worker_communicate = ad.get_worker_communicate()
    query = worker_communicate.PushData(pointer(push_indices), nitem, arr.handle, pointer(push_length))
    worker_communicate.WaitData(query)
    print("data_pushed")
    t = ThreadPoolExecutor(max_workers=max_thread)
    byte_count = 0
    arr2 = ndarray.array(np.random.rand(nitem, item_len),ctx = ctx)
    def pull_data():
        query = worker_communicate.PullData(pointer(push_indices), nitem, arr2.handle, pointer(push_length))
        worker_communicate.WaitData(query)
        # print( np.all(arr.asnumpy() == arr2.asnumpy()) )
        nonlocal byte_count
        byte_count += nitem * item_len * 4
    def watch():
        nonlocal byte_count
        start = time.time()
        while True:
            time.sleep(1)
            speed = byte_count / (time.time() - start)
            print("speed : {} MB/s".format(speed / 2**20))
    task_list = [None for i in range(max_thread)]
    threading.Thread(target=watch).start()
    while True:
        for i in range(max_thread):
            if task_list[i] is None or task_list[i].done():
                task_list[i] = t.submit(pull_data)
Esempio n. 14
0
    def sync_and_clear(self):
        self.count += 1
        train_stat = ndarray.array(self.train_stat, ndarray.cpu())
        test_stat = ndarray.array(self.test_stat, ndarray.cpu())
        comm.dlarrayNcclAllReduce(train_stat, train_stat,
                                  ncclDataType_t.ncclFloat32,
                                  ncclRedOp_t.ncclSum, comm.stream)
        comm.dlarrayNcclAllReduce(test_stat, test_stat,
                                  ncclDataType_t.ncclFloat32,
                                  ncclRedOp_t.ncclSum, comm.stream)
        comm.stream.sync()
        train_stat, test_stat = train_stat.asnumpy(), test_stat.asnumpy()
        printstr = "epoch {}: test loss: {:.3f} test acc: {:.3f} train loss: {:.3f} train acc: {:.3f}".format(
            self.count,
            test_stat[3] / test_stat[0],
            test_stat[1] / test_stat[2],
            train_stat[3] / train_stat[0],
            train_stat[1] / train_stat[2],
        )
        logstr = "{} {} {} {}".format(
            test_stat[3] / test_stat[0],
            test_stat[1] / test_stat[2],
            train_stat[3] / train_stat[0],
            train_stat[1] / train_stat[2],
        )
        self.time.append(time.time())
        if comm.device_id.value == 0:
            print(printstr, flush=True)
            print(logstr, file=self.file, flush=True)
            if len(self.time) > 3:
                epoch_time = np.array(self.time[1:]) - np.array(self.time[:-1])
                print("epoch time: {:.3f}+-{:.3f}".format(
                    np.mean(epoch_time), np.var(epoch_time)))

        self.train_stat[:] = 0
        self.test_stat[:] = 0
Esempio n. 15
0
def test():
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    arr = ndarray.array(np.random.rand(2,rank+100),ctx = ctx)
    print(arr.asnumpy())

    push_indices = np.array([2*rank+1,2*rank+2])

    if rank == 0:
        pull_indices = np.array([3])
    elif rank == 1:
        pull_indices = np.array([1])

    push_length = np.array([rank+100,rank+100])


    if rank == 0:
        pull_length = np.array([101])
        out_arr = ndarray.array(np.zeros(101),ctx = ctx)
    elif rank == 1:
        pull_length = np.array([100])
        out_arr = ndarray.array(np.zeros(100),ctx = ctx)

    print(out_arr.asnumpy())

    worker_communicate = ad.get_worker_communicate()
    query = worker_communicate.PushData(pointer(push_indices), 2, arr.handle, pointer(push_length))

    worker_communicate.WaitData(query);

    worker_communicate.BarrierWorker()
    worker_communicate.PullData(pointer(pull_indices), 1, out_arr.handle, pointer(pull_length))
    worker_communicate.WaitData(query);

    print(out_arr.asnumpy())
Esempio n. 16
0
    def eval():
        start = time.time()
        ad.Dropout.DropoutOp.phase = "eval"
        mp_val = mp_matrix(graph_full, ctx)

        feed_dict = {
            gcn1.mp : mp_val,
            gcn2.mp : mp_val,
            x_ : ndarray.array(graph_full.x, ctx=ctx),
        }
        executor_eval = ad.Executor([y], ctx=ctx)
        y_predicted, = executor_eval.run(feed_dict=feed_dict)
        y_predicted = y_predicted.asnumpy().argmax(axis=1)
        acc = (y_predicted == graph_full.y)[train_split:].sum()
        print("Test accuracy:", acc/len(y_predicted[train_split:]))
        ad.Dropout.DropoutOp.phase = "training"
Esempio n. 17
0
def test_allreduce(comm=None):
    shape = (24, 24)
    size = 4
    for val in shape:
        size *= val
    input_arr = np.ones(shape) * comm.localRank.value
    input_arr = ndarray.array(input_arr, ctx=ndarray.gpu(comm.localRank.value))
    # input_arr = ndarray.array(input_arr, ctx = ndarray.cpu())

    start = time.time()
    comm.dlarrayNcclAllReduce(input_arr, ncclDataType_t.ncclFloat32,
                              ncclRedOp_t.ncclSum)
    comm.stream.sync()
    end = time.time()

    secs = end - start

    return size, secs
Esempio n. 18
0
def test_p2p(comm=None, src=0, target=1):
    shape = (1000, 30, 224, 224)
    size = 4
    for val in shape:
        size *= val
    print("MyRank: ", comm.myRank.value)
    arr = np.ones(shape) * comm.localRank.value
    arr = ndarray.array(arr, ctx=ndarray.gpu(comm.localRank.value))
    # arr = ndarray.array(arr, ctx = ndarray.cpu())
    start = time.time()
    if comm.myRank.value == 0:
        comm.dlarraySend(arr, ncclDataType_t.ncclFloat32, 1)
    else:
        comm.dlarrayRecv(arr, ncclDataType_t.ncclFloat32, 0)
    comm.stream.sync()
    end = time.time()

    secs = end - start
    # size: /Bytes
    # dur_time: /s
    return size, secs
Esempio n. 19
0
def test_csrmv_op(executor_ctx):
    X = ad.Variable(name="X")
    W = ad.Variable(name="W")
    Y = ad.csrmv_op(X, W)
    Y_ = ad.Variable(name="Y_")
    temp = Y + (-1) * Y_
    loss = temp * temp

    grads = ad.gradients(loss, [W, Y])
    
    executor = ad.Executor(
        [loss, grads[0], grads[1]], ctx=executor_ctx)
    
    rand = np.random.RandomState(seed=123)

    W_val =rand.normal(scale=0.1, size=[70000, ])
    if ndarray.is_gpu_ctx(executor_ctx):
        W_val = ndarray.array(W_val, ctx=executor_ctx)
    
    X_val = scipy.sparse.rand(500, 70000, density=1e-5,format='coo',dtype=np.float32)
    Y_val = np.random.uniform(0, 10, size=(500, )).astype(np.float32) 
    
    loss_val = executor.run(feed_dict={X: X_val, Y_: Y_val, W: W_val})
    
    if ndarray.is_gpu_ctx(executor_ctx):
        W_val = W_val.asnumpy()
    loss_val = [val.asnumpy() for val in loss_val]
    
    y_groundtruth = X_val.dot(W_val)
    loss_groundtruth = (y_groundtruth - Y_val) ** 2
    Y_grad_groundtruth = 2 * (y_groundtruth - Y_val) * np.ones(loss_groundtruth.shape)
    W_grad_groundtruth = X_val.T.dot(Y_grad_groundtruth)
    

    np.testing.assert_allclose(loss_val[0], loss_groundtruth, rtol=1e-4)
    np.testing.assert_allclose(loss_val[1], W_grad_groundtruth, rtol=1e-4)
    np.testing.assert_allclose(loss_val[2], Y_grad_groundtruth, rtol=1e-4)
Esempio n. 20
0
def test_csrmm_op(executor_ctx):
    X = ad.Variable(name="X")
    W = ad.Variable(name="W")
    Y = ad.csrmm_op(X, W)
    Y_ = ad.Variable(name="Y_")
    loss = ad.softmaxcrossentropy_op(Y, Y_)
    loss = ad.reduce_mean_op(loss, [0])
    grads = ad.gradients(loss, [W, Y])
    
    executor = ad.Executor(
        [loss, grads[0], grads[1]], ctx=executor_ctx)
    
    rand = np.random.RandomState(seed=123)

    W_val = rand.normal(scale=0.1, size=[70000, 2]).astype(np.float32)
    if ndarray.is_gpu_ctx(executor_ctx):
        W_val = ndarray.array(W_val, ctx=executor_ctx)
    
    X_val = scipy.sparse.rand(500, 70000, density=1e-5,format='coo',dtype=np.float32)
    Y_val = np.random.uniform(0, 10, size=(500, 2)).astype(np.float32) 
    
    loss_val = executor.run(feed_dict={X: X_val, Y_: Y_val, W: W_val})
    
    if ndarray.is_gpu_ctx(executor_ctx):
        W_val = W_val.asnumpy()
    loss_val = [val.asnumpy() for val in loss_val]
    
    y_groundtruth = X_val.dot(W_val)
    loss_groundtruth = np.mean(
                -np.sum(Y_val * np.log(softmax_func(y_groundtruth)), axis=1), keepdims=True)
    Y_grad_groundtruth = (softmax_func(y_groundtruth) + -1 * Y_val) * np.ones(loss_groundtruth.shape) / 500
    W_grad_groundtruth = X_val.T.dot(Y_grad_groundtruth)

    np.testing.assert_allclose(loss_val[0], loss_groundtruth, rtol=1e-4)
    np.testing.assert_allclose(loss_val[1], W_grad_groundtruth, rtol=1e-4)
    np.testing.assert_allclose(loss_val[2], Y_grad_groundtruth, rtol=1e-4)
Esempio n. 21
0
    group_col = col_procs[rank_col]
    comm_row = row_groups[rank_row]
    comm_col = col_groups[rank_col]
    
    a = ndarray.array(np.array([rank,rank,rank,rank,rank]),ctx=ctx)
    comm_row.dlarrayBroadcast(a, a, ncclDataType_t.ncclFloat32, root = group_row[1])
    print("Broadcast device=%d, a:"%device_id,a.asnumpy()) 
    
    b = ndarray.array(np.array([rank,rank,rank,rank,rank]),ctx=ctx)  
    comm_col.dlarrayBroadcast(b, b, ncclDataType_t.ncclFloat32, root = group_col[1])
    print("Broadcast device=%d, b:"%device_id,b.asnumpy())   

comm, device_id = ad.mpi_nccl_init()
device = comm.device_id.value
rank = comm.localRank.value
size = comm.nRanks.value
ctx = ndarray.gpu(rank)
a = ndarray.array(np.array([1,2,3,4,5]),ctx=ctx)

test_default()

test_broadcast(group = [0,2,4,5,6], root=4)
test_broadcast(group = [1,4,2,7],root=4)
test_allreduce(group = [1,4,2,5])
test_allreduce(group = [0,7,6,2,4])
test_allgather(group = [2,5,3,7])
test_allgather(group = [2,6,1,7,4])

test_group_broadcast()

Esempio n. 22
0
def train_hetu(num_epoch):
    ctx = ndarray.gpu(0)

    x_ = ad.Variable(name="x_")
    y_ = ad.Variable(name="y_")
    mask_ = ad.Variable(name="mask_")

    gcn1 = GraphSage(graph.num_features, hidden_layer_size, activation="relu", dropout=0.1)
    gcn2 = GraphSage(2*hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1)

    x = gcn1(x_)
    x = gcn2(x)
    W = initializers.xavier_uniform(shape=(2*hidden_layer_size, graph.num_classes))
    B = initializers.zeros(shape=(graph.num_classes,))
    x = ad.matmul_op(x, W)
    y = x + ad.broadcastto_op(B, x)

    loss = ad.softmaxcrossentropy_op(y, y_)
    loss = ad.mul_op(loss, mask_)
    opt = optimizer.AdamOptimizer(0.01)
    train_op = opt.minimize(loss)
    executor = ad.Executor([loss, y, train_op], ctx=ctx)

    def eval():
        start = time.time()
        ad.Dropout.DropoutOp.phase = "eval"
        mp_val = mp_matrix(graph_full, ctx)

        feed_dict = {
            gcn1.mp : mp_val,
            gcn2.mp : mp_val,
            x_ : ndarray.array(graph_full.x, ctx=ctx),
        }
        executor_eval = ad.Executor([y], ctx=ctx)
        y_predicted, = executor_eval.run(feed_dict=feed_dict)
        y_predicted = y_predicted.asnumpy().argmax(axis=1)
        acc = (y_predicted == graph_full.y)[train_split:].sum()
        print("Test accuracy:", acc/len(y_predicted[train_split:]))
        ad.Dropout.DropoutOp.phase = "training"
    epoch = 0
    nnodes = 0
    batch_size = 1000
    with GraphSageSampler(graph, batch_size, depth=2, num_sample_thread=4) as sampler:
        start = time.time()
        while True:
            g_sample, mask = sampler.sample()
            mp_val = mp_matrix(g_sample, ctx)
            #print(time.time() - start)
            feed_dict = {
                gcn1.mp : mp_val,
                gcn2.mp : mp_val,
                mask_ : ndarray.array(mask,ctx=ctx),
                x_ : ndarray.array(g_sample.x, ctx=ctx),
                y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=graph.num_classes), ctx=ctx)
            }
            loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict)
            y_predicted = y_predicted.asnumpy().argmax(axis=1)
            acc = ((y_predicted == g_sample.y) * mask).sum()
            # print(i, "Train loss :", loss_val.asnumpy().mean())
            # print(i, "Train accuracy:", acc/len(y_predicted))
            nnodes += batch_size
            if nnodes > graph_full.num_nodes:
                nnodes = 0
                epoch += 1
                print("Epoch :", epoch, time.time() - start)
                print("Train accuracy:", acc/mask.sum())
                eval()
                start = time.time()
                if epoch >= num_epoch:
                    break
Esempio n. 23
0
            return
        self.ncclCommInitRank()


def mpi_nccl_communicator(mpi_init=True):
    '''

    '''
    return MPI_NCCL_Communicator(mpi_init=mpi_init)


# NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 4 python mpi_nccl_comm.py
if __name__ == "__main__":
    t = mpi_nccl_communicator()
    t.ncclInit()

    arr = np.ones(16) * t.localRank.value
    print("before: = ", arr)
    arr = ndarray.array(arr, ctx=ndarray.gpu(t.device_id.value))
    output_arr = np.zeros(16 * t.nRanks.value)

    output_arr = ndarray.array(output_arr, ctx=ndarray.gpu(t.device_id.value))
    t.dlarrayNcclAllReduce(arr, arr, ncclDataType_t.ncclFloat32,
                           ncclRedOp_t.ncclSum)
    # t.dlarrayBroadcast(arr, ncclDataType_t.ncclFloat32, 0)
    # t.dlarrayAllGather(arr, output_arr, ncclDataType_t.ncclFloat32)

    print("after: = ", arr.asnumpy())

    t.ncclFinish()
Esempio n. 24
0
def train_main(args):
    with open(os.path.join(args.path, "meta.yml"), 'rb') as f:
        meta = yaml.load(f.read(), Loader=yaml.FullLoader)
    hidden_layer_size = args.hidden_size
    num_epoch = args.num_epoch
    rank = ad.get_worker_communicate().rank()
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    ctx = ndarray.gpu(rank % args.num_local_worker)
    embedding_width = args.hidden_size
    extract_width = embedding_width * (meta["feature"] - 1)

    y_ = dl.GNNDataLoaderOp(lambda g: ndarray.array(
        convert_to_one_hot(g.y, max_val=g.num_classes), ctx=ndarray.cpu()))
    mask_ = ad.Variable(name="mask_")
    gcn1 = GCN(extract_width, hidden_layer_size, activation="relu")
    gcn2 = GCN(hidden_layer_size, meta["class"])
    index = dl.GNNDataLoaderOp(
        lambda g: ndarray.array(g.x[:, 0:-1], ctx=ndarray.cpu()),
        ctx=ndarray.cpu())
    embedding = initializers.random_normal([meta["idx_max"], embedding_width],
                                           stddev=0.1)
    embed = ad.embedding_lookup_op(embedding, index)
    embed = ad.array_reshape_op(embed, (-1, extract_width))
    # embed = ad.reduce_mean_op(embed, axes=1)
    # x = ad.concat_op(x_, embed, axis=1)
    x = gcn1(embed)
    y = gcn2(x)
    loss = ad.softmaxcrossentropy_op(y, y_)
    train_loss = loss * mask_
    train_loss = ad.reduce_mean_op(train_loss, [0])
    opt = optimizer.SGDOptimizer(args.learning_rate)
    train_op = opt.minimize(train_loss)
    ad.worker_init()
    distributed.ps_init(rank, nrank)

    ngraph = meta["partition"]["nodes"][rank] // args.batch_size
    graphs = prepare_data(ngraph)
    idx = 0
    g_sample, mp_val, mask, mask_eval = graphs[idx]
    idx = (idx + 1) % ngraph
    dl.GNNDataLoaderOp.step(g_sample)
    dl.GNNDataLoaderOp.step(g_sample)
    epoch = 0
    nnodes = 0
    executor = ad.Executor([loss, y, train_op],
                           ctx=ctx,
                           comm_mode='PS',
                           use_sparse_pull=False,
                           cstable_policy=args.cache)
    while True:
        g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt = graphs[idx]
        idx = (idx + 1) % ngraph
        dl.GNNDataLoaderOp.step(g_sample_nxt)
        feed_dict = {gcn1.mp: mp_val, gcn2.mp: mp_val, mask_: mask}
        loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict)
        y_predicted = y_predicted.asnumpy().argmax(axis=1)

        acc = np.sum((y_predicted == g_sample.y) * mask_eval)
        train_acc = np.sum((y_predicted == g_sample.y) * mask)
        stat.update(acc, mask_eval.sum(),
                    np.sum(loss_val.asnumpy() * mask_eval) / mask_eval.sum())
        stat.update_train(train_acc, mask.sum(),
                          np.sum(loss_val.asnumpy() * mask) / mask.sum())

        # distributed.ps_get_worker_communicator().BarrierWorker()
        nnodes += mask.sum() + mask_eval.sum()
        if nnodes > meta["partition"]["nodes"][rank]:
            nnodes = 0
            epoch += 1
            if rank == 0:
                stat.print(epoch)
            if epoch >= num_epoch:
                break
        g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt
Esempio n. 25
0
        return self.nRanks

    def DLArrayAllReduce(self, dlarray, datatype, reduceop):
        lib_mpi.dlarrayAllReduce(dlarray.handle, c_int(datatype.value),
                                 c_int(reduceop.value),
                                 ctypes.byref(self.mpicomm))

    def allReduce(self, arr):
        self.DLArrayAllReduce(arr, MPIDataType_t.MPI_Float32, MPIOp_t.MPI_SUM)

    def finish(self):
        lib_mpi.MPIFinalize()


def mpi_communicator():
    '''

    '''
    return MPI_Communicator()


# mpirun --allow-run-as-root -np 4 python2 mpi_comm.py
if __name__ == "__main__":
    comm = mpi_communicator()
    comm.MPI_GetComm()
    print("rank = %d" % (comm.rank().value))
    arr = np.ones([10]) * comm.rank().value
    arr = ndarray.array(arr)
    comm.allReduce(arr)
    print(arr.asnumpy())
    comm.finish()
Esempio n. 26
0
def test_api(rarr, rpush, rpull, sparse=False, lr=0.5):
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(nitem,
                                                              item_len).copy()
    local_push = np.frombuffer(rpush, dtype=np.float32).copy()
    local_pull = np.frombuffer(rpull, dtype=np.float32).copy()
    if rank == 0:
        arr = ndarray.array(local_arr, ctx=ctx)
    else:
        arr = ndarray.empty((nitem, item_len), ctx=ctx)
    comm = ad.get_worker_communicate()
    if sparse:
        arr_len = ctypes.c_int(nitem)
        arr_wid = ctypes.c_int(item_len)
    else:
        arr_len = ctypes.c_int(nitem * item_len)
        arr_wid = ctypes.c_int(1)
    comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len, arr_wid, ctypes.c_int(0), ctypes.c_double(0.0), ctypes.c_double(1.0), ctypes.c_ulonglong(123),\
        ctypes.c_int(0), (ctypes.c_float * 1)(lr), ctypes.c_int(1))
    if sparse:
        local_arr[:] = 0
        for j in local_push:
            local_arr[int(j)] += 1
        if rank == 0:
            push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx)
            push_val = ndarray.array(np.ones(
                (indx1, indx2, item_len)).astype(np.float32),
                                     ctx=ctx)
            comm.SparsePush(0, push_ind.handle, push_val.handle, None)
            comm.Wait(0)
        comm.BarrierWorker()
        comm.Pull(0, arr.handle)
        comm.Wait(0)
        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
        print('SparsePush DensePull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()

        for j in local_push:
            local_arr[int(j)] += 1
        if rank == 0:
            push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx)
            push_val = ndarray.array(np.ones(
                (indx1, indx2, item_len)).astype(np.float32),
                                     ctx=ctx)
            comm.SDPushPull(0, push_ind.handle, push_val.handle, arr.handle,
                            None)
            comm.Wait(0)
        comm.BarrierWorker()
        if rank != 0:
            comm.Pull(0, arr.handle)
            comm.Wait(0)
        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
        print('SDPushPull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()

        for j in local_push:
            local_arr[int(j)] += 1
        pull_ind = ndarray.array(local_pull.reshape(indx1, indx2), ctx=ctx)
        pull_val = ndarray.empty((indx1, indx2, item_len), ctx=ctx)
        if rank == 0:
            push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx)
            push_val = ndarray.array(np.ones(
                (indx1, indx2, item_len)).astype(np.float32),
                                     ctx=ctx)
            comm.SSPushPull(0, push_ind.handle, push_val.handle, \
                        pull_ind.handle, pull_val.handle, None)
            comm.Wait(0)
        comm.BarrierWorker()
        if rank != 0:
            comm.SparsePull(0, pull_ind.handle, pull_val.handle)
            comm.Wait(0)
        np.testing.assert_allclose(local_arr[local_pull.astype(int)].reshape(
            indx1, indx2, item_len),
                                   pull_val.asnumpy(),
                                   rtol=5e-7)
        print('SSPushPull and SparsePull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()

    else:
        if rank == 0:
            comm.Push(0, arr.handle, None)
            comm.Wait(0)
        comm.BarrierWorker()
        comm.Pull(0, arr.handle)
        comm.Wait(0)
        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
        print('DensePush DensePull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()
        if rank == 0:
            temp_push_val = ndarray.array(np.ones(
                (nitem, item_len)).astype(np.float32),
                                          ctx=ctx)
            comm.DDPushPull(0, temp_push_val.handle, arr.handle, None)
            comm.Wait(0)
        comm.BarrierWorker()
        if rank != 0:
            comm.Pull(0, arr.handle)
            comm.Wait(0)
        np.testing.assert_allclose(local_arr + 1, arr.asnumpy())
        print('DenseDensePushPull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()
    if rank == 0:
        comm.ClearOnServer(0)
    comm.Clear(0)
    comm.BarrierWorker()
Esempio n. 27
0
def test(func_name,
         nitem=2000,
         item_len=10000,
         ind_len=500,
         max_thread=10,
         ret_ans=False):
    func_name = func_name.lower()
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])

    comm = ad.get_worker_communicate()
    byte_count = 0
    if func_name == 'pushnpull':
        inarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)
        outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)

        def func(name):
            comm.Push(name, inarr.handle, None)
            comm.Pull(name, outarr.handle)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += nitem * item_len * 4 * 2
    elif func_name == 'pushpull':
        inarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)
        outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)

        def func(name):
            comm.DDPushPull(name, inarr.handle, outarr.handle, None)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += nitem * item_len * 4 * 2
    elif func_name == 'sparsepushnpull':
        inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)
        outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)

        def func(name):
            np_ind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            inind = ndarray.array(np_ind.astype(np.float32), ctx=ctx)
            uni_ind_len = np.unique(np_ind).size
            comm.SparsePush(name, inind.handle, inarr.handle, None)
            comm.Pull(name, outarr.handle)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += (nitem + uni_ind_len) * item_len * 4
    elif func_name == 'sparsepushnsparsepull':
        inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)
        outarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)

        def func(name):
            np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx)
            outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx)
            uni_inind_len = np.unique(np_inind).size
            uni_outind_len = np.unique(np_outind).size
            comm.SparsePush(name, inind.handle, inarr.handle, None)
            comm.SparsePull(name, outind.handle, outarr.handle)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += (uni_inind_len + uni_outind_len) * item_len * 4
    elif func_name == 'push':
        inarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)

        def func(name):
            comm.Push(name, inarr.handle, None)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += nitem * item_len * 4
    elif func_name == 'pull':
        outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)

        def func(name):
            comm.Pull(name, outarr.handle)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += nitem * item_len * 4
    elif func_name == 'sparsepush':
        inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)

        def func(name):
            np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx)
            uni_inind_len = np.unique(np_inind).size
            comm.SparsePush(name, inind.handle, inarr.handle, None)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += uni_inind_len * item_len * 4
    elif func_name == 'sparsepull':
        outarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)

        def func(name):
            np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx)
            uni_outind_len = np.unique(np_outind).size
            comm.SparsePull(name, outind.handle, outarr.handle)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += uni_outind_len * item_len * 4
    elif func_name == 'sdpushpull':
        inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)
        outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)

        def func(name):
            np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx)
            uni_inind_len = np.unique(np_inind).size
            comm.SDPushPull(name, inind.handle, inarr.handle, outarr.handle,
                            None)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += (uni_inind_len + nitem) * item_len * 4
    elif func_name == 'sspushpull':
        inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)
        outarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)

        def func(name):
            np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx)
            uni_inind_len = np.unique(np_inind).size
            outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx)
            uni_outind_len = np.unique(np_outind).size
            comm.SSPushPull(name, inind.handle, inarr.handle, outind.handle,
                            outarr.handle, None)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += (uni_inind_len + uni_outind_len) * item_len * 4
    else:
        assert False
    if 'sparse' in func_name or func_name in ('sdpushpull', 'sspushpull'):
        arr_len = ctypes.c_int(nitem)
        arr_wid = ctypes.c_int(item_len)
        sparse_init = ctypes.c_int(1)
    else:
        arr_len = ctypes.c_int(nitem * item_len)
        arr_wid = ctypes.c_int(1)
        sparse_init = ctypes.c_int(0)
    for i in range(max_thread):
        comm.InitTensor(i, sparse_init, arr_len, arr_wid, ctypes.c_int(0), ctypes.c_double(0), ctypes.c_double(1), ctypes.c_ulonglong(123),\
            ctypes.c_int(0), (ctypes.c_float * 1)(0.1), ctypes.c_int(1))
    # print("data init")
    t = ThreadPoolExecutor(max_workers=max_thread)
    if ret_ans:
        task_list = [None for i in range(max_thread)]
        for i in range(max_thread):
            task_list[i] = t.submit(func, i)
        curByte = byte_count
        start = time.time()
        cnt = 0
        while cnt < 30:
            for i in range(max_thread):
                if task_list[i].done():
                    cnt += 1
                    task_list[i] = t.submit(func, i)
        speed = (byte_count - curByte) / (time.time() - start) / 2**20
        t.shutdown()
        for i in range(max_thread):
            comm.ClearOnServer(i)
            comm.Clear(i)
        return speed
    else:

        def watch():
            start = time.time()
            while True:
                time.sleep(1)
                speed = byte_count / (time.time() - start)
                print("speed : {} MB/s".format(speed / 2**20))

        task_list = [None for i in range(max_thread)]
        threading.Thread(target=watch).start()
        while True:
            for i in range(max_thread):
                if task_list[i] is None or task_list[i].done():
                    task_list[i] = t.submit(func, i)
Esempio n. 28
0
def test_init_ps(rarr, init_type, init_a, init_b=1.0, sparse=False):
    assert init_type in ('constant', 'uniform', 'normal', 'truncated_normal')
    init_type_map = {
        'constant': 0,
        'uniform': 1,
        'normal': 2,
        'truncated_normal': 3
    }
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(nitem, item_len)
    if rank == 0:
        arr = ndarray.array(local_arr, ctx=ctx)
    else:
        arr = ndarray.empty((nitem, item_len), ctx=ctx)
    comm = ad.get_worker_communicate()
    if sparse:
        arr_len = ctypes.c_int(nitem)
        arr_wid = ctypes.c_int(item_len)
    else:
        arr_len = ctypes.c_int(nitem * item_len)
        arr_wid = ctypes.c_int(1)
    itype = ctypes.c_int(init_type_map[init_type])
    comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len,
                    arr_wid, itype, ctypes.c_double(init_a),
                    ctypes.c_double(init_b), ctypes.c_ulonglong(123),
                    ctypes.c_int(0), (ctypes.c_float * 1)(0.1),
                    ctypes.c_int(1))

    comm.Pull(ctypes.c_int(0), arr.handle)
    comm.Wait(ctypes.c_int(0))
    if rank == 0:
        local_arr[:] = arr.asnumpy()
    comm.BarrierWorker()
    if rank != 0:
        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
    else:
        if init_type == 'constant':
            np.testing.assert_allclose(np.full((nitem, item_len), init_a),
                                       arr.asnumpy(),
                                       rtol=5e-7)
        else:
            if init_type == 'uniform':
                numpy_samples = np.random.uniform(
                    low=init_a, high=init_b,
                    size=(nitem, item_len)).astype(np.float32)
            elif init_type == 'normal':
                numpy_samples = np.random.normal(
                    loc=init_a, scale=init_b,
                    size=(nitem, item_len)).astype(np.float32)
            else:
                numpy_samples = truncnorm.rvs(-2.0,
                                              2.0,
                                              loc=init_a,
                                              scale=init_b,
                                              size=(nitem, item_len)).astype(
                                                  np.float32)
            fig, ax = plt.subplots(1, 1)
            ax.hist(numpy_samples.flatten(),
                    histtype='stepfilled',
                    alpha=0.2,
                    bins=50,
                    label='numpy')
            ax.hist(local_arr.flatten(),
                    histtype='step',
                    alpha=0.2,
                    bins=50,
                    label='ps')
            ax.legend(loc='best', frameon=False)
            # ax2.legend(loc='best', frameon=False)
            file_name = '%s_%.1f_%.1f_%d.png' % (init_type, init_a, init_b,
                                                 int(sparse))
            plt.savefig(file_name)
            print('Check file %s.' % file_name)
    print('Init parameters %d/%d passed.' % (rank, nrank))
    if rank == 0:
        comm.ClearOnServer(0)
    comm.Clear(0)
    comm.BarrierWorker()
Esempio n. 29
0
def test_default():
    comm1 = ad.new_group_comm()
    a = ndarray.array(np.array([1,2,3,4,5]),ctx=ctx)
    comm1.dlarrayNcclAllReduce(a, a, ncclDataType_t.ncclFloat32, reduceop=ncclRedOp_t.ncclSum)
    print("Default Allreduce device=%d"%comm1.device_id.value,a.asnumpy())
Esempio n. 30
0
def test(args):
    comm, device_id = ad.mpi_nccl_init()
    rank = comm.localRank.value
    size = comm.nRanks.value

    dataset_info = {
        'Reddit': [232965, 602, 41],
        'Proteins': [132534, 602, 8],
        'Arch': [1644228, 602, 10],
        'Products': [2449029, 100, 47]
    }

    node_count, num_features, num_classes = dataset_info[args.dataset]

    hidden_layer_size = 128
    if num_features < 128:
        hidden_layer_size = 64

    replication = args.replication

    node_Count_Self = row_num(node_count, rank // replication,
                              size // replication)
    node_Count_All = node_count

    _, _, row_groups, col_groups = get_proc_groups(size, replication)

    executor_ctx = ndarray.gpu(device_id)

    if size > 1:
        adj_part, data_part, row_part, col_part, input_part, label_part = load_data(
            args, size, replication, rank)
    else:
        adj_part, data_part, row_part, col_part, input_part, label_part = load_data_whole(
            args)

    adj_matrix = ndarray.sparse_array(data_part, (row_part, col_part),
                                      shape=adj_part.shape,
                                      ctx=executor_ctx)

    # train:val:test=6:2:2
    # Our optimization on distributed GNN algorithm does NOT affect the correctness!
    # Here due to the limitation of current slice_op, data is split continuously.
    # Continuous split is unfriendly for reordered graph data where nodes are already clustered.
    # Specifically, training on some node clusters and testing on other clusters may cause poor test accuracy.
    # The better way is to split data randomly!
    train_split, test_split = 0.6, 0.8
    train_node = int(train_split * node_Count_Self)
    test_node = int(test_split * node_Count_Self)

    A = ad.Variable(name="A", trainable=False)
    H = ad.Variable(name="H")
    np.random.seed(123)
    bounds = np.sqrt(6.0 / (num_features + hidden_layer_size))
    W1_val = np.random.uniform(low=-bounds,
                               high=bounds,
                               size=[num_features,
                                     hidden_layer_size]).astype(np.float32)
    W1 = ad.Variable(name="W1", value=W1_val)
    bounds = np.sqrt(6.0 / (num_classes + hidden_layer_size))
    np.random.seed(123)
    W2_val = np.random.uniform(low=-bounds,
                               high=bounds,
                               size=[hidden_layer_size,
                                     num_classes]).astype(np.float32)

    W2 = ad.Variable(name="W2", value=W2_val)
    y_ = ad.Variable(name="y_")

    z = ad.distgcn_15d_op(A, H, W1, node_Count_Self, node_Count_All, size,
                          replication, device_id, comm,
                          [row_groups, col_groups], True)
    H1 = ad.relu_op(z)
    y = ad.distgcn_15d_op(A, H1, W2, node_Count_Self, node_Count_All, size,
                          replication, device_id, comm,
                          [row_groups, col_groups], True)

    y_train = ad.slice_op(y, (0, 0), (train_node, num_classes))
    label_train = ad.slice_op(y_, (0, 0), (train_node, num_classes))

    y_test = ad.slice_op(y, (test_node, 0),
                         (node_Count_Self - test_node, num_classes))
    label_test = ad.slice_op(y_, (test_node, 0),
                             (node_Count_Self - test_node, num_classes))

    loss = ad.softmaxcrossentropy_op(y_train, label_train)
    loss_test = ad.softmaxcrossentropy_op(y_test, label_test)
    opt = optimizer.AdamOptimizer()
    train_op = opt.minimize(loss)

    executor = ad.Executor([loss, y, loss_test, train_op], ctx=executor_ctx)

    feed_dict = {
        A:
        adj_matrix,
        H:
        ndarray.array(input_part, ctx=executor_ctx),
        y_:
        ndarray.array(convert_to_one_hot(label_part, max_val=num_classes),
                      ctx=executor_ctx),
    }

    epoch_num = 100
    epoch_all, epoch_0 = 0, 0

    for i in range(epoch_num):
        epoch_start_time = time.time()
        results = executor.run(feed_dict=feed_dict)
        loss = results[0].asnumpy().sum()
        y_out = results[1]
        loss_test = results[2].asnumpy().sum()
        epoch_end_time = time.time()
        epoch_time = epoch_end_time - epoch_start_time
        epoch_all += epoch_time
        if i == 0:
            epoch_0 = epoch_time

        print("[Epoch: %d, Rank: %d] Epoch time: %.3f, Total time: %.3f" %
              (i, rank, epoch_time, epoch_all))

        y_out_train, y_predict = y_out.asnumpy().argmax(
            axis=1)[:train_node], y_out.asnumpy().argmax(axis=1)[test_node:]
        label_train, label_test = label_part[:train_node], label_part[
            test_node:]
        train_acc = ndarray.array(np.array([(y_out_train == label_train).sum()
                                            ]),
                                  ctx=executor_ctx)
        test_acc = ndarray.array(np.array([(y_predict == label_test).sum()]),
                                 ctx=executor_ctx)
        train_loss = ndarray.array(np.array([loss]), ctx=executor_ctx)
        test_loss = ndarray.array(np.array([loss_test]), ctx=executor_ctx)

        if replication > 1:
            col_groups[rank % replication].dlarrayNcclAllReduce(
                test_acc, test_acc, ncclDataType_t.ncclFloat32,
                ncclRedOp_t.ncclSum)
            col_groups[rank % replication].dlarrayNcclAllReduce(
                test_loss, test_loss, ncclDataType_t.ncclFloat32,
                ncclRedOp_t.ncclSum)
            col_groups[rank % replication].dlarrayNcclAllReduce(
                train_acc, train_acc, ncclDataType_t.ncclFloat32,
                ncclRedOp_t.ncclSum)
            col_groups[rank % replication].dlarrayNcclAllReduce(
                train_loss, train_loss, ncclDataType_t.ncclFloat32,
                ncclRedOp_t.ncclSum)
        else:
            comm.dlarrayNcclAllReduce(test_acc, test_acc,
                                      ncclDataType_t.ncclFloat32,
                                      ncclRedOp_t.ncclSum)
            comm.dlarrayNcclAllReduce(test_loss, test_loss,
                                      ncclDataType_t.ncclFloat32,
                                      ncclRedOp_t.ncclSum)
            comm.dlarrayNcclAllReduce(train_acc, train_acc,
                                      ncclDataType_t.ncclFloat32,
                                      ncclRedOp_t.ncclSum)
            comm.dlarrayNcclAllReduce(train_loss, train_loss,
                                      ncclDataType_t.ncclFloat32,
                                      ncclRedOp_t.ncclSum)

        test_acc = float(
            test_acc.asnumpy()[0]) / (node_count - test_split * node_count)
        test_loss = test_loss.asnumpy()[0] / (node_count -
                                              test_split * node_count)
        train_acc = float(train_acc.asnumpy()[0]) / (train_split * node_count)
        train_loss = train_loss.asnumpy()[0] / (train_split * node_count)

        if rank == 0:
            print("[Epoch: %d] Train Loss: %.3f, Train Accuracy: %.3f, Test Loss: %.3f, Test Accuracy: %.3f"\
            %(i,train_loss, train_acc, test_loss, test_acc))

    avg_epoch_time = (epoch_all - epoch_0) / (epoch_num - 1)
    results = ndarray.array(np.array([epoch_all, avg_epoch_time]),
                            ctx=executor_ctx)
    comm.dlarrayNcclAllReduce(results,
                              results,
                              ncclDataType_t.ncclFloat32,
                              reduceop=ncclRedOp_t.ncclSum)
    results = results.asnumpy() / size

    if rank == 0:
        print("\nAverage Total Time: %.3f, Average Epoch Time: %.3f" %
              (results[0], results[1]))