Example #1
0
def test_layernorm_forward(shape=(5, 3)):
    ctx = ndarray.gpu(1)
    # shape = (5, 3)
    last_dim = shape[-1]
    x = np.random.random(shape).astype(np.float32)
    scale = np.random.random((last_dim,)).astype(np.float32)
    bias = np.random.random((last_dim,)).astype(np.float32)
    arr_x = ndarray.array(x, ctx=ctx)
    arr_scale = ndarray.array(scale, ctx=ctx)
    arr_bias = ndarray.array(bias, ctx=ctx)
    arr_mean = ndarray.empty(list(shape[:-1]) + [1], ctx=ctx)
    arr_var = ndarray.empty(list(shape[:-1]) + [1], ctx=ctx)
    arr_y = ndarray.empty((shape), ctx=ctx)
    gpu_op.layer_normalization(arr_x, arr_scale, arr_bias, arr_mean, arr_var, arr_y, 0.01)

    y = arr_y.asnumpy()

    np_means = x.mean(axis=-1, dtype=np.float32, keepdims=True)
    np_vars = x.var(axis=-1, dtype=np.float32, keepdims=True)
    std = np.sqrt(np_vars + 0.01, dtype=np.float32)
    centered_input = x - np_means
    normed_input = centered_input / std

    bc_shape = [1] * len(x.shape)
    bc_shape[-1] = x.shape[-1]

    y_ = scale.reshape(bc_shape) * normed_input + \
        bias.reshape(bc_shape)
    
    np.testing.assert_allclose(np_means, arr_mean.asnumpy(), atol=1e-6)
    np.testing.assert_allclose(np_vars, arr_var.asnumpy(), atol=1e-6)
    np.testing.assert_allclose(y_, y, atol=1e-6)
    print('Pass forward test with shape ', shape)
Example #2
0
def test_truncated_normal(size, mean=0, std=1):
    ctx = ndarray.gpu(0)
    cuda_x = ndarray.empty(size, ctx=ctx)
    stre = stream.create_stream_handle(ctx)
    np_st = time()
    for i in range(10):
        x = truncnorm.rvs(-2.0, 2.0, loc=mean, scale=std,
                          size=size).astype(np.float32)
        cuda_x[:] = x
    np_en = time()
    print('numpy time: ', np_en - np_st)
    cu_st = time()
    for i in range(10):
        gpu_op.truncated_normal_init(cuda_x, mean, std, 123, stre)
    stre.sync()
    cu_en = time()
    print('cuda time: ', cu_en - cu_st)
    fig, ax = plt.subplots(1, 1)
    cuda_x = cuda_x.asnumpy()
    assert (cuda_x.shape == x.shape)
    ax.hist(x.flatten(),
            histtype='stepfilled',
            alpha=0.2,
            bins=50,
            label='numpy')
    ax.hist(cuda_x.flatten(),
            histtype='step',
            alpha=0.2,
            bins=50,
            label='cuda')
    ax.legend(loc='best', frameon=False)
    # ax2.legend(loc='best', frameon=False)
    file_name = 'truncated_normal_%f_%f.png' % (mean, std)
    plt.savefig(file_name)
    plt.close()
Example #3
0
def test_uniform(size, lb=-1, ub=1):
    ctx = ndarray.gpu(0)
    cuda_x = ndarray.empty(size, ctx=ctx)
    stre = stream.create_stream_handle(ctx)
    np_st = time()
    for i in range(10):
        x = np.random.uniform(low=lb, high=ub, size=size).astype(np.float32)
        cuda_x[:] = x
    np_en = time()
    print('numpy time: ', np_en - np_st)
    cu_st = time()
    for i in range(10):
        gpu_op.uniform_init(cuda_x, lb, ub, 123, stre)
    stre.sync()
    cu_en = time()
    print('cuda time: ', cu_en - cu_st)
    fig, ax = plt.subplots(1, 1)
    cuda_x = cuda_x.asnumpy()
    assert (cuda_x.shape == x.shape)
    ax.hist(x.flatten(),
            histtype='stepfilled',
            alpha=0.2,
            bins=50,
            label='numpy')
    ax.hist(cuda_x.flatten(),
            histtype='step',
            alpha=0.2,
            bins=50,
            label='cuda')
    ax.legend(loc='best', frameon=False)
    # ax2.legend(loc='best', frameon=False)
    file_name = 'uniform_%f_%f.png' % (lb, ub)
    plt.savefig(file_name)
    plt.close()
Example #4
0
def test_cpu_normal(size, mean=0, std=1):
    cpu_x = ndarray.empty(size, ctx=ndarray.cpu(0))
    np_st = time()
    for i in range(10):
        x = np.random.normal(loc=mean, scale=std, size=size).astype(np.float32)
        cpu_x[:] = x
    np_en = time()
    print('numpy time: ', np_en - np_st)
    cpu_st = time()
    for i in range(10):
        cpu_op.normal_init(cpu_x, mean, std, 123)
    cpu_en = time()
    print('cpu time: ', cpu_en - cpu_st)
    fig, ax = plt.subplots(1, 1)
    cpu_x = cpu_x.asnumpy()
    assert (cpu_x.shape == x.shape)
    ax.hist(x.flatten(),
            histtype='stepfilled',
            alpha=0.2,
            bins=50,
            label='numpy')
    ax.hist(cpu_x.flatten(), histtype='step', alpha=0.2, bins=50, label='cpu')
    ax.legend(loc='best', frameon=False)
    # ax2.legend(loc='best', frameon=False)
    file_name = 'normal_%f_%f_cpu.png' % (mean, std)
    plt.savefig(file_name)
    plt.close()
Example #5
0
def test_layernorm_backward(shape=(5, 3)):
    ctx = ndarray.gpu(1)
    # shape = (5, 3)
    last_dim = shape[-1]
    grads = np.random.random(shape).astype(np.float32)
    x = np.random.random(shape).astype(np.float32)
    scale = np.random.random((last_dim,)).astype(np.float32)
    mean = np.random.random(list(shape[:-1])+[1]).astype(np.float32)
    var = np.random.random(list(shape[:-1])+[1]).astype(np.float32)

    arr_grads = ndarray.array(grads, ctx=ctx)
    arr_x = ndarray.array(x, ctx=ctx)
    arr_scale = ndarray.array(scale, ctx=ctx)
    arr_mean = ndarray.array(mean, ctx=ctx)
    arr_var = ndarray.array(var, ctx=ctx)

    grad_inarr = ndarray.empty(shape, ctx=ctx)
    grad_scale = ndarray.empty((last_dim,), ctx=ctx)
    grad_bias = ndarray.empty((last_dim,), ctx=ctx)
    gpu_op.layer_normalization_gradient(arr_grads, arr_x, arr_scale,
        grad_inarr, grad_scale, grad_bias, arr_mean, arr_var, 0.01)

    # numpy calculate phase
    red_axis = tuple(range(grads.ndim-1))
    np_grad_bias = grads.sum(red_axis) # (X,)
    
    std = np.sqrt(var + 0.01) # (N, 1)
    x_centered = x - mean # (N, X)
    x_norm = x_centered / std # (N, X)
    np_grad_scale = (grads * x_norm).sum(red_axis) # (X,)

    last_dim = x.shape[-1]
    dx_norm = grads * scale.reshape([1] * (grads.ndim - 1) + [-1]) # (N, X)
    dvar = (dx_norm * x_centered).sum(axis=-1, keepdims=True) * -0.5 / (var + 0.01) / std # (N, 1)
    dx_mu_1 = dx_norm / std # (N, X)
    dx_mu_2 = dvar * 2 * x_centered / last_dim # (N, X)
    dx_1 = dx_mu_1 + dx_mu_2 # (N, X)
    dx_2 = -1 * dx_1.sum(axis=-1, keepdims=True) / last_dim # (N, 1)
    np_grad_inarr = dx_1 + dx_2 # (N, X)
    
    np.testing.assert_allclose(np_grad_bias, grad_bias.asnumpy(), rtol=1e-4, atol=1e-4)
    np.testing.assert_allclose(np_grad_scale, grad_scale.asnumpy(), rtol=1e-4, atol=1e-4)
    np.testing.assert_allclose(np_grad_inarr, grad_inarr.asnumpy(), rtol=1e-4, atol=1e-4)
    print('Pass backward test with shape ', shape)
Example #6
0
def test_sparse_matrix_multiply():
	density = 1e-3
	ctx = ndarray.gpu(0)
	x = scipy.sparse.rand(500, 7000,density=density,format='coo',dtype=np.float32)
	y = np.random.uniform(0, 10, size=(7000, 100)).astype(np.float32)
	mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 7000], ctx=ctx)
	mat_y = ndarray.array(y, ctx=ctx)
	mat_z = ndarray.empty((500, 100), ctx=ctx)
	gpu_op.CuSparse_Csrmm(mat_x, False, mat_y, False, mat_z)
	z = mat_z.asnumpy()
	np.testing.assert_allclose(x.dot(y), z, rtol=1e-5)
Example #7
0
def test_sparse_array_dense_vector_multiply():
	density = 1e-3
	ctx = ndarray.gpu(0)
	x = scipy.sparse.rand(500, 70000, density=density,format='coo',dtype=np.float32)
	y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32)
	mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 70000], ctx=ctx)
	arr_y = ndarray.array(y, ctx=ctx)
	arr_z = ndarray.empty((500, 1), ctx=ctx)
	trans = False
	gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z)
	z = arr_z.asnumpy()
	np.testing.assert_allclose(x.dot(y), z, rtol=1e-5)

	
	x = scipy.sparse.rand(70000, 500, density=density,format='coo',dtype=np.float32)
	y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32)
	mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [70000, 500], ctx=ctx)
	arr_y = ndarray.array(y, ctx=ctx)
	arr_z = ndarray.empty((500, 1), ctx=ctx)
	trans = True
	gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z)
	z = arr_z.asnumpy()
	np.testing.assert_allclose(x.transpose().dot(y), z, rtol=1e-5)
Example #8
0
def test_init_ps(rarr, init_type, init_a, init_b=1.0, sparse=False):
    assert init_type in ('constant', 'uniform', 'normal', 'truncated_normal')
    init_type_map = {
        'constant': 0,
        'uniform': 1,
        'normal': 2,
        'truncated_normal': 3
    }
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(nitem, item_len)
    if rank == 0:
        arr = ndarray.array(local_arr, ctx=ctx)
    else:
        arr = ndarray.empty((nitem, item_len), ctx=ctx)
    comm = ad.get_worker_communicate()
    if sparse:
        arr_len = ctypes.c_int(nitem)
        arr_wid = ctypes.c_int(item_len)
    else:
        arr_len = ctypes.c_int(nitem * item_len)
        arr_wid = ctypes.c_int(1)
    itype = ctypes.c_int(init_type_map[init_type])
    comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len,
                    arr_wid, itype, ctypes.c_double(init_a),
                    ctypes.c_double(init_b), ctypes.c_ulonglong(123),
                    ctypes.c_int(0), (ctypes.c_float * 1)(0.1),
                    ctypes.c_int(1))

    comm.Pull(ctypes.c_int(0), arr.handle)
    comm.Wait(ctypes.c_int(0))
    if rank == 0:
        local_arr[:] = arr.asnumpy()
    comm.BarrierWorker()
    if rank != 0:
        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
    else:
        if init_type == 'constant':
            np.testing.assert_allclose(np.full((nitem, item_len), init_a),
                                       arr.asnumpy(),
                                       rtol=5e-7)
        else:
            if init_type == 'uniform':
                numpy_samples = np.random.uniform(
                    low=init_a, high=init_b,
                    size=(nitem, item_len)).astype(np.float32)
            elif init_type == 'normal':
                numpy_samples = np.random.normal(
                    loc=init_a, scale=init_b,
                    size=(nitem, item_len)).astype(np.float32)
            else:
                numpy_samples = truncnorm.rvs(-2.0,
                                              2.0,
                                              loc=init_a,
                                              scale=init_b,
                                              size=(nitem, item_len)).astype(
                                                  np.float32)
            fig, ax = plt.subplots(1, 1)
            ax.hist(numpy_samples.flatten(),
                    histtype='stepfilled',
                    alpha=0.2,
                    bins=50,
                    label='numpy')
            ax.hist(local_arr.flatten(),
                    histtype='step',
                    alpha=0.2,
                    bins=50,
                    label='ps')
            ax.legend(loc='best', frameon=False)
            # ax2.legend(loc='best', frameon=False)
            file_name = '%s_%.1f_%.1f_%d.png' % (init_type, init_a, init_b,
                                                 int(sparse))
            plt.savefig(file_name)
            print('Check file %s.' % file_name)
    print('Init parameters %d/%d passed.' % (rank, nrank))
    if rank == 0:
        comm.ClearOnServer(0)
    comm.Clear(0)
    comm.BarrierWorker()
Example #9
0
def test_api(rarr, rpush, rpull, sparse=False, lr=0.5):
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(nitem,
                                                              item_len).copy()
    local_push = np.frombuffer(rpush, dtype=np.float32).copy()
    local_pull = np.frombuffer(rpull, dtype=np.float32).copy()
    if rank == 0:
        arr = ndarray.array(local_arr, ctx=ctx)
    else:
        arr = ndarray.empty((nitem, item_len), ctx=ctx)
    comm = ad.get_worker_communicate()
    if sparse:
        arr_len = ctypes.c_int(nitem)
        arr_wid = ctypes.c_int(item_len)
    else:
        arr_len = ctypes.c_int(nitem * item_len)
        arr_wid = ctypes.c_int(1)
    comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len, arr_wid, ctypes.c_int(0), ctypes.c_double(0.0), ctypes.c_double(1.0), ctypes.c_ulonglong(123),\
        ctypes.c_int(0), (ctypes.c_float * 1)(lr), ctypes.c_int(1))
    if sparse:
        local_arr[:] = 0
        for j in local_push:
            local_arr[int(j)] += 1
        if rank == 0:
            push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx)
            push_val = ndarray.array(np.ones(
                (indx1, indx2, item_len)).astype(np.float32),
                                     ctx=ctx)
            comm.SparsePush(0, push_ind.handle, push_val.handle, None)
            comm.Wait(0)
        comm.BarrierWorker()
        comm.Pull(0, arr.handle)
        comm.Wait(0)
        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
        print('SparsePush DensePull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()

        for j in local_push:
            local_arr[int(j)] += 1
        if rank == 0:
            push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx)
            push_val = ndarray.array(np.ones(
                (indx1, indx2, item_len)).astype(np.float32),
                                     ctx=ctx)
            comm.SDPushPull(0, push_ind.handle, push_val.handle, arr.handle,
                            None)
            comm.Wait(0)
        comm.BarrierWorker()
        if rank != 0:
            comm.Pull(0, arr.handle)
            comm.Wait(0)
        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
        print('SDPushPull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()

        for j in local_push:
            local_arr[int(j)] += 1
        pull_ind = ndarray.array(local_pull.reshape(indx1, indx2), ctx=ctx)
        pull_val = ndarray.empty((indx1, indx2, item_len), ctx=ctx)
        if rank == 0:
            push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx)
            push_val = ndarray.array(np.ones(
                (indx1, indx2, item_len)).astype(np.float32),
                                     ctx=ctx)
            comm.SSPushPull(0, push_ind.handle, push_val.handle, \
                        pull_ind.handle, pull_val.handle, None)
            comm.Wait(0)
        comm.BarrierWorker()
        if rank != 0:
            comm.SparsePull(0, pull_ind.handle, pull_val.handle)
            comm.Wait(0)
        np.testing.assert_allclose(local_arr[local_pull.astype(int)].reshape(
            indx1, indx2, item_len),
                                   pull_val.asnumpy(),
                                   rtol=5e-7)
        print('SSPushPull and SparsePull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()

    else:
        if rank == 0:
            comm.Push(0, arr.handle, None)
            comm.Wait(0)
        comm.BarrierWorker()
        comm.Pull(0, arr.handle)
        comm.Wait(0)
        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
        print('DensePush DensePull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()
        if rank == 0:
            temp_push_val = ndarray.array(np.ones(
                (nitem, item_len)).astype(np.float32),
                                          ctx=ctx)
            comm.DDPushPull(0, temp_push_val.handle, arr.handle, None)
            comm.Wait(0)
        comm.BarrierWorker()
        if rank != 0:
            comm.Pull(0, arr.handle)
            comm.Wait(0)
        np.testing.assert_allclose(local_arr + 1, arr.asnumpy())
        print('DenseDensePushPull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()
    if rank == 0:
        comm.ClearOnServer(0)
    comm.Clear(0)
    comm.BarrierWorker()