Exemple #1
0
def test_slicing(fargs_tests):
    dims, dtype = fargs_tests

    gpu = NervanaGPU(default_dtype=dtype)
    cpu = NervanaCPU(default_dtype=dtype)

    array_np = np.random.uniform(-1, 1, dims).astype(dtype)
    array_ng = gpu.array(array_np, dtype=dtype)
    array_nc = cpu.array(array_np, dtype=dtype)

    assert_tensors_allclose(array_ng[0], array_nc[0], rtol=0, atol=1e-3)
    assert_tensors_allclose(array_ng[-1], array_nc[-1], rtol=0, atol=1e-3)
    assert_tensors_allclose(array_ng[0, :], array_nc[0, :], rtol=0, atol=1e-3)
    assert_tensors_allclose(array_ng[0:], array_nc[0:], rtol=0, atol=1e-3)
    assert_tensors_allclose(array_ng[:-1], array_nc[:-1], rtol=0, atol=1e-3)
    assert_tensors_allclose(array_ng[:, 0], array_nc[:, 0], rtol=0, atol=1e-3)
    assert_tensors_allclose(array_ng[:, 0:1], array_nc[:, 0:1], rtol=0, atol=1e-3)
    assert_tensors_allclose(array_ng[-1, 0:], array_nc[-1:, 0:], rtol=0, atol=1e-3)

    array_ng[0] = 0
    array_nc[0] = 0

    assert_tensors_allclose(array_ng, array_nc, rtol=0, atol=1e-3)

    del(gpu)
Exemple #2
0
def test_pool_layer(poolargs):

    op = poolargs[0]

    dtype = np.float32

    ng = NervanaGPU(stochastic_round=False, bench=True)
    nc = NervanaCPU()

    N, C = 32, 32
    D, H, W = 1, 32, 32
    J, T, R, S = 2, 1, 3, 3
    padding_j, padding_d, padding_h, padding_w = 0, 0, 0, 0
    strides_j, strides_d, strides_h, strides_w = 2, 1, 2, 2
    # op = 'max'

    pool_ng = ng.pool_layer(dtype, op, N, C, D, H, W, J, T, R, S, padding_j,
                            padding_d, padding_h, padding_w, strides_j,
                            strides_d, strides_h, strides_w)

    pool_nc = nc.pool_layer(dtype, op, N, C, D, H, W, J, T, R, S, padding_j,
                            padding_d, padding_h, padding_w, strides_j,
                            strides_d, strides_h, strides_w)

    assert pool_ng.dimI == pool_nc.dimI
    assert pool_ng.dimO == pool_nc.dimO

    dimI = pool_ng.dimI
    dimO = pool_ng.dimO

    # generating input arrays for inputs and errors
    cpuI = np.random.uniform(0.0, 1.0,
                             sliceable(dimI,
                                       1)).astype(np.float16).astype(dtype)
    cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(dtype)

    # zero pad the last row of cpu input for the sake of numpy
    if op == "max":
        cpuI[-1, :] = np.finfo(dtype).min
    else:
        cpuI[-1, :] = 0

    # =========GPU and CPU and numpy ==========
    beI = cpuI[:-1, :].reshape(dimI)
    beE = cpuE

    ngO, ngB = run_backend_pool(ng, pool_ng, beI, beE, dtype)
    ncO, ncB = run_backend_pool(nc, pool_nc, beI, beE, dtype)
    cpuO, cpuB = run_numpy_pool(op, cpuI, cpuE, dtype, pool_ng)

    for opA, ngA, ncA, cpuA in (("fprop", ngO, ncO, cpuO),
                                ("bprop", ngB, ncB.reshape(dimI),
                                 cpuB[:-1, :].reshape(dimI))):

        print opA
        assert np.allclose(ngA.get(), ncA.get(), rtol=0, atol=1e-4)
        assert np.allclose(ncA.get(), cpuA, rtol=0, atol=1e-5)

    del ng, nc
Exemple #3
0
def test_pooling(device_id):
    ng = NervanaGPU(stochastic_round=False, bench=True, device_id=device_id)
    nc = NervanaCPU()
    layer_args = dict(dtype=np.float32, N=122, C=16, D=1, H=32, W=32, J=5)
    pool_test_args = dict(ones=0, cpu=1, ng=ng, nc=nc,
                          alpha=1.0,  # not supported in CPU
                          ascale=1.2,
                          beta=0.0,  # not supported in CPU
                          bpower=0.5,
                          layer_g=ng.lrn_layer(**layer_args),  # returns a pool layer
                          layer_c=nc.lrn_layer(**layer_args),
                          **layer_args)

    lrn_helper(**pool_test_args)
def test_edge_cases(device_id):
    """
    Test several edge cases related to min/max bin, and rounding.

    Also test backend dump_hist_data functionality.
    """
    gpuflag = (check_gpu.get_compute_capability(0) >= 3.0)
    if gpuflag is False:
        raise RuntimeError("Device does not have CUDA compute capability 3.0 or greater")
    ng = NervanaGPU(device_id=device_id)
    nc = NervanaCPU()
    # edge case test
    np_ref = dict()
    inputs = [
        ("edges", np.array([2 ** -48, 2 ** 15], dtype=np.float32)),
        ("rounding", np.array([2 ** 5, 63.99998856, 2 ** 6, 2 ** -3, 2 ** -4,
                               0.11262291, 92.22483826], dtype=np.float32)),
        ("fp16 rounding", np.array([45.21875], dtype=np.float16))
    ]
    for tag, inp in inputs:
        np_ref[tag] = ref_hist(inp)
        for be in [ng, nc]:
            be_inp = be.array(inp)
            be_hist = be_inp.hist(tag)
            assert tensors_allclose(np_ref[tag], be_hist), tag + str(be)

    # dump_hist_data test
    for be in [ng, nc]:
        be_hist_data, be_hist_map = be.dump_hist_data()
        for tag, inp in inputs:
            be_data = be_hist_data[be_hist_map[tag]]
            assert tensors_allclose(np_ref[tag], be_data), tag + str(be)

    del(ng)
    del(nc)
Exemple #5
0
def test_hist(nbin_offset_dim_dtype_inp):
    """
    Compare the nervanagpu and nervanacpu hist implementation to the reference
    implementation above.

    Parameterized test case, uses pytest_generate_test to enumerate dim_dtype_inp
    tuples that drive the test.
    """

    (nbins, offset), dim, dtype, (name, inp_gen) = nbin_offset_dim_dtype_inp

    gpuflag = (check_gpu.get_compute_capability(0) >= 3.0)
    if gpuflag is False:
        raise RuntimeError(
            "Device does not have CUDA compute capability 3.0 or greater")

    ng = NervanaGPU(hist_bins=nbins, hist_offset=offset)
    nc = NervanaCPU(hist_bins=nbins, hist_offset=offset)

    np_inp = inp_gen(dim).astype(dtype)
    np_hist = ref_hist(np_inp, nbins=nbins, offset=offset)
    for be in [ng, nc]:
        be_inp = be.array(np_inp, dtype=dtype)
        be_hist = be_inp.hist(name)
        assert_tensors_allclose(np_hist, be_hist)
    del (ng)
    del (nc)
Exemple #6
0
def test_batched_dot():
    np.set_printoptions(threshold=8192 * 4, linewidth=600,
                        formatter={'int': lambda x: "%2d" % x, 'float': lambda x: "%2.0f" % x})

    ng = NervanaGPU(stochastic_round=False, bench=1)
    nc = NervanaCPU()

    dtype = np.float32  # np.float16 or np.float32

    X = 100   # Batch Size
    N = 32   # Minibatch Size
    C = 1536  # Input  Features
    K = 768  # Output Features

    cpuI, cpuE, cpuW = setup_test_data(X, N, C, K, dtype)

    ngO, ngB, ngU = run_batched_dot(ng, cpuI, cpuE, cpuW, X, dtype)
    ncO, ncB, ncU = run_batched_dot(nc, cpuI, cpuE, cpuW, X, dtype)
    npO, npB, npU = run_batched_dot(np, cpuI, cpuE, cpuW, X, dtype)

    # set_trace()
    assert_tensors_allclose(npO, ngO, rtol=0, atol=1e-3)
    assert_tensors_allclose(npB, ngB, rtol=0, atol=1e-3)
    assert_tensors_allclose(npU, ngU, rtol=0, atol=1e-3)

    assert_tensors_allclose(npO, ncO, rtol=0, atol=1e-3)
    assert_tensors_allclose(npB, ncB, rtol=0, atol=1e-3)
    assert_tensors_allclose(npU, ncU, rtol=0, atol=1e-3)

    ng.ctx.detach()
    del(ng)
Exemple #7
0
def test_cpu():
    from neon.backends.nervanacpu import NervanaCPU

    # ng = NervanaGPU()
    nc = NervanaCPU()
    # test_nms((ng, nc), (0.1, 5))
    import datetime
    time_old = datetime.datetime.now()
    score = 4
    box_count = 5
    thre = 0.1
    for i in range(10000):
        dets = np.asarray(
            [[1, 2, 3, 4, 0.6], [1, 3, 3, 4, 0.7], [1, 1, 4, 4, 0.9], [2, 1, 4, 4, 0.85], [1, 1, 3, 4, 0.85]],
            dtype=np.float32)
        print(dets)
        dets[:, score] = np.sort(np.random.random((box_count,)))[::-1]

        # call reference nms
        keep_ref = py_cpu_nms(dets, thre)
        # call cpu nms
        # dets_nc = nc.array(dets)
        # # tic_cpu = nc.init_mark()
        # # toc_cpu = nc.init_mark()
        # # nc.record_mark(tic_cpu)
        #
        # keep_nc = nc.nms(dets_nc, 0.1)
        # print("keep_nc nms", keep_nc)

    print('requests cpu', (datetime.datetime.now() - time_old).microseconds)
Exemple #8
0
def compare_helper(op, inA, inB, dtype):
    numpy_result = math_helper(np, op, inA, inB, dtype=np.float32)

    if np.dtype(dtype).kind == 'i' or np.dtype(dtype).kind == 'u':
        numpy_result = np.around(numpy_result)
        numpy_result = numpy_result.clip(
            np.iinfo(dtype).min, np.iinfo(dtype).max)
    numpy_result = numpy_result.astype(dtype)

    if dtype in (np.float32, np.float16):
        gpu = NervanaGPU(default_dtype=dtype)
        nervanaGPU_result = math_helper(gpu, op, inA, inB, dtype=dtype)
        nervanaGPU_result = nervanaGPU_result.get()
        np.allclose(numpy_result, nervanaGPU_result, rtol=0, atol=1e-5)

    cpu = NervanaCPU(default_dtype=dtype)
    nervanaCPU_result = math_helper(cpu, op, inA, inB, dtype=dtype)
    nervanaCPU_result = nervanaCPU_result.get()
    np.allclose(numpy_result, nervanaCPU_result, rtol=0, atol=1e-5)
Exemple #9
0
def test_hist(nbin_offset_dim_dtype_inp):
    """
    Compare the nervanagpu and nervanacpu hist implementation to the reference
    implementation above.

    Parameterized test case, uses pytest_generate_test to enumerate dim_dtype_inp
    tuples that drive the test.
    """

    (nbins, offset), dim, dtype, (name, inp_gen) = nbin_offset_dim_dtype_inp

    ng = NervanaGPU(hist_bins=nbins, hist_offset=offset)
    nc = NervanaCPU(hist_bins=nbins, hist_offset=offset)

    np_inp = inp_gen(dim).astype(dtype)
    np_hist = ref_hist(np_inp, nbins=nbins, offset=offset)
    for be in [ng, nc]:
        be_inp = be.array(np_inp, dtype=dtype)
        be_hist = be_inp.hist(name)
        assert_tensors_allclose(np_hist, be_hist)
Exemple #10
0
def test_copy_transpose(shape_dtype_inp):
    """
    Parameterized test case, uses pytest_generate_test to enumerate dim_dtype_inp
    tuples that drive the test.
    """

    shape, dtype, (name, inp_gen) = shape_dtype_inp
    ng = NervanaGPU(default_dtype=dtype)
    nc = NervanaCPU(default_dtype=dtype)
    np_inp = inp_gen(shape).astype(dtype)
    ndims = len(shape)

    axes = [None] + list(itt.permutations(range(ndims), ndims))
    axes.remove(tuple(range(ndims)))
    for be, ax in itt.product([ng, nc], axes):
        be_inp = be.array(np_inp, dtype=dtype)
        np_trans = np.transpose(np_inp, axes=ax)
        be_trans = be.zeros(np_trans.shape)
        be.copy_transpose(be_inp, be_trans, axes=ax)
        assert_tensors_allclose(np_trans, be_trans)
    del (ng)
    del (nc)
Exemple #11
0
def test_edge_cases():
    """
    Test several edge cases related to min/max bin, and rounding.

    Also test backend dump_hist_data functionality.
    """
    ng = NervanaGPU()
    nc = NervanaCPU()
    # edge case test
    np_ref = dict()
    inputs = [
        ("edges", np.array([2**-48, 2**15], dtype=np.float32)),
        ("rounding",
         np.array(
             [2**5, 63.99998856, 2**6, 2**-3, 2**-4, 0.11262291, 92.22483826],
             dtype=np.float32)),
        ("fp16 rounding", np.array([45.21875], dtype=np.float16))
    ]
    for tag, inp in inputs:
        np_ref[tag] = ref_hist(inp)
        for be in [ng, nc]:
            be_inp = be.array(inp)
            be_hist = be_inp.hist(tag)
            assert_tensors_allclose(np_ref[tag],
                                    be_hist,
                                    err_msg=tag + str(be))

    # dump_hist_data test
    for be in [ng, nc]:
        be_hist_data, be_hist_map = be.dump_hist_data()

        for tag, inp in inputs:
            be_data = be_hist_data[be_hist_map[tag]]
            assert_tensors_allclose(np_ref[tag],
                                    be_data,
                                    err_msg=tag + str(be))
Exemple #12
0
from neon.backends.convolution import (_ceil_div,
    FpropCuda,   BpropCuda,   UpdateCuda,
    FpropDirect, BpropDirect, UpdateDirect)

from neon.backends.winograd_conv import (
    FpropWinograd_2x2_3x3, BpropWinograd_2x2_3x3, UpdateWinograd_3x3_2x2,
    FpropWinograd_4x4_3x3, BpropWinograd_4x4_3x3, UpdateWinograd_3x3_4x4,
    FpropWinograd_2x2_5x5, BpropWinograd_2x2_5x5)

fprop_kernels  = (FpropCuda,  FpropDirect,  FpropWinograd_2x2_3x3,  FpropWinograd_4x4_3x3, FpropWinograd_2x2_5x5)
bprop_kernels  = (BpropCuda,  BpropDirect,  BpropWinograd_2x2_3x3,  BpropWinograd_4x4_3x3, BpropWinograd_2x2_5x5)
update_kernels = (UpdateCuda, UpdateDirect, UpdateWinograd_3x3_2x2, UpdateWinograd_3x3_4x4)

ng = NervanaGPU(0)
nc = NervanaCPU()

neon_logger.display(drv.Context.get_current().get_device().name())

out =  0
ones = 0

#                D,   H,   W,  T, R, S,    pad,   str
conv_1x1     = ( 1,  14,  14,  1, 1, 1,  0,0,0, 1,1,1)
conv_3x3     = ( 1,  14,  14,  1, 3, 3,  0,1,1, 1,1,1)
conv_3x3p0   = ( 1,  14,  14,  1, 3, 3,  0,0,0, 1,1,1)
conv_3x3p2   = ( 1,  14,  14,  1, 3, 3,  0,2,2, 1,1,1)
conv_3x3s2   = ( 1,  14,  14,  1, 3, 3,  0,1,1, 1,2,2)
conv_1x3     = ( 1,  14,  14,  1, 1, 3,  0,0,1, 1,1,1)
conv_3x1     = ( 1,  14,  14,  1, 3, 1,  0,1,0, 1,1,1)
conv_5x5     = ( 1,  14,  14,  1, 5, 5,  0,2,2, 1,1,1)
Exemple #13
0
def gen_backend(backend='cpu', rng_seed=None, datatype=np.float32,
                batch_size=0, stochastic_round=False, device_id=0,
                max_devices=get_device_count(), compat_mode=None):
    """
    Construct and return a backend instance of the appropriate type based on
    the arguments given. With no parameters, a single CPU core, float32
    backend is returned.

    Arguments:
        backend (string, optional): 'cpu' or 'gpu'.
        rng_seed (numeric, optional): Set this to a numeric value which can be used to seed the
                                      random number generator of the instantiated backend.
                                      Defaults to None, which doesn't explicitly seed (so each run
                                      will be different)
        dataype (dtype): Default tensor data type. CPU backend supports np.float64, np.float32 and
                         np.float16; GPU backend supports np.float32 and np.float16.
        batch_size (int): Set the size the data batches.
        stochastic_round (int/bool, optional): Set this to True or an integer to implent
                                               stochastic rounding. If this is False rounding will
                                               be to nearest. If True will perform stochastic
                                               rounding using default bit width. If set to an
                                               integer will round to that number of bits.
                                               Only affects the gpu backend.
        device_id (numeric, optional): Set this to a numeric value which can be used to select
                                       device on which to run the process
        max_devices (int, optional): For use with multi-GPU backend only.
                                      Controls the maximum number of GPUs to run
                                      on.
        compat_mode (str, optional): if this is set to 'caffe' then the conv and pooling
                                     layer output sizes will match that of caffe as will
                                     the dropout layer implementation

    Returns:
        Backend: newly constructed backend instance of the specifed type.

    Notes:
        * Attempts to construct a GPU instance without a CUDA capable card or without nervanagpu
          package installed will cause the program to display an error message and exit.
    """
    logger = logging.getLogger(__name__)

    if NervanaObject.be is not None:
        # backend was already generated clean it up first
        cleanup_backend()
    else:
        # at exit from python force cleanup of backend only register this function once, will use
        # NervanaObject.be instead of a global
        atexit.register(cleanup_backend)

    if backend == 'cpu' or backend is None:
        from neon.backends.nervanacpu import NervanaCPU
        be = NervanaCPU(rng_seed=rng_seed, default_dtype=datatype, compat_mode=compat_mode)
    elif backend == 'gpu' or backend == 'mgpu':
        gpuflag = False
        # check nvcc
        from neon.backends.util import check_gpu
        gpuflag = (check_gpu.get_compute_capability(device_id) >= 5.0)
        if gpuflag is False:
            raise RuntimeError("Device " + str(device_id) + " does not have CUDA compute " +
                               "capability 5.0 or greater")
        if backend == 'gpu':
            from neon.backends.nervanagpu import NervanaGPU
            # init gpu
            be = NervanaGPU(rng_seed=rng_seed, default_dtype=datatype,
                            stochastic_round=stochastic_round,
                            device_id=device_id,
                            compat_mode=compat_mode)
        else:
            try:
                from mgpu.nervanamgpu import NervanaMGPU
                # init multiple GPU
                be = NervanaMGPU(rng_seed=rng_seed,
                                 default_dtype=datatype,
                                 stochastic_round=stochastic_round,
                                 num_devices=max_devices)
            except ImportError:
                logger.error("Multi-GPU support is a premium feature "
                             "available exclusively through the Nervana cloud."
                             " Please contact [email protected] for details.")
                raise
    else:
        raise ValueError("backend must be one of ('cpu', 'gpu', 'mgpu')")

    logger.info("Backend: {}, RNG seed: {}".format(backend, rng_seed))

    NervanaObject.be = be
    be.bsz = batch_size
    return be
Exemple #14
0
    dets_ng = ng.array(dets)
    scores = dets_ng[:, 4].get()
    order = scores.argsort()[::-1]

    sorted_dets_dev = dets_ng[order, :]

    tic_gpu = ng.init_mark()
    toc_gpu = ng.init_mark()

    # call through backend
    ng.record_mark(tic_gpu)

    keep_ng = ng.nms(sorted_dets_dev, thre)

    ng.record_mark(toc_gpu)
    ng.synchronize_mark(toc_gpu)
    print("gpu NMS time (ms): {}".format(ng.get_time(tic_gpu, toc_gpu)))

    assert keep_ng == keep_ref


if __name__ == '__main__':

    from neon.backends.nervanagpu import NervanaGPU
    from neon.backends.nervanacpu import NervanaCPU

    ng = NervanaGPU()
    nc = NervanaCPU()

    test_nms((ng, nc), (0.7, 300))
Exemple #15
0
def gen_backend(backend='cpu', rng_seed=None, default_dtype=np.float32,
                batch_size=0, stochastic_round=False, device_id=0):
    """
    Construct and return a backend instance of the appropriate type based on
    the arguments given. With no parameters, a single CPU core, float32
    backend is returned.

    Arguments:
        backend (string, optional): 'cpu' or 'gpu'.
        rng_seed (numeric, optional): Set this to a numeric value which can be
                                      used to seed the random number generator
                                      of the instantiated backend.  Defaults to
                                      None, which doesn't explicitly seed (so
                                      each run will be different)
        default_dtype (dtype): Default tensor data type. CPU backend supports
                               np.float64, np.float32 and np.float16; GPU
                               backend supports np.float32 and np.float16.
        batch_size (int): Set the size the data batches.
        stochastic_round (int/bool, optional): Set this to True or an integer
                                               to implent stochastic rounding.
                                               If this is False rounding will
                                               be to nearest.
                                               If True will perform stochastic
                                               rounding using default bit width.
                                               If set to an integer will round
                                               to that number of bits.
                                               Only affects the gpu backend.
        device_id (numeric, optional): Set this to a numeric value which can be
                                       used to select which device to run the
                                       process on

    Returns:
        Backend: newly constructed backend instance of the specifed type.

    Notes:
        * Attempts to construct a GPU instance without a CUDA capable card or
          without nervanagpu package installed will cause the
          program to display an error message and exit.
    """
    logger = logging.getLogger(__name__)

    if NervanaObject.be is not None:
        # backend was already generated
        # clean it up first
        cleanup_backend()
    else:
        # at exit from python force cleanup of backend
        # only register this function once, will use
        # NervanaObject.be instead of a global
        atexit.register(cleanup_backend)

    if backend == 'cpu' or backend is None:
        from neon.backends.nervanacpu import NervanaCPU
        be = NervanaCPU(rng_seed=rng_seed, default_dtype=default_dtype)
    elif backend == 'gpu':
        gpuflag = False
        # check nvcc
        from neon.backends.util import check_gpu
        gpuflag = (check_gpu.get_compute_capability(device_id) >= 5.0)
        if gpuflag is False:
            raise RuntimeError("Device " + str(device_id) + " does not have CUDA compute " +
                               "capability 5.0 or greater")
        from neon.backends.nervanagpu import NervanaGPU
        # init gpu
        be = NervanaGPU(rng_seed=rng_seed, default_dtype=default_dtype,
                        stochastic_round=stochastic_round, device_id=device_id)
    elif backend == 'mgpu':
        raise NotImplementedError("mgpu will be ready soon")
    else:
        raise ValueError("backend must be one of "
                         "('cpu', 'gpu', 'mgpu')")

    logger.info("Backend: {}, RNG seed: {}".format(backend, rng_seed))

    NervanaObject.be = be
    be.bsz = batch_size
    return be
Exemple #16
0
def get_backend_pair_mkl(device_id, dtype=np.float32, bench=False):
    nm = NervanaMKL(default_dtype=dtype)
    nc = NervanaCPU(default_dtype=dtype)
    return (nm, nc)
Exemple #17
0
from neon.backends.convolution import (_ceil_div,
    FpropCuda,   BpropCuda,   UpdateCuda,
    FpropDirect, BpropDirect, UpdateDirect)

from neon.backends.winograd_conv import (
    FpropWinograd_2x2_3x3, BpropWinograd_2x2_3x3, UpdateWinograd_3x3_2x2,
    FpropWinograd_4x4_3x3, BpropWinograd_4x4_3x3, UpdateWinograd_3x3_4x4,
    FpropWinograd_2x2_5x5, BpropWinograd_2x2_5x5)

fprop_kernels  = (FpropCuda,  FpropDirect,  FpropWinograd_2x2_3x3,  FpropWinograd_4x4_3x3, FpropWinograd_2x2_5x5)
bprop_kernels  = (BpropCuda,  BpropDirect,  BpropWinograd_2x2_3x3,  BpropWinograd_4x4_3x3, BpropWinograd_2x2_5x5)
update_kernels = (UpdateCuda, UpdateDirect, UpdateWinograd_3x3_2x2, UpdateWinograd_3x3_4x4)

ng = NervanaGPU(0)
nc = NervanaCPU()

neon_logger.display(drv.Context.get_current().get_device().name())

out =  0
ones = 0

#                D,   H,   W,  T, R, S,    pad,   str
conv_1x1     = ( 1,  14,  14,  1, 1, 1,  0,0,0, 1,1,1)
conv_3x3     = ( 1,  14,  14,  1, 3, 3,  0,1,1, 1,1,1)
conv_3x3p0   = ( 1,  14,  14,  1, 3, 3,  0,0,0, 1,1,1)
conv_3x3p2   = ( 1,  14,  14,  1, 3, 3,  0,2,2, 1,1,1)
conv_3x3s2   = ( 1,  14,  14,  1, 3, 3,  0,1,1, 1,2,2)
conv_1x3     = ( 1,  14,  14,  1, 1, 3,  0,0,1, 1,1,1)
conv_3x1     = ( 1,  14,  14,  1, 3, 1,  0,1,0, 1,1,1)
conv_5x5     = ( 1,  14,  14,  1, 5, 5,  0,2,2, 1,1,1)
Exemple #18
0
def test_conv_layer():

    dtype = np.float32

    ng = NervanaGPU(stochastic_round=False, bench=True)
    nc = NervanaCPU()

    N, C, K = 64, 64, 64
    D, H, W = 1, 5, 5
    T, R, S = 1, 3, 3
    padding_d, padding_h, padding_w = 0, 1, 1
    strides_d, strides_h, strides_w = 1, 1, 1

    conv_ng = ng.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d,
                            padding_h, padding_w, strides_d, strides_h,
                            strides_w)

    conv_nc = nc.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d,
                            padding_h, padding_w, strides_d, strides_h,
                            strides_w)

    assert conv_nc.dimI == conv_ng.dimI
    assert conv_nc.dimF == conv_ng.dimF
    assert conv_nc.dimO == conv_ng.dimO
    assert conv_nc.M == conv_ng.M

    dimI = conv_ng.dimI
    dimF = conv_ng.dimF
    dimO = conv_ng.dimO

    # cpu input arrays
    cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32)
    cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32)
    cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32)

    # zero pad the last row of cpu input for the sake of numpy
    cpuI[-1, :] = 0.0

    # =======GPU and CPU==========
    beI = cpuI[:-1, :].reshape(dimI)
    beF = cpuF.reshape(dimF)
    beE = cpuE

    start_gpu = default_timer()
    ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype)
    end_gpu = default_timer()

    start_cpu = default_timer()
    ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype)
    end_cpu = default_timer()

    print("gputime: %s, cputime %s" %
          (end_gpu - start_gpu, end_cpu - start_cpu))

    # ======numpy===========
    # cpu output arrays
    cpuO = np.zeros(dimO, dtype=dtype)
    cpuB = np.zeros(slicable(dimI, 1), dtype=dtype)
    cpuU = np.zeros(slicable(dimF), dtype=dtype)

    D, H, W = conv_nc.DHW
    T, R, S = conv_nc.TRS
    M, P, Q = conv_nc.MPQ

    pad_d, pad_h, pad_w = conv_nc.padding
    str_d, str_h, str_w = conv_nc.strides

    for m in range(M):
        mt = m * str_d - pad_d

        for p in range(P):
            pr = p * str_h - pad_h

            for q in range(Q):
                qs = q * str_w - pad_w

                idx = pixel_indices(conv_nc, mt, pr, qs)

                cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :])

                cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :])

                cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T)

    for op, ngA, ncA, cpuA, w in (("fprop", ngO, ncO, cpuO,
                                   Q), ("bprop", ngB, ncB.reshape(dimI),
                                        cpuB[:-1, :].reshape(dimI), W),
                                  ("update", ngU, ncU.reshape(dimF),
                                   cpuU.reshape(dimF), S)):

        print op
        assert np.allclose(ngA.get(), cpuA, rtol=0, atol=1e-4)
        assert np.allclose(ncA.get(), cpuA, rtol=0, atol=1e-5)

    ng.ctx.detach()
    del ng
Exemple #19
0
def get_backend_pair(device_id, dtype=np.float32, bench=False):
    from neon.backends.nervanagpu import NervanaGPU
    ng = NervanaGPU(default_dtype=dtype, bench=bench, device_id=device_id)
    nc = NervanaCPU(default_dtype=dtype)
    return (ng, nc)
Exemple #20
0
def gen_backend(backend='cpu',
                rng_seed=None,
                datatype=np.float32,
                batch_size=0,
                stochastic_round=False,
                device_id=0,
                max_devices=get_device_count(),
                compat_mode=None,
                deterministic_update=False,
                deterministic=True):
    """
    Construct and return a backend instance of the appropriate type based on
    the arguments given. With no parameters, a single CPU core, float32
    backend is returned.

    Arguments:
        backend (string, optional): 'cpu' or 'gpu'.
        rng_seed (numeric, optional): Set this to a numeric value which can be used to seed the
                                      random number generator of the instantiated backend.
                                      Defaults to None, which doesn't explicitly seed (so each run
                                      will be different)
        dataype (dtype): Default tensor data type. CPU backend supports np.float64, np.float32 and
                         np.float16; GPU backend supports np.float32 and np.float16.
        batch_size (int): Set the size the data batches.
        stochastic_round (int/bool, optional): Set this to True or an integer to implent
                                               stochastic rounding. If this is False rounding will
                                               be to nearest. If True will perform stochastic
                                               rounding using default bit width. If set to an
                                               integer will round to that number of bits.
                                               Only affects the gpu backend.
        device_id (numeric, optional): Set this to a numeric value which can be used to select
                                       device on which to run the process
        max_devices (int, optional): For use with multi-GPU backend only.
                                      Controls the maximum number of GPUs to run
                                      on.
        compat_mode (str, optional): if this is set to 'caffe' then the conv and pooling
                                     layer output sizes will match that of caffe as will
                                     the dropout layer implementation
        deterministic (bool, optional): if set to true, all operations will be done deterministically.

    Returns:
        Backend: newly constructed backend instance of the specifed type.

    Notes:
        * Attempts to construct a GPU instance without a CUDA capable card or without nervanagpu
          package installed will cause the program to display an error message and exit.
    """
    logger = logging.getLogger(__name__)

    if NervanaObject.be is not None:
        # backend was already generated clean it up first
        cleanup_backend()
    else:
        # at exit from python force cleanup of backend only register this function once, will use
        # NervanaObject.be instead of a global
        atexit.register(cleanup_backend)

    if deterministic_update:
        deterministic = True
        logger.warning(
            "--deterministic_update is deprecated in favor of --deterministic")

    if backend == 'cpu' or backend is None:
        from neon.backends.nervanacpu import NervanaCPU
        be = NervanaCPU(rng_seed=rng_seed,
                        default_dtype=datatype,
                        compat_mode=compat_mode)
    elif backend == 'gpu' or backend == 'mgpu':
        gpuflag = False
        # check nvcc
        from neon.backends.util import check_gpu
        gpuflag = (check_gpu.get_compute_capability(device_id) >= 3.0)
        if gpuflag is False:
            raise RuntimeError("Device " + str(device_id) +
                               " does not have CUDA compute " +
                               "capability 3.0 or greater")
        if backend == 'gpu':
            from neon.backends.nervanagpu import NervanaGPU
            # init gpu
            be = NervanaGPU(rng_seed=rng_seed,
                            default_dtype=datatype,
                            stochastic_round=stochastic_round,
                            device_id=device_id,
                            compat_mode=compat_mode,
                            deterministic=deterministic)
        else:
            try:
                from mgpu.nervanamgpu import NervanaMGPU
                # init multiple GPU
                be = NervanaMGPU(rng_seed=rng_seed,
                                 default_dtype=datatype,
                                 stochastic_round=stochastic_round,
                                 num_devices=max_devices,
                                 compat_mode=compat_mode,
                                 deterministic=deterministic)
            except ImportError:
                logger.error(
                    "Multi-GPU support is a premium feature "
                    "available exclusively through the Nervana cloud."
                    " Please contact [email protected] for details.")
                raise
    elif backend == 'argon':
        from argon.neon_backend.ar_backend import ArBackend
        be = ArBackend(rng_seed=rng_seed, default_dtype=datatype)
    else:
        raise ValueError("backend must be one of ('cpu', 'gpu', 'mgpu')")

    logger.info("Backend: {}, RNG seed: {}".format(backend, rng_seed))

    NervanaObject.be = be
    be.bsz = batch_size
    return be
def test_conv_layer(fargs_tests):

    dtype = np.float32

    ng = NervanaGPU(stochastic_round=False, bench=True)

    N, C, K = fargs_tests[0]
    D, H, W = fargs_tests[1]
    T, R, S = fargs_tests[2]

    padding_d, padding_h, padding_w = 0, 1, 1
    strides_d, strides_h, strides_w = 1, 1, 1

    conv_ng = ng.conv_layer(
        dtype,
        N, C, K,
        D, H, W,
        T, R, S,
        padding_d, padding_h, padding_w,
        strides_d, strides_h, strides_w)

    nc = NervanaCPU()
    conv_nc = nc.conv_layer(
        dtype,
        N, C, K,
        D, H, W,
        T, R, S,
        padding_d, padding_h, padding_w,
        strides_d, strides_h, strides_w)

    assert conv_nc.dimI == conv_ng.dimI
    assert conv_nc.dimF == conv_ng.dimF
    assert conv_nc.dimO == conv_ng.dimO
    assert conv_nc.M == conv_ng.M

    dimI = conv_ng.dimI
    dimF = conv_ng.dimF
    dimO = conv_ng.dimO

    # cpu input arrays
    cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32)
    cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32)
    cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32)

    # zero pad the last row of cpu input for the sake of numpy
    cpuI[-1, :] = 0.0

    # =======GPU and CPU==========
    beI = cpuI[:-1, :].reshape(dimI)
    beF = cpuF.reshape(dimF)
    beE = cpuE

    start_gpu = default_timer()
    ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype)
    end_gpu = default_timer()

    start_cpu = default_timer()
    ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype)
    end_cpu = default_timer()

    print("gputime: %s, cputime %s" %
          (end_gpu - start_gpu, end_cpu - start_cpu))

    # ======numpy===========
    # cpu output arrays
    cpuO = np.zeros(dimO, dtype=dtype)
    cpuB = np.zeros(slicable(dimI, 1), dtype=dtype)
    cpuU = np.zeros(slicable(dimF), dtype=dtype)

    D, H, W = conv_nc.DHW
    T, R, S = conv_nc.TRS
    M, P, Q = conv_nc.MPQ

    pad_d, pad_h, pad_w = conv_nc.padding
    str_d, str_h, str_w = conv_nc.strides

    for m in range(M):
        mt = m * str_d - pad_d

        for p in range(P):
            pr = p * str_h - pad_h

            for q in range(Q):
                qs = q * str_w - pad_w

                idx = pixel_indices(conv_nc, mt, pr, qs)

                cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :])

                cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :])

                cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T)

    for op, ngA, ncA, cpuA, w in (
            ("fprop", ngO, ncO, cpuO, Q),
            ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI), W),
            ("update", ngU, ncU.reshape(dimF), cpuU.reshape(dimF), S)):

        print(op)
        assert np.allclose(ngA.get(), cpuA, rtol=0, atol=1e-4)
        assert np.allclose(ncA.get(), cpuA, rtol=0, atol=1e-4)

    del ng
    del nc
Exemple #22
0
def gen_backend(backend='cpu',
                rng_seed=None,
                default_dtype=np.float32,
                batch_size=0,
                stochastic_round=False,
                device_id=0):
    """
    Construct and return a backend instance of the appropriate type based on
    the arguments given. With no parameters, a single CPU core, float32
    backend is returned.

    Arguments:
        backend (string, optional): 'cpu' or 'gpu'.
        rng_seed (numeric, optional): Set this to a numeric value which can be
                                      used to seed the random number generator
                                      of the instantiated backend.  Defaults to
                                      None, which doesn't explicitly seed (so
                                      each run will be different)
        default_dtype (dtype): Default tensor data type. CPU backend supports
                               np.float64, np.float32 and np.float16; GPU
                               backend supports np.float32 and np.float16.
        batch_size (int): Set the size the data batches.
        stochastic_round (int/bool, optional): Set this to True or an integer
                                               to implent stochastic rounding.
                                               If this is False rounding will
                                               be to nearest.
                                               If True will perform stochastic
                                               rounding using default bit width.
                                               If set to an integer will round
                                               to that number of bits.
                                               Only affects the gpu backend.
        device_id (numeric, optional): Set this to a numeric value which can be
                                       used to select which device to run the
                                       process on

    Returns:
        Backend: newly constructed backend instance of the specifed type.

    Notes:
        * Attempts to construct a GPU instance without a CUDA capable card or
          without nervanagpu package installed will cause the
          program to display an error message and exit.
    """
    logger = logging.getLogger(__name__)

    if NervanaObject.be is not None:
        # backend was already generated
        # clean it up first
        cleanup_backend()
    else:
        # at exit from python force cleanup of backend
        # only register this function once, will use
        # NervanaObject.be instead of a global
        atexit.register(cleanup_backend)

    if backend == 'cpu' or backend is None:
        from neon.backends.nervanacpu import NervanaCPU
        be = NervanaCPU(rng_seed=rng_seed, default_dtype=default_dtype)
    elif backend == 'gpu':
        gpuflag = False
        # check nvcc
        from neon.backends.util import check_gpu
        gpuflag = (check_gpu.get_compute_capability(device_id) >= 5.0)
        if gpuflag is False:
            raise RuntimeError("Device " + str(device_id) +
                               " does not have CUDA compute " +
                               "capability 5.0 or greater")
        from neon.backends.nervanagpu import NervanaGPU
        # init gpu
        be = NervanaGPU(rng_seed=rng_seed,
                        default_dtype=default_dtype,
                        stochastic_round=stochastic_round,
                        device_id=device_id)
    elif backend == 'mgpu':
        raise NotImplementedError("mgpu will be ready soon")
    else:
        raise ValueError("backend must be one of " "('cpu', 'gpu', 'mgpu')")

    logger.info("Backend: {}, RNG seed: {}".format(backend, rng_seed))

    NervanaObject.be = be
    be.bsz = batch_size
    return be
Exemple #23
0
    def setup(self):

        self.gpu = NervanaGPU(stochastic_round=False)
        self.cpu = NervanaCPU()
        self.dims = (1024, 1024)
def test_pool_layer(poolargs):

    op = poolargs[0]

    dtype = np.float32

    ng = NervanaGPU(stochastic_round=False, bench=True)
    nc = NervanaCPU()

    N, C = 32, 32
    D, H, W = 1, 32, 32
    J, T, R, S = 2, 1, 3, 3
    padding_j, padding_d, padding_h, padding_w = 0, 0, 0, 0
    strides_j, strides_d, strides_h, strides_w = 2, 1, 2, 2
    # op = 'max'

    pool_ng = ng.pool_layer(
        dtype,
        op,
        N,
        C, D, H, W,
        J, T, R, S,
        padding_j, padding_d, padding_h, padding_w,
        strides_j, strides_d, strides_h, strides_w)

    pool_nc = nc.pool_layer(
        dtype,
        op,
        N,
        C, D, H, W,
        J, T, R, S,
        padding_j, padding_d, padding_h, padding_w,
        strides_j, strides_d, strides_h, strides_w)

    assert pool_ng.dimI == pool_nc.dimI
    assert pool_ng.dimO == pool_nc.dimO

    dimI = pool_ng.dimI
    dimO = pool_ng.dimO

    # generating input arrays for inputs and errors
    cpuI = np.random.uniform(0.0, 1.0, sliceable(dimI, 1)).astype(
        np.float16).astype(dtype)
    cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(dtype)

    # zero pad the last row of cpu input for the sake of numpy
    if op == "max":
        cpuI[-1, :] = np.finfo(dtype).min
    else:
        cpuI[-1, :] = 0

    # =========GPU and CPU and numpy ==========
    beI = cpuI[:-1, :].reshape(dimI)
    beE = cpuE

    ngO, ngB = run_backend_pool(ng, pool_ng, beI, beE, dtype)
    ncO, ncB = run_backend_pool(nc, pool_nc, beI, beE, dtype)
    cpuO, cpuB = run_numpy_pool(op, cpuI, cpuE, dtype, pool_ng)

    for opA, ngA, ncA, cpuA in (
            ("fprop", ngO, ncO, cpuO),
            ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI))):

        print opA
        assert np.allclose(ngA.get(), ncA.get(), rtol=0, atol=1e-4)
        assert np.allclose(ncA.get(), cpuA, rtol=0, atol=1e-5)

    del ng, nc
def test_conv_layer(fargs_tests, device_id):

    dtype = np.float32

    ng = NervanaGPU(stochastic_round=False, bench=True, device_id=device_id)

    N, C, K = fargs_tests[0]
    D, H, W = fargs_tests[1]
    T, R, S = fargs_tests[2]
    padding_d, padding_h, padding_w = fargs_tests[3]
    strides_d, strides_h, strides_w = fargs_tests[4]

    conv_ng = ng.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d,
                            padding_h, padding_w, strides_d, strides_h,
                            strides_w)

    nc = NervanaCPU()
    conv_nc = nc.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d,
                            padding_h, padding_w, strides_d, strides_h,
                            strides_w)

    assert conv_nc.dimI == conv_ng.dimI
    assert conv_nc.dimF == conv_ng.dimF
    assert conv_nc.dimO == conv_ng.dimO
    assert conv_nc.M == conv_ng.M

    dimI = conv_ng.dimI
    dimF = conv_ng.dimF
    dimO = conv_ng.dimO

    # cpu input arrays
    cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32)
    cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32)
    cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32)

    # zero pad the last row of cpu input for the sake of numpy
    cpuI[-1, :] = 0.0

    # =======GPU and CPU==========
    beI = cpuI[:-1, :].reshape(dimI)
    beF = cpuF.reshape(dimF)
    beE = cpuE

    start_gpu = default_timer()
    ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype)
    end_gpu = default_timer()

    start_cpu = default_timer()
    ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype)
    end_cpu = default_timer()

    print("gputime: %s, cputime %s" %
          (end_gpu - start_gpu, end_cpu - start_cpu))

    # ======numpy===========
    # cpu output arrays
    cpuO = np.zeros(dimO, dtype=dtype)
    cpuB = np.zeros(slicable(dimI, 1), dtype=dtype)
    cpuU = np.zeros(slicable(dimF), dtype=dtype)

    D, H, W = conv_nc.DHW
    T, R, S = conv_nc.TRS
    M, P, Q = conv_nc.MPQ

    pad_d, pad_h, pad_w = conv_nc.padding
    str_d, str_h, str_w = conv_nc.strides

    for m in range(M):
        mt = m * str_d - pad_d

        for p in range(P):
            pr = p * str_h - pad_h

            for q in range(Q):
                qs = q * str_w - pad_w

                idx = pixel_indices(conv_nc, mt, pr, qs)

                cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :])

                cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :])

                cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T)

    for op, ngA, ncA, cpuA, w in (("fprop", ngO, ncO, cpuO,
                                   Q), ("bprop", ngB, ncB.reshape(dimI),
                                        cpuB[:-1, :].reshape(dimI), W),
                                  ("update", ngU, ncU.reshape(dimF),
                                   cpuU.reshape(dimF), S)):

        print(op)
        ncAnp = ncA.get().astype(np.float32)
        ngAnp = ngA.get().astype(np.float32)
        ncdif = cpuA - ncAnp
        ngdif = cpuA - ngAnp
        maxval = abs(cpuA).max()
        ncmaxdif = abs(ncdif).max()
        ngmaxdif = abs(ngdif).max()
        ncRatio = ncmaxdif / maxval
        ngRatio = ngmaxdif / maxval

        assert ncRatio < 1e-5
        assert ngRatio < 1e-5
        assert allclose_with_out(ncA.get(), cpuA, rtol=0, atol=1e-4)
        assert allclose_with_out(ngA.get(), cpuA, rtol=0, atol=1e-3)

    del ng
    del nc