def test_conv_layer(): dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True) nc = NervanaCPU() N, C, K = 64, 64, 64 D, H, W = 1, 5, 5 T, R, S = 1, 3, 3 padding_d, padding_h, padding_w = 0, 1, 1 strides_d, strides_h, strides_w = 1, 1, 1 conv_ng = ng.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) conv_nc = nc.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) assert conv_nc.dimI == conv_ng.dimI assert conv_nc.dimF == conv_ng.dimF assert conv_nc.dimO == conv_ng.dimO assert conv_nc.M == conv_ng.M dimI = conv_ng.dimI dimF = conv_ng.dimF dimO = conv_ng.dimO # cpu input arrays cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32) cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32) cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32) # zero pad the last row of cpu input for the sake of numpy cpuI[-1, :] = 0.0 # =======GPU and CPU========== beI = cpuI[:-1, :].reshape(dimI) beF = cpuF.reshape(dimF) beE = cpuE start_gpu = default_timer() ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype) end_gpu = default_timer() start_cpu = default_timer() ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype) end_cpu = default_timer() print("gputime: %s, cputime %s" % (end_gpu - start_gpu, end_cpu - start_cpu)) # ======numpy=========== # cpu output arrays cpuO = np.zeros(dimO, dtype=dtype) cpuB = np.zeros(slicable(dimI, 1), dtype=dtype) cpuU = np.zeros(slicable(dimF), dtype=dtype) D, H, W = conv_nc.DHW T, R, S = conv_nc.TRS M, P, Q = conv_nc.MPQ pad_d, pad_h, pad_w = conv_nc.padding str_d, str_h, str_w = conv_nc.strides for m in range(M): mt = m * str_d - pad_d for p in range(P): pr = p * str_h - pad_h for q in range(Q): qs = q * str_w - pad_w idx = pixel_indices(conv_nc, mt, pr, qs) cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :]) cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :]) cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T) for op, ngA, ncA, cpuA, w in (("fprop", ngO, ncO, cpuO, Q), ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI), W), ("update", ngU, ncU.reshape(dimF), cpuU.reshape(dimF), S)): print op assert np.allclose(ngA.get(), cpuA, rtol=0, atol=1e-4) assert np.allclose(ncA.get(), cpuA, rtol=0, atol=1e-5) ng.ctx.detach() del ng
dict(alpha=2.0, beta=3.0), dict(), ] for config in configs: kernelClass, N, C, K, determ, compound, override, convs = config for conv in convs: D, H, W, T, R, S, pad_d, pad_h, pad_w, str_d, str_h, str_w = conv ng.deterministic = determ layer = nc.conv_layer(np.float64, N, C, K, D, H, W, T, R, S, pad_d, pad_h, pad_w, str_d, str_h, str_w) (M, P, Q) = layer.MPQ if kernelClass in (FpropCuda, BpropCuda, UpdateCuda): dtypes = (np.float32,) else: dtypes = (np.float32, np.float16) for dtype in (dtypes): ng.scratch_buffer_reset() if override is None: kernel = kernelClass(ng, np.dtype(dtype),
dict(alpha=2.0, beta=3.0), dict(), ] for config in configs: kernelClass, N, C, K, determ, compound, override, convs = config for conv in convs: D, H, W, T, R, S, pad_d, pad_h, pad_w, str_d, str_h, str_w = conv ng.deterministic = determ layer = nc.conv_layer(np.float64, N, C, K, D, H, W, T, R, S, pad_d, pad_h, pad_w, str_d, str_h, str_w) (M, P, Q) = layer.MPQ if kernelClass in (FpropCuda, BpropCuda, UpdateCuda): dtypes = (np.float32,) else: dtypes = (np.float32, np.float16) for dtype in (dtypes): ng.scratch_buffer_reset() if override is None: kernel = kernelClass(ng, np.dtype(dtype),
def test_conv_layer(fargs_tests): dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True) N, C, K = fargs_tests[0] D, H, W = fargs_tests[1] T, R, S = fargs_tests[2] padding_d, padding_h, padding_w = 0, 1, 1 strides_d, strides_h, strides_w = 1, 1, 1 conv_ng = ng.conv_layer( dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) nc = NervanaCPU() conv_nc = nc.conv_layer( dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) assert conv_nc.dimI == conv_ng.dimI assert conv_nc.dimF == conv_ng.dimF assert conv_nc.dimO == conv_ng.dimO assert conv_nc.M == conv_ng.M dimI = conv_ng.dimI dimF = conv_ng.dimF dimO = conv_ng.dimO # cpu input arrays cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32) cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32) cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32) # zero pad the last row of cpu input for the sake of numpy cpuI[-1, :] = 0.0 # =======GPU and CPU========== beI = cpuI[:-1, :].reshape(dimI) beF = cpuF.reshape(dimF) beE = cpuE start_gpu = default_timer() ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype) end_gpu = default_timer() start_cpu = default_timer() ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype) end_cpu = default_timer() print("gputime: %s, cputime %s" % (end_gpu - start_gpu, end_cpu - start_cpu)) # ======numpy=========== # cpu output arrays cpuO = np.zeros(dimO, dtype=dtype) cpuB = np.zeros(slicable(dimI, 1), dtype=dtype) cpuU = np.zeros(slicable(dimF), dtype=dtype) D, H, W = conv_nc.DHW T, R, S = conv_nc.TRS M, P, Q = conv_nc.MPQ pad_d, pad_h, pad_w = conv_nc.padding str_d, str_h, str_w = conv_nc.strides for m in range(M): mt = m * str_d - pad_d for p in range(P): pr = p * str_h - pad_h for q in range(Q): qs = q * str_w - pad_w idx = pixel_indices(conv_nc, mt, pr, qs) cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :]) cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :]) cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T) for op, ngA, ncA, cpuA, w in ( ("fprop", ngO, ncO, cpuO, Q), ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI), W), ("update", ngU, ncU.reshape(dimF), cpuU.reshape(dimF), S)): print(op) assert np.allclose(ngA.get(), cpuA, rtol=0, atol=1e-4) assert np.allclose(ncA.get(), cpuA, rtol=0, atol=1e-4) del ng del nc
def test_conv_layer(fargs_tests, device_id): dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True, device_id=device_id) N, C, K = fargs_tests[0] D, H, W = fargs_tests[1] T, R, S = fargs_tests[2] padding_d, padding_h, padding_w = fargs_tests[3] strides_d, strides_h, strides_w = fargs_tests[4] conv_ng = ng.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) nc = NervanaCPU() conv_nc = nc.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) assert conv_nc.dimI == conv_ng.dimI assert conv_nc.dimF == conv_ng.dimF assert conv_nc.dimO == conv_ng.dimO assert conv_nc.M == conv_ng.M dimI = conv_ng.dimI dimF = conv_ng.dimF dimO = conv_ng.dimO # cpu input arrays cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32) cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32) cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32) # zero pad the last row of cpu input for the sake of numpy cpuI[-1, :] = 0.0 # =======GPU and CPU========== beI = cpuI[:-1, :].reshape(dimI) beF = cpuF.reshape(dimF) beE = cpuE start_gpu = default_timer() ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype) end_gpu = default_timer() start_cpu = default_timer() ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype) end_cpu = default_timer() print("gputime: %s, cputime %s" % (end_gpu - start_gpu, end_cpu - start_cpu)) # ======numpy=========== # cpu output arrays cpuO = np.zeros(dimO, dtype=dtype) cpuB = np.zeros(slicable(dimI, 1), dtype=dtype) cpuU = np.zeros(slicable(dimF), dtype=dtype) D, H, W = conv_nc.DHW T, R, S = conv_nc.TRS M, P, Q = conv_nc.MPQ pad_d, pad_h, pad_w = conv_nc.padding str_d, str_h, str_w = conv_nc.strides for m in range(M): mt = m * str_d - pad_d for p in range(P): pr = p * str_h - pad_h for q in range(Q): qs = q * str_w - pad_w idx = pixel_indices(conv_nc, mt, pr, qs) cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :]) cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :]) cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T) for op, ngA, ncA, cpuA, w in (("fprop", ngO, ncO, cpuO, Q), ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI), W), ("update", ngU, ncU.reshape(dimF), cpuU.reshape(dimF), S)): print(op) ncAnp = ncA.get().astype(np.float32) ngAnp = ngA.get().astype(np.float32) ncdif = cpuA - ncAnp ngdif = cpuA - ngAnp maxval = abs(cpuA).max() ncmaxdif = abs(ncdif).max() ngmaxdif = abs(ngdif).max() ncRatio = ncmaxdif / maxval ngRatio = ngmaxdif / maxval assert ncRatio < 1e-5 assert ngRatio < 1e-5 assert allclose_with_out(ncA.get(), cpuA, rtol=0, atol=1e-4) assert allclose_with_out(ngA.get(), cpuA, rtol=0, atol=1e-3) del ng del nc