def isaacPool(ctx, stream, shapes, layouts): # Shapes dtype, Npix, Nfilt = shapes N, K, M, P, Q = 1, 1, 1, 1, Npix T, R, S = 1, 1, Nfilt dtype = sc.dtype(dtype) pad_d, pad_h, pad_w, stride_d, stride_h, stride_w = 0, 0, 0, 1, 1, 1 D = M * stride_d + T - 1 - 2 * pad_d - stride_d + 1 H = P * stride_h + R - 1 - 2 * pad_h - stride_h + 1 W = Q * stride_w + S - 1 - 2 * pad_w - stride_w + 1 # Kernel generator = sc.templates.Pool(sc.dtype(dtype), K, D, H, W, N, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, *layouts) src = generator.dump(ctx.device, "pool_fprop") module = sc.driver.Module(ctx, src) kernel = sc.driver.Kernel(module, "pool_fprop") with lock: # BuffeNfilt O = sc.driver.Buffer(ctx, K * M * P * Q * N * sc.size_of(dtype)) I = sc.driver.Buffer(ctx, K * D * H * W * N * sc.size_of(dtype)) # Result time = benchmark( lambda: (generator.enqueue(kernel, stream, I, O), stream.synchronize()), ctx.device, 1e-2) tflops = M * P * Q * N * K * T * R * S / time * 1e-12 return tflops
def isaacGemm(ctx, stream, shapes, layouts): # Shapes offa, offb, offc = 0, 0, 0 dtype, AT, BT, M, N, K = shapes dtype = sc.dtype(dtype) AT, BT = sc.templates.op(AT), sc.templates.op(BT) ldc = M lda = M if AT == sc.templates.OP_N else K ldb = K if BT == sc.templates.OP_N else N # Kernel generator = sc.templates.GEMM(dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, *layouts) src = generator.dump(ctx.device, "gemm") module = sc.driver.Module(ctx, src) kernel = sc.driver.Kernel(module, "gemm") with lock: # BuffeNfilt C = sc.driver.Buffer(ctx, M * N * sc.size_of(dtype)) A = sc.driver.Buffer(ctx, M * K * sc.size_of(dtype)) B = sc.driver.Buffer(ctx, K * N * sc.size_of(dtype)) alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype) # Result ts = benchmark( lambda: (generator.enqueue(kernel, stream, alpha, A, B, beta, C), stream.synchronize()), ctx.device, 1e-2) tflops = 2 * M * N * K / ts * 1e-12 return tflops
def isaacConv(ctx, stream, shapes, layouts): # Shapes dtype, Npix, K, C, Nfilt = shapes N, M, P, Q = 1, 1, 1, Npix T, R, S = 1, 1, Nfilt dtype = sc.float64 if dtype==8 else sc.float32 pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w = 0, 0, 0, 1, 1, 1, 1, 1, 1 D = M*stride_d + T - 1 - 2*pad_d - stride_d + 1 H = P*stride_h + R - 1 - 2*pad_h - stride_h + 1 W = Q*stride_w + S - 1 - 2*pad_w - stride_w + 1 # Kernel generator = sc.templates.Conv(dtype, dtype, C, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, sc.templates.LINEAR, 1, sc.templates.NO_RESIDUAL, 0, 1, 1, 1, 1, 1, 1, *layouts) src = generator.dump(ctx.device, "conv_fprop") module = sc.driver.Module(ctx, src) kernel = sc.driver.Kernel(module, "conv_fprop") with lock: # Buffers O = sc.driver.Buffer(ctx, K*M*P*Q*N*sc.size_of(dtype)) I = sc.driver.Buffer(ctx, C*D*H*W*N*sc.size_of(dtype)) F = sc.driver.Buffer(ctx, C*T*R*S*K*sc.size_of(dtype)) alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype) # Result time = benchmark(lambda: (generator.enqueue(kernel, stream, I, F, O, None, 1., 1., 1., [1.], 1., None), stream.synchronize()), ctx.device, 1e-2) tflops = 2*M*P*Q*N*K*C*T*R*S/time*1e-12 return tflops
def cudaGemm(ctx, stream, dtype, AT, BT, M, N, K): ldc = M lda = M if AT==1 else K ldb = K if BT==1 else N dtype = sc.dtype(dtype) C = sc.driver.Buffer(ctx, M*N*sc.size_of(dtype)) A = sc.driver.Buffer(ctx, M*K*sc.size_of(dtype)) B = sc.driver.Buffer(ctx, K*N*sc.size_of(dtype)) alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype) time = benchmark(lambda: (sc.driver.cublasGemm(dtype, ctx, stream, 'N' if AT==1 else 'T', 'N' if BT==1 else 'T', M, N, K, alpha, A, lda, B, ldb, beta, C, ldc), stream.synchronize()), ctx.device, 1e-2) tflops = 2*M*N*K/time*1e-12 return tflops
def cudaConv(ctx, stream, dtype, N, K, P, Q, C, R, S): pad_h, pad_w, stride_h, stride_w = 0, 0, 1, 1 H = P*stride_h + R - 1 - 2*pad_h W = Q*stride_w + S - 1 - 2*pad_w dtype = sc.dtype(dtype) O = sc.driver.Buffer(ctx, K*P*Q*N*sc.size_of(dtype)) I = sc.driver.Buffer(ctx, C*H*W*N*sc.size_of(dtype)) F = sc.driver.Buffer(ctx, C*R*S*K*sc.size_of(dtype)) alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype) time = benchmark(lambda: (sc.driver.cudnnConv(dtype, ctx, stream, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O), stream.synchronize()), ctx.device, 1e-2) tflops = 2*P*Q*K*N*C*R*S/time*1e-12 return tflops
def isaacPool(ctx, stream, shapes, layouts): # Shapes dtype, Npix, Nfilt = shapes N, K, M, P, Q = 1, 1, 1, 1, Npix T, R, S = 1, 1, Nfilt dtype = sc.float64 if dtype==8 else sc.float32 pad_d, pad_h, pad_w, stride_d, stride_h, stride_w = 0, 0, 0, 1, 1, 1 D = M*stride_d + T - 1 - 2*pad_d - stride_d + 1 H = P*stride_h + R - 1 - 2*pad_h - stride_h + 1 W = Q*stride_w + S - 1 - 2*pad_w - stride_w + 1 # Kernel generator = sc.templates.Pool(dtype, dtype, sc.templates.MAX_POOL, K, D, H, W, N, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, *layouts) src = generator.dump(ctx.device, "pool_fprop") module = sc.driver.Module(ctx, src) kernel = sc.driver.Kernel(module, "pool_fprop") with lock: # BuffeNfilt O = sc.driver.Buffer(ctx, K*M*P*Q*N*sc.size_of(dtype)) I = sc.driver.Buffer(ctx, K*D*H*W*N*sc.size_of(dtype)) # Result time = benchmark(lambda: (generator.enqueue(kernel, stream, I, O, 1., 1.), stream.synchronize()), ctx.device, 1e-2) tflops = M*P*Q*N*K*T*R*S/time*1e-12 return tflops
def isaacGemm(ctx, stream, shapes, layouts): # Shapes offa, offb, offc = 0, 0, 0 dtype, AT, BT, M, N, K = shapes dtype = sc.float64 if dtype==8 else sc.float32 AT, BT = sc.templates.op(AT), sc.templates.op(BT) ldc = M lda = M if AT==sc.templates.OP_N else K ldb = K if BT==sc.templates.OP_N else N # Kernel generator = sc.templates.GEMM(dtype, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, *layouts) src = generator.dump(ctx.device, "gemm") module = sc.driver.Module(ctx, src) kernel = sc.driver.Kernel(module, "gemm") with lock: # BuffeNfilt C = sc.driver.Buffer(ctx, M*N*sc.size_of(dtype)) A = sc.driver.Buffer(ctx, M*K*sc.size_of(dtype)) B = sc.driver.Buffer(ctx, K*N*sc.size_of(dtype)) alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype) # Result ts = benchmark(lambda: (generator.enqueue(kernel, stream, alpha, A, B, beta, C, 1., 1., 1., None), stream.synchronize()), ctx.device, 1e-2) tflops = 2*M*N*K/ts*1e-12 return tflops