def isaacGemm(ctx, stream, shapes, layouts): # Shapes offa, offb, offc = 0, 0, 0 dtype, AT, BT, M, N, K = shapes dtype = sc.dtype(dtype) AT, BT = sc.templates.op(AT), sc.templates.op(BT) ldc = M lda = M if AT == sc.templates.OP_N else K ldb = K if BT == sc.templates.OP_N else N # Kernel generator = sc.templates.GEMM(dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, *layouts) src = generator.dump(ctx.device, "gemm") module = sc.driver.Module(ctx, src) kernel = sc.driver.Kernel(module, "gemm") with lock: # BuffeNfilt C = sc.driver.Buffer(ctx, M * N * sc.size_of(dtype)) A = sc.driver.Buffer(ctx, M * K * sc.size_of(dtype)) B = sc.driver.Buffer(ctx, K * N * sc.size_of(dtype)) alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype) # Result ts = benchmark( lambda: (generator.enqueue(kernel, stream, alpha, A, B, beta, C), stream.synchronize()), ctx.device, 1e-2) tflops = 2 * M * N * K / ts * 1e-12 return tflops
def isaacConv(ctx, stream, shapes, layouts): # Shapes dtype, Npix, K, C, Nfilt = shapes N, M, P, Q = 1, 1, 1, Npix T, R, S = 1, 1, Nfilt dtype = sc.dtype(dtype) pad_d, pad_h, pad_w, stride_d, stride_h, stride_w = 0, 0, 0, 1, 1, 1 D = M * stride_d + T - 1 - 2 * pad_d - stride_d + 1 H = P * stride_h + R - 1 - 2 * pad_h - stride_h + 1 W = Q * stride_w + S - 1 - 2 * pad_w - stride_w + 1 # Kernel generator = sc.templates.Conv(sc.dtype(dtype), C, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, *layouts) src = generator.dump(ctx.device, "conv_fprop") module = sc.driver.Module(ctx, src) kernel = sc.driver.Kernel(module, "conv_fprop") with lock: # Buffers O = sc.driver.Buffer(ctx, K * M * P * Q * N * sc.size_of(dtype)) I = sc.driver.Buffer(ctx, C * D * H * W * N * sc.size_of(dtype)) F = sc.driver.Buffer(ctx, C * T * R * S * K * sc.size_of(dtype)) alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype) # Result time = benchmark( lambda: (generator.enqueue(kernel, stream, alpha, I, F, beta, O), stream.synchronize()), ctx.device, 1e-2) tflops = 2 * M * P * Q * N * K * C * T * R * S / time * 1e-12 return tflops
def cudaGemm(ctx, stream, dtype, AT, BT, M, N, K): ldc = M lda = M if AT==1 else K ldb = K if BT==1 else N dtype = sc.dtype(dtype) C = sc.driver.Buffer(ctx, M*N*sc.size_of(dtype)) A = sc.driver.Buffer(ctx, M*K*sc.size_of(dtype)) B = sc.driver.Buffer(ctx, K*N*sc.size_of(dtype)) alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype) time = benchmark(lambda: (sc.driver.cublasGemm(dtype, ctx, stream, 'N' if AT==1 else 'T', 'N' if BT==1 else 'T', M, N, K, alpha, A, lda, B, ldb, beta, C, ldc), stream.synchronize()), ctx.device, 1e-2) tflops = 2*M*N*K/time*1e-12 return tflops
def cudaConv(ctx, stream, dtype, N, K, P, Q, C, R, S): pad_h, pad_w, stride_h, stride_w = 0, 0, 1, 1 H = P*stride_h + R - 1 - 2*pad_h W = Q*stride_w + S - 1 - 2*pad_w dtype = sc.dtype(dtype) O = sc.driver.Buffer(ctx, K*P*Q*N*sc.size_of(dtype)) I = sc.driver.Buffer(ctx, C*H*W*N*sc.size_of(dtype)) F = sc.driver.Buffer(ctx, C*R*S*K*sc.size_of(dtype)) alpha, beta = sc.Scalar(1., dtype), sc.Scalar(0., dtype) time = benchmark(lambda: (sc.driver.cudnnConv(dtype, ctx, stream, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O), stream.synchronize()), ctx.device, 1e-2) tflops = 2*P*Q*K*N*C*R*S/time*1e-12 return tflops