def test_conv_backward_pass_options(self): lang = CONV_TRAIN N, C, H, W, O, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1 convolution = tc.define(lang, training=True, name="convolution", backward="convolution_grad", constants={"sh":sH, "sw":sW}) I = Variable(torch.randn(N, C, H, W).cuda(), requires_grad=True) W = Parameter(torch.randn(O, C, kH, kW).cuda()) out = convolution(I, W, options=[tc.Options("conv"), tc.Options("group_conv")]) out.sum().backward()
def test_mv(R, C): mat = Variable(torch.randn(R, C).cuda()) vector = Variable(torch.randn(C).cuda()) out_tc = mv(mat, vector, options=tc.Options("mlp")) out_pt = torch.mv(mat, vector) assert out_tc.cpu().data.view(-1).tolist() == approx( out_pt.cpu().data.view(-1).tolist(), abs=1e-4)
def test_train_matmul(self): LANG = """ def matmul(float(M,N) A, float(N,K) B) -> (output) { output(i, j) +=! A(i, kk) * B(kk, j) } def matmul_grad(float(M,N) A, float(N,K) B, float(M,K) O_grad) -> (A_grad, B_grad){ A_grad(i, j) +=! O_grad(i, kk) * B(j, kk) B_grad(i, j) +=! O_grad(kk, j) * A(kk, i) } """ matmul = tc.define(LANG, name="matmul", training=True, backward="matmul_grad") mat1 = Parameter(torch.randn(3, 4).cuda()) mat2 = Variable(torch.randn(4, 5).cuda(), requires_grad=True) out = matmul(mat1, mat2, options=[tc.Options("mlp"), tc.Options("mlp")]) out.sum().backward()
def test_absolute(self): LANG = """ def abs(float(M, N) A) -> (O1) { O1(m, n) = fabs(A(m, n)) } """ absolute = tc.define(LANG, name="abs") A = -1 * torch.randn(3, 4).cuda() out = absolute(A, options=tc.Options("pointwise"))
def test_matmul(M, N, K): mat1 = Variable(torch.randn(M, N).cuda()) mat2 = Variable(torch.randn(N, K).cuda()) matmul.autotune(mat1, mat2, cache=True, options=tc.Options("mlp"), **tc.autotuner_settings) out_tc = matmul(mat1, mat2) out_pt = torch.matmul(mat1, mat2) assert out_tc.cpu().data.view(-1).tolist() == approx( out_pt.cpu().data.view(-1).tolist(), abs=1e-4)
def tmm(M, K, N, **compare_kwargs): global A, B, tc_tmm print('tmm(M={}, K={}, N={})'.format(M, K, N)) A = Variable(torch.Tensor(M, K).cuda().normal_()) B = Variable(torch.Tensor(N, K).cuda().normal_()) tc_tmm = tc.define(''' def tmm(float(M, K) A, float(N, K) B) -> (C) { C(m, n) +=! A(m, kk) * B(n, kk) }''', name='tmm') tc_tmm.autotune(A, B, options=tc.Options('mlp'), **autotune_kwargs) compare('tc_tmm(A, B)', 'torch.mm(A, B.t())', **compare_kwargs)
def tbmm(B, M, K, N, **compare_kwargs): global X, Y, tc_tbmm print('tbmm(B={}, M={}, K={}, N={})'.format(B, M, K, N)) X = Variable(torch.Tensor(B, N, M).cuda().normal_()) Y = Variable(torch.Tensor(B, K, M).cuda().normal_()) tc_tbmm = tc.define(''' def tbmm(float(B, N, M) X, float(B, K, M) Y) -> (Z) { Z(b, n, k) +=! X(b, n, m) * Y(b, k, m) }''', name='tbmm') tc_tbmm.autotune(X, Y, options=tc.Options('mlp'), **autotune_kwargs) compare('tc_tbmm(X, Y)', 'torch.bmm(X, Y.transpose(1, 2))', **compare_kwargs)
def autotune(): input, running_mean, running_std, weight, bias, params = generate_data() input = input.transpose(0, 1).contiguous().view(input.shape[1], -1) grad_output = input.clone() options = tc.Options("mlp") tuner_kwargs = dict(options=options, generations=1, pop_size=10, crossover_rate=80, number_elites=1, threads=20) autotune_with_named_cache(BatchReNorm2dTCFunction.calc_mean_std, input, params, **tuner_kwargs) batchMean, batchStd = BatchReNorm2dTCFunction.calc_mean_std(input, params) autotune_with_named_cache(BatchReNorm2dTCFunction.calc_r_d, batchStd, batchMean, running_mean, running_std, params, **tuner_kwargs) r, d = BatchReNorm2dTCFunction.calc_r_d(batchStd, batchMean, running_mean, running_std, params) autotune_with_named_cache(BatchReNorm2dTCFunction.calc_O, input, weight, bias, batchStd, batchMean, r, d, **tuner_kwargs) O = BatchReNorm2dTCFunction.calc_O(input, weight, bias, batchStd, batchMean, r, d) autotune_with_named_cache(BatchReNorm2dTCFunction.calc_running_mean_std, batchStd, batchMean, running_mean, running_std, params, **tuner_kwargs) rMeanOut, rStdOut = BatchReNorm2dTCFunction.calc_running_mean_std( batchStd, batchMean, running_mean, running_std, params) autotune_with_named_cache(BatchReNorm2dTCFunction.calc_xHat_grad, weight, grad_output, **tuner_kwargs) xHat_grad = BatchReNorm2dTCFunction.calc_xHat_grad(weight, grad_output) autotune_with_named_cache(BatchReNorm2dTCFunction.calc_mean_std_grad, input, batchMean, batchStd, r, xHat_grad, **tuner_kwargs) batchMean_grad, batchStd_grad = BatchReNorm2dTCFunction.calc_mean_std_grad( input, batchMean, batchStd, r, xHat_grad) autotune_with_named_cache(BatchReNorm2dTCFunction.calc_xHat, input, batchMean, batchStd, r, d, **tuner_kwargs) xHat = BatchReNorm2dTCFunction.calc_xHat(input, batchMean, batchStd, r, d) autotune_with_named_cache(BatchReNorm2dTCFunction.calc_weight_bias_grad, grad_output, xHat, **tuner_kwargs) weight_grad, bias_grad = BatchReNorm2dTCFunction.calc_weight_bias_grad( grad_output, xHat) autotune_with_named_cache(BatchReNorm2dTCFunction.calc_I_grad, input, batchMean, batchStd, r, xHat_grad, batchMean_grad, batchStd_grad, **tuner_kwargs) I_grad = BatchReNorm2dTCFunction.calc_I_grad(input, batchMean, batchStd, r, xHat_grad, batchMean_grad, batchStd_grad)
def test_layernorm(self): # NOTE: take note of use of {{ }} below for handling TC with scalars lang = """ def layernorm(float(T, B, C) I) -> (O, mean, centered, var) {{ mean(t, b) +=! I(t, b, c) / C centered(t, b, c) = I(t, b, c) - mean(t, b) var(t, b) +=! centered(t, b, c) * centered(t, b, c) var(t, b) = (var(t, b) + {eps}) / C O(t, b, c) = centered(t, b, c) / rsqrt(var(t, b)) }} """ layernorm = tc.define(lang, name="layernorm", constants={"eps": 1e-5}) inp = torch.randn(7, 32, 64).cuda() options = tc.Options("mlp") options = layernorm.autotune(inp, **tc.autotuner_settings) out = layernorm(inp, options=options)
def gconv(N, G, F, C, W, H, KH, KW, **compare_kwargs): global tc_I, tc_W1, tc_gconv, nn_I, nn_gconv print('gconv(N={}, G={}, F={}, C={}, W={}, H={}, KH={}, KW={})'.format( N, G, F, C, W, H, KH, KW)) tc_I = Variable(torch.Tensor(N, G, C, H, W).cuda().normal_()) nn_I = tc_I.view(N, G * C, H, W) nn_gconv = nn.Conv2d(G * C, G * F, (KH, KW), groups=G, bias=False).cuda() tc_W1 = nn_gconv.weight.view(G, F, C, KH, KW) tc_gconv = tc.define(''' def gconv(float(N, G, C, H, W) I, float(G, F, C, KW, KW) W1) -> (O) { O(n, g, o, h, w) +=! I(n, g, i, h + kh, w + kw) * W1(g, o, i, kh, kw) }''', name='gconv') tc_gconv.autotune(tc_I, tc_W1, options=tc.Options('group_conv'), **autotune_kwargs) compare('tc_gconv(tc_I, tc_W1)', 'nn_gconv(nn_I)', **compare_kwargs)
def build_model_tc(self): import tensor_comprehensions as tc lang = """ def convolution(float(N,CI,H,W) I, float(CO,CI,KH,KW) W1) -> (O) { O(n, co, h, w) +=! I(n, ci, h + kh, w + kw) * W1(co, ci, kh, kw) } """ convolution = tc.define(lang, name="convolution") inp, kern = self.get_dataset() if (self.params.backend_opts['tc_autotune']): convolution.autotune( inp, kern, cache=self.get_tc_cache(), options=tc.Options("conv"), generations=self.params.backend_opts["tc_at_generations"], pop_size=self.params.backend_opts["tc_at_population"], elites=1, threads=8) return convolution
def build_model_tc(self): actmap = {"relu": "fmax(OUT(n, i, j), 0)"} import tensor_comprehensions as tc lang = """ def matmul(float(N, I, K) IN, float(K, J) W, float(J) B) -> (OUT) { OUT(n, i, j) +=! IN(n, i, k) * W(k, j) OUT(n, i, j) = OUT(n, i, j) + B(j) """ if self.activation: lang += "OUT(n, i, j) = {}\n".format(actmap[self.activation]) lang += "}" inp, wgt, bias = self.get_dataset() matmul = tc.define(lang, name="matmul") matmul.autotune( inp, wgt, bias, cache=self.get_tc_cache(), options=tc.Options("mlp"), generations=self.params.backend_opts['tc_at_generations'], pop_size=self.params.backend_opts['tc_at_population'], elites=1, threads=8) return matmul
def test_maxpool(B, C, H, W): tensor = Variable(torch.randn(B, C, H, W).cuda()) out_tc = maxpool(tensor, options=tc.Options("conv")) out_pt = F.max_pool2d(tensor, kernel_size=2, stride=1) assert out_tc.cpu().data.view(-1).tolist() == approx( out_pt.cpu().data.view(-1).tolist())
def test_abs(M, N): mat = Variable(torch.randn(M, N).cuda()) out_tc = abs(mat, options=tc.Options("pointwise")) out_pt = torch.abs(mat) assert out_tc.cpu().data.view(-1).tolist() == approx( out_pt.cpu().data.view(-1).tolist())
import tensor_comprehensions as tc import torch from torch.autograd import Variable from torch.nn.parameter import Parameter matmul = tc.define(tc.database['matmul']['lang'], name='matmul') mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda() out = matmul(mat1, mat2, options=tc.Options("mlp")) print(out) CONV_LANG = """ def convolution(float(N,C,H,W) I, float(M,C,KH,KW) W1) -> (O) {{ O(n, m, h, w) +=! I(n, c, {sh} * h + kh, {sw} * w + kw) * W1(m, c, kh, kw) }} def convolution_grad(float(N,C,H,W) I, float(M,C,KH,KW) W1, float(N,M,H,W) O_grad) -> (I_grad, W1_grad) {{ I_grad(n, c, h, w) +=! O_grad(n, m, {sh} * h - kh, {sw} * w - kw) * W1(m, c, kh, kw) W1_grad(m, c, kh, kw) +=! O_grad(n, m, {sh} * h - kh, {sw} * w - kw) * I(n, c, h, w) }} """ N, C, H, W, O, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1 convolution = tc.define(CONV_LANG, training=True, name="convolution", backward="convolution_grad", constants={"sh": sH, "sw": sW}) I = Variable(torch.randn(N, C, H, W).cuda(), requires_grad=True) W = Parameter(torch.randn(O, C, kH, kW).cuda()) out = convolution(I, W, options=[tc.Options("conv"), tc.Options("group_conv")]) out[0].sum().backward() lang = """ def matmul(float(M,N) A, float(N,K) B) -> (output) { output(i, j) +=! A(i, kk) * B(kk, j) }
return times def autotune(cache_file='tc_cache'): print("Starting autotune") A, B = torch.randn(M, K).cuda(), torch.randn(K, N).cuda() sgemm = tc.define(lang, name="sgemm") best_opts = sgemm.autotune(A, B, cache=cache_file, generations=25, pop_size=50, crossover_rate=70, number_elites=5, gpus="1,2,3") print("Done autotune") print(sorted(test(best_opts, 20))[10]) return best_opts def load_cache(cache_file='tc_cache'): A, B = torch.randn(M, K).cuda(), torch.randn(K, N).cuda() sgemm = tc.define(lang, name="sgemm") # Couldn't find a reasonable way to load cache: return sgemm.autotune(A, B, cache=cache_file, generations=0) # autotune() print("naive:", sorted(test(tc.Options("naive"), 10))[5]) print("autotuned:", sorted(test(load_cache(), 100))[50])