def test_matmul_tune_and_run(self, n, m, k, seed, gc, dc): matmul = tc.define(MATMUL_LANG, name="matmul") matmul_grad = tc.define(MATMUL_GRAD_LANG, name="matmul_grad") mapping_options = matmul.autotune( (n, k), (k, m), generations=3, threads=32, pop_size=2, tuner_min_launch_total_threads=1, ) grad_mapping_options = matmul_grad.autotune( (n, k), (k, m), (n, m), generations=1, threads=32, pop_size=2, tuner_min_launch_total_threads=1, ) X = np.random.rand(m, k).astype(np.float32) W = np.random.rand(k, n).astype(np.float32) def ref(X, W): return [np.dot(X, W)] op = core.CreateOperator( "TcOp", ["X", "Y"], "out", tc_def=MATMUL_LANG, tc_name="matmul", tc_grad_def=MATMUL_GRAD_LANG, tc_grad_name="matmul_grad", inputs_used_by_gradient=[0, 1], output_gradients_used_by_gradient=[0], inputs_to_compute_gradients_of=[0, 1], mapping_options=mapping_options.serialize(), grad_mapping_options=grad_mapping_options.serialize(), ) self.assertReferenceChecks( device_option=gc, op=op, inputs=[X, W], reference=ref, ) for i in range(2): self.assertGradientChecks( device_option=gc, op=op, inputs=[X, W], outputs_to_check=i, outputs_with_grads=[0], )
def test_tc_autotune_reinforce(self): with tempfile.NamedTemporaryFile() as cache_file: group_normalization = """ def moments(float(N, K) I) -> (mean, var) { # var = E(x^2) - mean^2. mean(n) +=! I(n, r_k) var(n) +=! I(n, r_k) * I(n, r_k) mean(n) = mean(n) / (K) var(n) = var(n) / (K) - mean(n) * mean(n) } def group_normalization( float(N, G, D, H, W) I, float(G, D) gamma, float(G, D) beta, float(N, G) mean, float(N, G) var) -> (O) { O(n, g, d, h, w) = gamma(g, d) * ( I(n, g, d, h, w) - mean(n, g) ) * rsqrt( var(n, g) + 1e-5 ) + beta(g, d) } """ N, G, D, H, W = 32, 32, 4, 56, 56 I, gamma, beta = (torch.randn(N, G, D, H, W, device='cuda'), torch.randn(G, D, device='cuda').fill_(1.0), torch.randn(G, D, device='cuda').zero_()) T = tc.define( group_normalization, tc.make_autotuned_options_factory( starting_options='naive', tuner_config=tuner_config, cache_filename=cache_file.name, store_to_cache=True)) # First occurrence triggers tuning mean, var = T.moments(I.view((N * G, -1))) out = T.group_normalization(I, gamma, beta, mean.view((N, G)), var.view((N, G))) # Create a new TC object to retrigger tuning T = tc.define( group_normalization, tc.make_autotuned_options_factory( tuner_config=tuner_config, cache_filename=cache_file.name, load_from_cache=True, store_to_cache=True)) mean, var = T.moments(I.view((N * G, -1))) out = T.group_normalization(I, gamma, beta, mean.view((N, G)), var.view((N, G))) from torch.nn.modules.normalization import GroupNorm GN = GroupNorm(G, G * D).cuda() ref = GN.forward(I.view((N, G * D, H, W))) tc.assert_almost_equal(ref, out.view((N, G * D, H, W)), I, operations=D * H * W)
def test_multiple_tc(self): lang = MATMUL_KRU1_LANG matmul = tc.define(lang, name="matmul") mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda() out = matmul(mat1, mat2) KRU3_1 = tc.define(lang, name="KRU3_1") W2, X = torch.randn(32, 16).cuda(), torch.randn(256, 16, 16, 16).cuda() out = KRU3_1(W2, X)
def test_multiple_tc(self): lang = MATMUL_ABS_LANG matmul = tc.define(lang, name="matmul") mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda() out = matmul(mat1, mat2) abs = tc.define(lang, name="abs") A = torch.randn(3, 4).cuda() out = abs(A)
def test_autotuner_multiple_tc(self): lang = MATMUL_KRU1_LANG matmul = tc.define(lang, name="matmul") mat1, mat2 = torch.randn(72, 26).cuda(), torch.randn(26, 72).cuda() matmul.autotune(mat1, mat2, cache=True, **tc.autotuner_default_options) out = matmul(mat1, mat2) KRU3_1 = tc.define(lang, name="KRU3_1") W2, X = torch.randn(32, 16).cuda(), torch.randn(256, 16, 16, 16).cuda() KRU3_1.autotune(W2, X, cache=True, **tc.autotuner_default_options) out = KRU3_1(W2, X)
def test_autotuner_multiple_tc(self): lang = MATMUL_ABS_LANG matmul = tc.define(lang, name="matmul") mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda() matmul.autotune(mat1, mat2, cache=True, **tc.autotuner_settings) out = matmul(mat1, mat2) absolute = tc.define(lang, name="abs") A = torch.randn(100, 400).cuda() absolute.autotune(A, cache=True, **tc.autotuner_settings) out = absolute(A)
def __init__(self, I, C, K, groups=1, padding=0, bias=False, from_cache=False, cache_file='tc_group3d.pt', tuner_config=None): ''' Module providing grouped 3d convolution using tensor comprehensions :param I: Number of input channels :type I: int :param C: Number of output channels :type C: int :param K: Kernel size :type K: tuple or int :param groups: Number of groups :type groups: int :param from_cache: If True load from specified cache file, If False, perform autotuning :type from_cache: bool :param cache_file: Path and name of cache file :type cache_file: string :param padding: Amount of input padding :type padding: tuple or int :param bias: Not implemented :type bias: bool :param tuner_config: Tuner config object to use for auto-tuning :type tuner_config: tensor_comprehensions.TunerConfig ''' import torch.nn.functional as F super().__init__() K = self.int_to_tuple(K) padding = self.int_to_tuple(padding) group_convolution = self.tc_string() if not from_cache: if tuner_config is None: tuner_config = tc.TunerConfig().generations(25).pop_size(100).number_elites(15) conv_option = tc.tclib.MappingOptions('naive').tile([1,1]).mapToThreads([4,16,4]).mapToBlocks([256,256]).unroll(1) TC = tc.define(group_convolution, tc.make_autotuned_options_factory( starting_options=conv_option, tuner_config=tuner_config, cache_filename=cache_file, store_to_cache=True, load_from_cache=False )) else: TC = tc.define(group_convolution, tc.make_load_from_cache_options_factory(cache_file)) self.convolution_grouped = tc.make_autograd(TC.group_convolution, TC.convolution_grad) self.W = torch.nn.Parameter(torch.rand(groups, C/groups, I/groups, K[0], K[1], K[2])) self.pad = F.pad self.groups = groups self.padding = padding self.K = K
def indexed_matmul_2_tc(x, y, I, tune=False): if not has_tensor_comprehensions(): return indexed_matmul_2(x, y, I) lang = """ def indexed_matmul_2_tc(float(B,N,F) X, float(B,M,O,K) Y, int32(B,M,O) I) -> (output) { output(b, m, f, k) +=! Y(b, m, o, k) * X(b, I(b,m,o), f) } """ b, m, _, k = y.shape o = I.shape[2] n, f = x.shape[1:] cachefile = "tc_kernels/b{}_m{}_o{}_k{}_f{}.tc".format(b, m, o, k, f) op = tc.define(lang, name="indexed_matmul_2_tc") if tune: tune_opt = tc.autotuner_settings tune_opt["cache"] = cachefile op.autotune(x, y, I.int(), **tune_opt) out = op(x, y, I.int(), cache=cachefile, options=tc.mapping_options.Options("naive")) if out is None: out = op(x, y, I.int(), options=tc.mapping_options.Options("naive")) return out
def test_batchnorm(self): # NOTE: take note of use of {{ }} below for handling TC with scalars lang = """ def batchnorm(float(N,C,H,W) I, float(C) rMeanIn, float(C) rVarIn) -> (O, rMeanOut, rVarOut, mean, centered, variance, expectedVariance, normalizedOut) {{ mean(c) +=! I(nn, c, hh, ww) mean(c) = mean(c) / (N * H * W) rMeanOut(c) = (1 - {momentum}) * rMeanIn(c) + {momentum} * mean(c) centered(n, c, h, w) = I(n, c, h, w) - rMeanOut(c) variance(n, c, h, w) = centered(n, c, h, w) * centered(n, c, h, w) expectedVariance(c) +=! (variance(n, c, h, w) + {eps}) / (N * H * W) rVarOut(c) = rsqrt( (1 - {momentum}) * rVarIn(c) + {momentum} * expectedVariance(c)) O(n, c, h, w) = centered(n, c, h, w) * rVarOut(c) normalizedOut(n, c, h, w) = O(n, c, h, w) }} """ batchnorm = tc.define(lang, name="batchnorm", constants={ "momentum": 0.5, "eps": 1e-5 }) inp = torch.randn(32, 4, 56, 56).cuda() running_mean, running_var = torch.randn(4).cuda(), torch.randn( 4).cuda() out = batchnorm(inp, running_mean, running_var)
def test_train_convolution_strided(self): # NOTE: take note of use of {{ }} below for handling TC with scalars LANG = """ def convolution(float(N,C,H,W) I, float(M,C,KH,KW) W1) -> (O) {{ O(n, m, h, w) +=! I(n, c, {sh} * h + kh, {sw} * w + kw) * W1(m, c, kh, kw) }} def convolution_grad(float(N,C,H,W) I, float(M,C,KH,KW) W1, float(N,M,H,W) d_O) -> (d_I, d_W1) {{ d_I(n, c, h, w) +=! d_O(n, m, {sh} * h - kh, {sw} * w - kw) * W1(m, c, kh, kw) d_W1(m, c, kh, kw) +=! d_O(n, m, {sh} * h - kh, {sw} * w - kw) * I(n, c, h, w) }} """ # NOTE: TC doesn't support padding yet # see https://github.com/facebookresearch/TensorComprehensions/issues/11 # due to this reason, we use kernel=1 for now (only because we want to) # do the backwards as well. If kernel != 1 then we will have inconsistent # values of H, W in the backward TC N, C, H, W, O, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1 convolution = tc.define(LANG, training=True, name="convolution", backward="convolution_grad", constants={"sh":sH, "sw":sW}) I = Variable(torch.randn(N, C, H, W).cuda(), requires_grad=True) W = Parameter(torch.randn(O, C, kH, kW).cuda()) out = convolution(I, W) out[0].sum().backward()
def test_group_norm_fused(self): group_normalization = """ def group_normalization( float(N, G, D, H, W) I, float(G, D) gamma, float(G, D) beta) -> (Sum, SumSq, O) { Sum(n, g) +=! I(n, g, r_d, r_h, r_w) SumSq(n, g) +=! I(n, g, r_d, r_h, r_w) * I(n, g, r_d, r_h, r_w) O(n, g, d, h, w) = gamma(g, d) * ( I(n, g, d, h, w) - Sum(n, g) / (D * H * W)) * rsqrt( (SumSq(n, g) - Sum(n, g) * Sum(n, g) / (D * H * W)) / (D * H * W) + 1e-5) + beta(g, d) } """ N, G, D, H, W = 32, 32, 4, 56, 56 T = tc.define( group_normalization, tc.make_autotuned_options_factory(starting_options='naive', tuner_config=tuner_config)) I, gamma, beta = (torch.randn(N, G, D, H, W, device='cuda'), torch.randn(G, D, device='cuda').fill_(1.0), torch.randn(G, D, device='cuda').zero_()) Sum, SumSq, O = T.group_normalization(I, gamma, beta) from torch.nn.modules.normalization import GroupNorm GN = GroupNorm(G, G * D).cuda() ref = GN.forward(I.view((N, G * D, H, W))) tc.assert_almost_equal(ref, O.view((N, G * D, H, W)), I, operations=D * H * W)
def test_train_convolution_reorder(self): LANG = """ def convolution(float(N, C, H, W) I, float(M, C, KH, KW) W1, float(M) B) -> (tmp, O) { tmp(n, m, h, w) +=! I(n, c, h + kh, w + kw) * W1(m, c, kh, kw) O(n, m, h, w) = tmp(n, m, h, w) + B(m) } def convolution_grad(float(N, C, H, W) I, float(M, C, KH, KW) W1, float(M) B, float(N, M, H, W) O_grad) -> (I_grad, W1_grad, B_grad) { I_grad(n, c, h, w) +=! O_grad(n, m, h - kh, w - kw) * W1(m, c, kh, kw) W1_grad(m, c, kh, kw) +=! O_grad(n, m, h - kh, w - kw) * I(n, c, h, w) B_grad(m) +=! O_grad(n, m, h, w) } """ # since the forward layer produces two outputs, one is temporary which is # not needed in the forward pass, we can reorder the grad_outputs accordingly def reorder(): def reorder_function(grad_outputs): return [grad_outputs[1]] return reorder_function N, C, H, W, M, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1 convolution = tc.define(LANG, training=True, name="convolution", backward="convolution_grad") I = Variable(torch.randn(N, C, H, W).cuda(), requires_grad=True) W = Parameter(torch.randn(M, C, kH, kW).cuda()) B = Parameter(torch.randn(M).cuda()) out = convolution(I, W, B, reorder_function=reorder()) out[0].sum().backward()
def test_conv_with_backward_2kernels(self): conv = """ def convolution(float(N,C,H,W) I, float(M,C,KH,KW) W1, float(M) Bias) -> (O) { O(n, m, h, w) +=! I(n, r_c, h + r_kh, w + r_kw) * W1(m, r_c, r_kh, r_kw) O(n, m, h, w) = O(n, m, h, w) + Bias(m) } def convolution_igrad(float(M,C,KH,KW) W1, float(N,M,H,W) d_O) -> (d_I) { d_I(n, c, h, w) +=! d_O( n, r_m, h - r_kh, w - r_kw) * W1(r_m, c, r_kh, r_kw) } def convolution_wgrad(float(N,C,H,W) I, float(N,M,H,W) d_O) -> (d_W1) { d_W1(m, c, kh, kw) +=! d_O(r_n, m, r_h - kh, r_w - kw) * I(r_n, c, r_h, r_w) } def convolution_biasgrad(float(M) Bias) -> (d_Bias) { # TODO: Bias incorrect + check d_Bias(m) = Bias(m) } """ N, C, H, W, O, kH, kW = 32, 4, 56, 56, 16, 1, 1 T = tc.define( conv, tc.make_autotuned_options_factory(starting_options='naive', tuner_config=tuner_config)) I = torch.randn(N, C, H, W, device='cuda', requires_grad=True) # Reference from torch.nn.modules.conv import Conv2d Conv = Conv2d(C, O, 1, stride=1).cuda() ref = Conv.forward(I) W = Conv.weight.clone() Bias = Conv.bias.clone() def convolution_backward(I, W, Bias, d_O): d_I = T.convolution_igrad(W, d_O) d_O = T.convolution_wgrad(I, d_O) d_Bias = T.convolution_biasgrad(Bias) return (d_I, d_O, d_Bias) convolution_function = tc.make_autograd(T.convolution, convolution_backward) # First occurrence triggers tuning out = convolution_function(I, W, Bias) out.sum().backward() # Subsequent occurrences do not out = convolution_function(I, W, Bias) out.sum().backward() tc.assert_almost_equal(ref, out, I, operations=C * kH * kW)
def test_matmul_tune_and_run(self, n, m, k, seed, gc, dc): matmul = tc.define(MATMUL_LANG, name="matmul") mapping_options = matmul.autotune( (n, k), (k, m), generations=1, threads=32, pop_size=2, tuner_min_launch_total_threads=1, ) X = np.random.rand(m, k).astype(np.float32) W = np.random.rand(k, n).astype(np.float32) def ref(X, W): return [np.dot(X, W)] op = core.CreateOperator( "TcOp", ["X", "Y"], "out", tcDef=MATMUL_LANG, tcName="matmul", mappingOptions=mapping_options.serialize(), ) self.assertReferenceChecks( device_option=gc, op=op, inputs=[X, W], reference=ref, )
def test_autotuner_cachefile_first(self): cache_file = "{}/matmul_100_400_500".format( PATH_PREFIX) # use argparse if input from command line lang = MATMUL_LANG matmul = tc.define(lang, name="matmul") mat1, mat2 = torch.randn(100, 400).cuda(), torch.randn(400, 500).cuda() matmul.autotune(mat1, mat2, cache=cache_file, **tc.autotuner_settings)
def test_autotuner_no_cache_explicit_set(self): lang = MATMUL_LANG matmul = tc.define(lang, name="matmul") mat1, mat2 = torch.randn(100, 400).cuda(), torch.randn(400, 500).cuda() options = matmul.autotune(mat1, mat2, cache=False, **tc.autotuner_settings)
def test_conv_train_autotune_cache_no_options_seed(self): lang = CONV_TRAIN N, C, H, W, O, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1 convolution = tc.define(lang, training=True, name="convolution", backward="convolution_grad", constants={"sh":sH, "sw":sW}) I, W = torch.randn(N, C, H, W).cuda(), torch.randn(O, C, kH, kW).cuda() convolution.autotune(I, W, cache=True, **tc.autotuner_settings) # on the second call, autotuning will be seeded from previous best options convolution.autotune(I, W, cache=True, **tc.autotuner_settings)
def test_matmul_variable_reuse_outputs(self): lang = MATMUL_LANG matmul = tc.define(lang, name="matmul") mat1, mat2 = Variable(torch.randn(3, 4).cuda(), requires_grad=True), Variable(torch.randn(4, 5).cuda(), requires_grad=True) out = matmul(mat1, mat2) mat3, mat4 = Variable(torch.randn(3, 4).cuda(), requires_grad=True), Variable(torch.randn(4, 5).cuda(), requires_grad=True) matmul(mat3, mat4, outputs=out)
def test_conv_backward_pass_options(self): lang = CONV_TRAIN N, C, H, W, O, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1 convolution = tc.define(lang, training=True, name="convolution", backward="convolution_grad", constants={"sh":sH, "sw":sW}) I = Variable(torch.randn(N, C, H, W).cuda(), requires_grad=True) W = Parameter(torch.randn(O, C, kH, kW).cuda()) out = convolution(I, W, options=[tc.Options("conv"), tc.Options("group_conv")]) out.sum().backward()
def test_kru_train_autotune_no_cache_no_options(self): lang = KRU3_1_TRAINING KRU3_1 = tc.define(lang, training=True, name="KRU3_1", backward="KRU3_1_GRAD") W2, X = torch.randn(32, 16).cuda(), torch.randn(256, 16, 16, 16).cuda() options = KRU3_1.autotune(W2, X, **tc.autotuner_default_options)
def test_copy(self): LANG = """ def copy(float(M, N) I) -> (O) { O(i, j) = I(i, j) } """ copy = tc.define(LANG, name="copy") inp = torch.randn(32, 32).cuda() out = copy(inp)
def test_tanh(self): LANG = """ def Tanh(float(M) I) -> (O) { O(m) = tanh(I(m)) } """ Tanh = tc.define(LANG, name="Tanh") inp = torch.randn(32).cuda() out = Tanh(inp)
def test_sigmoid(self): LANG = """ def sigmoid(float(N, C, H, W) I) -> (O) { O(n, c, h, w) = 1 / (1 + exp(-I(n, c, h, w))) } """ sigmoid = tc.define(LANG, name="sigmoid") inp = torch.randn(32, 3, 128, 128).cuda() out = sigmoid(inp)
def test_different_input_sizes(self): lang = MATMUL_LANG matmul = tc.define(lang, name="matmul") mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda() out1 = matmul(mat1, mat2) # if the inputs sizes are different, re-compilation will happen mat3, mat4 = torch.randn(100, 400).cuda(), torch.randn(400, 500).cuda() out2 = matmul(mat3, mat4)
def test_absolute(self): LANG = """ def abs(float(M, N) A) -> (O1) { O1(m, n) = fabs(A(m, n)) } """ absolute = tc.define(LANG, name="abs") A = -1 * torch.randn(3, 4).cuda() out = absolute(A, options=tc.Options("pointwise"))
def test_relu(self): LANG = """ def relu(float(B,M) I) -> (O1){ O1(b, m) = fmax(I(b, m), 0) } """ relu = tc.define(LANG, name="relu") inp = torch.randn(100, 128).cuda() out = relu(inp)
def test_conv_train_autotune_to_cache_file_seed(self): lang = CONV_TRAIN cache_file = "{}/CONV_32_4_56_56_16_1_1_1_1".format(PATH_PREFIX) N, C, H, W, O, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1 convolution = tc.define(lang, training=True, name="convolution", backward="convolution_grad", constants={"sh":sH, "sw":sW}) I, W = torch.randn(N, C, H, W).cuda(), torch.randn(O, C, kH, kW).cuda() convolution.autotune(I, W, cache=cache_file, **tc.autotuner_settings) # the second call should be seeded from the previous call convolution.autotune(I, W, cache=cache_file, **tc.autotuner_settings)
def test_cast(self): LANG = """ def cast(float(M,N) A) -> (int32(M,N) O1) {{ O1(m, n) = int32(A(m, n) + {constant}) }} """ cast = tc.define(LANG, name="cast", constants={"constant": 0.3}) A = torch.randn(32, 16).cuda() out = cast(A)
def test_debug_init(self): lang = """ def matmul(float(M,N) A, float(N,K) B) -> (output) { output(i, j) +=! A(i, kk) * B(kk, j) } """ matmul = tc.define(lang, name="matmul") mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda() out = matmul(mat1, mat2)
def test_autotuner_tuple_size_cache_to_default(self): lang = MATMUL_LANG matmul = tc.define(lang, name="matmul") matmul.autotune((3, 4), (4, 5), cache=True, **tc.small_sizes_autotuner_settings) matmul.autotune((100, 400), (400, 500), cache=True, **tc.autotuner_settings)
def autotune(cache_file='tc_cache'): print("Starting autotune") A, B = torch.randn(M, K).cuda(), torch.randn(K, N).cuda() sgemm = tc.define(lang, name="sgemm") best_opts = sgemm.autotune(A, B, cache=cache_file, generations=25, pop_size=50, crossover_rate=70, number_elites=5, gpus="1,2,3") print("Done autotune") print(sorted(test(best_opts, 20))[10]) return best_opts
def test(options, n): # runs given options n times and returns running times torch.cuda.synchronize() torch.cuda.synchronize() sgemm = tc.define(lang, name="sgemm") times = [] A_, B_, C_ = torch.randn(M, K), torch.randn(K, N), torch.randn(M, N) print("") for i in range(n + 1): print("\033[Frunning test: {}/{}".format(i, n)) A, B, C = A_.clone(), B_.clone(), C_.clone() t1 = time.perf_counter() A_cuda, B_cuda, C_cuda = A.cuda(), B.cuda(), C.cuda() torch.cuda.synchronize() sgemm(A_cuda, B_cuda, outputs=C_cuda, options=options) torch.cuda.synchronize() C_res = C_cuda.cpu() torch.cuda.synchronize() if i > 0: # The first run is warmup times.append(time.perf_counter() - t1) return times
def load_cache(cache_file='tc_cache'): A, B = torch.randn(M, K).cuda(), torch.randn(K, N).cuda() sgemm = tc.define(lang, name="sgemm") # Couldn't find a reasonable way to load cache: return sgemm.autotune(A, B, cache=cache_file, generations=0)