Example #1
0
    def test_matmul_tune_and_run(self, n, m, k, seed, gc, dc):
        matmul = tc.define(MATMUL_LANG, name="matmul")
        matmul_grad = tc.define(MATMUL_GRAD_LANG, name="matmul_grad")

        mapping_options = matmul.autotune(
            (n, k),
            (k, m),
            generations=3,
            threads=32,
            pop_size=2,
            tuner_min_launch_total_threads=1,
        )

        grad_mapping_options = matmul_grad.autotune(
            (n, k),
            (k, m),
            (n, m),
            generations=1,
            threads=32,
            pop_size=2,
            tuner_min_launch_total_threads=1,
        )

        X = np.random.rand(m, k).astype(np.float32)
        W = np.random.rand(k, n).astype(np.float32)

        def ref(X, W):
            return [np.dot(X, W)]

        op = core.CreateOperator(
            "TcOp",
            ["X", "Y"],
            "out",
            tc_def=MATMUL_LANG,
            tc_name="matmul",
            tc_grad_def=MATMUL_GRAD_LANG,
            tc_grad_name="matmul_grad",
            inputs_used_by_gradient=[0, 1],
            output_gradients_used_by_gradient=[0],
            inputs_to_compute_gradients_of=[0, 1],
            mapping_options=mapping_options.serialize(),
            grad_mapping_options=grad_mapping_options.serialize(),
        )

        self.assertReferenceChecks(
            device_option=gc,
            op=op,
            inputs=[X, W],
            reference=ref,
        )

        for i in range(2):
            self.assertGradientChecks(
                device_option=gc,
                op=op,
                inputs=[X, W],
                outputs_to_check=i,
                outputs_with_grads=[0],
            )
Example #2
0
    def test_tc_autotune_reinforce(self):
        with tempfile.NamedTemporaryFile() as cache_file:
            group_normalization = """
            def moments(float(N, K) I) -> (mean, var) {
                # var = E(x^2) - mean^2.
                mean(n) +=! I(n, r_k)
                 var(n) +=! I(n, r_k) * I(n, r_k)
                mean(n)  = mean(n) / (K)
                 var(n)  =  var(n) / (K) - mean(n) * mean(n)
            }

            def group_normalization(
                float(N, G, D, H, W) I, float(G, D) gamma, float(G, D) beta,
                float(N, G) mean, float(N, G) var) -> (O)
            {
                O(n, g, d, h, w) = gamma(g, d)
                    * ( I(n, g, d, h, w) - mean(n, g) )
                    * rsqrt( var(n, g) + 1e-5 )
                    + beta(g, d)
            }
            """

            N, G, D, H, W = 32, 32, 4, 56, 56
            I, gamma, beta = (torch.randn(N, G, D, H, W, device='cuda'),
                              torch.randn(G, D, device='cuda').fill_(1.0),
                              torch.randn(G, D, device='cuda').zero_())

            T = tc.define(
                group_normalization,
                tc.make_autotuned_options_factory(
                    starting_options='naive',
                    tuner_config=tuner_config,
                    cache_filename=cache_file.name,
                    store_to_cache=True))
            # First occurrence triggers tuning
            mean, var = T.moments(I.view((N * G, -1)))
            out = T.group_normalization(I, gamma, beta, mean.view((N, G)),
                                        var.view((N, G)))

            # Create a new TC object to retrigger tuning
            T = tc.define(
                group_normalization,
                tc.make_autotuned_options_factory(
                    tuner_config=tuner_config,
                    cache_filename=cache_file.name,
                    load_from_cache=True,
                    store_to_cache=True))
            mean, var = T.moments(I.view((N * G, -1)))
            out = T.group_normalization(I, gamma, beta, mean.view((N, G)),
                                        var.view((N, G)))

            from torch.nn.modules.normalization import GroupNorm
            GN = GroupNorm(G, G * D).cuda()
            ref = GN.forward(I.view((N, G * D, H, W)))

            tc.assert_almost_equal(ref,
                                   out.view((N, G * D, H, W)),
                                   I,
                                   operations=D * H * W)
    def test_multiple_tc(self):
        lang = MATMUL_KRU1_LANG
        matmul = tc.define(lang, name="matmul")
        mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda()
        out = matmul(mat1, mat2)

        KRU3_1 = tc.define(lang, name="KRU3_1")
        W2, X = torch.randn(32, 16).cuda(), torch.randn(256, 16, 16, 16).cuda()
        out = KRU3_1(W2, X)
Example #4
0
    def test_multiple_tc(self):
        lang = MATMUL_ABS_LANG
        matmul = tc.define(lang, name="matmul")
        mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda()
        out = matmul(mat1, mat2)

        abs = tc.define(lang, name="abs")
        A = torch.randn(3, 4).cuda()
        out = abs(A)
    def test_autotuner_multiple_tc(self):
        lang = MATMUL_KRU1_LANG
        matmul = tc.define(lang, name="matmul")
        mat1, mat2 = torch.randn(72, 26).cuda(), torch.randn(26, 72).cuda()
        matmul.autotune(mat1, mat2, cache=True, **tc.autotuner_default_options)
        out = matmul(mat1, mat2)

        KRU3_1 = tc.define(lang, name="KRU3_1")
        W2, X = torch.randn(32, 16).cuda(), torch.randn(256, 16, 16, 16).cuda()
        KRU3_1.autotune(W2, X, cache=True, **tc.autotuner_default_options)
        out = KRU3_1(W2, X)
Example #6
0
    def test_autotuner_multiple_tc(self):
        lang = MATMUL_ABS_LANG
        matmul = tc.define(lang, name="matmul")
        mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda()
        matmul.autotune(mat1, mat2, cache=True, **tc.autotuner_settings)
        out = matmul(mat1, mat2)

        absolute = tc.define(lang, name="abs")
        A = torch.randn(100, 400).cuda()
        absolute.autotune(A, cache=True, **tc.autotuner_settings)
        out = absolute(A)
Example #7
0
    def __init__(self, I, C, K, groups=1, padding=0, bias=False, from_cache=False, cache_file='tc_group3d.pt', tuner_config=None):
        '''
        Module providing grouped 3d convolution using tensor comprehensions

        :param I: Number of input channels
        :type I: int
        :param C: Number of output channels
        :type C: int
        :param K: Kernel size
        :type K: tuple or int
        :param groups: Number of groups
        :type groups: int
        :param from_cache: If True load from specified cache file, If False, perform autotuning
        :type from_cache: bool
        :param cache_file: Path and name of cache file
        :type cache_file: string
        :param padding: Amount of input padding
        :type padding: tuple or int
        :param bias: Not implemented
        :type bias: bool
        :param tuner_config: Tuner config object to use for auto-tuning
        :type tuner_config: tensor_comprehensions.TunerConfig
        '''
        import torch.nn.functional as F
        super().__init__()

        K = self.int_to_tuple(K)
        padding = self.int_to_tuple(padding)

        group_convolution = self.tc_string()
        if not from_cache:
            if tuner_config is None:
                tuner_config = tc.TunerConfig().generations(25).pop_size(100).number_elites(15)
            conv_option = tc.tclib.MappingOptions('naive').tile([1,1]).mapToThreads([4,16,4]).mapToBlocks([256,256]).unroll(1)
            TC = tc.define(group_convolution, tc.make_autotuned_options_factory(
                    starting_options=conv_option,
                    tuner_config=tuner_config,
                    cache_filename=cache_file,
                    store_to_cache=True,
                    load_from_cache=False
                    ))
        else:
            TC = tc.define(group_convolution, tc.make_load_from_cache_options_factory(cache_file))

        self.convolution_grouped = tc.make_autograd(TC.group_convolution, TC.convolution_grad)
        self.W = torch.nn.Parameter(torch.rand(groups, C/groups, I/groups, K[0], K[1], K[2]))
        self.pad = F.pad
        self.groups = groups
        self.padding = padding
        self.K = K
Example #8
0
def indexed_matmul_2_tc(x, y, I, tune=False):
    if not has_tensor_comprehensions():
        return indexed_matmul_2(x, y, I)

    lang = """
    def indexed_matmul_2_tc(float(B,N,F) X, float(B,M,O,K) Y, int32(B,M,O) I) -> (output) {
        output(b, m, f, k) +=! Y(b, m, o, k) * X(b, I(b,m,o), f)
    }
    """
    b, m, _, k = y.shape
    o = I.shape[2]
    n, f = x.shape[1:]
    cachefile = "tc_kernels/b{}_m{}_o{}_k{}_f{}.tc".format(b, m, o, k, f)
    op = tc.define(lang, name="indexed_matmul_2_tc")
    if tune:
        tune_opt = tc.autotuner_settings
        tune_opt["cache"] = cachefile
        op.autotune(x, y, I.int(), **tune_opt)

    out = op(x,
             y,
             I.int(),
             cache=cachefile,
             options=tc.mapping_options.Options("naive"))
    if out is None:
        out = op(x, y, I.int(), options=tc.mapping_options.Options("naive"))
    return out
 def test_batchnorm(self):
     # NOTE: take note of use of {{ }} below for handling TC with scalars
     lang = """
     def batchnorm(float(N,C,H,W) I, float(C) rMeanIn, float(C) rVarIn)
     -> (O, rMeanOut, rVarOut, mean, centered, variance, expectedVariance, normalizedOut)
     {{
        mean(c) +=! I(nn, c, hh, ww)
        mean(c)  = mean(c) / (N * H * W)
        rMeanOut(c) = (1 - {momentum}) * rMeanIn(c) + {momentum} * mean(c)
        centered(n, c, h, w) = I(n, c, h, w) - rMeanOut(c)
        variance(n, c, h, w) = centered(n, c, h, w) * centered(n, c, h, w)
        expectedVariance(c) +=! (variance(n, c, h, w) + {eps}) / (N * H * W)
        rVarOut(c) = rsqrt(
          (1 - {momentum}) * rVarIn(c) + {momentum} * expectedVariance(c))
        O(n, c, h, w) = centered(n, c, h, w) * rVarOut(c)
        normalizedOut(n, c, h, w) = O(n, c, h, w)
     }}
     """
     batchnorm = tc.define(lang,
                           name="batchnorm",
                           constants={
                               "momentum": 0.5,
                               "eps": 1e-5
                           })
     inp = torch.randn(32, 4, 56, 56).cuda()
     running_mean, running_var = torch.randn(4).cuda(), torch.randn(
         4).cuda()
     out = batchnorm(inp, running_mean, running_var)
    def test_train_convolution_strided(self):
        # NOTE: take note of use of {{ }} below for handling TC with scalars
        LANG = """
        def convolution(float(N,C,H,W) I, float(M,C,KH,KW) W1) -> (O) {{
            O(n, m, h, w) +=! I(n, c, {sh} * h + kh, {sw} * w + kw) * W1(m, c, kh, kw)
        }}
        def convolution_grad(float(N,C,H,W) I, float(M,C,KH,KW) W1, float(N,M,H,W) d_O)
        -> (d_I, d_W1)
        {{
            d_I(n, c, h, w) +=! d_O(n, m, {sh} * h - kh, {sw} * w - kw) * W1(m, c, kh, kw)
            d_W1(m, c, kh, kw) +=! d_O(n, m, {sh} * h - kh, {sw} * w - kw) * I(n, c, h, w)
        }}
        """

        # NOTE: TC doesn't support padding yet
        # see https://github.com/facebookresearch/TensorComprehensions/issues/11
        # due to this reason, we use kernel=1 for now (only because we want to)
        # do the backwards as well. If kernel != 1 then we will have inconsistent
        # values of H, W in the backward TC
        N, C, H, W, O, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1
        convolution = tc.define(LANG, training=True, name="convolution", backward="convolution_grad", constants={"sh":sH, "sw":sW})
        I = Variable(torch.randn(N, C, H, W).cuda(), requires_grad=True)
        W = Parameter(torch.randn(O, C, kH, kW).cuda())
        out = convolution(I, W)
        out[0].sum().backward()
Example #11
0
    def test_group_norm_fused(self):
        group_normalization = """
            def group_normalization(
                float(N, G, D, H, W) I, float(G, D) gamma, float(G, D) beta)
            -> (Sum, SumSq, O)
            {
                Sum(n, g) +=! I(n, g, r_d, r_h, r_w)
              SumSq(n, g) +=! I(n, g, r_d, r_h, r_w) * I(n, g, r_d, r_h, r_w)
                O(n, g, d, h, w) =  gamma(g, d)
                    * ( I(n, g, d, h, w) - Sum(n, g) / (D * H * W))
                    * rsqrt( (SumSq(n, g) - Sum(n, g) * Sum(n, g) / (D * H * W))
                           / (D * H * W)
                           + 1e-5)
                    + beta(g, d)
            }
        """

        N, G, D, H, W = 32, 32, 4, 56, 56
        T = tc.define(
            group_normalization,
            tc.make_autotuned_options_factory(starting_options='naive',
                                              tuner_config=tuner_config))
        I, gamma, beta = (torch.randn(N, G, D, H, W, device='cuda'),
                          torch.randn(G, D, device='cuda').fill_(1.0),
                          torch.randn(G, D, device='cuda').zero_())
        Sum, SumSq, O = T.group_normalization(I, gamma, beta)

        from torch.nn.modules.normalization import GroupNorm
        GN = GroupNorm(G, G * D).cuda()
        ref = GN.forward(I.view((N, G * D, H, W)))

        tc.assert_almost_equal(ref,
                               O.view((N, G * D, H, W)),
                               I,
                               operations=D * H * W)
    def test_train_convolution_reorder(self):
        LANG = """
        def convolution(float(N, C, H, W) I, float(M, C, KH, KW) W1, float(M) B) -> (tmp, O) {
          tmp(n, m, h, w) +=! I(n, c, h + kh, w + kw) * W1(m, c, kh, kw)
          O(n, m, h, w) = tmp(n, m, h, w) + B(m)
        }
        def convolution_grad(float(N, C, H, W) I, float(M, C, KH, KW) W1, float(M) B, float(N, M, H, W) O_grad)
        -> (I_grad, W1_grad, B_grad) {
          I_grad(n, c, h, w) +=! O_grad(n, m, h - kh, w - kw) * W1(m, c, kh, kw)
          W1_grad(m, c, kh, kw) +=! O_grad(n, m,  h - kh, w - kw) * I(n, c, h, w)
          B_grad(m) +=! O_grad(n, m, h, w)
        }
        """

        # since the forward layer produces two outputs, one is temporary which is
        # not needed in the forward pass, we can reorder the grad_outputs accordingly
        def reorder():
            def reorder_function(grad_outputs):
                return [grad_outputs[1]]

            return reorder_function

        N, C, H, W, M, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1
        convolution = tc.define(LANG,
                                training=True,
                                name="convolution",
                                backward="convolution_grad")
        I = Variable(torch.randn(N, C, H, W).cuda(), requires_grad=True)
        W = Parameter(torch.randn(M, C, kH, kW).cuda())
        B = Parameter(torch.randn(M).cuda())
        out = convolution(I, W, B, reorder_function=reorder())
        out[0].sum().backward()
Example #13
0
    def test_conv_with_backward_2kernels(self):
        conv = """
        def convolution(float(N,C,H,W) I, float(M,C,KH,KW) W1, float(M) Bias)
        -> (O)
        {
            O(n, m, h, w) +=!
                I(n, r_c, h + r_kh, w + r_kw) * W1(m, r_c, r_kh, r_kw)
            O(n, m, h, w)  = O(n, m, h, w) + Bias(m)
        }
        def convolution_igrad(float(M,C,KH,KW) W1, float(N,M,H,W) d_O)
            -> (d_I)
        {
            d_I(n, c, h, w) +=!
                d_O(  n, r_m, h - r_kh, w - r_kw) * W1(r_m, c, r_kh, r_kw)
        }
        def convolution_wgrad(float(N,C,H,W) I, float(N,M,H,W) d_O) -> (d_W1)
        {
            d_W1(m, c, kh, kw) +=!
                d_O(r_n,   m, r_h - kh, r_w - kw) *  I(r_n, c,  r_h,  r_w)
        }
        def convolution_biasgrad(float(M) Bias) -> (d_Bias)
        {
            # TODO: Bias incorrect + check
            d_Bias(m) = Bias(m)
        }
        """

        N, C, H, W, O, kH, kW = 32, 4, 56, 56, 16, 1, 1
        T = tc.define(
            conv,
            tc.make_autotuned_options_factory(starting_options='naive',
                                              tuner_config=tuner_config))
        I = torch.randn(N, C, H, W, device='cuda', requires_grad=True)

        # Reference
        from torch.nn.modules.conv import Conv2d
        Conv = Conv2d(C, O, 1, stride=1).cuda()
        ref = Conv.forward(I)

        W = Conv.weight.clone()
        Bias = Conv.bias.clone()

        def convolution_backward(I, W, Bias, d_O):
            d_I = T.convolution_igrad(W, d_O)
            d_O = T.convolution_wgrad(I, d_O)
            d_Bias = T.convolution_biasgrad(Bias)
            return (d_I, d_O, d_Bias)

        convolution_function = tc.make_autograd(T.convolution,
                                                convolution_backward)

        # First occurrence triggers tuning
        out = convolution_function(I, W, Bias)
        out.sum().backward()

        # Subsequent occurrences do not
        out = convolution_function(I, W, Bias)
        out.sum().backward()

        tc.assert_almost_equal(ref, out, I, operations=C * kH * kW)
Example #14
0
    def test_matmul_tune_and_run(self, n, m, k, seed, gc, dc):
        matmul = tc.define(MATMUL_LANG, name="matmul")

        mapping_options = matmul.autotune(
            (n, k), (k, m),
            generations=1,
            threads=32,
            pop_size=2,
            tuner_min_launch_total_threads=1,
        )

        X = np.random.rand(m, k).astype(np.float32)
        W = np.random.rand(k, n).astype(np.float32)

        def ref(X, W):
            return [np.dot(X, W)]

        op = core.CreateOperator(
            "TcOp", ["X", "Y"], "out",
            tcDef=MATMUL_LANG,
            tcName="matmul",
            mappingOptions=mapping_options.serialize(),
        )

        self.assertReferenceChecks(
            device_option=gc,
            op=op,
            inputs=[X, W],
            reference=ref,
        )
 def test_autotuner_cachefile_first(self):
     cache_file = "{}/matmul_100_400_500".format(
         PATH_PREFIX)  # use argparse if input from command line
     lang = MATMUL_LANG
     matmul = tc.define(lang, name="matmul")
     mat1, mat2 = torch.randn(100, 400).cuda(), torch.randn(400, 500).cuda()
     matmul.autotune(mat1, mat2, cache=cache_file, **tc.autotuner_settings)
 def test_autotuner_no_cache_explicit_set(self):
     lang = MATMUL_LANG
     matmul = tc.define(lang, name="matmul")
     mat1, mat2 = torch.randn(100, 400).cuda(), torch.randn(400, 500).cuda()
     options = matmul.autotune(mat1,
                               mat2,
                               cache=False,
                               **tc.autotuner_settings)
Example #17
0
 def test_conv_train_autotune_cache_no_options_seed(self):
     lang = CONV_TRAIN
     N, C, H, W, O, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1
     convolution = tc.define(lang, training=True, name="convolution", backward="convolution_grad", constants={"sh":sH, "sw":sW})
     I, W = torch.randn(N, C, H, W).cuda(), torch.randn(O, C, kH, kW).cuda()
     convolution.autotune(I, W, cache=True, **tc.autotuner_settings)
     # on the second call, autotuning will be seeded from previous best options
     convolution.autotune(I, W, cache=True, **tc.autotuner_settings)
Example #18
0
    def test_matmul_variable_reuse_outputs(self):
        lang = MATMUL_LANG
        matmul = tc.define(lang, name="matmul")
        mat1, mat2 = Variable(torch.randn(3, 4).cuda(), requires_grad=True), Variable(torch.randn(4, 5).cuda(), requires_grad=True)
        out = matmul(mat1, mat2)

        mat3, mat4 = Variable(torch.randn(3, 4).cuda(), requires_grad=True), Variable(torch.randn(4, 5).cuda(), requires_grad=True)
        matmul(mat3, mat4, outputs=out)
Example #19
0
 def test_conv_backward_pass_options(self):
     lang = CONV_TRAIN
     N, C, H, W, O, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1
     convolution = tc.define(lang, training=True, name="convolution", backward="convolution_grad", constants={"sh":sH, "sw":sW})
     I = Variable(torch.randn(N, C, H, W).cuda(), requires_grad=True)
     W = Parameter(torch.randn(O, C, kH, kW).cuda())
     out = convolution(I, W, options=[tc.Options("conv"), tc.Options("group_conv")])
     out.sum().backward()
 def test_kru_train_autotune_no_cache_no_options(self):
     lang = KRU3_1_TRAINING
     KRU3_1 = tc.define(lang,
                        training=True,
                        name="KRU3_1",
                        backward="KRU3_1_GRAD")
     W2, X = torch.randn(32, 16).cuda(), torch.randn(256, 16, 16, 16).cuda()
     options = KRU3_1.autotune(W2, X, **tc.autotuner_default_options)
 def test_copy(self):
     LANG = """
     def copy(float(M, N) I) -> (O) {
       O(i, j) = I(i, j)
     }
     """
     copy = tc.define(LANG, name="copy")
     inp = torch.randn(32, 32).cuda()
     out = copy(inp)
Example #22
0
 def test_tanh(self):
     LANG = """
     def Tanh(float(M) I) -> (O) {
       O(m) = tanh(I(m))
     }
     """
     Tanh = tc.define(LANG, name="Tanh")
     inp = torch.randn(32).cuda()
     out = Tanh(inp)
Example #23
0
 def test_sigmoid(self):
     LANG = """
     def sigmoid(float(N, C, H, W) I) -> (O) {
       O(n, c, h, w) = 1 / (1 + exp(-I(n, c, h, w)))
     }
     """
     sigmoid = tc.define(LANG, name="sigmoid")
     inp = torch.randn(32, 3, 128, 128).cuda()
     out = sigmoid(inp)
Example #24
0
    def test_different_input_sizes(self):
        lang = MATMUL_LANG
        matmul = tc.define(lang, name="matmul")
        mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda()
        out1 = matmul(mat1, mat2)

        # if the inputs sizes are different, re-compilation will happen
        mat3, mat4 = torch.randn(100, 400).cuda(), torch.randn(400, 500).cuda()
        out2 = matmul(mat3, mat4)
 def test_absolute(self):
     LANG = """
     def abs(float(M, N) A) -> (O1) {
       O1(m, n) = fabs(A(m, n))
     }
     """
     absolute = tc.define(LANG, name="abs")
     A = -1 * torch.randn(3, 4).cuda()
     out = absolute(A, options=tc.Options("pointwise"))
 def test_relu(self):
     LANG = """
     def relu(float(B,M) I) -> (O1){
       O1(b, m) = fmax(I(b, m), 0)
     }
     """
     relu = tc.define(LANG, name="relu")
     inp = torch.randn(100, 128).cuda()
     out = relu(inp)
Example #27
0
 def test_conv_train_autotune_to_cache_file_seed(self):
     lang = CONV_TRAIN
     cache_file = "{}/CONV_32_4_56_56_16_1_1_1_1".format(PATH_PREFIX)
     N, C, H, W, O, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1
     convolution = tc.define(lang, training=True, name="convolution", backward="convolution_grad", constants={"sh":sH, "sw":sW})
     I, W = torch.randn(N, C, H, W).cuda(), torch.randn(O, C, kH, kW).cuda()
     convolution.autotune(I, W, cache=cache_file, **tc.autotuner_settings)
     # the second call should be seeded from the previous call
     convolution.autotune(I, W, cache=cache_file, **tc.autotuner_settings)
Example #28
0
 def test_cast(self):
     LANG = """
     def cast(float(M,N) A) -> (int32(M,N) O1) {{
         O1(m, n) = int32(A(m, n) + {constant})
     }}
     """
     cast = tc.define(LANG, name="cast", constants={"constant": 0.3})
     A = torch.randn(32, 16).cuda()
     out = cast(A)
 def test_debug_init(self):
     lang = """
     def matmul(float(M,N) A, float(N,K) B) -> (output) {
       output(i, j) +=! A(i, kk) * B(kk, j)
     }
     """
     matmul = tc.define(lang, name="matmul")
     mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda()
     out = matmul(mat1, mat2)
 def test_autotuner_tuple_size_cache_to_default(self):
     lang = MATMUL_LANG
     matmul = tc.define(lang, name="matmul")
     matmul.autotune((3, 4), (4, 5),
                     cache=True,
                     **tc.small_sizes_autotuner_settings)
     matmul.autotune((100, 400), (400, 500),
                     cache=True,
                     **tc.autotuner_settings)
Example #31
0
def autotune(cache_file='tc_cache'):
    print("Starting autotune")
    A, B = torch.randn(M, K).cuda(), torch.randn(K, N).cuda()
    sgemm = tc.define(lang, name="sgemm")
    best_opts = sgemm.autotune(A, B,
            cache=cache_file,
            generations=25,
            pop_size=50,
            crossover_rate=70,
            number_elites=5,
            gpus="1,2,3")
    print("Done autotune")
    print(sorted(test(best_opts, 20))[10])
    return best_opts
Example #32
0
def test(options, n):
    # runs given options n times and returns running times
    torch.cuda.synchronize()
    torch.cuda.synchronize()
    sgemm = tc.define(lang, name="sgemm")
    times = []
    A_, B_, C_ = torch.randn(M, K), torch.randn(K, N), torch.randn(M, N)
    print("")
    for i in range(n + 1):
        print("\033[Frunning test: {}/{}".format(i, n))
        A, B, C = A_.clone(), B_.clone(), C_.clone()
        t1 = time.perf_counter()
        A_cuda, B_cuda, C_cuda = A.cuda(), B.cuda(), C.cuda()
        torch.cuda.synchronize()
        sgemm(A_cuda, B_cuda, outputs=C_cuda, options=options)
        torch.cuda.synchronize()
        C_res = C_cuda.cpu()
        torch.cuda.synchronize()
        if i > 0: # The first run is warmup
            times.append(time.perf_counter() - t1)
    return times
Example #33
0
def load_cache(cache_file='tc_cache'):
    A, B = torch.randn(M, K).cuda(), torch.randn(K, N).cuda()
    sgemm = tc.define(lang, name="sgemm")
    # Couldn't find a reasonable way to load cache:
    return sgemm.autotune(A, B, cache=cache_file, generations=0)