def get_options_from_kwargs_and_tuner_cache(name, cache_file, options_cache,
                                            *inputs, **kwargs):
    options = None
    if "options" in kwargs and kwargs["options"] is not None:
        options = kwargs["options"]
        assert "type" in kwargs, "tuning layer type not specified: forward/backward"
        # if we pass separate options for forward/backward, we use them otherwise
        # use the same options
        if isinstance(options, list) and len(options) == 2:
            options = options[0] if kwargs["type"] == "forward" else options[1]
    elif cache_file and isinstance(kwargs["cache"], str):
        options = get_options_from_cache_file(name, *inputs, **kwargs)
    elif options_cache and kwargs["type"] in options_cache and options_cache[
            kwargs["type"]] is not None:
        options = options_cache[kwargs["type"]]
        logger.info(
            "Kernel was previously tuned, seeding the current tuning with those mapping options"
        )

    if options is None:
        options = Options("naive")
        logger.warning(
            "Using 'naive' type mapping options for autotuning. See help(your_layer.autotune) for how to set mapping options."
        )
    if not isinstance(options, Options):
        options = Options(options)
    return options
def get_options_from_kwargs(name, *inputs, **kwargs):
    # now the options can be a tuple (if training) or it will be just options
    # (only forward)
    options = None
    if "options" in kwargs and kwargs["options"] is not None:
        options = kwargs["options"]
        assert "type" in kwargs, "layer type not specified: forward/backward"
        if isinstance(options, list) and len(options) == 2:
            options = options[0] if kwargs["type"] == "forward" else options[1]
        elif "training" in kwargs and kwargs["training"] and kwargs[
                "type"] == "backward":
            logger.warning(
                'Same mapping options will be used to run backward layer, please pass backward mapping options for better performance.'
            )
    elif "cache" in kwargs and kwargs["cache"]:
        options = get_options_from_cache_file(name, *inputs, **kwargs)
    elif "options_cache" in kwargs and kwargs["options_cache"]:
        options_cache = kwargs["options_cache"]
        assert kwargs[
            "type"] is not None, "layer type not specified: forward/backward"
        options = options_cache[kwargs["type"]]
        logger.info("Tuned kernel options found, using those options")

    if options is None:
        options = Options("naive")
        logger.warning(
            "No mapping options passed, 'naive' type mapping options will be used and will likely have bad performance. See help(your_layer.__call__) for setting mapping options."
        )
    if not isinstance(options, Options):
        options = Options(options)
    return options
 def test_kru_backward_pass_options(self):
     lang = KRU3_1_TRAINING
     forward_options = Options("naive")
     backward_options = Options("naive")
     KRU3_1 = tc.define(lang,
                        training=True,
                        name="KRU3_1",
                        backward="KRU3_1_GRAD")
     X = Variable(torch.randn(256, 16, 16, 16).cuda(), requires_grad=True)
     W2 = Parameter(torch.randn(32, 16).cuda())
     out = KRU3_1(W2, X, options=[forward_options, backward_options])
     out[0].sum().backward()
 def test_kru_train_autotune_manual_options_both(self):
     lang = KRU3_1_TRAINING
     forward_options, backward_options = Options("mlp"), Options("mlp")
     KRU3_1 = tc.define(lang,
                        training=True,
                        name="KRU3_1",
                        backward="KRU3_1_GRAD")
     W2, X = torch.randn(32, 16).cuda(), torch.randn(256, 16, 16, 16).cuda()
     options = KRU3_1.autotune(W2,
                               X,
                               options=[forward_options, backward_options],
                               cache=True,
                               **tc.autotuner_default_options)
 def tune_and_store(self, tc_name, inputs, mapping_options, cache_file=""):
     options = mapping_options
     if not isinstance(options, Options):
         options = Options(options)
     try:
         best_options = self.autotuner.tune(cache_file, tc_name, inputs,
                                            options, [options])
         return best_options
     except RuntimeError:
         return options
 def test_autotuner_start_options_and_run_kernel(self):
     lang = MATMUL_LANG
     matmul = tc.define(lang, name="matmul")
     mat1, mat2 = torch.randn(100, 400).cuda(), torch.randn(400, 500).cuda()
     options = Options("mlp")
     best_options = matmul.autotune(mat1,
                                    mat2,
                                    cache=True,
                                    options=options,
                                    **tc.autotuner_default_options)
     out = matmul(mat1, mat2, options=best_options)
 def tune_and_store(self, tc_name, inputs, mapping_options, cache_file=""):
     options = mapping_options
     if not isinstance(options, Options):
         options = Options(options)
     try:
         best_options = self.autotuner.tune(cache_file, tc_name, inputs,
                                            options, [options])
         return best_options
     except Exception as e:
         logger.error('Raised exception: {}'.format(e))
         return options
Exemple #8
0
 def test_manual_options(self):
     lang = MATMUL_LANG
     matmul = tc.define(lang, name="matmul")
     mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda()
     options = Options("naive")
     out = matmul(mat1, mat2, options=options)
 def test_manual_options(self):
     lang = MATMUL_KRU1_LANG
     KRU3_1 = tc.define(lang, name="KRU3_1")
     W2, X = torch.randn(32, 16).cuda(), torch.randn(256, 16, 16, 16).cuda()
     options = Options("naive")
     out = KRU3_1(W2, X, options=options)
Exemple #10
0
    def test_KRU3(self):
        # define TC
        lang = """
        def KRU3_1(float(D2, N2) W2, float(M, N0, N1, N2) X) -> (XW2) {
           XW2(m, n0, n1, d2)   +=! X(m, n0, n1, n2_red) * W2(d2, n2_red)
        }
        def KRU3_2(float(D1, N1) W1, float(M, N0, N1, D2) XW2) -> (XW2W1) {
           XW2W1(m, n0, d1, d2) +=! XW2(m, n0, n1_red, d2) * W1(d1, n1_red)
        }
        def KRU3_3(float(D0, N0) W0, float(M, N0, D1, D2) XW2W1) -> (Y) {
           Y(m, d0, d1, d2)     +=! XW2W1(m, n0_red, d1, d2) * W0(d0, n0_red)
        }
        """

        # create input tensors
        M, D0, D1, D2, N0, N1, N2, max_factors = 256, 32, 32, 32, 16, 16, 16, 3
        W0 = torch.randn(D0, N0).cuda()
        W1 = torch.randn(D1, N1).cuda()
        W2 = torch.randn(D2, N2).cuda()
        X = torch.randn(M, N0, N1, N2).cuda()

        # define the mapping_options
        options = Options("naive")
        options.useSharedMemory(True)
        options.usePrivateMemory(True)
        options.tile([4, 1, 1, 8, 16])
        options.mapToBlocks([64, 16, 16])
        options.mapToThreads([8, 4, 8])
        options.unroll(128)

        # create TC compilation unit object and define the TC language
        cu = TcCompilationUnit()
        cu.define(lang)

        print("Running KRU3_1")
        inputs1 = [W2, X]
        outputs1 = cu.compile_and_run("KRU3_1", inputs1, options=options)

        print("Running KRU3_2")
        XW2 = outputs1[0]
        inputs2 = [W1, XW2]
        outputs2 = cu.compile_and_run("KRU3_2", inputs2, options=options)

        print("Running KRU3_3")
        XW2W1 = outputs2[0]
        inputs3 = [W0, XW2W1]
        outputs3 = cu.compile_and_run("KRU3_3", inputs3, options=options)
    def test_batchmatmul(self):
        # define TC
        lang = """
        def batch_matmul(float(B, N, M) X, float(B, M, K) Y) -> (Z) {
          Z(b, n, k) +=! X(b, n, mm) * Y(b, mm, k)
        }
        """

        # create input tensors
        B, K, M, N = 500, 26, 72, 26
        X = torch.randn(B, N, M).cuda()
        Y = torch.randn(B, M, K).cuda()
        inputs = [X, Y]

        # define the mapping_options
        options = Options("naive")
        options.useSharedMemory(True)
        options.usePrivateMemory(True)
        options.unrollCopyShared(True)
        options.outerScheduleFusionStrategy("Preserve3Coincident")
        options.fixParametersBeforeScheduling(True)
        options.tile([1])
        options.tileImperfectlyNested(False)
        options.mapToBlocks([72, 16, 1])
        options.mapToThreads([7, 26])
        options.unroll(128)

        # run with TC, get the outputs and check against reference implementation
        outputs = self.check(lang, "batch_matmul", options, inputs)
    def test_tmm(self):
        # define TC
        lang = """
        def tmm(float(M,K) A, float(N,K) B) -> (C) {
          C(m, n) +=! A(m, kk) * B(n, kk)
        }
        """

        # create input tensors
        M, N, K = 128, 256, 32
        A = torch.randn(M, K).cuda()
        B = torch.randn(N, K).cuda()
        inputs = [A, B]

        # define the mapping_options
        options = Options("naive")
        options.useSharedMemory(True)
        options.usePrivateMemory(True)
        options.unrollCopyShared(False)
        options.outerScheduleFusionStrategy("Preserve3Coincident")
        options.fixParametersBeforeScheduling(False)
        options.tile([4, 32])
        options.tileImperfectlyNested(False)
        options.mapToBlocks([64, 128])
        options.mapToThreads([1, 32])
        options.unroll(4)

        # run with TC, get the outputs and check against reference implementation
        outputs = self.check(lang, "tmm", options, inputs)
        expected = torch.mm(A, torch.transpose(B, 0, 1))
        diff = outputs[0] - expected
        self.assert_almost_equal(diff, inputs, M * N, 3e-7)
 def test_options(self):
     print('\nCreating mapping_options')
     options = Options("naive")
     options.useSharedMemory(True)
     options.usePrivateMemory(False)
     options.unrollCopyShared(False)
     options.mapToBlocks([256, 8])
     options.mapToThreads([4, 16, 4])
     options.tile([2, 8, 64, 128])
     options.unroll(128)
     options.tileImperfectlyNested(False)
     options.fixParametersBeforeScheduling(False)
     options.scheduleFusionStrategy("Max")
     options.outerScheduleFusionStrategy("Preserve3Coincident")
     print('Mapping options created successfully')
    def test_mlp(self):
        # define TC
        lang = """
        def mlp3(float(B,N) I, float(O,N) W2, float(O) B2, float(P,O) W3, float(P) B3, float(Q,P) W4, float(Q) B4) -> (O2, O3, O4) {
            O2(b, o) +=! I(b, n) * W2(o, n)
            O2(b, o) = O2(b, o) + B2(o)
            O2(b, o) = fmax(O2(b, o), 0)
            O3(b, p) +=! O2(b, o) * W3(p, o)
            O3(b, p) = O3(b, p) + B3(p)
            O3(b, p) = fmax(O3(b, p), 0)
            O4(b, q) +=! O3(b, p) * W4(q, p)
            O4(b, q) = O4(b, q) + B4(q)
            O4(b, q) = fmax(O4(b, q), 0)
        }
        """

        # create input tensors
        B, N, O, P, Q = 128, 128, 64, 32, 2
        I = torch.randn(B, N).cuda()
        W2 = torch.randn(O, N).cuda()
        B2 = torch.randn(O).cuda()
        W3 = torch.randn(P, O).cuda()
        B3 = torch.randn(P).cuda()
        W4 = torch.randn(Q, P).cuda()
        B4 = torch.randn(Q).cuda()
        inputs = [I, W2, B2, W3, B3, W4, B4]

        # define the mapping_options
        options = Options("naive")
        options.useSharedMemory(False)
        options.usePrivateMemory(False)
        options.unrollCopyShared(True)
        options.outerScheduleFusionStrategy("Max")
        options.fixParametersBeforeScheduling(False)
        options.tile([4])
        options.tileImperfectlyNested(False)
        options.mapToBlocks([128])
        options.mapToThreads([64])
        options.unroll(128)

        # run with TC, get the outputs and check against reference implementation
        outputs = self.check(lang, "mlp3", options, inputs)
    def test_C3(self):
        # define TC
        lang = """
        def _C3(float(B, WX) I, float(WY, WX) W) -> (C3) {
            C3(b, wy) +=! I(b, wxx) * W(wy, wxx)
        }
        """

        # create input tensors
        B, WX, WY = 128, 1000, 1024
        I = torch.randn(B, WX).cuda()
        W = torch.randn(WY, WX).cuda()
        inputs = [I, W]

        # define the mapping_options
        options = Options("naive")
        options.useSharedMemory(True)
        options.usePrivateMemory(True)
        options.unrollCopyShared(True)
        options.outerScheduleFusionStrategy("Preserve3Coincident")
        options.fixParametersBeforeScheduling(True)
        options.tile([8, 32, 32])
        options.tileImperfectlyNested(False)
        options.mapToBlocks([128, 128])
        options.mapToThreads([1, 32])
        options.unroll(256)

        # run with TC, get the outputs and check against reference implementation
        outputs = self.check(lang, "_C3", options, inputs)
    def test_group_convolution(self):
        # define TC
        lang = """
        def group_convolution(float(N,G,C,H,W) I, float(G,F,C,KH,KW) W1, float(G,F) B)
        -> (O)
        {
          O(n, g, f, h, w) +=! I(n, g, c, h + kh, w + kw) * W1(g, f, c, kh, kw)
          O(n, g, f, h, w) = O(n, g, f, h, w) + B(g, f)
        }
        """

        # create input tensors
        N, G, C, F, H, W, KH, KW = 32, 32, 32, 32, 7, 7, 3, 3
        tI = torch.randn(N, G, C, H, W).cuda()
        tW = torch.randn(G, F, C, KH, KW).cuda()
        tB = torch.randn(G, F).cuda()
        inputs = [tI, tW, tB]

        # define the mapping_options
        options = Options("naive")
        options.useSharedMemory(True)
        options.usePrivateMemory(False)
        options.unrollCopyShared(True)
        options.outerScheduleFusionStrategy("Preserve3Coincident")
        options.fixParametersBeforeScheduling(False)
        options.tile([1, 1])
        options.tileImperfectlyNested(False)
        options.mapToBlocks([32, 32, 3])
        options.mapToThreads([8, 7, 7])
        options.unroll(256)

        # run with TC, get the outputs and check against reference implementation
        outputs = self.check(lang, "group_convolution", options, inputs)