Example #1
0
def test_correctness_layout_rewrite_insert_transform_stage():
    N = 128
    target = tvm.target.Target("llvm")
    task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N),
                                      target)
    dag = task.compute_dag

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        search_policy = auto_scheduler.SketchPolicy(task)

        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
        tuning_options = auto_scheduler.TuningOptions(
            num_measure_trials=2,
            runner=measure_ctx.runner,
            verbose=1,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        auto_scheduler.auto_schedule(task, search_policy, tuning_options)
        inp, _ = auto_scheduler.load_best(log_file, task.workload_key, target)
        s, bufs = dag.apply_steps_from_state(
            inp.state,
            layout_rewrite=auto_scheduler.compute_dag.ComputeDAG.
            InsertTransformStage)

        s_ref, bufs_ref = dag.apply_steps_from_state(inp.state)
        np_args = [
            np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype)
            for x in bufs
        ]

        func = tvm.build(s, bufs, target=target)
        func_ref = tvm.build(s_ref, bufs_ref, target=target)

        ctx = tvm.context(str(target))
        ctx_ref = tvm.cpu()

        args = [tvm.nd.array(x, ctx=ctx) for x in np_args]
        args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args]
        ctx.sync()

        func(*args)
        func_ref(*args_ref)
        ctx.sync()

        tvm.testing.assert_allclose(args[0].asnumpy(),
                                    args_ref[0].asnumpy(),
                                    atol=1e-3,
                                    rtol=1e-3)
        tvm.testing.assert_allclose(args[1].asnumpy(),
                                    args_ref[1].asnumpy(),
                                    atol=1e-3,
                                    rtol=1e-3)
        tvm.testing.assert_allclose(args[2].asnumpy(),
                                    args_ref[2].asnumpy(),
                                    atol=1e-3,
                                    rtol=1e-3)
        del measure_ctx
Example #2
0
    def __init__(self, task, **kwargs):
        self.task = task
        self.measure_ctx = auto_scheduler.LocalRPCMeasureContext(
            min_repeat_ms=300)

        @auto_scheduler.register_workload
        def auto_template():
            _, arg_bufs = task.func()
            return arg_bufs

        self.auto_task = auto_scheduler.create_task(auto_template, (),
                                                    task.target)
def get_sample_records(number):
    """Generate a list of random MeasureInput and MeasureResult pairs"""
    N = 128
    task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), "llvm")
    policy = auto_scheduler.SketchPolicy(task, verbose=0)
    states = policy.sample_initial_population()[:number]

    inputs = [auto_scheduler.MeasureInput(task, s) for s in states]
    results = [
        auto_scheduler.MeasureResult([np.random.uniform(0.5, 1.0)], 0, "", 0.1, 0)
        for _ in range(len(inputs))
    ]

    return task, inputs, results
def test_task_scheduler_round_robin():
    tasks = []
    for n in [2, 4, 8]:
        tasks.append(
            auto_scheduler.create_task(matmul_auto_scheduler_test, (n, n, n),
                                       "llvm"))

    def objective_func(costs):
        return sum(costs)

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name
        num_trials_per_task = 2

        # Tune all tasks
        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=num_trials_per_task * len(tasks),
            runner=measure_ctx.runner,
            num_measures_per_round=1,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        task_scheduler = auto_scheduler.TaskScheduler(tasks,
                                                      objective_func,
                                                      strategy="round-robin")
        task_scheduler.tune(tune_option, search_policy="sketch.random")

        # Check the result of round robin
        counters = {}
        for task in tasks:
            counters[task.workload_key] = 0

        for inp, res in auto_scheduler.load_records(log_file):
            counters[inp.task.workload_key] += 1

        for task in tasks:
            assert counters[task.workload_key] == num_trials_per_task

        # test continuous tuning (restoring the status)
        task_scheduler = auto_scheduler.TaskScheduler(tasks,
                                                      objective_func,
                                                      strategy="round-robin",
                                                      load_log_file=log_file)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=len(tasks),
            num_measures_per_round=1,
        )
        task_scheduler.tune(tune_option, search_policy="sketch.random")
        del measure_ctx
Example #5
0
def test_measure_local_builder_runner():
    if not tvm.testing.device_enabled("llvm"):
        return

    task = auto_scheduler.create_task(matmul_auto_scheduler_test, [512, 512, 512], "llvm")

    for enable_cpu_cache_flush in [True, False]:
        minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
        local_builder = auto_scheduler.LocalBuilder()
        local_runner = auto_scheduler.LocalRunner(
            timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush
        )

        bress = local_builder.build([minp])
        assert bress[0].error_no == 0
        mress = local_runner.run([minp], bress)
        assert mress[0].error_no == 0
def test_mutate_parallel():
    """
    The test case initializes evo search with a batch of "bad" states and check whether
    the search algorithm can find "good" states by mutating the "bad" states.
    """
    class MockCostModel(PythonBasedModel):
        @staticmethod
        def is_good_state(state):
            for line in str(state).split("\n"):
                if (line.find("parallel i.0@ (0") != -1
                        or line.find("parallel [email protected]@ (0") != -1
                        or line.find("parallel [email protected]@i.1@ (0") != -1):
                    return True
            return False

        def predict(self, task, states):
            scores = []
            for state in states:
                scores.append(1 if self.is_good_state(state) else 0)
            return scores

    task = auto_scheduler.create_task(matmul_auto_scheduler_test,
                                      (1024, 1024, 1024), "llvm")
    policy = auto_scheduler.SketchPolicy(task,
                                         program_cost_model=MockCostModel(),
                                         verbose=0)

    found = False
    retry_ct = 0
    while retry_ct < 10 and not found:
        states = policy.sample_initial_population()[:100]
        bad_states = []
        for state in states:
            if not MockCostModel.is_good_state(state):
                bad_states.append(state)

        new_states = policy.evolutionary_search(bad_states, 50)
        for state in new_states:
            if MockCostModel.is_good_state(state):
                found = True
                break
        retry_ct += 1

    assert found
def test_task_scheduler_gradient():
    tasks = []
    for n in [2, 4]:
        tasks.append(
            auto_scheduler.create_task(matmul_auto_scheduler_test, (n, n, n),
                                       "llvm"))

    def objective_func(costs):
        return costs[0]

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        n_trials = 5

        # Tune all tasks
        measure_ctx = auto_scheduler.LocalRPCMeasureContext()
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=n_trials,
            runner=measure_ctx.runner,
            num_measures_per_round=1,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        task_scheduler = auto_scheduler.TaskScheduler(
            tasks, objective_func=objective_func)

        # Forcely rewrite the initial values.
        # This can make this test more stable on the slow CI machines
        task_scheduler.best_costs = np.array([1e2, 1e-8])

        task_scheduler.tune(tune_option, search_policy="sketch.random")

        # Check the allocation results
        counters = {}
        for task in tasks:
            counters[task.workload_key] = 0

        for inp, res in auto_scheduler.load_records(log_file):
            counters[inp.task.workload_key] += 1

        assert counters[tasks[0].workload_key] == n_trials - 1
        assert counters[tasks[1].workload_key] == 1
        del measure_ctx
Example #8
0
def test_recover_measure_input():
    task = auto_scheduler.create_task(matmul_auto_scheduler_test, [512, 512, 512], "llvm")

    inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)

    with tempfile.NamedTemporaryFile() as fp:
        auto_scheduler.save_records(fp.name, [inp], [res])

        log_reader = auto_scheduler.RecordReader(fp.name)
        inputs, results = log_reader.read_lines()
        assert len(inputs) == 1

        raw_inp = inputs[0]

        correct_inp = auto_scheduler.measure_record.recover_measure_input(raw_inp)
        assert str(correct_inp.task.compute_dag) == str(inp.task.compute_dag)

        correct_inp = auto_scheduler.measure_record.recover_measure_input(
            raw_inp, rebuild_state=True
        )
        assert str(correct_inp.state) == str(inp.state)
    k = te.reduce_axis((0, L), name="k")
    matmul = te.compute((N, M),
                        lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
                        name="matmul")
    out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")

    return [A, B, C, out]


######################################################################
# Create the search task
# ^^^^^^^^^^^^^^^^^^^^^^
# We then create a search task with N=L=M=128 and dtype="float32"

target = tvm.target.Target("llvm")
task = auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"),
                                  target)

# Inspect the computational graph
print(task.compute_dag)

######################################################################
# Next, we set parameters for the auto-scheduler.
#
# * `num_measure_trials` is the number of measurement trials we can use during the search.
#   We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a
#   good value for the search to converge. You can do more trials according to your time budget.
# * In addition, we use `RecordToFile` to dump measurement records into a file `matmul.json`.
#   The measurement records can be used to query the history best, resume the search,
#   and do more analyses later.
# * see :any:`auto_schedule.TuningOptions`: for more parameters
Example #10
0
def search_common(
    workload=matmul_auto_scheduler_test,
    target="llvm",
    search_policy="empty",
    seed=random.randint(1, 1 << 30),
    runner="local",
    cost_model=auto_scheduler.RandomModel(),
    num_measure_trials=10,
    init_search_callbacks=None,
):
    print("Test %s schedule search with the default search policy" % (target))

    random.seed(seed)
    N = 128
    target = tvm.target.Target(target)
    task = auto_scheduler.create_task(workload, (N, N, N), target)

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        init_search_callbacks = init_search_callbacks or []
        init_search_callbacks.append(
            auto_scheduler.PreloadMeasuredStates(log_file))

        if search_policy == "empty":
            search_policy = auto_scheduler.EmptyPolicy(task)
        elif search_policy == "sketch":
            search_policy = auto_scheduler.SketchPolicy(
                task,
                program_cost_model=cost_model,
                init_search_callbacks=init_search_callbacks)

        tuning_options = auto_scheduler.TuningOptions(
            num_measure_trials=num_measure_trials,
            runner=runner,
            verbose=1,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        sch, args = auto_scheduler.auto_schedule(task, search_policy,
                                                 tuning_options)
        print("*" * 80)
        print(target)
        print("*" * 80)
        inp, res = auto_scheduler.load_best(log_file, task.workload_key,
                                            target)

        print("==== Python Code ====")
        print(task.compute_dag.print_python_code_from_state(inp.state))

        try:
            print("==== Lowered Stmt ====")
            print(tvm.lower(sch, args, simple_mode=True))
            mod = tvm.build(sch, args, target)

            ctx = tvm.context(str(target), 0)
            dtype = task.compute_dag.tensors[0].dtype
            a = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx)
            b = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx)
            c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx)
            mod(a, b, c)
            tvm.testing.assert_allclose(c.asnumpy(),
                                        np.dot(a.asnumpy(), b.asnumpy()),
                                        rtol=1e-5)
            print("==== Verification passed ====")
        except Exception:
            raise Exception("Error encountered with seed: %d" % (seed))
    print()
Example #11
0
def test_layout_rewrite_correctness():
    N = 128
    target = tvm.target.Target("llvm")
    task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N),
                                      target)
    dag = task.compute_dag

    with tempfile.NamedTemporaryFile() as fp:
        log_file = fp.name

        search_policy = auto_scheduler.SketchPolicy(task)

        tuning_options = auto_scheduler.TuningOptions(
            num_measure_trials=2,
            runner="local",
            verbose=1,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        )
        auto_scheduler.auto_schedule(task, search_policy, tuning_options)
        inp, _ = auto_scheduler.load_best(log_file, task.workload_key, target)
        s, bufs = dag.apply_steps_from_state(inp.state, layout_rewrite=True)
        s_ref, bufs_ref = dag.apply_steps_from_state(inp.state,
                                                     layout_rewrite=False)
        np_args = [
            np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype)
            for x in bufs
        ]
        np_args_ref = [np.array(x) for x in np_args]

        weight = np_args_ref[1]
        # infer shape for the rewritten layout
        if len(weight.shape) >= 6:
            # For cpu tile structure SSRSRS
            base = len(weight.shape) - 6
            red_dim = weight.shape[2 + base] * weight.shape[4 + base]
            out_dim = weight.shape[3 + base] * weight.shape[5 + base]
            for i in range(base + 2):
                out_dim *= weight.shape[i]
            new_order = ([
                2 + base,
                4 + base,
            ] + list(range(base + 2)) + [
                3 + base,
                5 + base,
            ])
            np_args_ref[1] = np_args_ref[1].transpose(new_order)
            np_args_ref[1] = np_args_ref[1].reshape((red_dim, out_dim))

        func = tvm.build(s, bufs, target=target)
        func_ref = tvm.build(s_ref, bufs_ref, target=target)

        ctx = tvm.context(str(target))
        ctx_ref = tvm.cpu()

        args = [tvm.nd.array(x, ctx=ctx) for x in np_args]
        args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args_ref]
        ctx.sync()

        func(*args)
        func_ref(*args_ref)
        ctx.sync()

        np.testing.assert_allclose(np_args[0], np_args_ref[0])
        np.testing.assert_allclose(np_args[2], np_args_ref[2])
Example #12
0
def main_compute(code_only=False):
  tvm.register_func('tvm_callback_cuda_compile', compile_source, override=True)
  logging.getLogger('autotvm').setLevel(logging.DEBUG)
  logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

  default_tune_op = importlib.import_module('templates.' + (os.environ['OP'] if 'OP' in os.environ else 'auto.generic'))
  if verbose:
    print('  >> Backend = %s, Python PID = %s, Task = %s;' % (backend, os.getpid(), default_tune_op.__name__))

  task = autotvm.task.create("template_op", args=(), target=tvm_target)

  def json_to_config(json_dict, index=-1, code_hash=None):
    if not isinstance(json_dict, list):
      json_list = []
      for key in json_dict:
        json_list.append([key, 'ot' if type(json_dict[key]) is not list else ('sp' if json_dict[key][0:1] == [-1] else 're'), json_dict[key]])
      json_dict = json_list
    config = ConfigEntity.from_json_dict({"index": index, "time": "", "code_hash": code_hash, "entity": json_dict})
    # config = ConfigEntity.from_json_dict({"i": index, "t": "", "c": code_hash, "e": json_dict})
    return config

  def config_to_json(config):
    if config is None:
      return {}
    if isinstance(config, str):
      return json.loads(config)
    jobj = config.to_json_dict()['entity']
    # jobj = config.to_json_dict()['e']
    json_dict = dict()
    for i in range(len(jobj)):
      assert(jobj[i][1] in ['sp', 'ot', 're'])
      json_dict[jobj[i][0]] = jobj[i][2]
    return json_dict

  num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0

  config = os.environ.get('CONFIG', '').strip()
  if config != '':
    best_config = config
  elif 'NNI_TRIAL_JOB_ID' in os.environ:
    if os.environ['NNI_TRIAL_JOB_ID'] == '@':
      search_space = get_search_space(task.config_space)
      json_space = json.dumps(search_space)
      dump_to_file='./search_space.json'
      print("\n>> Writing Search Space to '%s', Search Space = %s;" % (dump_to_file, json_space))
      with open("search_space.json", "w") as fp:
        fp.write(json_space)
      sys.exit(0)

    try:
      import nni
      params_given = nni.get_next_parameter()
      if params_given is None:
        raise
      local_dir_id = os.environ['NNI_TRIAL_JOB_ID']
    except:
      params_given = default_tune_op.get_choice_example()
      local_dir_id = '_'
    t = run_config_entity(params_given, local_dir_id)
    gflops = compute_gflops(task.flop, t)
    print('[Antares-engine] Final entity result is: %g' % gflops)
    try:
      nni.report_final_result(gflops)
    except:
      print('[Antares-engine] (not reporting final result to NNI.)')
    exit(0)

  elif num_trials > 0:
    dev_num = platform_config.get_execution_parallism()
    if dev_num <= 0:
        raise Exception("No valid device found for backend: %s." % backend)
    batch_size = int(os.environ.get('BATCH', '16'))

    from concurrent.futures import ThreadPoolExecutor
    try:
      if platform_config.allow_concurrent_compile_execution():
        raise Exception()
      worker_size = 1
    except:
      worker_size = batch_size
    thread_pool = ThreadPoolExecutor(max_workers=worker_size)

    task.antares_helper = Mock()
    task.antares_helper.json_to_config = json_to_config
    task.antares_helper.config_to_json = config_to_json
    task.antares_helper.to_json_search_space = get_search_space

    tuner_type = os.environ.get('TUNER', '')
    if not tuner_type:
      comp = os.environ['COMPUTE_V1']
      if '=!' in comp and 'plan/' not in comp[comp.find(' ##') + 1:] and ';' not in comp and backend in ['c-rocm', 'c-cuda', 'c-hlsl', 'c-ocl']:
        tuner_type = 'AutoTVM2'
      else:
        tuner_type = 'XGBoost'
    print('  >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s' % (worker_size, batch_size, dev_num, tuner_type))

    auto_commit = os.environ.get('COMMIT', '')
    if auto_commit:
      saved_code = codehub_db(os.environ['COMPUTE_V1'])
      if saved_code is not None and auto_commit != 'force':
        raise Exception("Saved code has existed in codehub. Please try COMMIT=force to override it.")
      os.environ.pop('COMMIT')

    try:
      tuner = importlib.import_module('tuner.%s.main' % tuner_type)
      tuner = tuner.MainTuner(task)
    except:
      raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type)

    if tuner is not None:
      AntaresGlobal.current_step = 0

      def measure_batch(inputs):
        results, futures = [], []
        best_slot = -1
        expected_timecost = tuner.task.best.timecost
        for i in range(len(inputs)):
          futures.append(thread_pool.submit(run_config_entity, config_to_json(inputs[i].config), AntaresGlobal.current_step + i + 1, expected_timecost, i % dev_num))
        for i in range(len(inputs)):
          t = futures[i].result()
          if t < tuner.task.best.timecost:
            best_slot = AntaresGlobal.current_step + i + 1
            tuner.task.best.timecost = t
            tuner.task.best.config = inputs[i].config
            tuner.task.best.occur = best_slot
          results.append(autotvm.measure.MeasureResult(costs=(t,), error_no=0, all_cost=i, timestamp=time.time()))
        AntaresGlobal.current_step += len(results)

        print('\nSTEP[%d / %d] Current Best Config = %s, Perf = %g Gflops, MemRatio = %g %%, Occur Step = %d;' % (
          AntaresGlobal.current_step,
          num_trials,
          json.dumps(config_to_json(tuner.task.best.config)),
          compute_gflops(tuner.task.flop, tuner.task.best.timecost),
          compute_mem_ratio(tuner.task.best.timecost),
          tuner.task.best.occur))

        if auto_commit and best_slot >= 0:
          with open(local_get_dir_file('my_kernel.cc', best_slot), 'r') as fp:
            device_source = fp.read()
          with open(local_get_dir_file('result.txt', best_slot), 'r') as fp:
            t = float(fp.read().split()[0])
          kernel_path = codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Saved Perf = %g sec / run; Step Produced = %d;' % (t, best_slot))
          print('  >> Update current code to codehub: %s' % kernel_path)
        return results

      tuner.task.best = Mock()
      tuner.task.best.timecost = float('inf')
      tuner.task.best.config = None
      tuner.task.best.occur = -1

      tuner.measure_batch = measure_batch
      tuner.measure_batch.n_parallel = batch_size
      callbacks = []

      history_log_for_transfer_learning = os.environ.get('RECORD', '')

      if history_log_for_transfer_learning:
        callbacks.append(autotvm.callback.log_to_file(history_log_for_transfer_learning))
        # Enable Transfer Learning for Incremental Task
        if os.path.exists(history_log_for_transfer_learning):
          print('  >>  Loading incremental history from log file: %s ..' % history_log_for_transfer_learning)
          tuner.load_history(autotvm.record.load_from_file(history_log_for_transfer_learning))

      tuner.tune(n_trial=num_trials, measure_option=autotvm.measure_option(
          builder=autotvm.LocalBuilder(n_parallel=batch_size),
          runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
      ), callbacks=callbacks)
      assert not math.isinf(tuner.task.best.timecost), "Not valid config found in the whole tuning."
      best_config = json.dumps(config_to_json(tuner.task.best.config))

      if auto_commit:
          device_source = codehub_db(os.environ['COMPUTE_V1'])
          codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Antares Tuning Completed in %d steps.' % AntaresGlobal.current_step)

      print("\n[Best Config] CONFIG='%s'  ==>  Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec." % (
        best_config,
        compute_gflops(tuner.task.flop, tuner.task.best.timecost),
        tuner.task.best.occur,
        num_trials,
        tuner.task.best.timecost))

      if hasattr(tuner, 'cleanup'):
        tuner.cleanup()
    else:
      raise Exception('Unrecognized tuner type: `%s`' % tuner_type)
    exit(0)
  else:
    if os.environ['OP'] == 'auto.generic':
      saved_code = codehub_db(os.environ['COMPUTE_V1'])
      if saved_code is not None:
        print("  >> Using Saved Code from Codehub:")
        print("===========================")
        print(saved_code)
        print("===========================")
        exit(0)
    best_config = ''

  assert isinstance(best_config, str)
  if verbose:
    print("====>> [Current Config Option]", best_config)
  if best_config.startswith('['):
    from tvm import auto_scheduler
    origin_cfg = json.loads(best_config)
    origin_cfg = {
      "i": [['["main_compute.<locals>.auto_template"]', 'cuda -keys=cuda,gpu -max_num_threads=%d -thread_warp_size=%d' % (
                device_properties().max_threads_per_block, device_properties().warp_size
             )], origin_cfg],
      "r": [[0], 0, 0, 0],
      "v": "v0.2",
    }
    origin_cfg_file = local_get_dir_file('my_kernel.cfg')
    with open(origin_cfg_file, 'w') as fp:
      fp.write(json.dumps(origin_cfg))
    origin_cfg = tvm.auto_scheduler.measure_record.load_records(origin_cfg_file)
 
    @auto_scheduler.register_workload
    def auto_template():
      _, arg_bufs = default_tune_op.get_template_op()
      return arg_bufs

    target = tvm.target.Target("cuda")
    auto_task = auto_scheduler.create_task(auto_template, (), target)
    for inp, res in origin_cfg:
      s, arg_bufs = auto_task.compute_dag.apply_steps_from_state(inp.state)
      break
  else:
    config = json_to_config(json.loads(best_config)) if best_config else task.config_space
    with ApplyConfig(config):
      with tvm.target.Target(tvm_target):
        s, arg_bufs = default_tune_op.get_template_op()

  device_source, kernel_path = get_target_source(s, arg_bufs)

  if code_only:
    return device_source

  if verbose:
    print("====================================")
    print(device_source)
    print("====================================\n")

  dev_id = int(os.environ.get('DEV_KEY', '0'))
  result = evaluate_perf(kernel_path, task.flop, dev_id)
  exit(0 if result is not None else 1)
Example #13
0
    # [[4, 512, 7, 7],  [256, 512, 3, 3],  [2, 2], [1, 1], [2, 2]],
    # [[1, 512, 14, 14],  [256, 512, 3, 3],  [1, 1], [1, 1], [1, 1]],
    # [[1, 512, 28, 7],  [256, 512, 3, 3],  [1, 1], [1, 1], [1, 1]],
    # [[1, 512, 7, 28],  [256, 512, 3, 3],  [1, 1], [1, 1], [1, 1]],
    # [[2, 512, 7, 14],  [256, 512, 3, 3],  [1, 1], [1, 1], [1, 1]],
    # [[2, 512, 14, 7],  [256, 512, 3, 3],  [1, 1], [1, 1], [1, 1]],
    # [[4, 512, 7, 7],  [256, 512, 3, 3],  [1, 1], [1, 1], [1, 1]],
    # [512, 768, 768, 1],
    [512, 768, 768, 3],
]

for input_task in input_tasks:
    M, N, K, B = input_task
    # task = auto_scheduler.create_task(
    #     conv2d_layer, (N, H, W, CO, CI, KH, KW, strides, padding, dilation), target)
    task = auto_scheduler.create_task(bmm_layer, (M, N, K, B), target)
    tasks.append(task)

# for input_task in input_tasks:
#     [[N, CI, H, W], [CO, _, KH, KW], padding, strides, dilation] = input_task
#     assert(CI == _)
#     task = auto_scheduler.create_task(
#         conv2d_layer, (N, H, W, CO, CI, KH, KW, strides, padding, dilation), target)
#     tasks.append(task)

print('# of tasks = %d' % (len(tasks)))
# Inspect the computational graph
# print(task.compute_dag)

######################################################################
# Next, we set parameters for the auto-scheduler. These parameters
Example #14
0
def main_compute(code_only=False):
    tvm_target = 'cuda'
    tvm.register_func('tvm_callback_cuda_compile',
                      compile_source,
                      override=True)
    logging.getLogger('autotvm').setLevel(logging.DEBUG)
    logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

    default_tune_op = importlib.import_module('templates.' + (
        os.environ['OP'] if 'OP' in os.environ else 'auto.generic'))
    print('  >> Backend = %s, Python PID = %s, Task = %s;' %
          (backend, os.getpid(), default_tune_op.__name__))

    task = autotvm.task.create("template_op", args=(), target=tvm_target)

    def json_to_config(json_dict, index=-1, code_hash=None):
        if not isinstance(json_dict, list):
            json_list = []
            for key in json_dict:
                json_list.append([
                    key, 'ot' if type(json_dict[key]) is not list else
                    ('sp' if json_dict[key][0:1] == [-1] else 're'),
                    json_dict[key]
                ])
            json_dict = json_list
        config = ConfigEntity.from_json_dict({
            "index": index,
            "time": "",
            "code_hash": code_hash,
            "entity": json_dict
        })
        # config = ConfigEntity.from_json_dict({"i": index, "t": "", "c": code_hash, "e": json_dict})
        return config

    def config_to_json(config):
        if config is None:
            return {}
        if isinstance(config, str):
            return json.loads(config)
        jobj = config.to_json_dict()['entity']
        # jobj = config.to_json_dict()['e']
        json_dict = dict()
        for i in range(len(jobj)):
            assert (jobj[i][1] in ['sp', 'ot', 're'])
            json_dict[jobj[i][0]] = jobj[i][2]
        return json_dict

    num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0

    config = os.environ.get('CONFIG', '').strip()
    if config != '':
        if config[0] != '[':
            params_given = json.loads(config)
            print("====>> [Current Config Option]", config)
            best_config = json_to_config(params_given)
        else:
            best_config = config

    elif 'NNI_TRIAL_JOB_ID' in os.environ:
        if os.environ['NNI_TRIAL_JOB_ID'] == '@':
            search_space = get_search_space(task.config_space)
            json_space = json.dumps(search_space)
            dump_to_file = './search_space.json'
            print("\n>> Writing Search Space to '%s', Search Space = %s;" %
                  (dump_to_file, json_space))
            with open("search_space.json", "w") as fp:
                fp.write(json_space)
            sys.exit(0)

        try:
            import nni
            params_given = nni.get_next_parameter()
            if params_given is None:
                raise
            local_dir_id = os.environ['NNI_TRIAL_JOB_ID']
        except:
            params_given = default_tune_op.get_choice_example()
            local_dir_id = '_'
        t = run_config_entity(params_given, local_dir_id)
        gflops = compute_gflops(task.flop, t)
        print('[Antares-engine] Final entity result is: %g' % gflops)
        try:
            nni.report_final_result(gflops)
        except:
            print('[Antares-engine] (not reporting final result to NNI.)')
        exit(0)

    elif num_trials > 0:
        dev_num = platform_config.get_execution_parallism()
        if dev_num <= 0:
            raise Exception("No valid device found for backend: %s." % backend)
        batch_size = int(os.environ.get('BATCH', '16'))

        from concurrent.futures import ThreadPoolExecutor
        try:
            if platform_config.allow_concurrent_compile_execution():
                raise Exception()
            worker_size = 1
        except:
            worker_size = batch_size
        thread_pool = ThreadPoolExecutor(max_workers=worker_size)

        task.antares_helper = Mock()
        task.antares_helper.json_to_config = json_to_config
        task.antares_helper.config_to_json = config_to_json
        task.antares_helper.to_json_search_space = get_search_space

        tuner_type = os.environ.get('TUNER', 'XGBoost')
        print('  >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s' %
              (worker_size, batch_size, dev_num, tuner_type))

        auto_commit = os.environ.get('COMMIT', '')
        if auto_commit:
            saved_code = codehub_db(os.environ['COMPUTE_V1'])
            if saved_code is not None and auto_commit != 'force':
                raise Exception(
                    "Saved code has existed in codehub. Please try COMMIT=force to overide it."
                )
            os.environ.pop('COMMIT')

        try:
            tuner = importlib.import_module('tuner.%s.main' % tuner_type)
            tuner = tuner.MainTuner(task)
        except:
            raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type)

        if tuner is not None:

            def measure_batch(inputs):
                results, futures = [], []
                best_slot = -1
                expected_timecost = tuner.task.best.timecost
                for i in range(len(inputs)):
                    futures.append(
                        thread_pool.submit(run_config_entity,
                                           config_to_json(inputs[i].config), i,
                                           expected_timecost, i % dev_num))
                for i in range(len(inputs)):
                    t = futures[i].result()
                    if t < tuner.task.best.timecost:
                        best_slot = i
                        tuner.task.best.timecost = t
                        tuner.task.best.config = inputs[i].config
                        tuner.task.best.occur = tuner.task.best.curr_step + i + 1
                    results.append(
                        autotvm.measure.MeasureResult(costs=(t, ),
                                                      error_no=0,
                                                      all_cost=i,
                                                      timestamp=time.time()))
                tuner.task.best.curr_step += len(results)

                print(
                    '\nSTEP[%d / %d] Current Best Config = %s, Perf = %g Gflops, Occur Step = %d;'
                    %
                    (tuner.task.best.curr_step, num_trials,
                     json.dumps(config_to_json(tuner.task.best.config)),
                     compute_gflops(tuner.task.flop, tuner.task.best.timecost),
                     tuner.task.best.occur))

                if auto_commit and best_slot >= 0:
                    with open(local_get_dir_file('my_kernel.cc', best_slot),
                              'r') as fp:
                        device_source = fp.read()
                    with open(local_get_dir_file('result.txt', best_slot),
                              'r') as fp:
                        t = float(fp.read().split()[0])
                    kernel_path = codehub_db(
                        os.environ['COMPUTE_V1'],
                        source_code=device_source +
                        '\n// Saved Perf = %g sec / run; Step Produced = %d;' %
                        (t, tuner.task.best.curr_step))
                    print('  >> Update current code to codehub: %s' %
                          kernel_path)
                return results

            tuner.task.best = Mock()
            tuner.task.best.timecost = float('inf')
            tuner.task.best.config = None
            tuner.task.best.occur = -1
            tuner.task.best.curr_step = 0

            tuner.measure_batch = measure_batch
            callbacks = []

            history_log_for_transfer_learning = os.environ.get('RECORD', '')

            if history_log_for_transfer_learning:
                callbacks.append(
                    autotvm.callback.log_to_file(
                        history_log_for_transfer_learning))
                # Enable Transfer Learning for Incremental Task
                if os.path.exists(history_log_for_transfer_learning):
                    print(
                        '  >>  Loading incremental history from log file: %s ..'
                        % history_log_for_transfer_learning)
                    tuner.load_history(
                        autotvm.record.load_from_file(
                            history_log_for_transfer_learning))

            tuner.tune(n_trial=num_trials,
                       measure_option=autotvm.measure_option(
                           builder=autotvm.LocalBuilder(n_parallel=batch_size),
                           runner=autotvm.LocalRunner(repeat=3,
                                                      min_repeat_ms=100,
                                                      timeout=4)),
                       callbacks=callbacks)
            assert not math.isinf(
                tuner.task.best.timecost
            ), "Not valid config found in the whole tuning."
            best_config = tuner.task.best.config

            print(
                "\n[Best Config] CONFIG='%s'  ==>  Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec."
                %
                (json.dumps(config_to_json(best_config)),
                 compute_gflops(tuner.task.flop, tuner.task.best.timecost),
                 tuner.task.best.occur, num_trials, tuner.task.best.timecost))

            if hasattr(tuner, 'cleanup'):
                tuner.cleanup()
        else:
            raise Exception('Unrecognized tuner type: `%s`' % tuner_type)
        exit(0)
    else:
        if os.environ['OP'] == 'auto.generic':
            saved_code = codehub_db(os.environ['COMPUTE_V1'])
            if saved_code is not None:
                print("  >> Using Saved Code from Codehub:")
                print("===========================")
                print(saved_code)
                print("===========================")
                exit(0)
        best_config = task.config_space

    if isinstance(best_config, str):
        from tvm import auto_scheduler
        origin_cfg = json.loads(best_config)
        origin_cfg = {
            "i": [[
                '["main_compute.<locals>.auto_template"]',
                'cuda -keys=cuda,gpu -max_num_threads=%d -thread_warp_size=%d'
                % (device_properties().max_threads_per_block,
                   device_properties().warp_size)
            ], origin_cfg],
            "r": [[0], 0, 0, 0],
            "v":
            "v0.2",
        }
        origin_cfg_file = local_get_dir_file('my_kernel.cfg')
        with open(origin_cfg_file, 'w') as fp:
            fp.write(json.dumps(origin_cfg))
        origin_cfg = tvm.auto_scheduler.measure_record.load_records(
            origin_cfg_file)

        @auto_scheduler.register_workload
        def auto_template():
            _, arg_bufs = default_tune_op.get_template_op()
            return arg_bufs

        target = tvm.target.Target("cuda")
        auto_task = auto_scheduler.create_task(auto_template, (), target)
        for inp, res in origin_cfg:
            s, arg_bufs = auto_task.compute_dag.apply_steps_from_state(
                inp.state)
            break
    else:
        with ApplyConfig(best_config):
            with tvm.target.Target(tvm_target):
                s, arg_bufs = default_tune_op.get_template_op()

    if s is not None:
        lower_source = str(tvm.lower(s, arg_bufs, simple_mode=True))

        lower_file = local_get_dir_file('my_kernel.lower')
        with open(lower_file, 'w') as fp:
            fp.write(lower_source)

        # Verify Lower Code Code
        if len(('\n' + lower_source).split('\nprimfn(')) != 2:
            raise Exception('[Not Support Multi Unfuse-able kernels]\n\n' +
                            lower_source)

        max_threads_per_block = device_properties().max_threads_per_block
        max_shared_memory_per_block = device_properties(
        ).max_shared_memory_per_block
        assert max_threads_per_block > 0 and max_shared_memory_per_block >= 0, '[Error] Invalid device properties, maybe device is not detected correctly.'

        lower_lines = lower_source.split('\n')
        thread_extents, allocate_shared = [], []
        for ll in lower_lines:
            if ll.strip().startswith(
                    'attr [IterVar(') and ll.find(' "thread_extent" = ') >= 0:
                thread_name = ll.split('attr [IterVar(')[-1].split(':')[0]
                thread_val = int(
                    ll.split(' "thread_extent" = ')[-1].split(';')
                    [0].strip().split(' ')[0])
                thread_extents.append((thread_name, thread_val))
            elif ll.strip().startswith('allocate(') and ll.find(
                    '.shared, ') >= 0 and ll.endswith(");"):
                parts = ll[:-2].split(', ')[1:]
                allocate_type = parts[0]
                allocate_val = int(np.product(eval(parts[1])))
                allocate_shared.append((allocate_type, allocate_val))

        reserved_axes = dict()
        for thread_name, thread_val in thread_extents:
            if thread_name in reserved_axes:
                assert reserved_axes[
                    thread_name] == thread_val, "Invalid code: Multiple hints for thread extent conflict with each other: %d v.s. %d" % (
                        reserved_axes[thread_name], thread_val)
            else:
                reserved_axes[thread_name] = thread_val

        num_threads = 1
        for thread_name in ['threadIdx.x', 'threadIdx.y', 'threadIdx.z']:
            num_threads *= reserved_axes.get(thread_name, 1)
        assert num_threads <= max_threads_per_block, "Invalid kernel code: using num_threads(%d) > max_threads_per_block(%d)" % (
            num_threads, max_threads_per_block)

        shared_memory_in_bytes = 0
        for allocate_type, allocate_size in allocate_shared:
            if allocate_type.startswith('custom['):
                type_name = allocate_type[7:].split(']')[0]
                shared_memory_inc = int(
                    custom_dtypes[type_name][-1].split('@')[-1])
            else:
                shared_memory_inc = 8 * np.dtype(allocate_type).itemsize
            assert shared_memory_inc % 8 == 0, "The bits of shared_memory is not aligned with 8-bit bytes."
            shared_memory_in_bytes += shared_memory_inc // 8 * allocate_size

        if shared_memory_in_bytes > max_shared_memory_per_block:
            raise Exception(
                "Invalid kernel code: using shared_memory_in_bytes %d > max_shared_memory_per_block %d"
                % (shared_memory_in_bytes, max_shared_memory_per_block))

        # Compile Source Code
        def build_template():
            return tvm.build(s, arg_bufs, tvm_target, name='template_op')

        func = wait_for(build_template, 30)

    assert (len(func.imported_modules) == 1)
    device_source = translate_code(func.imported_modules[0].get_source())

    if code_only:
        return device_source

    print("====================================")
    print(device_source)
    print("====================================")

    print()
    try:
        eval_client = importlib.import_module('platforms.%s.evaluator.client' %
                                              backend)
    except ModuleNotFoundError:
        print('>> Evaluator for backend %s not found, skipping evaluation.' %
              backend)
        exit(0)
    except:
        traceback.print_exc()
        exit(1)

    def handle_result(result):
        print('\n[EvalAgent] Results =', json.dumps(result))
        if 'RESULT' in os.environ:
            if abs(float(os.environ['RESULT']) / result['K/0'] - 1.0) > 1e-6:
                result['TPR'] = None

        t = result.get('TPR', None)
        if t is None:
            print("\n[Antares] Incorrect compute kernel from evaluator.")
        else:
            gflops = compute_gflops(task.flop, t)
            print("\n[Antares] Average time cost / run = %g sec, %g gflops." %
                  (t, gflops))
            with open(local_get_dir_file('result.txt'), 'w') as fp:
                fp.write(str(t) + '\n')
                if 'K/0' in result:
                    fp.write(str(result['K/0']) + '\n')
        if os.environ['OP'] == 'auto.generic' and os.environ.get('COMMIT', ''):
            kernel_path = codehub_db(os.environ['COMPUTE_V1'],
                                     source_code=device_source +
                                     '\n// Saved Perf = %g sec / run' % t)
            print('  >> Update current code to codehub: %s' % kernel_path)

    tune_slot_id = int(os.environ.get(unified_slot_key, '0'))

    exec_fd, _ = system_lock([tune_slot_id])
    try:
        expected_timeout = None
        if 'EXPECTED_TIMEOUT' in os.environ and not math.isinf(
                float(os.environ['EXPECTED_TIMEOUT'])):
            expected_timeout = float(os.environ['EXPECTED_TIMEOUT'])
            expected_timeout = max(expected_timeout * 1.1,
                                   expected_timeout + 0.1)

        results = eval_client.eval(
            kernel_path=local_get_dir_file('my_kernel.cc'),
            expected_timeout=expected_timeout,
            func=func,
        )
    except:
        traceback.print_exc()
        exit(1)

    handle_result(results)
    exec_fd()
    exit(0)
Example #15
0
def create_auto_task(tvm_target):
    return auto_scheduler.create_task(auto_template, (), tvm_target)
    bias = te.placeholder((1, CO, 1, 1), name="bias")
    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype="float32")
    out = topi.nn.relu(conv + bias)
    return [data, kernel, bias, out]


######################################################################
# Create the search task
# ^^^^^^^^^^^^^^^^^^^^^^
# We then create a search task for the last convolution layer in the resnet.

target = tvm.target.Target("cuda")

# Use the last layer in ResNet-50
N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
task = auto_scheduler.create_task(conv2d_layer, (N, H, W, CO, CI, KH, KW, strides, padding), target)

# Inspect the computational graph
print(task.compute_dag)

######################################################################
# Next, we set parameters for the auto-scheduler. These parameters
# mainly specify how we do the measurement during the search.
#
# * :code:`measure_ctx` launches a different process for measurement to
#   provide isolation. It can protect the master process from GPU crashes
#   during measurement and avoid other runtime conflicts.
# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
#   This can warmup the GPU, which is necessary to get accurate measurement results.
#   Typically, we recommend a value > 300 ms.
# * :code:`num_measure_trials` is the number of measurement trials we can use during the search.
Example #17
0
def generate_sketches(workload_func, args, target, print_for_debug=False):
    task = auto_scheduler.create_task(workload_func, args, tvm.target.Target(target))
    policy = auto_scheduler.SketchPolicy(task, verbose=0)
    return policy.generate_sketches(print_for_debug)