Exemple #1
0
def init_remote(vta_env, config):
    """Create an RPC session based on the given config."""
    if vta_env.TARGET in ['sim', 'tsim']:
        # To target the simulator, we use a local RPC session as the execution
        # remote.
        remote = rpc.LocalSession()
        return remote, None
    else:
        # Get remote from tracker node if environment variable is set.
        # To set up the tracker, you'll need to follow the "Auto-tuning
        # a convolutional network for VTA" tutorial.
        tracker_host = config.get('tracker_host', None)
        tracker_port = config.get('tracker_port', None)
        # Otherwise if you have a device you want to program directly from
        # the host, make sure you've set the variables below to the IP of
        # your board.
        device_host = config.get('pynq_rpc_host', '192.168.2.99')
        device_port = config.get('pynq_rpc_port', 9091)
        if not tracker_host or not tracker_port:
            remote = rpc.connect(device_host, device_port)
        else:
            remote = autotvm.measure.request_remote(vta_env.TARGET, tracker_host, tracker_port, timeout=10000)

        # Reconfigure the JIT runtime and FPGA.
        # You can program the FPGA with your own custom bitstream
        # by passing the path to the bitstream file instead of None.
        vta.reconfig_runtime(remote)
        vta.program_fpga(remote, bitstream=None)
        return remote
Exemple #2
0
def program_rpc_bitstream(path=None):
    """Program the FPGA on the RPC server

    Parameters
    ----------
    path : path to bitstream (optional)
    """
    assert tvm.runtime.enabled("rpc")
    remote = rpc.connect(host, port)
    program_fpga(remote, path)
Exemple #3
0
def program_rpc_bitstream(path=None):
    """Program the FPGA on the RPC server

    Parameters
    ----------
    path : path to bitstream (optional)
    """
    assert tvm.module.enabled("rpc")
    remote = rpc.connect(host, port)
    program_fpga(remote, path)
Exemple #4
0
 def _run(env, remote):
     if device == "vta":
         target = env.target
         if env.TARGET not in ["sim", "tsim"]:
             assert tvm.runtime.enabled("rpc")
             program_fpga(remote, bitstream=None)
             reconfig_runtime(remote)
     elif device == "arm_cpu":
         target = env.target_vta_cpu
     with autotvm.tophub.context(target): # load pre-tuned schedule parameters
         run_gemm(env, remote, target, batch, in_feat, out_feat)
Exemple #5
0
 def _run(env, remote):
     if device == "vta":
         target = env.target
         if env.TARGET not in ["sim", "tsim"]:
             assert tvm.module.enabled("rpc")
             program_fpga(remote, bitstream="/home/did/tvm_did/vta/build/hardware/xilinx/vivado/pynq_1x16_i8w8a32_15_15_18_17/export/vta.bit")
             reconfig_runtime(remote)
     elif device == "arm_cpu":
         target = env.target_vta_cpu
     with autotvm.tophub.context(target): # load pre-tuned schedule parameters
         for _, wl in resnet_wkls:
             print(wl)
             run_conv2d(env, remote, wl, target)
 def _run(env, remote):
     if device == "vta":
         target = env.target
         if env.TARGET not in ["sim", "tsim"]:
             assert tvm.runtime.enabled("rpc")
             program_fpga(remote, bitstream=None)
             reconfig_runtime(remote)
     elif device == "arm_cpu":
         target = env.target_vta_cpu
     with autotvm.tophub.context(target):  # load pre-tuned schedule parameters
         for _, wl in dcgan_wklds:
             print(wl)
             run_conv2d_transpose(env, remote, wl, target)
Exemple #7
0
 def _run(env, remote):
     if device == "vta":
         target = env.target
         if env.TARGET != "sim":
             assert tvm.module.enabled("rpc")
             program_fpga(remote, bitstream=None)
             reconfig_runtime(remote)
     elif device == "arm_cpu":
         target = env.target_vta_cpu
     with autotvm.tophub.context(
             target):  # load pre-tuned schedule parameters
         for _, wl in resnet_wkls:
             print(wl)
             run_conv2d(env, remote, wl, target)
Exemple #8
0
 def _run(env, remote):
     if device == "vta":
         target = env.target
         if env.TARGET not in ["sim", "tsim"]:
             assert tvm.runtime.enabled("rpc")
             program_fpga(remote, bitstream=None)
             reconfig_runtime(remote)
     elif device == "arm_cpu":
         target = env.target_vta_cpu
     
     for wkls in target_workloads:
         with autotvm.tophub.context(target): # load pre-tuned schedule parameters
             for _, wl in wkls:
                 print(wl)
                 run_conv2d(env, remote, wl, target)
                 # Add to avoid rpc crash
                 time.sleep(1)
 def _run(env, remote):
     if device == "vta":
         target = env.target
         if env.TARGET not in ["sim", "tsim"]:
             assert tvm.module.enabled("rpc")
             custom_bitstream = os.environ.get(
                 "TVM_HOME"
             ) + "/vta/build/hardware/xilinx/vivado/pynq_1x16_i8w8a32_15_15_18_17/export/vta.bit"
             if (not os.path.exists(custom_bitstream)):
                 print("BITSTREAM ERROR: file " + custom_bitstream +
                       " does not exist. Will use a default bitstream...")
                 program_fpga(remote, bitstream=None)
             else:
                 program_fpga(remote, bitstream=custom_bitstream)
             reconfig_runtime(remote)
     elif device == "arm_cpu":
         target = env.target_vta_cpu
     with autotvm.tophub.context(
             target):  # load pre-tuned schedule parameters
         for _, wl in resnet_wkls:
             print(wl)
             run_conv2d(env, remote, wl, target)
    def compile_model(self):
        if device == 'vta':
            self.remote = rpc.connect(self.pynq_addr, 9091)
            vta.reconfig_runtime(self.remote)
            vta.program_fpga(self.remote, bitstream=None)
        else:
            self.remote = rpc.LocalSession()

        self.ctx = self.remote.ext_dev(
            0) if device == 'vta' else self.remote.cpu(0)

        # Load pre-configured AutoTVM schedules
        with autotvm.tophub.context(target):

            # Populate the shape and data type dictionary for ResNet input
            dtype_dict = {'data': 'float32'}
            shape_dict = {'data': (env.BATCH, 3, 224, 224)}

            gluon_model = vision.resnet18_v1(
                pretrained=True, ctx=ctx
            ).features if args.nonsplit else splitnet.resnet18_v1_split(
                self.id + 1)

            # Measure build start time
            build_start = time.time()

            # Start front end compilation
            mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)

            # Update shape and type dictionary
            shape_dict.update({k: v.shape for k, v in params.items()})
            dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

            # Perform quantization in Relay
            with relay.quantize.qconfig(global_scale=8.0,
                                        skip_conv_layers=[0]):
                relay_prog = relay.quantize.quantize(mod['main'],
                                                     params=params)

            # Perform graph packing and constant folding for VTA target
            if target.device_name == 'vta':
                assert env.BLOCK_IN == env.BLOCK_OUT
                relay_prog = graph_pack(relay_prog,
                                        env.BATCH,
                                        env.BLOCK_OUT,
                                        env.WGT_WIDTH,
                                        start_name=start_pack,
                                        stop_name=stop_pack)

            # Compile Relay program with AlterOpLayout disabled
            with relay.build_config(opt_level=3,
                                    disabled_pass={'AlterOpLayout'}):
                if target.device_name != 'vta':
                    graph, lib, params = relay.build(
                        relay_prog,
                        target=target,
                        params=params,
                        target_host=env.target_host)
                else:
                    with vta.build_config():
                        graph, lib, params = relay.build(
                            relay_prog,
                            target=target,
                            params=params,
                            target_host=env.target_host)

            self.params = params

            # Measure Relay build time
            build_time = time.time() - build_start
            print(f'inference graph for thread {self.id} built in {0:.4f}s!'.
                  format(build_time))

            # Send the inference library over to the remote RPC server
            temp = util.tempdir()
            lib.save(temp.relpath('graphlib.o'))
            self.remote.upload(temp.relpath('graphlib.o'))
            lib = self.remote.load_module('graphlib.o')

            # Graph runtime
            self.m = graph_runtime.create(graph, lib, self.ctx)
Exemple #11
0
    # your board.
    device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
    device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091")
    if not tracker_host or not tracker_port:
        remote = rpc.connect(device_host, int(device_port))
    else:
        remote = autotvm.measure.request_remote(env.TARGET,
                                                tracker_host,
                                                int(tracker_port),
                                                timeout=10000)
    # Reconfigure the JIT runtime and FPGA.
    # You can program the FPGA with your own custom bitstream
    # by passing the path to the bitstream file instead of None.
    reconfig_start = time.time()
    vta.reconfig_runtime(remote)
    vta.program_fpga(remote, bitstream=None)
    reconfig_time = time.time() - reconfig_start
    print(
        "Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))

# In simulation mode, host the RPC server locally.
else:
    remote = rpc.LocalSession()

# Get execution context from remote
ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)

####################################
# Build the inference graph runtime.
# ----------------------------------
# Using Darknet library load downloaded vision model and compile with Relay.
Exemple #12
0
def tune_and_evaluate(tuning_opt):

    if env.TARGET != "sim":
        # Get remote from fleet node
        remote = autotvm.measure.request_remote(env.TARGET,
                                                tracker_host,
                                                tracker_port,
                                                timeout=10000)
        # Reconfigure the JIT runtime and FPGA.
        vta.reconfig_runtime(remote)
        vta.program_fpga(remote, bitstream=None)
    else:
        # In simulation mode, host the RPC server locally.
        remote = rpc.LocalSession()

    # Register VTA tuning tasks
    register_vta_tuning_tasks()

    # Perform task extraction on Relay program
    print("Extract tasks...")
    relay_prog, params = compile_network(env, target, network, start_pack,
                                         stop_pack)
    mod = tvm.IRModule.from_expr(relay_prog)
    tasks = autotvm.task.extract_from_program(mod,
                                              params=params,
                                              ops=(tvm.relay.op.nn.conv2d, ),
                                              target=target,
                                              target_host=env.target_host)

    # We should have extracted 10 convolution tasks
    assert len(tasks) == 10
    print("Extracted {} conv2d tasks:".format(len(tasks)))
    for tsk in tasks:
        inp = tsk.args[0][1]
        wgt = tsk.args[1][1]
        batch = inp[0] * inp[4]
        in_filter = inp[1] * inp[5]
        out_filter = wgt[0] * wgt[4]
        height, width = inp[2], inp[3]
        hkernel, wkernel = wgt[2], wgt[3]
        hstride, wstride = tsk.args[2][0], tsk.args[2][1]
        hpad, wpad = tsk.args[3][0], tsk.args[3][1]
        print("({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format(
            batch, height, width, in_filter, out_filter, hkernel, wkernel,
            hpad, wpad, hstride, wstride))

    # We do not run the tuning in our webpage server since it takes too long.
    # Comment the following line to run it by yourself.
    return

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.tophub.context(target, extra_files=[log_file]):
        # Compile network
        print("Compile...")
        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
            if target.device_name != "vta":
                graph, lib, params = relay.build(relay_prog,
                                                 target=target,
                                                 params=params,
                                                 target_host=env.target_host)
            else:
                with vta.build_config():
                    graph, lib, params = relay.build(
                        relay_prog,
                        target=target,
                        params=params,
                        target_host=env.target_host)

        # Export library
        print("Upload...")
        temp = util.tempdir()
        lib.save(temp.relpath("graphlib.o"))
        remote.upload(temp.relpath("graphlib.o"))
        lib = remote.load_module("graphlib.o")

        # Generate the graph runtime
        ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
        m = graph_runtime.create(graph, lib, ctx)

        # upload parameters to device
        image = tvm.nd.array(
            (np.random.uniform(size=(1, 3, 224, 224))).astype('float32'))
        m.set_input(**params)
        m.set_input('data', image)

        # evaluate
        print("Evaluate inference time cost...")
        timer = m.module.time_evaluator("run", ctx, number=1, repeat=10)
        tcost = timer()
        prof_res = np.array(tcost.results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
Exemple #13
0
def run_through_rpc(measure_input,
                    build_result,
                    number,
                    repeat,
                    min_repeat_ms,
                    cooldown_interval,
                    remote_args,
                    ref_input=None,
                    ref_output=None):
    """Run a generated library through rpc

    Parameters
    ----------
    measure_input: MeasureInput
        The raw measure input
    build_result: BuildResult
        The result returned from Builder. This contains the path to the generated library.
    number: int
        The number of times to run the generated code for taking average.
        We call these runs as one `repeat` of measurement.
    repeat : int, optional
        The number of times to repeat the measurement.
        In total, the generated code will be run (1 + number x repeat) times,
        where the first one is warm up and will be discarded.
        The returned result contains `repeat` costs,
        each of which is an average of `number` costs.
    min_repeat_ms: int, optional
        The minimum duration of one `repeat` in milliseconds.
        By default, one `repeat` contains `number` runs. If this parameter is set,
        the parameters `number` will be dynamically adjusted to meet the
        minimum duration requirement of one `repeat`.
        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
        will be automatically increased.
    cooldown_interval: float
        The cool down interval between two measurements
    remote_args: Tuple
        The argument for request_remote
    ref_input: List of np.ndarray
        The reference input used for checking correctness
    ref_output: List of np.ndarray
        The reference output used for checking correctness
    """
    if isinstance(build_result, MeasureResult):
        return build_result

    tic = time.time()
    errno = MeasureErrorNo.NO_ERROR
    try:
        # upload built module
        remote = request_remote(*remote_args)
        # Program the FPGA every single time when targeting VTA
        if hasattr(measure_input.target, 'device_name') and \
            measure_input.target.device_name == 'vta':
            # pylint: disable=import-outside-toplevel
            from vta import program_fpga, reconfig_runtime
            program_fpga(remote, None)
            reconfig_runtime(remote)
        remote.upload(build_result.filename)
        func = remote.load_module(os.path.split(build_result.filename)[1])
        ctx = remote.context(str(measure_input.target), 0)
        time_f = func.time_evaluator(func.entry_name,
                                     ctx,
                                     number=number,
                                     repeat=repeat,
                                     min_repeat_ms=min_repeat_ms)

        # set input
        if ref_input:
            args = [nd.array(x, ctx=ctx) for x in ref_input]
        else:
            # create empty arrays on the remote device and copy them once.
            # This can avoid some memory issues that make the measurement results unreliable.
            args = [
                nd.empty(x[0], dtype=x[1], ctx=ctx)
                for x in build_result.arg_info
            ]
            args = [nd.array(x, ctx=ctx) for x in args]
            ctx.sync()

        costs = time_f(*args).results

        # clean up remote files
        remote.remove(build_result.filename)
        remote.remove(os.path.splitext(build_result.filename)[0] + '.so')
        remote.remove('')

        if len(costs
               ) > 2:  # remove largest and smallest value to reduce variance
            costs = list(costs)
            costs.sort()
            costs = tuple(costs[1:-1])

        # check correctness of output
        if ref_output:
            for expected, real in zip(ref_output, args):
                if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
                    logger.warning("Wrong Answer!")
                    errno = MeasureErrorNo.WRONG_ANSWER
    except TVMError as exc:
        msg = str(exc)
        if "Stack trace returned" in msg:
            msg = msg[:msg.index("Stack trace returned")]
        if "CUDA Source" in msg:
            msg = msg[:msg.index("CUDA Source")]
        costs = (RuntimeError(msg[:1024]), )
        errno = MeasureErrorNo.RUNTIME_DEVICE
    tstamp = time.time()
    time.sleep(cooldown_interval)
    return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost,
                         tstamp)
Exemple #14
0
# We configure both the bitstream and the runtime system on the Pynq
# to match the VTA configuration specified by the vta_config.json file.
if env.TARGET == "pynq":

    # Make sure that TVM was compiled with RPC=1
    assert tvm.module.enabled("rpc")
    remote = rpc.connect(host, port)

    # Reconfigure the JIT runtime
    vta.reconfig_runtime(remote)

    # Program the FPGA with a pre-compiled VTA bitstream.
    # You can program the FPGA with your own custom bitstream
    # by passing the path to the bitstream file instead of None.
    vta.program_fpga(remote, bitstream=None)

# In simulation mode, host the RPC server locally.
elif env.TARGET == "sim":
    remote = rpc.LocalSession()

######################################################################
# Computation Declaration
# -----------------------
# In this example we describe a simple matrix multiplication addition, which
# requires multiple computation stages, as shown in the dataflow diagram below.
# First we describe the input tensors :code:`A` and :code:`B` that are living
# in main memory.
# Second, we need to declare intermediate tensors :code:`A_buf` and
# :code:`B_buf`, which will live in VTA's on-chip buffers.
# Having this extra computational stage allows us to explicitly
Exemple #15
0
def run_through_rpc(
    measure_input,
    build_result,
    number,
    repeat,
    min_repeat_ms,
    cooldown_interval,
    remote_args,
    ref_input=None,
    ref_output=None,
    enable_cpu_cache_flush=False,
):
    """Run a generated library through rpc

    Parameters
    ----------
    measure_input: MeasureInput
        The raw measure input
    build_result: BuildResult
        The result returned from Builder. This contains the path to the generated library.
    number: int
        The number of times to run the generated code for taking average.
        We call these runs as one `repeat` of measurement.
    repeat : int, optional
        The number of times to repeat the measurement.
        In total, the generated code will be run (1 + number x repeat) times,
        where the first one is warm up and will be discarded.
        The returned result contains `repeat` costs,
        each of which is an average of `number` costs.
    min_repeat_ms: int, optional
        The minimum duration of one `repeat` in milliseconds.
        By default, one `repeat` contains `number` runs. If this parameter is set,
        the parameters `number` will be dynamically adjusted to meet the
        minimum duration requirement of one `repeat`.
        i.e., When the run time of one `repeat` falls below this time, the `number` parameter
        will be automatically increased.
    cooldown_interval: float
        The cool down interval between two measurements
    remote_args: Tuple
        The argument for request_remote
    ref_input: List of np.ndarray
        The reference input used for checking correctness
    ref_output: List of np.ndarray
        The reference output used for checking correctness
    enable_cpu_cache_flush: bool
        Whether to flush cache on CPU between repeated measurements.
        Flushing cache can make the measured latency of one operator closer to
        its actual latency during end-to-end inference.
        To make this option effective, the argument `number` should also be set to 1.
        This is only has effect on CPU task.
    """
    if isinstance(build_result, MeasureResult):
        return build_result

    tic = time.time()
    errno = MeasureErrorNo.NO_ERROR
    try:
        # upload built module
        remote = request_remote(*remote_args)
        # Program the FPGA every single time when targeting VTA
        if (
            hasattr(measure_input.target, "device_name")
            and measure_input.target.device_name == "vta"
        ):
            # pylint: disable=import-outside-toplevel
            from vta import program_fpga, reconfig_runtime

            program_fpga(remote, None)
            reconfig_runtime(remote)
        remote.upload(build_result.filename)
        func = remote.load_module(os.path.split(build_result.filename)[1])
        ctx = remote.context(str(measure_input.target), 0)

        # Limitation:
        # We can not get PackFunction directly in the remote mode as it is wrapped
        # under the std::function. We could lift the restriction later once we fold
        # the PackedFunc as an object. Currently, we pass function name to work
        # around it.
        f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
        time_f = func.time_evaluator(
            func.entry_name,
            ctx,
            number=number,
            repeat=repeat,
            min_repeat_ms=min_repeat_ms,
            f_preproc=f_prepare,
        )

        # set input
        if ref_input:
            args = [nd.array(x, ctx=ctx) for x in ref_input]
        else:
            try:
                random_fill = remote.get_function("tvm.contrib.random.random_fill")
            except AttributeError:
                raise AttributeError(
                    "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices"
                )
            args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
            for arg in args:
                random_fill(arg)
            ctx.sync()

        costs = time_f(*args).results

        # clean up remote files
        remote.remove(build_result.filename)
        remote.remove(os.path.splitext(build_result.filename)[0] + ".so")
        remote.remove("")

        if len(costs) > 2:  # remove largest and smallest value to reduce variance
            costs = list(costs)
            costs.sort()
            costs = tuple(costs[1:-1])

        # check correctness of output
        if ref_output:
            for expected, real in zip(ref_output, args):
                if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
                    logger.warning("Wrong Answer!")
                    errno = MeasureErrorNo.WRONG_ANSWER
    except TVMError as exc:
        msg = str(exc)
        if "Stack trace returned" in msg:
            msg = msg[: msg.index("Stack trace returned")]
        if "CUDA Source" in msg:
            msg = msg[: msg.index("CUDA Source")]
        costs = (RuntimeError(msg[:1024]),)
        errno = MeasureErrorNo.RUNTIME_DEVICE
    tstamp = time.time()
    time.sleep(cooldown_interval)
    return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
Exemple #16
0
# We configure both the bitstream and the runtime system on the Pynq
# to match the VTA configuration specified by the config.json file.
if env.TARGET == "pynq":

    # try init
    if os.environ.get('INIT_PYNQ', '').lower() == 'yes':
        print('')
        print('Initializing board ...')

        from tvm import rpc
        from vta import get_bitstream_path, download_bitstream, program_fpga, reconfig_runtime

        assert tvm.module.enabled("rpc")
        remote = rpc.connect(host, port)
        program_fpga(remote, None)  # None -> path
        #       remote = rpc.connect(host, port)
        reconfig_runtime(remote)

        print('')

    # Make sure that TVM was compiled with RPC=1
    assert tvm.module.enabled("rpc")
    remote = rpc.connect(host, port)

    dt = time.time()

    # Reconfigure the JIT runtime
    vta.reconfig_runtime(remote)

    # Program the FPGA with a pre-compiled VTA bitstream.