Exemple #1
0
    def wrapper():
        cur_dir_path = os.path.dirname(os.path.abspath(__file__))
        build_root_dir = custom_op_tools._get_default_build_root()
        build_path = os.path.join(build_root_dir, "custom_opsrc", "build")

        if os.path.exists(build_path):
            shutil.rmtree(build_path)

        mgb_root_path = os.path.dirname(
            os.path.dirname(
                os.path.dirname(os.path.dirname(
                    os.path.dirname(cur_dir_path)))))
        extra_include_paths = [
            os.path.join(mgb_root_path, "src", "custom", "include")
        ]
        extra_ld_flags = []

        if sys.platform != "win32":
            ld_path = os.environ.get("LD_LIBRARY_PATH")
            if ld_path != None:
                ld_dirs = ld_path.split(":")
                for ld_dir in ld_dirs:
                    if os.path.exists(ld_dir) and os.path.isdir(ld_dir):
                        for lib in os.listdir(ld_dir):
                            if "megengine_shared" in lib:
                                extra_ld_flags += [
                                    "-L{} -Wl,-rpath,{}".format(
                                        ld_dir, ld_dir)
                                ]
                                break

        if get_device_count("gpu") > 0:
            custom_opsrc = [
                os.path.join(cur_dir_path, "custom_opsrc", "matmul_scale.cpp"),
                os.path.join(cur_dir_path, "custom_opsrc", "matmul_scale.cu"),
            ]
        else:
            custom_opsrc = [
                os.path.join(cur_dir_path, "custom_opsrc", "elem_add.cpp")
            ]

        try:
            lib_path = custom_op_tools.build_and_load(
                "test_op",
                custom_opsrc,
                extra_include_paths=extra_include_paths,
                extra_ldflags=extra_ld_flags,
                build_dir=build_path,
                verbose=False,
            )
            test_func()
            custom.unload(lib_path)

        finally:
            if os.path.exists(build_path):
                shutil.rmtree(build_path)
Exemple #2
0
def test_sync_min_max_observer():
    word_size = get_device_count("gpu")
    x = np.random.rand(3 * word_size, 3, 3, 3).astype("float32")
    np_min, np_max = x.min(), x.max()

    @dist.launcher
    def worker():
        rank = dist.get_rank()
        m = SyncMinMaxObserver()
        y = mge.tensor(x[rank * 3:(rank + 1) * 3])
        m(y)
        assert m.min_val == np_min and m.max_val == np_max

    worker()
Exemple #3
0
def test_sync_exponential_moving_average_observer():
    word_size = get_device_count("gpu")
    t = np.random.rand()
    x1 = np.random.rand(3 * word_size, 3, 3, 3).astype("float32")
    x2 = np.random.rand(3 * word_size, 3, 3, 3).astype("float32")
    expected_min = x1.min() * t + x2.min() * (1 - t)
    expected_max = x1.max() * t + x2.max() * (1 - t)

    @dist.launcher
    def worker():
        rank = dist.get_rank()
        m = SyncExponentialMovingAverageObserver(momentum=t)
        y1 = mge.tensor(x1[rank * 3:(rank + 1) * 3])
        y2 = mge.tensor(x2[rank * 3:(rank + 1) * 3])
        m(y1)
        m(y2)
        np.testing.assert_allclose(m.min_val.numpy(), expected_min, atol=1e-6)
        np.testing.assert_allclose(m.max_val.numpy(), expected_max, atol=1e-6)

    worker()
Exemple #4
0
def run_model(args, graph, inputs, outputs, data):
    # must use level0 to avoid unintended opr modification
    graph.options.graph_opt_level = 0

    logger.info("input tensors: ")
    for k, v in data.items():
        logger.info("  {}: {}".format(k, v.shape))

    G.modify_opr_algo_strategy_inplace(outputs, get_execution_strategy(args))

    if args.optimize_for_inference:
        opt_kwargs = get_opt_kwargs(args)
        outputs = G.optimize_for_inference(outputs, **opt_kwargs)

    # embed inputs must be on the last, to avoid const fold
    if args.embed_input:
        outputs, inp_dict = tools.embed_inputs(outputs, data.values(), inputs=inputs)
    else:
        outputs, inp_dict = tools.convert_inputs(outputs, inputs=inputs)

    if args.dump_cpp_model:
        dump_content, _ = G.dump_graph(outputs, keep_var_name=2)
        with open(args.dump_cpp_model, "wb") as file:
            file.write(dump_content)
        logger.info("C++ model written to {}".format(args.dump_cpp_model))

    outputs, output_dict = tools.convert_outputs(outputs)

    if args.profile:
        profiler = tools.GraphProfiler(graph)

    func = graph.compile(outputs)

    def run():
        if not args.embed_input:
            for key in inp_dict:
                inp_dict[key].set_value(mge.Tensor(data[key])._dev_tensor())
        func.execute()
        func.wait()
        return [oup_node.get_value().numpy() for oup_node in output_dict.values()]

    if args.warm_up:
        logger.info("warming up")
        run()

    total_time = 0

    for i in range(args.iter):
        logger.info("iter {}".format(i))
        start_time = time.time()
        retval = run()
        cur_time = time.time() - start_time
        total_time += cur_time

        avg_speed = (i + 1) / total_time
        if "data" in data:
            avg_speed *= data["data"].shape[0]
            avg_speed_txt = "{:.3f}sample/s".format(avg_speed)
        else:
            avg_speed_txt = "{:.3f}batch/s".format(avg_speed)

        msg = (
            "iter {}: duration={:.4f}({:.4f})s average={:.4f}s "
            "avg_speed={} time={:.4f}s"
        ).format(
            i,
            cur_time,
            func.get_prev_exec_time(),
            total_time / (i + 1),
            avg_speed_txt,
            total_time,
        )
        if args.calc_output_rms:
            rms = []
            for v in retval:
                rms.append("{:.3g}".format(float(((v ** 2).mean()) ** 0.5)))
            msg += " output_rms=[{}]".format(", ".join(rms))
        if logger.level > logging.INFO:
            print(msg)
        else:
            logger.info(msg)

    if args.focused_nvprof:
        if get_device_count("gpu") < 1:
            logger.warning(
                "No cuda device detected. ``focused_nvprof`` will be ignored."
            )
        else:
            try:
                import pycuda.driver as D

                D.start_profiler()
                func.execute()
                func.wait()
                D.stop_profiler()
            except ImportError:
                logger.error("`focused_nvprof need pycuda`", exc_info=True)

    if args.profile:
        with open(args.profile, "w") as fout:
            fout.write(profiler.get())

    return avg_speed
Exemple #5
0
import numpy as np
import pytest

import megengine as mge
import megengine.functional as F
from megengine.device import get_device_count
from megengine.module import LSTM, RNN, LSTMCell, RNNCell


def assert_tuple_equal(src, ref):
    assert len(src) == len(ref)
    for i, j in zip(src, ref):
        assert i == j


@pytest.mark.skipif(get_device_count("gpu") > 0, reason="no algorithm on cuda")
@pytest.mark.parametrize(
    "batch_size, input_size, hidden_size, init_hidden",
    [(3, 10, 20, True), (3, 10, 20, False), (1, 10, 20, False)],
)
def test_rnn_cell(batch_size, input_size, hidden_size, init_hidden):
    rnn_cell = RNNCell(input_size, hidden_size)
    x = mge.random.normal(size=(batch_size, input_size))
    if init_hidden:
        h = F.zeros(shape=(batch_size, hidden_size))
    else:
        h = None
    h_new = rnn_cell(x, h)
    assert_tuple_equal(h_new.shape, (batch_size, hidden_size))

Exemple #6
0
def _get_device_count_worker(queue, device_type):
    num = get_device_count(device_type)
    queue.put(num)
Exemple #7
0
        result = fn(inp, scores=scores)
        np.testing.assert_equal(result.numpy(),
                                np.array([2, 1, 3], dtype=np.int32))

    x = np.array(
        [],
        dtype=np.float32,
    ).reshape(0, 4)
    inp = tensor(x)
    scores = tensor([], dtype=np.float32)
    for _ in range(3):
        result = fn(inp, scores=scores)
        np.testing.assert_equal(result.numpy(), np.array([], dtype=np.int32))


@pytest.mark.skipif(get_device_count("gpu") > 0,
                    reason="cuda does not support nchw int8")
def test_conv_bias():
    inp_scale = 1.5
    w_scale = 2.5
    outp_scale = 1.5
    inp_dtype = dtype.qint8(inp_scale)
    w_dtype = dtype.qint8(w_scale)
    b_dtype = dtype.qint32(inp_scale * w_scale)
    out_dtype = dtype.qint8(outp_scale)

    def run(
        N,
        IC,
        OC,
        IH,
Exemple #8
0
    BetaRNG,
    GammaRNG,
    GaussianRNG,
    PermutationRNG,
    PoissonRNG,
    UniformRNG,
)
from megengine.device import get_device_count
from megengine.jit import trace
from megengine.random import RNG
from megengine.random import seed as set_global_seed
from megengine.random import uniform


@pytest.mark.skipif(
    get_device_count("xpu") <= 2,
    reason="xpu counts need > 2",
)
def test_gaussian_op():
    # FIXME: remove this sync
    mge.core.set_option("async_level", 0)
    set_global_seed(1024)
    shape = (
        8,
        9,
        11,
        12,
    )
    shape = Tensor(shape, dtype="int32")
    op = GaussianRNG(seed=get_global_rng_seed(),
                     mean=1.0,
Exemple #9
0
import platform
import sys

import pytest

import megengine.functional
import megengine.module
from megengine import Parameter
from megengine.core._imperative_rt.core2 import sync
from megengine.device import get_device_count
from megengine.jit import trace as _trace
from megengine.module import Linear, Module

sys.path.append(os.path.join(os.path.dirname(__file__), "helpers"))

_ngpu = get_device_count("gpu")


@pytest.fixture(autouse=True)
def skip_by_ngpu(request):
    if request.node.get_closest_marker("require_ngpu"):
        require_ngpu = int(
            request.node.get_closest_marker("require_ngpu").args[0])
        if require_ngpu > _ngpu:
            pytest.skip(
                "skipped for ngpu unsatisfied: {}".format(require_ngpu))


@pytest.fixture(autouse=True)
def skip_distributed(request):
    if request.node.get_closest_marker("distributed_isolated"):