def wrapper(): cur_dir_path = os.path.dirname(os.path.abspath(__file__)) build_root_dir = custom_op_tools._get_default_build_root() build_path = os.path.join(build_root_dir, "custom_opsrc", "build") if os.path.exists(build_path): shutil.rmtree(build_path) mgb_root_path = os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.dirname(cur_dir_path))))) extra_include_paths = [ os.path.join(mgb_root_path, "src", "custom", "include") ] extra_ld_flags = [] if sys.platform != "win32": ld_path = os.environ.get("LD_LIBRARY_PATH") if ld_path != None: ld_dirs = ld_path.split(":") for ld_dir in ld_dirs: if os.path.exists(ld_dir) and os.path.isdir(ld_dir): for lib in os.listdir(ld_dir): if "megengine_shared" in lib: extra_ld_flags += [ "-L{} -Wl,-rpath,{}".format( ld_dir, ld_dir) ] break if get_device_count("gpu") > 0: custom_opsrc = [ os.path.join(cur_dir_path, "custom_opsrc", "matmul_scale.cpp"), os.path.join(cur_dir_path, "custom_opsrc", "matmul_scale.cu"), ] else: custom_opsrc = [ os.path.join(cur_dir_path, "custom_opsrc", "elem_add.cpp") ] try: lib_path = custom_op_tools.build_and_load( "test_op", custom_opsrc, extra_include_paths=extra_include_paths, extra_ldflags=extra_ld_flags, build_dir=build_path, verbose=False, ) test_func() custom.unload(lib_path) finally: if os.path.exists(build_path): shutil.rmtree(build_path)
def test_sync_min_max_observer(): word_size = get_device_count("gpu") x = np.random.rand(3 * word_size, 3, 3, 3).astype("float32") np_min, np_max = x.min(), x.max() @dist.launcher def worker(): rank = dist.get_rank() m = SyncMinMaxObserver() y = mge.tensor(x[rank * 3:(rank + 1) * 3]) m(y) assert m.min_val == np_min and m.max_val == np_max worker()
def test_sync_exponential_moving_average_observer(): word_size = get_device_count("gpu") t = np.random.rand() x1 = np.random.rand(3 * word_size, 3, 3, 3).astype("float32") x2 = np.random.rand(3 * word_size, 3, 3, 3).astype("float32") expected_min = x1.min() * t + x2.min() * (1 - t) expected_max = x1.max() * t + x2.max() * (1 - t) @dist.launcher def worker(): rank = dist.get_rank() m = SyncExponentialMovingAverageObserver(momentum=t) y1 = mge.tensor(x1[rank * 3:(rank + 1) * 3]) y2 = mge.tensor(x2[rank * 3:(rank + 1) * 3]) m(y1) m(y2) np.testing.assert_allclose(m.min_val.numpy(), expected_min, atol=1e-6) np.testing.assert_allclose(m.max_val.numpy(), expected_max, atol=1e-6) worker()
def run_model(args, graph, inputs, outputs, data): # must use level0 to avoid unintended opr modification graph.options.graph_opt_level = 0 logger.info("input tensors: ") for k, v in data.items(): logger.info(" {}: {}".format(k, v.shape)) G.modify_opr_algo_strategy_inplace(outputs, get_execution_strategy(args)) if args.optimize_for_inference: opt_kwargs = get_opt_kwargs(args) outputs = G.optimize_for_inference(outputs, **opt_kwargs) # embed inputs must be on the last, to avoid const fold if args.embed_input: outputs, inp_dict = tools.embed_inputs(outputs, data.values(), inputs=inputs) else: outputs, inp_dict = tools.convert_inputs(outputs, inputs=inputs) if args.dump_cpp_model: dump_content, _ = G.dump_graph(outputs, keep_var_name=2) with open(args.dump_cpp_model, "wb") as file: file.write(dump_content) logger.info("C++ model written to {}".format(args.dump_cpp_model)) outputs, output_dict = tools.convert_outputs(outputs) if args.profile: profiler = tools.GraphProfiler(graph) func = graph.compile(outputs) def run(): if not args.embed_input: for key in inp_dict: inp_dict[key].set_value(mge.Tensor(data[key])._dev_tensor()) func.execute() func.wait() return [oup_node.get_value().numpy() for oup_node in output_dict.values()] if args.warm_up: logger.info("warming up") run() total_time = 0 for i in range(args.iter): logger.info("iter {}".format(i)) start_time = time.time() retval = run() cur_time = time.time() - start_time total_time += cur_time avg_speed = (i + 1) / total_time if "data" in data: avg_speed *= data["data"].shape[0] avg_speed_txt = "{:.3f}sample/s".format(avg_speed) else: avg_speed_txt = "{:.3f}batch/s".format(avg_speed) msg = ( "iter {}: duration={:.4f}({:.4f})s average={:.4f}s " "avg_speed={} time={:.4f}s" ).format( i, cur_time, func.get_prev_exec_time(), total_time / (i + 1), avg_speed_txt, total_time, ) if args.calc_output_rms: rms = [] for v in retval: rms.append("{:.3g}".format(float(((v ** 2).mean()) ** 0.5))) msg += " output_rms=[{}]".format(", ".join(rms)) if logger.level > logging.INFO: print(msg) else: logger.info(msg) if args.focused_nvprof: if get_device_count("gpu") < 1: logger.warning( "No cuda device detected. ``focused_nvprof`` will be ignored." ) else: try: import pycuda.driver as D D.start_profiler() func.execute() func.wait() D.stop_profiler() except ImportError: logger.error("`focused_nvprof need pycuda`", exc_info=True) if args.profile: with open(args.profile, "w") as fout: fout.write(profiler.get()) return avg_speed
import numpy as np import pytest import megengine as mge import megengine.functional as F from megengine.device import get_device_count from megengine.module import LSTM, RNN, LSTMCell, RNNCell def assert_tuple_equal(src, ref): assert len(src) == len(ref) for i, j in zip(src, ref): assert i == j @pytest.mark.skipif(get_device_count("gpu") > 0, reason="no algorithm on cuda") @pytest.mark.parametrize( "batch_size, input_size, hidden_size, init_hidden", [(3, 10, 20, True), (3, 10, 20, False), (1, 10, 20, False)], ) def test_rnn_cell(batch_size, input_size, hidden_size, init_hidden): rnn_cell = RNNCell(input_size, hidden_size) x = mge.random.normal(size=(batch_size, input_size)) if init_hidden: h = F.zeros(shape=(batch_size, hidden_size)) else: h = None h_new = rnn_cell(x, h) assert_tuple_equal(h_new.shape, (batch_size, hidden_size))
def _get_device_count_worker(queue, device_type): num = get_device_count(device_type) queue.put(num)
result = fn(inp, scores=scores) np.testing.assert_equal(result.numpy(), np.array([2, 1, 3], dtype=np.int32)) x = np.array( [], dtype=np.float32, ).reshape(0, 4) inp = tensor(x) scores = tensor([], dtype=np.float32) for _ in range(3): result = fn(inp, scores=scores) np.testing.assert_equal(result.numpy(), np.array([], dtype=np.int32)) @pytest.mark.skipif(get_device_count("gpu") > 0, reason="cuda does not support nchw int8") def test_conv_bias(): inp_scale = 1.5 w_scale = 2.5 outp_scale = 1.5 inp_dtype = dtype.qint8(inp_scale) w_dtype = dtype.qint8(w_scale) b_dtype = dtype.qint32(inp_scale * w_scale) out_dtype = dtype.qint8(outp_scale) def run( N, IC, OC, IH,
BetaRNG, GammaRNG, GaussianRNG, PermutationRNG, PoissonRNG, UniformRNG, ) from megengine.device import get_device_count from megengine.jit import trace from megengine.random import RNG from megengine.random import seed as set_global_seed from megengine.random import uniform @pytest.mark.skipif( get_device_count("xpu") <= 2, reason="xpu counts need > 2", ) def test_gaussian_op(): # FIXME: remove this sync mge.core.set_option("async_level", 0) set_global_seed(1024) shape = ( 8, 9, 11, 12, ) shape = Tensor(shape, dtype="int32") op = GaussianRNG(seed=get_global_rng_seed(), mean=1.0,
import platform import sys import pytest import megengine.functional import megengine.module from megengine import Parameter from megengine.core._imperative_rt.core2 import sync from megengine.device import get_device_count from megengine.jit import trace as _trace from megengine.module import Linear, Module sys.path.append(os.path.join(os.path.dirname(__file__), "helpers")) _ngpu = get_device_count("gpu") @pytest.fixture(autouse=True) def skip_by_ngpu(request): if request.node.get_closest_marker("require_ngpu"): require_ngpu = int( request.node.get_closest_marker("require_ngpu").args[0]) if require_ngpu > _ngpu: pytest.skip( "skipped for ngpu unsatisfied: {}".format(require_ngpu)) @pytest.fixture(autouse=True) def skip_distributed(request): if request.node.get_closest_marker("distributed_isolated"):