def worker(rank, data, backend, expect, port_queue): if not mge.is_cuda_available(): return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = Parameter(data) dist.functional.bcast_param(inp, "x") assert np.allclose(inp.numpy(), expect)
def worker(): if not mge.is_cuda_available(): return port = mgb.config.create_mm_server("0.0.0.0", 0) assert port > 0 res = mgb.config.create_mm_server("0.0.0.0", port) assert res == -1
def worker(rank, data, backend, expect, port_queue): if not mge.is_cuda_available(): return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = tensor(data) output = dist.functional.all_gather(inp, "x") assert np.allclose(output.numpy(), expect)
def test_correctness_use_adaptive_pooling(): if mge.is_cuda_available(): model_name = "mnist_model_with_test.mge" else: model_name = "mnist_model_with_test_cpu.mge" model_path = os.path.join(os.path.dirname(__file__), model_name) set_execution_strategy("HEURISTIC_REPRODUCIBLE") run_train(model_path, False, False, max_err=1e-5, use_adaptive_pooling=True) run_train(model_path, True, False, max_err=1e-5, use_adaptive_pooling=True) run_train(model_path, True, True, max_err=1e-5, use_adaptive_pooling=True) # sublinear config = SublinearMemoryConfig(genetic_nr_iter=10) run_train( model_path, True, True, sublinear_memory_config=config, max_err=1e-5, use_adaptive_pooling=True, ) run_eval(model_path, False, max_err=1e-7, use_adaptive_pooling=True) run_eval(model_path, True, max_err=1e-7, use_adaptive_pooling=True)
def test_tensor_serialization(): with TemporaryFile() as f: data = np.random.randint(low=0, high=7, size=[233]) a = Tensor(data, device="cpu0", dtype=np.int32) mge.save(a, f) f.seek(0) b = mge.load(f) np.testing.assert_equal(a.numpy(), data) assert b.device.logical_name == "cpu0:0" assert b.dtype == np.int32 with TemporaryFile() as f: a = Parameter(np.random.random(size=(233, 2)).astype(np.float32)) mge.save(a, f) f.seek(0) b = mge.load(f) assert isinstance(b, Parameter) np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: a = Tensor(np.random.random(size=(2, 233)).astype(np.float32)) mge.save(a, f) f.seek(0) b = mge.load(f) assert type(b) is Tensor np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: a = Tensor(np.random.random(size=(2, 233)).astype(np.float32)) mge.save(a, f) f.seek(0) b = mge.load(f, map_location="cpux") assert type(b) is Tensor assert "cpu" in str(b.device) np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: if mge.is_cuda_available(): device_org = mge.get_default_device() mge.set_default_device("gpu0") a = Tensor(np.random.random(size=(2, 233)).astype(np.float32)) mge.save(a, f) f.seek(0) mge.set_default_device("cpux") b = mge.load(f, map_location={"gpu0": "cpu0"}) assert type(b) is Tensor assert "cpu0" in str(b.device) np.testing.assert_equal(a.numpy(), b.numpy()) mge.set_default_device(device_org) with TemporaryFile() as f: a = Tensor(0) a.qparams.scale = Tensor(1.0) mge.save(a, f) f.seek(0) b = mge.load(f) assert isinstance(b.qparams.scale, Tensor) np.testing.assert_equal(b.qparams.scale.numpy(), 1.0)
def worker(rank, backend, q): if not mge.is_cuda_available(): return _init_process_group_wrapper(world_size, rank, rank, backend, q) assert dist.is_distributed() == True assert dist.get_master_ip() == _LOCALHOST assert dist.get_master_port() > 0 assert dist.get_world_size() == world_size assert dist.get_rank() == rank assert dist.get_backend() == backend
def test_correctness(): if mge.is_cuda_available(): model_name = "mnist_model_with_test.mge" else: model_name = "mnist_model_with_test_cpu.mge" model_path = os.path.join(os.path.dirname(__file__), model_name) set_conv_execution_strategy("HEURISTIC_REPRODUCIBLE") run_test(model_path, False, False) run_test(model_path, True, False) run_test(model_path, True, True)
def worker(rank, q): if not mge.is_cuda_available(): return _init_process_group_wrapper(world_size, rank, rank, backend, q) dist.group_barrier() if rank == 0: dist.group_barrier() q.put(0) # to be observed in rank 1 else: _assert_q_empty(q) # q.put(0) is not executed in rank 0 dist.group_barrier() _assert_q_val(q, 0) # q.put(0) executed in rank 0
def test_tensor_serialization(): def tensor_eq(a, b): assert a.dtype == b.dtype assert a.device == b.device np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: data = np.random.randint(low=0, high=7, size=[233]) a = Tensor(data, device="xpux", dtype=np.int32) pickle.dump(a, f) f.seek(0) b = pickle.load(f) np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: a = Parameter(np.random.random(size=(233, 2)).astype(np.float32)) pickle.dump(a, f) f.seek(0) b = pickle.load(f) assert isinstance(b, Parameter) np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: a = Tensor(np.random.random(size=(2, 233)).astype(np.float32)) pickle.dump(a, f) f.seek(0) b = pickle.load(f) assert type(b) is Tensor np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: a = Tensor(np.random.random(size=(2, 233)).astype(np.float32)) mge.save(a, f) f.seek(0) b = mge.load(f, map_location="cpux") assert type(b) is Tensor assert "cpu" in str(b.device) np.testing.assert_equal(a.numpy(), b.numpy()) with TemporaryFile() as f: if mge.is_cuda_available(): device_org = mge.get_default_device() mge.set_default_device("gpu0") a = Tensor(np.random.random(size=(2, 233)).astype(np.float32)) mge.save(a, f) f.seek(0) mge.set_default_device("cpux") b = mge.load(f, map_location={"gpu0": "cpu0"}) assert type(b) is Tensor assert "cpu0" in str(b.device) np.testing.assert_equal(a.numpy(), b.numpy()) mge.set_default_device(device_org)
def worker(rank, data, yv_expect, running_mean, running_var): if not mge.is_cuda_available(): return dist.init_process_group("localhost", 2333, 4, rank, rank) bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps) data_tensor = tensor() for i in range(steps): data_tensor.set_value(data[i]) yv = bn(data_tensor) assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6) assertTensorClose(running_mean, bn.running_mean.numpy(), max_err=5e-6) assertTensorClose(running_var, bn.running_var.numpy(), max_err=5e-6)
def test_warp_affine(): inp_shape = (1, 3, 3, 3) x = tensor(np.arange(27, dtype=np.float32).reshape(inp_shape)) weightv = [[[1.26666667, 0.6, -83.33333333], [-0.33333333, 1, 66.66666667]]] outp = F.vision.warp_affine(x, tensor(weightv), (2, 2), border_mode="wrap") res = np.array( [[ [[7.875, 8.875, 9.875], [8.90625, 9.90625, 10.90625]], [[18.75, 19.75, 20.75], [14.90625, 15.90625, 16.90625]], ]], dtype=np.float32, ) if not is_cuda_available(): np.testing.assert_almost_equal(outp.numpy(), res, 5)
def worker(master_ip, master_port, world_size, rank, dev, trace): import megengine.distributed as dist import megengine.functional as F from megengine import is_cuda_available from megengine import jit from megengine.module import Linear, Module from megengine.optimizer import SGD if not is_cuda_available(): return class MLP(Module): def __init__(self): super().__init__() self.fc0 = Linear(3 * 224 * 224, 500) self.fc1 = Linear(500, 10) def forward(self, x): x = self.fc0(x) x = F.relu(x) x = self.fc1(x) return x dist.init_process_group(master_ip=master_ip, master_port=3456, world_size=world_size, rank=rank, dev=dev) net = MLP() opt = SGD(net.parameters(requires_grad=True), lr=0.02) data = np.random.random((64, 3 * 224 * 224)).astype(np.float32) label = np.random.randint(0, 10, size=(64, )).astype(np.int32) jit.trace.enabled = trace @jit.trace() def train_func(data, label): pred = net(data) loss = F.cross_entropy_with_softmax(pred, label) opt.backward(loss) return loss for i in range(5): opt.zero_grad() loss = train_func(data, label) opt.step()
def worker(rank, q): if not mge.is_cuda_available(): return _init_process_group_wrapper(world_size, rank, rank, backend, q) dist.group_barrier() if rank == 0: func(0, q) # q.put(0) q.put(2) else: _assert_q_val(q, 0) # func executed in rank 0 _assert_q_empty(q) # q.put(2) is not executed func(1, q) _assert_q_val( q, 1 ) # func in rank 1 executed earlier than q.put(2) in rank 0 _assert_q_val(q, 2) # q.put(2) executed in rank 0
import json import numpy as np import megengine as mge import megengine.functional as F import megengine.hub as hub import megengine.optimizer as optim from megengine.jit import trace # 使用GPU运行这个例子 assert mge.is_cuda_available(), "Please run with GPU" # 我们从 megengine hub 中加载一个 resnet50 模型。 resnet = hub.load("megengine/models", "resnet50") optimizer = optim.SGD(resnet.parameters(), lr=0.1,) # profiling=True 收集性能数据 @trace(symbolic=True, profiling=True) def train_func(data, label, *, net, optimizer): pred = net(data) loss = F.cross_entropy_with_softmax(pred, label) optimizer.backward(loss) resnet.train() batch_size = 64 # 运行 10 次,保存最后一次的性能结果 for i in range(10): batch_data = np.random.randn(batch_size, 3, 224, 224).astype(np.float32)
def get_xpu_name(): if mge.is_cuda_available(): return get_gpu_name() else: return get_cpu_name()
def main(): if not mge.is_cuda_available(): mge.set_default_device("cpux") net = XORNet() opt = optim.SGD(net.parameters(requires_grad=True), lr=0.01, momentum=0.9) batch_size = 64 train_dataset = minibatch_generator(batch_size) val_dataset = minibatch_generator(batch_size) data = mge.tensor() label = mge.tensor(np.zeros((batch_size, )), dtype=np.int32) train_loss = [] val_loss = [] for step, minibatch in enumerate(train_dataset): if step > 1000: break data.set_value(minibatch["data"]) label.set_value(minibatch["label"]) opt.zero_grad() _, loss = train_fun(data, label, net=net, opt=opt) train_loss.append((step, loss.numpy())) if step % 50 == 0: minibatch = next(val_dataset) _, loss = val_fun(data, label, net=net) loss = loss.numpy()[0] val_loss.append((step, loss)) print("Step: {} loss={}".format(step, loss)) opt.step() test_data = np.array([ (0.5, 0.5), (0.3, 0.7), (0.1, 0.9), (-0.5, -0.5), (-0.3, -0.7), (-0.9, -0.1), (0.5, -0.5), (0.3, -0.7), (0.9, -0.1), (-0.5, 0.5), (-0.3, 0.7), (-0.1, 0.9), ]) data.set_value(test_data) out = pred_fun(data, net=net) pred_output = out.numpy() pred_label = np.argmax(pred_output, 1) print("Test data") print(test_data) with np.printoptions(precision=4, suppress=True): print("Predicated probability:") print(pred_output) print("Predicated label") print(pred_label) model_name = "xornet_deploy.mge" if pred_fun.enabled: print("Dump model as {}".format(model_name)) pred_fun.dump(model_name, arg_names=["data"]) else: print("pred_fun must be run with trace enabled in order to dump model")
def run( N, IC, OC, IH, IW, KH, KW, PH, PW, SH, SW, has_bias=True, nonlinear_mode="IDENTITY", ): inp_v = np.random.normal(size=(N, IC, IH, IW)) w_v = np.random.normal(size=(OC, IC, KW, KW)) b_v = np.random.normal(size=(1, OC, 1, 1)) inp_scale = mgb.dtype.get_scale(inp_dtype) w_scale = mgb.dtype.get_scale(w_dtype) b_scale = mgb.dtype.get_scale(b_dtype) inpv = mgb.dtype.convert_to_qint8(inp_v * inp_scale, inp_dtype) wv = mgb.dtype.convert_to_qint8(w_v * w_scale, w_dtype) bv = mgb.dtype.convert_to_qint32(b_v * b_scale, b_dtype) inp_int8 = tensor(inpv, dtype=inp_dtype) w_int8 = Parameter(wv, dtype=w_dtype) b_int32 = Parameter(bv, dtype=b_dtype) inp_fp32 = inp_int8.astype("float32") w_fp32 = w_int8.astype("float32") b_fp32 = b_int32.astype("float32") jit.trace.enabled = True b_symbolic = True def convert_to_nchw4(var): return var.reshape(var.shapeof(0), var.shapeof(1) // 4, 4, var.shapeof(2), var.shapeof(3)).dimshuffle(0, 1, 3, 4, 2) @jit.trace(symbolic=b_symbolic) def run_conv2d(inp, w, b): O = F.conv2d( inp, w, b if has_bias else None, stride=(SH, SW), padding=(PH, PW), ) if nonlinear_mode == "RELU": return F.relu(O) else: return O @jit.trace(symbolic=b_symbolic) def run_conv_bias(inp, w, b, format="NCHW"): b = b if has_bias else np.zeros_like(b) if format == "NCHW4": inp = convert_to_nchw4(inp) w = convert_to_nchw4(w) b = F.flatten(b) return F.conv_bias_activation( inp, w, b, stride=(SH, SW), padding=(PH, PW), dtype=out_dtype, nonlinear_mode=nonlinear_mode, ) format = "NCHW4" if is_cuda_available() else "NCHW" expected = run_conv2d(inp_fp32, w_fp32, b_fp32) expected = expected.astype(out_dtype).astype("float32") result = run_conv_bias(inp_int8, w_int8, b_int32, format=format).astype("float32") if format == "NCHW4": result = result.dimshuffle(0, 1, 4, 2, 3) expected = F.flatten(expected) result = F.flatten(result) assertTensorClose(result.numpy(), expected.numpy())
def run( N, IC, OC, IH, IW, KH, KW, PH, PW, SH, SW, has_bias=True, nonlinear_mode="identity", ): inp_v = np.random.normal(size=(N, IC, IH, IW)) w_v = np.random.normal(size=(OC, IC, KH, KW)) b_v = np.random.normal(size=(1, OC, 1, 1)) inp_scale = dtype.get_scale(inp_dtype) w_scale = dtype.get_scale(w_dtype) b_scale = dtype.get_scale(b_dtype) inpv = dtype.convert_to_qint8(inp_v * inp_scale, inp_dtype) wv = dtype.convert_to_qint8(w_v * w_scale, w_dtype) bv = dtype.convert_to_qint32(b_v * b_scale, b_dtype) inp_int8 = tensor(inpv, dtype=inp_dtype) w_int8 = Parameter(wv, dtype=w_dtype) b_int32 = Parameter(bv, dtype=b_dtype) inp_fp32 = inp_int8.astype("float32") w_fp32 = w_int8.astype("float32") b_fp32 = b_int32.astype("float32") def convert_to_nchw4(var): var = F.reshape(var, (var.shape[0], var.shape[1] // 4, 4, var.shape[2], var.shape[3])) var = F.transpose(var, (0, 1, 3, 4, 2)) return var def run_conv2d(inp, w, b): O = F.conv2d( inp, w, b if has_bias else None, stride=(SH, SW), padding=(PH, PW), ) if nonlinear_mode == "relu": return F.relu(O) else: return O def run_conv_bias(inp, w, b, format="NCHW"): b = b if has_bias else Parameter(np.zeros_like(b.numpy())) if format == "NCHW4": inp = convert_to_nchw4(inp) w = convert_to_nchw4(w) b = convert_to_nchw4(b) return F.quantized.conv_bias_activation( inp, w, b, stride=(SH, SW), padding=(PH, PW), dtype=out_dtype, nonlinear_mode=nonlinear_mode, ) format = "NCHW4" if is_cuda_available() else "NCHW" expected = run_conv2d(inp_fp32, w_fp32, b_fp32) expected = expected.astype(out_dtype).astype("float32") result = run_conv_bias(inp_int8, w_int8, b_int32, format=format).astype("float32") if format == "NCHW4": result = F.transpose(result, (0, 1, 4, 2, 3)) expected = F.flatten(expected) result = F.flatten(result) np.testing.assert_allclose(result.numpy(), expected.numpy(), atol=outp_scale)
def main(): if not mge.is_cuda_available(): mge.set_default_device("cpux") net = XORNet() gm = ad.GradManager().attach(net.parameters()) opt = optim.SGD(net.parameters(), lr=0.01, momentum=0.9) batch_size = 64 train_dataset = minibatch_generator(batch_size) val_dataset = minibatch_generator(batch_size) def train_fun(data, label): opt.clear_grad() with gm: pred = net(data) loss = F.loss.cross_entropy(pred, label) gm.backward(loss) opt.step() return pred, loss def val_fun(data, label): pred = net(data) loss = F.loss.cross_entropy(pred, label) return pred, loss @trace(symbolic=True, capture_as_const=True) def pred_fun(data): pred = net(data) pred_normalized = F.softmax(pred) return pred_normalized data = np.random.random((batch_size, 2)).astype(np.float32) label = np.zeros((batch_size,)).astype(np.int32) train_loss = [] val_loss = [] for step, minibatch in enumerate(train_dataset): if step > 1000: break data = mge.tensor(minibatch["data"]) label = mge.tensor(minibatch["label"]) net.train() _, loss = train_fun(data, label) train_loss.append((step, loss.numpy())) if step % 50 == 0: minibatch = next(val_dataset) net.eval() _, loss = val_fun(data, label) loss = loss.numpy() val_loss.append((step, loss)) print("Step: {} loss={}".format(step, loss)) opt.step() test_data = np.array( [ (0.5, 0.5), (0.3, 0.7), (0.1, 0.9), (-0.5, -0.5), (-0.3, -0.7), (-0.9, -0.1), (0.5, -0.5), (0.3, -0.7), (0.9, -0.1), (-0.5, 0.5), (-0.3, 0.7), (-0.1, 0.9), ] ) # tracing only accepts tensor as input data = mge.tensor(test_data, dtype=np.float32) net.eval() out = pred_fun(data) pred_output = out.numpy() pred_label = np.argmax(pred_output, 1) print("Test data") print(test_data) with np.printoptions(precision=4, suppress=True): print("Predicated probability:") print(pred_output) print("Predicated label") print(pred_label) model_name = "xornet_deploy.mge" print("Dump model as {}".format(model_name)) pred_fun.dump(model_name, arg_names=["data"]) model_with_testcase_name = "xornet_with_testcase.mge" print("Dump model with testcase as {}".format(model_with_testcase_name)) pred_fun.dump(model_with_testcase_name, arg_names=["data"], input_data=["#rand(0.1, 0.8, 4, 2)"])