def test_compile_multi_times_static(): return # XXX: rewrite or remove this test with Graph() as cg: cg.set_option("eager_evaluation", False) data = Input("data", shape=(2, 28)) label = Input("label", shape=(2, ), dtype=np.int32) mlp = MLP() opt = SGD(mlp.parameters(requires_grad=True), lr=0.01) pred0 = mlp(data) pred = F.softmax(pred0) loss = F.square_loss(pred, label.reshape(2, 1)) opt.zero_grad() grads = opt.backward(loss) opt.step() f0 = compile(pred, None) f1 = compile([pred, loss], grads, copy=True) data = np.random.random((2, 28)).astype(np.float32) label = np.random.randint(0, 10, (2, )).astype(np.float32) out0 = f0(data=data) out1 = f1(data=data, label=label) assertTensorClose(out0[0], out1[0]) _ = compile([pred, loss], grads, copy=False) with pytest.raises(mgb.MegBrainError): f0(data=data)
def test_multi_step_lr(): mlp = MLP() opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9) scheduler = MultiStepLR(opt, [3, 6, 8]) lr = np.array(0.01, dtype=np.float32) for i in range(10): for group in opt.param_groups: assertTensorClose( np.array(group["lr"], dtype=np.float32), (lr * 0.1**bisect_right([3, 6, 8], i)).astype(np.float32), max_err=5e-6, ) scheduler.step()
def test_dump_model(): data_shape = (2, 28) data = tensor() data.set_value(np.random.random(data_shape)) mlp = MLP() pred = mlp(data) with tempfile.NamedTemporaryFile() as f: mge.dump(pred, f.name)
def test_load_quantized(): data_shape = (2, 28) data = tensor(np.random.random(data_shape), dtype="float32") data = data.astype(mgb.dtype.qint8(0.1)) mlp = MLP() quantize_qat(mlp) quantize(mlp) mlp.dense0.weight = Parameter( mlp.dense0.weight.astype(mgb.dtype.qint8(0.001)).numpy()) mlp.dense1.weight = Parameter( mlp.dense1.weight.astype(mgb.dtype.qint8(0.0002)).numpy()) mlp.eval() pred0 = mlp(data) with BytesIO() as fout: mge.save(mlp.state_dict(), fout) fout.seek(0) checkpoint = mge.load(fout) # change mlp weight. mlp.dense0.weight = Parameter( mlp.dense0.weight.astype(mgb.dtype.qint8(0.00001)).numpy()) mlp.dense1.weight = Parameter( mlp.dense1.weight.astype(mgb.dtype.qint8(0.2)).numpy()) mlp.load_state_dict(checkpoint) pred1 = mlp(data) assertTensorClose(pred0.astype("float32").numpy(), pred1.astype("float32").numpy(), max_err=5e-6)
def test_sgd_simple(): data, data_shape, label, label_shape = get_input() mlp = MLP() opt = SGD(mlp.parameters(), lr=0.01, weight_decay=0.1) for idx in range(3): data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) if idx % 2: opt.zero_grad() else: mlp.zero_grad() opt.backward(loss) grads = TensorDict() orig_params = TensorDict() for param in mlp.parameters(): grad = F.grad(loss, param, use_virtual_grad=False) assertTensorClose(grad.numpy(), param.grad.numpy()) grads[param] = np.copy(grad.numpy()) orig_params[param] = np.copy(param.numpy()) opt.step() for param in mlp.parameters(): assertTensorClose(param.numpy(), orig_params[param] * 0.999 - grads[param] * 0.01)
def test_sgd_momentum_static(): _, data_shape, _, label_shape = get_input() mlp = MLP() opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9) @trace def f(data, label): pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) slots = TensorDict() for param in mlp.parameters(): slots[param] = np.zeros(param.shape).astype(np.float32) for _ in range(3): f( np.random.random(data_shape).astype(np.float32), np.random.randint(0, 10, label_shape).astype(np.int32), ) orig_params = TensorDict() grads = TensorDict() for param in mlp.parameters(): orig_params[param] = np.copy(param.numpy()) grads[param] = np.copy(param.grad.numpy()) opt.step() for param in mlp.parameters(): slot = slots[param] orig_param = orig_params[param] slot *= 0.9 slot -= param.grad.numpy() * 0.01 assertTensorClose(param.numpy(), orig_param + slot)
def test_update_lr(): data, data_shape, label, label_shape = get_input() mlp = MLP() opt = SGD(mlp.parameters(), lr=0.01) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) opt.step() for group in opt.param_groups: group["lr"] += 0.02 for _ in range(3): data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) for param in mlp.parameters(): grad = F.grad(loss, param, use_virtual_grad=False) assertTensorClose(grad.numpy(), param.grad.numpy()) orig_params = [] for param in mlp.parameters(): orig_params.append(np.copy(param.numpy())) opt.step() for param, orig_param in zip(mlp.parameters(), orig_params): assertTensorClose(param.numpy(), orig_param - param.grad.numpy() * 0.03)
def test_compile_multi_times_eager(): return # XXX: rewrite or remove this test data = Input("data", shape=(2, 28)) label = Input("label", shape=(2, ), dtype=np.int32) mlp = MLP() opt = SGD(mlp.parameters(requires_grad=True), lr=0.01) pred0 = mlp(data) pred = F.softmax(pred0) loss = F.square_loss(pred, label.reshape(2, 1)) opt.zero_grad() grads = opt.backward(loss) opt.step() f0 = compile(pred, None) f1 = compile([pred, loss], grads, copy=False) for _ in range(3): data = np.random.random((2, 28)).astype(np.float32) label = np.random.randint(0, 10, (2, )).astype(np.float32) out0 = f0(data=data) out1 = f1(data=data, label=label) assertTensorClose(out0[0], out1[0])
def _test_optimizer(opt_str, test_case, check_class, update_lr=False): iter_num = 3 data, data_shape, label, label_shape = get_input() net = MLP() opt = getattr(optimizer, opt_str)(net.parameters(), **test_case) check_func = check_class(net, **test_case) step = 0 # eager graph for i in range(iter_num): if update_lr and i == 1: # change learning rate for group in opt.param_groups: group["lr"] += 0.01 check_func.lr += 0.01 data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = net(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) ori_params = TensorDict() for param in net.parameters(): ori_params[param] = np.copy(param.numpy()) opt.step() step += 1 check_func(ori_params, net.parameters(), step) # static graph @trace def train_func(data, label): pred = net(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.backward(loss) for i in range(iter_num): if update_lr and i == 1: # change learning rate for group in opt.param_groups: group["lr"] += 0.01 check_func.lr += 0.01 opt.zero_grad() ori_params = TensorDict() for param in net.parameters(): ori_params[param] = np.copy(param.numpy()) train_func( np.random.random(data_shape).astype(np.float32), np.random.randint(0, 10, label_shape).astype(np.int32), ) opt.step() step += 1 check_func(ori_params, net.parameters(), step)
def test_optimizer_serialization(): data, data_shape, label, label_shape = get_input() mlp = MLP() opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9) slots = TensorDict() for param in mlp.parameters(): slots[param] = np.zeros(param.shape).astype(np.float32) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) opt.step() for param in mlp.parameters(): slot = slots[param] slot *= 0.9 slot -= param.grad.numpy() * 0.01 with BytesIO() as fout: save(opt.state_dict(), fout) fout.seek(0) state_dict = load(fout) opt1 = SGD(mlp.parameters(), lr=0.02, momentum=0.8) opt1.load_state_dict(state_dict) data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt1.zero_grad() opt1.backward(loss) orig_params = TensorDict() for param in mlp.parameters(): orig_params[param] = np.copy(param.numpy()) opt1.step() for param in mlp.parameters(): orig_param = orig_params[param] slot = slots[param] slot *= 0.9 slot -= param.grad.numpy() * 0.01 assertTensorClose(param.numpy(), orig_param + slot)
def test_pickle_module(): data_shape = (2, 28) data = tensor() data.set_value(np.random.random(data_shape)) mlp = MLP() # pickle before forward with BytesIO() as fout: mge.save(mlp, fout) fout.seek(0) mlp1 = mge.load(fout) pred0 = mlp1(data) pred1 = mlp(data) # pickle after forward with BytesIO() as fout: mge.save(mlp, fout) fout.seek(0) mlp1 = mge.load(fout) pred2 = mlp1(data) assertTensorClose(pred0.numpy(), pred1.numpy(), max_err=5e-6) assertTensorClose(pred0.numpy(), pred2.numpy(), max_err=5e-6)
def test_state_dict(): data_shape = (2, 28) data = tensor() data.set_value(np.random.random(data_shape)) mlp = MLP() pred0 = mlp(data) with BytesIO() as fout: mge.save(mlp.state_dict(), fout) fout.seek(0) state_dict = mge.load(fout) state_dict["extra"] = None mlp1 = MLP() mlp1.load_state_dict(state_dict, strict=False) pred1 = mlp1(data) assertTensorClose(pred0.numpy(), pred1.numpy(), max_err=5e-6) with pytest.raises(KeyError): mlp1.load_state_dict(state_dict) del state_dict["extra"] del state_dict["dense0.bias"] with pytest.raises(KeyError): mlp1.load_state_dict(state_dict)
def test_adam(): data, data_shape, label, label_shape = get_input() mlp = MLP() beta0 = 0.8 beta1 = 0.9 eps = 1e-4 opt = Adam(mlp.parameters(), lr=0.01, betas=(beta0, beta1), eps=eps) m_slots = TensorDict() v_slots = TensorDict() for param in mlp.parameters(): m_slots[param] = np.zeros(param.shape).astype(np.float32) v_slots[param] = np.zeros(param.shape).astype(np.float32) step_size = 0 def check_value(): for param in mlp.parameters(): grad = param.grad.numpy() orig_param = orig_params[param] m = m_slots[param] v = v_slots[param] m *= beta0 m += (1 - beta0) * grad v *= beta1 v += (1 - beta1) * grad * grad update = (m / (1 - beta0**step_size)) / ( np.sqrt(v / (1 - beta1**step_size)) + eps) assertTensorClose(param.numpy(), orig_param - 0.01 * update) # eager for _ in range(3): data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() grads = opt.backward(loss) orig_params = TensorDict() for param in mlp.parameters(): orig_params[param] = np.copy(param.numpy()) opt.step() step_size += 1 check_value() # static @trace def f(data, label): pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.backward(loss) for _ in range(3): opt.zero_grad() orig_params = TensorDict() for param in mlp.parameters(): orig_params[param] = np.copy(param.numpy()) f( np.random.random(data_shape).astype(np.float32), np.random.randint(0, 10, label_shape).astype(np.int32), ) opt.step() step_size += 1 check_value()