def run(use_trace, symbolic): a = tensor(np.array([1926.0817], dtype=np.float32)) net = Sigmoid() func_run = run_saved_context if use_trace: func_run = trace(run_saved_context, symbolic=symbolic) s = func_run(a, net=net) s2 = F.sigmoid(a) assertTensorClose(s.numpy(), s2.numpy()) assertTensorClose( F.grad(s, a, use_virtual_grad=False).numpy(), F.grad(s2, a, use_virtual_grad=False).numpy(), )
def test_sgd_simple(): data, data_shape, label, label_shape = get_input() mlp = MLP() opt = SGD(mlp.parameters(), lr=0.01, weight_decay=0.1) for idx in range(3): data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) if idx % 2: opt.zero_grad() else: mlp.zero_grad() opt.backward(loss) grads = TensorDict() orig_params = TensorDict() for param in mlp.parameters(): grad = F.grad(loss, param, use_virtual_grad=False) assertTensorClose(grad.numpy(), param.grad.numpy()) grads[param] = np.copy(grad.numpy()) orig_params[param] = np.copy(param.numpy()) opt.step() for param in mlp.parameters(): assertTensorClose(param.numpy(), orig_params[param] * 0.999 - grads[param] * 0.01)
def test_update_lr(): data, data_shape, label, label_shape = get_input() mlp = MLP() opt = SGD(mlp.parameters(), lr=0.01) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) opt.step() for group in opt.param_groups: group["lr"] += 0.02 for _ in range(3): data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) for param in mlp.parameters(): grad = F.grad(loss, param, use_virtual_grad=False) assertTensorClose(grad.numpy(), param.grad.numpy()) orig_params = [] for param in mlp.parameters(): orig_params.append(np.copy(param.numpy())) opt.step() for param, orig_param in zip(mlp.parameters(), orig_params): assertTensorClose(param.numpy(), orig_param - param.grad.numpy() * 0.03)
def test_Clone_model(): # 必须要将新参数clone到另一个模型中,才可以继续 train_loader = build_dataloader() image_support = meg.tensor(dtype='float32') label_support = meg.tensor(dtype="int32") model = OmniglotFC(28 * 28, 5) model.train() loss_fn = F.cross_entropy_with_softmax optimizer = optim.SGD(model.parameters(), lr=0.05) iters = iter(train_loader) (images_support, labels_support, images_query, labels_query) = next(iters) i = 0 image_support.set_value(images_support[i]) label_support.set_value(labels_support[i]) image_support = F.remove_axis(image_support, 1) label_support = F.remove_axis(label_support, 1) support_out = model.forward(image_support) support_loss = loss_fn(support_out, label_support) # 对需要梯度更新的参数进行更新 params = list(model.parameters(requires_grad=True)) params[0] = meg.tensor(np.ones((5)), dtype='float32') grads = F.grad(support_loss, params, use_virtual_grad=False) fast_weights = [p - 0.5 * g for g, p in zip(grads, params)]
def train_func3(x1, y1, x2, y2, *, loss_fn, opt, net, params): loss = loss_fn(net(x1, weights=params), y1) grads = F.grad(loss, params, use_virtual_grad=False) fast_weights = [p - 0.5 * g for g, p in zip(grads, params)] # forward twice loss2 = loss_fn(net(x2, weights=fast_weights), y2) opt.backward(loss2)
def test_grad_twice(): # model define model = M.Sequential(M.Linear(10, 20), M.Linear(20, 10), M.Linear(10, 5)) model.train() named_param = dict(list(model.named_parameters(requires_grad=True))) named_module = dict(list(model.named_children())) name_keys = list(named_param.keys()) params = list(named_param.values()) loss_fn = F.cross_entropy_with_softmax optimizer = optim.SGD(params, lr=0.003) # forward once optimizer.zero_grad() x1 = meg.tensor(np.random.randn(5, 10), dtype='float32') y1 = meg.tensor(np.random.randint(0, 5, (5)), dtype='int32') loss = loss_fn(model(x1), y1) grads = F.grad(loss, params, use_virtual_grad=False, return_zero_for_nodep=False) fast_weights = [p - 0.5 * g for g, p in zip(grads, params)] # manual update params replace_parameter(named_module, dict(zip(name_keys, fast_weights))) # forward twice x2 = meg.tensor(np.random.randn(5, 10), dtype='float32') y2 = meg.tensor(np.random.randint(0, 5, (5)), dtype='int32') loss2 = loss_fn(model(x2), y2) # got error replace_parameter(named_module, named_param) optimizer.backward(loss2) optimizer.step()
def test_maml_update_var(): model = OmniglotFC(28 * 28, 5) model.train() loss_fn = F.cross_entropy_with_softmax old_params = list(model.parameters()) maml = MAML(model) params = list(maml.named_parameters.values()) optimizer = optim.SGD(old_params, lr=0.05) optimizer.zero_grad() support_out = model.forward( meg.tensor(np.random.randn(5, 28 * 28), dtype='float32')) support_loss = loss_fn( support_out, meg.tensor(np.random.randint(0, 5, (5)), dtype='int32')) optimizer.backward(support_loss) optimizer.step() assert id(old_params[0]) == id(params[0]) # 手动update grads = F.grad(support_loss, params, use_virtual_grad=False) fast_weights = [p - 0.5 * g for g, p in zip(grads, params)] named_update = dict(zip(maml.named_parameters.keys(), fast_weights)) named_old = dict(zip(maml.named_parameters.keys(), old_params)) maml.replace_parameter(maml.module_table, named_update) # 被替换为新的值后就无法通过model.parameters()找到了。 after_params = list(model.parameters()) maml.module_table['classifier'].bias named_update['classifier.bias'] mods = list(model.modules()) mods[1].bias maml.replace_parameter(maml.module_table, named_old)
def main(): nway = 5 batch_size = 32 train_loader, val_loader = build_dataset(nway=nway, batch_size=batch_size) model = OmniglotFC(28 * 28, nway) model.train() maml = MAML(model) loss_fn = F.cross_entropy_with_softmax opt = optim.Adam(maml.trainable_params, lr=0.003) accuracy = F.accuracy adapt_data = meg.tensor(dtype='float32') adapt_label = meg.tensor(dtype='int32') eval_data = meg.tensor(dtype='float32') eval_label = meg.tensor(dtype='int32') iteration = 0 for ep in range(500): for (images_support, labels_support, images_query, labels_query) in train_loader: opt.zero_grad() meta_train_error = 0.0 meta_train_accuracy = 0.0 for i in range(batch_size): (image_support, label_support, image_query, label_query) = (images_support[i], labels_support[i], images_query[i], labels_query[i]) adapt_data.set_value(np.squeeze(image_support, 1)) adapt_label.set_value(np.squeeze(label_support, 1)) loss = loss_fn(model.forward(adapt_data), adapt_label) gradients = F.grad(loss, maml.trainable_params, use_virtual_grad=False, return_zero_for_nodep=False) fast_weights = [ p - 0.5 * g for p, g in zip(maml.trainable_params, gradients) ] maml.replace_fast_parameter(fast_weights) # Evaluate the adapted model eval_data.set_value(np.squeeze(image_query, 1)) eval_label.set_value(np.squeeze(label_query, 1)) predictions = model.forward(eval_data) valid_error = loss_fn(predictions, eval_label) valid_accuracy = accuracy(predictions, eval_label) opt.backward(valid_error) meta_train_error += valid_error.numpy().item() meta_train_accuracy += valid_accuracy.numpy().item() # for p in maml.trainable_params: # p.grad = p.grad * (1.0 / batch_size) opt.step() print('Iteration', iteration) print('Meta Train Error', meta_train_error / batch_size) print('Meta Train Accuracy', meta_train_accuracy / batch_size) iteration += 1
def test_none_in_out_grad(): class Test(Function): def forward(self, a, b): return a, b def backward(self, grad_a, grad_b): assert grad_b is None return (grad_a, 0) a = tensor(np.array([1.0], dtype=np.float32)) b = tensor(np.array([2.0], dtype=np.float32)) aa, bb = Test()(a, b) assertTensorClose( F.grad(aa, a, use_virtual_grad=False).numpy(), np.array([1.0], dtype=np.float32)) assertTensorClose( F.grad(aa, b, use_virtual_grad=False).numpy(), np.array([0.0], dtype=np.float32))
def test_a_plus_b(): data_shape = (1, 9, 2, 6) av = np.random.random(data_shape).astype(np.float32) bv = np.random.random(data_shape).astype(np.float32) a = tensor(av) b = tensor(bv) class MulFunc(Function): def forward(self, a, b): return a * b def backward(self, grad_o): return (grad_o * b * 2, grad_o * a * 3) c = MulFunc()(a, b).sum() assertTensorClose(c.numpy(), (av * bv).sum()) assertTensorClose(F.grad(c, a, use_virtual_grad=False).numpy(), bv * 2) assertTensorClose(F.grad(c, b, use_virtual_grad=False).numpy(), av * 3)
def test_save_context(): class Sigmoid(Function): def forward(self, x): y = 1 / (1 + F.exp(-x)) self.save_for_backward(y) return y def backward(self, grad_y): (y, ) = self.saved_tensors return grad_y * y * (1 - y) a = tensor(np.array([1926.0817], dtype=np.float32)) s = Sigmoid()(a) s2 = F.sigmoid(a) assertTensorClose(s.numpy(), s2.numpy()) assertTensorClose( F.grad(s, a, use_virtual_grad=False).numpy(), F.grad(s2, a, use_virtual_grad=False).numpy(), )
def train_func(x1, y1, x2, y2, *, loss_fn, opt, net, keys, params): # 此处data和label不再需要先创建tensor然后通过set_value赋值,这些操作在trace内部完成 logits = net(x1, weights=dict(zip(keys, params))) loss = loss_fn(logits, y1) grads = F.grad(loss, params, use_virtual_grad=False) fast_weights = [p - 0.5 * g for g, p in zip(grads, params)] # forward twice loss2 = loss_fn(net(x2, weights=dict(zip(keys, fast_weights))), y2) opt.backward(loss2)
def test_mge_81(): np.random.seed(0) N, D = 3, 4 x = mge.Parameter(value=np.random.normal(size=(N, D)).astype(np.float32)) y = mge.Parameter(value=np.random.normal(size=(N, D)).astype(np.float32)) z = mge.Parameter(value=np.random.normal(size=(N, D)).astype(np.float32)) a = x * y b = a + z c = F.sum(b) grad_x = F.grad(c, x, use_virtual_grad=False) grad_y = F.grad(c, y, use_virtual_grad=False) grad_z = F.grad(c, z, use_virtual_grad=False) print(grad_x.numpy()) print(grad_y.numpy()) print(grad_z.numpy()) m = M.BatchNorm2d(4) input = tensor(np.zeros((64, 4, 32, 32), dtype=np.float32)) _ = m(input) m = M.BatchNorm2d(4, affine=False) _ = m(input)
def test_skip_invalid_grad(): data_shape = (1, 9, 2, 6) av = np.random.random(data_shape).astype(np.float32) bv = np.random.random(data_shape).astype(np.float32) a = tensor(av) b = tensor(bv) cookie = tensor(np.random.random(data_shape).astype(np.float32)) class EqWithFakeGrad(Function): def forward(self, a, b): return a == b def backward(self, grad_o): _ = grad_o return cookie, cookie c = EqWithFakeGrad()(a, b).sum() assertTensorClose(c.numpy(), (av == bv).sum().astype(np.float32)) assertTensorClose(F.grad(c, a, use_virtual_grad=False).numpy(), cookie) assertTensorClose(F.grad(c, b, use_virtual_grad=False).numpy(), cookie)
def test_zero_grad(): class StopGradient(Function): def forward(self, a): return a def backward(self, *_): return None a = tensor(np.array([1.0], dtype=np.float32)) b = a * 3.0 c = a * 4.0 loss = StopGradient()(b) + c assertTensorClose( F.grad(loss, a, use_virtual_grad=False).numpy(), np.array([4.0], dtype=np.float32), )
def test_ste(): class STE(Function): def forward(self, x): maxv, minv = x.max(), x.min() scale = F.maximum(maxv, -minv) / 127 return F.round(x / scale) * scale def backward(self, grad_y): return grad_y data_shape = (1, 9, 2, 6) av = np.random.random(data_shape).astype(np.float32) a = tensor(av) q = STE()(a) q_2 = (q * 2.0).sum() assertTensorClose( F.grad(q_2, a, use_virtual_grad=False).numpy(), np.broadcast_to(np.array([2.0], dtype=np.float32), data_shape), )