def test_adam(): for target, ctx in ctx_list(): data = nnvm.sym.Variable("data") weight = nnvm.sym.Variable("weight") out = nnvm.sym.elemwise_mul(data, weight**2) dshape = (1, 2, 3) wshape = dshape base_lr = 0.1 beta1 = 0.9 beta2 = 0.999 epsilon = 1e-8 lr_factor = 0.5 rescale_grad = 0.2 wd = 0.1 clip_gradient = 0.25 scheduler = lr_scheduler.FactorScheduler(base_lr=base_lr, step=1, factor=lr_factor) opt = optimizer.Adam(learning_rate=base_lr, beta1=beta1, beta2=beta2, epsilon=epsilon, lr_scheduler=scheduler, rescale_grad=rescale_grad, clip_gradient=clip_gradient, wd=wd) opt_sym = opt.minimize(out, var=weight) inputs = [("data", dshape, data)] params = [("weight", wshape, weight)] def update_func(data, weight): rate_0 = np.sqrt(1 - beta2) / (1 - beta1) lr_0 = base_lr * lr_factor * rate_0 gradient_0 = data * 2 * weight * rescale_grad gradient_0 = np.clip(gradient_0, -clip_gradient, clip_gradient) m_0 = (1 - beta1) * gradient_0 v_0 = (1 - beta2) * (gradient_0**2) weight_0 = weight - lr_0 * (m_0 / (np.sqrt(v_0) + epsilon) + wd * weight) rate_1 = np.sqrt(1 - beta2**2) / (1 - beta1**2) lr_1 = base_lr * (lr_factor**2) * rate_1 gradient_1 = data * 2 * weight_0 * rescale_grad gradient_1 = np.clip(gradient_1, -clip_gradient, clip_gradient) m_1 = beta1 * m_0 + (1 - beta1) * gradient_1 v_1 = beta2 * v_0 + (1 - beta2) * (gradient_1**2) weight_1 = weight_0 - lr_1 * (m_1 / (np.sqrt(v_1) + epsilon) + wd * weight_0) return weight_1 helper(opt_sym, inputs, params, update_func, 2, target, ctx)
def test_sgd(): for target, ctx in ctx_list(): data = nnvm.sym.Variable("data") weight = nnvm.sym.Variable("weight") out = nnvm.sym.elemwise_mul(data, weight**2) dshape = (1, 2, 3) wshape = dshape base_lr = 0.1 lr_factor = 0.5 rescale_grad = 0.2 wd = 0.1 clip_gradient = 0.25 scheduler = lr_scheduler.FactorScheduler(base_lr=base_lr, step=1, factor=lr_factor) opt = optimizer.SGD(learning_rate=base_lr, lr_scheduler=scheduler, rescale_grad=rescale_grad, clip_gradient=clip_gradient, wd=wd) opt_sym = opt.minimize(out, var=weight) inputs = [("data", dshape, data)] params = [("weight", wshape, weight)] def update_func(data, weight): gradient_0 = data * 2 * weight * rescale_grad gradient_0 = np.clip(gradient_0, -clip_gradient, clip_gradient) weight_0 = weight - base_lr * lr_factor * (gradient_0 + wd * weight) gradient_1 = data * 2 * weight_0 * rescale_grad gradient_1 = np.clip(gradient_1, -clip_gradient, clip_gradient) weight_1 = weight_0 - base_lr * (lr_factor** 2) * (gradient_1 + wd * weight_0) return weight_1 helper(opt_sym, inputs, params, update_func, 2, target, ctx)
} module.set_input(**new_params) # run module.run() # get output out = module.get_output(0, tvm.nd.empty(out_shape)) # convert to numpy out.asnumpy() # Print first 10 elements of output print("----------Output----------") print(out.asnumpy().flatten()) base_lr = 0.1 lr_factor = 0.5 rescale_grad = 0.2 wd = 0.1 clip_gradient = 0.25 scheduler = lr_scheduler.FactorScheduler(base_lr=base_lr, step=1, factor=lr_factor) opt = optimizer.SGD(learning_rate=base_lr, lr_scheduler=scheduler, rescale_grad=rescale_grad, clip_gradient=clip_gradient, wd=wd) opt_sym = opt.minimize(tvm.ndarray.array( ((real_label - out.asnumpy().flatten())**2), ctx=ctx), var=params['dense0_weight'])