d_alphas[i] = np.dot(d_x, V.val) X.sub(alpha * V.val) # Reverse position update g = L_grad(X.val, meta, i) # Evaluate gradient V.add((1.0 - beta) * g).div(beta) # Reverse momentum update d_v += d_x * alpha d_betas[i] = np.dot(d_v, V.val + g) d_x -= (1.0 - beta) * L_hvp_x(X.val, meta, d_v, i) d_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, d_v, i) d_v *= beta assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_betas, d_meta return x_final, [None, hypergrad] sgd4 = Differentiable(sgd4, partial(sgd4, forward_pass_only=False)) def sgd4_mad(L_grad, hypers, callback=None, forward_pass_only=True): x0, alphas, gammas, meta = hypers N_safe_sampling = len(alphas) x_init = np.copy(x0) x_current = np.copy(x0) global v_current v_current = np.zeros(x0.size) X, V = ExactRep(x0), ExactRep(np.zeros(x0.size)) iters = zip(range(len(alphas)), alphas, gammas) for i, alpha, gamma in iters: g = L_grad(X.val, meta, i) if callback: callback(X.val, V.val, g, i) V.mul(gamma).sub((1.0 - gamma) * g)
if any([x_size > y_size for x_size, y_size in zip(shapes[_X_]['conv'], shapes[_Y_]['conv'])]): new_mode = 'full' else: new_mode = 'valid' result = convolve(g, Y[flipped_idxs(Y.ndim, axes[_Y_]['conv'])], axes = [axes['out']['conv'], axes[_Y_]['conv']], dot_axes = [axes['out'][ignore_Y], axes[_Y_]['ignore']], mode = new_mode) new_order = np.argsort(axes[_X_]['ignore'] + axes[_X_]['dot'] + axes[_X_]['conv']) return np.transpose(result, new_order) return einsum_tensordot(A_view, B_view, all_axes), [None, grad_convolve] convolve = Differentiable(convolve, partial(convolve, forward_pass_only=False)) def einsum_tensordot(A, B, axes, reverse=False): # Does tensor dot product using einsum, which shouldn't require a copy. A_axnums = list(range(A.ndim)) B_axnums = list(range(A.ndim, A.ndim + B.ndim)) sum_axnum = A.ndim + B.ndim for i_sum, (i_A, i_B) in enumerate(zip(*axes)): A_axnums[i_A] = sum_axnum + i_sum B_axnums[i_B] = sum_axnum + i_sum print(A_axnums, B_axnums) return np.einsum(A, A_axnums, B, B_axnums) def pad_to_full(A, B, axes): A_pad = [(0, 0)] * A.ndim for ax_A, ax_B in zip(*axes):
for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()): d_betas[i, j] = np.dot(d_v[ixs], V.val[ixs]) d_x -= L_hvp_x(X.val, meta, d_v, i) d_meta -= L_hvp_meta(X.val, meta, d_v, i) d_v *= cur_beta_vect assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_betas, d_meta return x_final, [None, hypergrad] sgd_parsed = Differentiable(sgd_parsed, partial(sgd_parsed, forward_pass_only=False)) def adam(grad, x, callback=None, num_iters=100, step_size=0.1, b1=0.1, b2=0.01, eps=10**-4, lam=10**-4): m = np.zeros(len(x)) v = np.zeros(len(x)) for i in xrange(num_iters): b1t = 1 - (1 - b1) * (lam**i)
d_alphas[i] = np.dot(d_x, V.val) X.sub(alpha * V.val) # Reverse position update g = L_grad(X.val, meta, i) # Evaluate gradient V.add((1.0 - beta) * g).div(beta) # Reverse momentum update d_v += d_x * alpha d_betas[i] = np.dot(d_v, V.val + g) d_x -= (1.0 - beta) * L_hvp_x(X.val, meta, d_v, i) d_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, d_v, i) d_v *= beta assert np.all(ExactRep(x0).val == X.val) return d_x, d_alphas, d_betas, d_meta return x_final, [None, hypergrad] sgd4 = Differentiable(sgd4, partial(sgd4, forward_pass_only=False)) def sgd_meta_only(L_grad, meta, x0, alpha, beta, N_iters, callback=None, forward_pass_only=True): X, V = ExactRep(x0), ExactRep(np.zeros(x0.size)) for i in range(N_iters): g = L_grad(X.val, meta, i, record_results=True) if callback: callback(X.val, V.val, g, i) V.mul(beta).sub((1.0 - beta) * g)