def compare(self, expect='expect', got='got', show_regression=1, scatter=1, **kw): from arsenal.maths import compare if self.ax is None: self.ax = pl.figure().add_subplot(111) if self.df.empty: return with update_ax(self.ax): compare(expect, got, data=self.df).plot(ax=self.ax, **kw)
def compare(self, want='want', have='have', show_regression=1, scatter=1, **kw): from arsenal.maths import compare if self.ax is None: self.ax = pl.figure().add_subplot(111) if self.df.empty: return with update_ax(self.ax): compare(want, have, data=self.df).plot(ax=self.ax, **kw)
def test(): methods = [ swor_heap1, # swor_heap2, swor_heap3, ] R = 50_000 v = random_dist(4) S = {f.__name__: f(v, R) for f in methods} D = {name: counts(S[name]) for name in S} R = {} n = len(v) for z in permute(range(n)): R[z] = p_perm(v, z) for d in D.values(): d[z] += 0 # Check that p_perm sums to one. np.testing.assert_allclose(sum(R.values()), 1) for name, d in sorted(D.items()): compare(R, d) #.show(title=name); T = timers() R = 50 for i in range(1, 15): n = 2**i #print('n=', n, 'i=', i) for _ in range(R): v = random_dist(n) np.random.shuffle(methods) for f in methods: name = f.__name__ with T[name](n=n): S = f(v, R=1) assert S.shape == (1, n) # some sort of sanity check print('done') fig, ax = pl.subplots(ncols=2, figsize=(12, 5)) T.plot_feature('n', ax=ax[0]) fig.tight_layout() T.plot_feature('n', ax=ax[1]) ax[1].set_yscale('log') ax[1].set_xscale('log') T.compare() pl.show()
def quick_fdcheck(func, w, g, n_checks = 20, eps = 1e-5, verbose=1, progressbar=1): """ Check gradient along random directions (a faster alternative to axis-aligned directions). Tim Vieira (2017) "How to test gradient implementations" https://timvieira.github.io/blog/post/2017/04/21/how-to-test-gradient-implementations/ """ keys = ['rand_%s' % i for i in range(n_checks)] H = {} G = {} was = w.flatten() w = np.asarray(w.flat) g = np.asarray(g.flat) dim = len(w) for k in (iterview(keys) if progressbar else keys): d = spherical(dim) G[k] = g.dot(d) w[:] = was + eps*d b = func() w[:] = was - eps*d a = func() w[:] = was H[k] = (b-a) / (2*eps) return compare(H, G, verbose=verbose)
def fdcheck(func, w, g, keys = None, eps = 1e-5): """ Finite-difference check. Returns `arsenal.maths.compare` instance. - `func`: zero argument function, which references `w` in caller's scope. - `w`: parameters. - `g`: gradient estimate to compare against - `keys`: dimensions to check - `eps`: perturbation size """ if keys is None: if hasattr(w, 'keys'): keys = list(w.keys()) else: keys = list(range(len(w))) fd = {} for key in iterview(keys): was = w[key] w[key] = was + eps b = func() w[key] = was - eps a = func() w[key] = was fd[key] = (b-a) / (2*eps) return compare([fd[k] for k in keys], [g[k] for k in keys])
def quick_fdcheck(func, w, g, n_checks=20, eps=1e-5, verbose=1, progressbar=1): """ Check gradient along random directions (a faster alternative to axis-aligned directions). Tim Vieira (2017) "How to test gradient implementations" https://timvieira.github.io/blog/post/2017/04/21/how-to-test-gradient-implementations/ """ keys = ['rand_%s' % i for i in range(n_checks)] H = {} G = {} was = w.flatten() w = np.asarray(w.flat) g = np.asarray(g.flat) dim = len(w) for k in (iterview(keys) if progressbar else keys): d = spherical(dim) G[k] = g.dot(d) w[:] = was + eps * d b = func() w[:] = was - eps * d a = func() w[:] = was H[k] = (b - a) / (2 * eps) return compare(H, G, verbose=verbose)
def test_stationary(M): print('[test stationary]') π = random_dist(M.S, M.A) [_, _, γ, r] = M = M | π T = 1 / (1 - γ) d1 = M.d() d2 = M.d_by_eigen() assert compare(d1, d2).max_relative_error < 1e-5 J0 = M.J() d0 = M.d() def estimate(N): d = np.zeros(M.S) J = 0.0 for t, [s, r, _] in enumerate(M.run(), start=1): if t >= N: break d += (onehot(s, M.S) - d) / t # Note the 'importance sampling correction' T, which accounts for # the (1-γ)-resetting dynamics. J += (r * T - J) / t if t % 1000 == 0: yield [ t, 0.5 * abs(J - J0), 0.5 * abs(d - d0).sum(), ] ns, J_err, d_err = np.array(list(estimate(1_000_000))).T dmax = 1 Jmax = T * r.max( ) # scaled by T because of the importance sampling correction. # Very loose bounds on total variation distance J_bnd = Jmax / np.sqrt(ns) d_bnd = M.S * dmax / np.sqrt(ns) if 0: # Error decays at a rate of 1/sqrt(N) pl.title('performance estimate') pl.loglog(ns, J_bnd, label='error bound') pl.loglog(ns, J_err, label='error observed') pl.show() pl.title('distribution estimate') pl.loglog(ns, d_bnd, label='error bound') pl.loglog(ns, d_err, label='error observed') pl.show() assert (J_err <= J_bnd).all() assert (d_err <= d_bnd).all()
def fdcheck(func, w, g, keys=None, eps=1e-5, quiet=0, verbose=1, progressbar=1): """ Finite-difference check. Returns `arsenal.math.compare` instance. - `func`: zero argument function, which references `w` in caller's scope. - `w`: parameters. - `g`: gradient estimate to compare against - `keys`: dimensions to check - `eps`: perturbation size """ if quiet: verbose = 0 progressbar = 0 if keys is None: if hasattr( w, 'keys' ): # support for sparse vectors represented as a dictionary-like object. keys = list(w.keys()) d = {} else: # use flat views, if need be. if len(w.shape) > 1: w = w.flat if len(g.shape) > 1: g = g.flat d = np.zeros_like(w) keys = list( range(len(w)) ) # TODO: these keys have lost their names. So not good for debugging. else: d = {} for k in (iterview(keys) if progressbar else keys): was = w[k] w[k] = was + eps b = func() w[k] = was - eps a = func() w[k] = was d[k] = (b - a) / (2 * eps) return compare([d[k] for k in keys], [g[k] for k in keys], verbose=verbose)
def fdcheck(func, w, g, keys = None, eps = 1e-5, quiet=0, verbose=1, progressbar=1): """ Finite-difference check. Returns `arsenal.math.compare` instance. - `func`: zero argument function, which references `w` in caller's scope. - `w`: parameters. - `g`: gradient estimate to compare against - `keys`: dimensions to check - `eps`: perturbation size """ if quiet: verbose = 0 progressbar = 0 if keys is None: if hasattr(w, 'keys'): # support for sparse vectors represented as a dictionary-like object. keys = list(w.keys()) d = {} else: # use flat views, if need be. if len(w.shape) > 1: w = w.flat if len(g.shape) > 1: g = g.flat d = np.zeros_like(w) keys = list(range(len(w))) # TODO: these keys have lost their names. So not good for debugging. else: d = {} for k in (iterview(keys) if progressbar else keys): was = w[k] w[k] = was + eps b = func() w[k] = was - eps a = func() w[k] = was d[k] = (b-a) / (2*eps) return compare([d[k] for k in keys], [g[k] for k in keys], verbose=verbose)
def test_lp_solver(M): # primary testing strategy is to compare the linear programming solver (LP) # to anther solver (e.g., value iteration or policy iteration). In addition # to checking equivalence of the policies found by each, we also compare # equivalence of other quantities found by the LP. The dual variables should # be value functions (equal to VI's). The primal variables should be the # joint state-action distribution of the policy. vi = M.solve_by_policy_iteration() D = M.solve_by_lp_dual() P = M.solve_by_lp_primal() π = P['policy'] assert np.allclose(P['policy'], vi['policy']) print('[lp-solver] policy', ok) # Objective value matches the solution found by VI. assert abs(D['obj'] - vi['obj']) / abs(vi['obj']) < 0.01 print('[lp-solver] objective value', ok) d = D['mu'].sum(axis=1) assert is_distribution( d), 'stationary distribution is not a valid distribution.' assert compare(D['mu'].sum(axis=1), M.d(π), verbose=False).max_err < 1e-5 print('[lp-solver] stationary distribution', ok) assert np.allclose(vi['V'], D['V']) print('[lp-solver] value function', ok) # Test the relationships between primal and dual LPs # assert np.allclose(P['policy'], D['policy']) # behavior with ties is different. assert np.allclose(P['mu'], D['mu']) print('[dual-lp-solver]', ok) # Test that the objectives match assert np.allclose(D['obj'], M.J(π)) assert np.allclose(P['obj'], M.J(π)) print('[lp-objectives]', ok)
def quick_fdcheck(func, w, g, n_checks = 20, eps = 1e-5, verbose=1, progressbar=1): "Check gradient along random directions (a faster alternative to axis-aligned directions)." keys = ['rand_%s' % i for i in range(n_checks)] H = {} G = {} was = w.flatten() w = np.asarray(w.flat) g = np.asarray(g.flat) dim = len(w) for k in (iterview(keys) if progressbar else keys): d = spherical(dim) G[k] = g.dot(d) w[:] = was + eps*d b = func() w[:] = was - eps*d a = func() w[:] = was H[k] = (b-a) / (2*eps) return compare(H, G, verbose=verbose)
def fdcheck(E, root, eps=1e-4): """Finite-difference approximation of gradient of numerator and denominator wrt edge probability. """ def fn(W): "Evaluate numerator and denominator of risk." g = Hypergraph() g.root = root for e, [_, r, f] in list(E.items()): p = LogVal(np.exp(f.dot(W).to_real())) g.edge(Semiring1(p, p * r), *e) B = g.inside(Semiring1.Zero) Q = B[g.root] return Q.p.to_real(), Q.r.to_real(), (Q.r / Q.p).to_real() features = {k for [_, _, f] in E.values() for k in f} W = LogValVector({k: LogVal(np.random.uniform(-1, 1)) for k in features}) # For gradient of risk we use <p, p*r, D[p], r*D[p]>, but my code computes # <p, p*r, p*s, p*r*s>, so we pass in s=D[p]/p. # # D[p] = D[exp(f.dot(W))] = exp(s.dot(W))*D[f.dot(W)] = exp(s.dot(W))*f # # therefore D[p]/p = f if 0: E1 = {} for e, [_, r, f] in list(E.items()): p = LogVal(np.exp(f.dot(W).to_real())) E1[e] = (p, r, f * p) #S = secondorder_expectation_semiring(E, root) from hypergraphs.insideout3 import inside_outside_speedup khat, xhat = inside_outside_speedup(E1, root) else: E1 = {} for e, [_, r, f] in list(E.items()): p = LogVal(np.exp(f.dot(W).to_real())) E1[e] = (p, r, f) #S = secondorder_expectation_semiring(E, root) from hypergraphs.insideout import inside_outside_speedup khat, xhat = inside_outside_speedup(E1, root) ad_Z = xhat.s ad_rbar = xhat.t Z = khat.p rbar = khat.r ad_risk = ad_rbar / Z - rbar * ad_Z / Z / Z dd = [] for k in features: was = W[k] W.x[k] = was + LogVal(eps) b_Z, b_rbar, b_risk = fn(W) W.x[k] = was - LogVal(eps) a_Z, a_rbar, a_risk = fn(W) W.x[k] = was fd_rbar = (b_rbar - a_rbar) / (2 * eps) fd_Z = (b_Z - a_Z) / (2 * eps) fd_risk = (b_risk - a_risk) / (2 * eps) dd.append({ 'key': k, 'ad_risk': ad_risk[k].to_real(), 'fd_risk': fd_risk, 'ad_Z': ad_Z[k].to_real(), 'fd_Z': fd_Z, 'ad_rbar': ad_rbar[k].to_real(), 'fd_rbar': fd_rbar }) from arsenal.maths import compare from pandas import DataFrame df = DataFrame(dd) compare(df.fd_Z, df.ad_Z, alphabet=df.key).show() compare(df.fd_rbar, df.ad_rbar, alphabet=df.key).show() compare(df.fd_risk, df.ad_risk, alphabet=df.key).show()
def active_set(self): for outer in range(1, self.outer_iterations+1): print() print(colors.green % '=====================') print(colors.green % 'Outer %s' % outer) self.inner_optimization(self.inner_iterations) if outer != self.outer_iterations: print() print(colors.yellow % 'Grow %s' % outer) # old feature index old = {c: self.context_feature_id(c) for c in self.C} w = self.dense.w.copy() q = np.array(self.dense.q, copy=1) TEST_EXPECT = 0 if TEST_EXPECT: # Record expectations under previous model. Technically, # this is observed-expected features. predictions = [] for x in self.train: S = ScoringModel(x, self.A, self.feature_backoff, self.sparse, self.dense) self.gradient(x.N, x.tags, S) # don't backprop thru scoring model because we don't change the parameters. predictions.append({k: S.d_dense[i] for k,i in old.items()}) # "Grow" Z by extending active features with on more character. active = self.active_features() # Heuristic: Use an intelligent guess for 'new' q values in the # next iterations. # # This improves active set's ability to monotonically improve # after growing. Otherwise, adagrad will update too aggressively # compared to the sensible alternative of start at the last seen # value (if possible) or at the fudge value. # # In other words, new features get huge learning rates compared # to existing ones. Features that used to exist also get pretty # big learning rates too. This is because adagrad learning rates # decrease quickly with time as they are 1/sqrt(sum-of-squares). # # I found that guessing the mean q works better than min or max. self.dense.w[:] = 0 self.dense.q[:] = float(q.mean()) # [2018-08-13 Mon] the use of `float` is workaround for "BufferError: Object is not writable." # Grow active contexts to the right. cc = {p+(y,) for p in active for y in self.sigma} #### # Note that just because we extended a bunch of active elements # by all elements of sigma, this does not mean that we are # last-character closed. # # Feel free to check via the following (failing) assertion # # assert set(prefix_closure(cc)) == set(last_char_sub_closure(self.sigma, prefix_closure(cc))) # # The reason is that some elements go to zero and, thus, get # pruned. This is the same reason why `active` is not # automatically prefix closed. #### # Is the growing set prefix closed by construction? # # No. The grown set is also not prefix closed either because # it's possible for a parent to be zero with nonzero children. # # Here is an assertion that will fail. # # assert set(prefix_closure(cc)) == set(cc) # #cc = set(prefix_closure(cc)) #### # XXX: In general, we probably do not want to do last-char-sub # closure. I've added it in because it seems to help use # more-closely preserve the distribution after manipulating the # active set. #cc = set(last_char_sub_closure(self.sigma, cc)) # Filter active set by allowed-context constraints, if supplied. if self.allowed_contexts: cc &= set(self.allowed_contexts) # Update DFA and group lasso data structures. self.update(self.sigma, cc) self.dense.set_groups(self.group_structure()) print(colors.yellow % '=> new', '|C| = %s' % len(self.C)) # Copy previous weights for c in self.C: i = self.context_feature_id(c) if c in old: o = old[c] self.dense.w[i] = w[o] self.dense.q[i] = q[o] if 0: print() print(colors.light.red % 'is accuracy the same???????') self.after_inner_pass() print(colors.light.red % '^^^^^^^^^^^^^^^^^^^^^^^^^^^') print() if TEST_EXPECT: # DEBUGGING: check that expections match # # I'm not sure this test is implemented perfectly because we # need to compute the expected value of all the old features # under the new model. # # We get away using the new model because it has backoff # features. # # In the case of a unigram model (order-0 model), this test # fails. Why? are the unigrams used incorrectly? # new = {c: self.context_feature_id(c) for c in self.C} for x, want in zip(self.train, predictions): S = ScoringModel(x, self.A, self.feature_backoff, self.sparse, self.dense) self.gradient(x.N, x.tags, S) # don't backprop thru scoring model because we don't change the parameters. # just check on *old* features. E = {k: 0 for k in want} E.update({k: S.d_dense[new[k]] for k in want if k in new}) # XXX: filter down to features in both vectors, I guess? E = {k: v for k, v in E.items() if k in new} want = {k: v for k, v in want.items() if k in new} c = compare(want, E, verbose=1) if c.pearson <= .99: c.show()
# sample = lazy_sampler() sample = iter(sampler()) for r in range(1, 1+reps): _, z = next(sample) c[z] += 1 if r % 10_000 == 0: print(f'err({r})=', 0.5*np.abs(p - c/r).sum()) c /= reps print(p) print(c) compare(p, c) #pl.plot(c/reps) #pl.plot(p) #pl.show() from hypergraphs.apps.parser2 import parse, load_grammar def parser(sentence, grammar, w, Weights): def binary(sentence,X,Y,Z,i,j,k): return Weights(w(X,Y,Z,i,j,k), X) def unary(sentence,X,Y,i,k): return Weights(w(X,Y,i,k), X) def terminal(sentence,W,i): return Weights(1.0, W) return parse(sentence, grammar, binary, unary, terminal, zero = Weights.zero)[0,len(sentence),'S']