def compare(self, expect='expect', got='got', show_regression=1, scatter=1, **kw): from arsenal.math import compare if self.ax is None: self.ax = pl.figure().add_subplot(111) if self.df.empty: return with update_ax(self.ax): compare(expect, got, data=self.df).plot(ax=self.ax, **kw)
def fdcheck(func, w, g, keys=None, eps=1e-5): """ Finite-difference check. Returns `arsenal.math.compare` instance. - `func`: zero argument function, which references `w` in caller's scope. - `w`: parameters. - `g`: gradient estimate to compare against - `keys`: dimensions to check - `eps`: perturbation size """ if keys is None: if hasattr(w, 'keys'): keys = w.keys() else: keys = range(len(w)) fd = {} for key in iterview(keys): was = w[key] w[key] = was + eps b = func() w[key] = was - eps a = func() w[key] = was fd[key] = (b - a) / (2 * eps) return compare([fd[k] for k in keys], [g[k] for k in keys])
def test_gradient(self, data, subsetsize=100): def fd(x, i, eps=1e-5): """Compute `i`th component of the finite-difference approximation to the gradient of log-likelihood at current parameters on example `x`. """ was = self.W[i] # record value self.W[i] = was + eps b = self.likelihood(x) self.W[i] = was - eps a = self.likelihood(x) self.W[i] = was # restore original value return (b - a) / 2 / eps for x in iterview(data, msg='test grad'): g = defaultdict(float) for k, v in self.expectation(x).iteritems(): g[k] -= 1 * v for k in x.target_features: g[k] += 1 # pick a subset of features to test d = np.random.choice(g.keys(), subsetsize, replace=0) f = {} for i in iterview(d, msg='fd approx'): # loop over active features f[i] = fd(x, i) from arsenal.math import compare compare([f[k] for k in d], [g[k] for k in d], name='test gradient %s' % x, scatter=1, show_regression=1) import pylab as pl pl.show()
def test_gradient(self, data, subsetsize=100): def fd(x, i, eps=1e-5): """Compute `i`th component of the finite-difference approximation to the gradient of log-likelihood at current parameters on example `x`. """ was = self.W[i] # record value self.W[i] = was+eps b = self.likelihood(x) self.W[i] = was-eps a = self.likelihood(x) self.W[i] = was # restore original value return (b - a) / 2 / eps for x in iterview(data, msg='test grad'): g = defaultdict(float) for k, v in self.expectation(x).iteritems(): g[k] -= 1*v for k in x.target_features: g[k] += 1 # pick a subset of features to test d = np.random.choice(g.keys(), subsetsize, replace=0) f = {} for i in iterview(d, msg='fd approx'): # loop over active features f[i] = fd(x, i) from arsenal.math import compare compare([f[k] for k in d], [g[k] for k in d], name='test gradient %s' % x, scatter=1, show_regression=1) import pylab as pl pl.show()
def fdcheck(func, w, g, keys=None, eps=1e-5, quiet=0, verbose=1, progressbar=1): """ Finite-difference check. Returns `arsenal.math.compare` instance. - `func`: zero argument function, which references `w` in caller's scope. - `w`: parameters. - `g`: gradient estimate to compare against - `keys`: dimensions to check - `eps`: perturbation size """ if quiet: verbose = 0 progressbar = 0 if keys is None: if hasattr(w, 'keys'): keys = w.keys() d = {} else: d = np.zeros_like(w) # use flat views, if need be. if len(w.shape) > 1: w = w.flat if len(g.shape) > 1: g = g.flat if len(d.shape) > 1: d = d.flat keys = range(len(w)) for k in (iterview(keys) if progressbar else keys): was = w[k] w[k] = was + eps b = func() w[k] = was - eps a = func() w[k] = was d[k] = (b - a) / (2 * eps) return compare([d[k] for k in keys], [g[k] for k in keys], verbose=verbose)
def test(e, grammar, m): """ Compare runtime of running parser v. inside algorithm. Note: This isn't the same rollouts. """ M = m*1.0 steps = 2 # TODO: compare_hypergraph_to_cky(grammar, example, m, steps) grammar.exp_the_grammar_weights() # Do exp outside the timing loop. It will be cached in the Grammar object. with T['dp'](N=e.N): expected_recall = InsideOut(e, grammar, M*1.0, steps=steps, with_gradient=0).val with T['parse'](N=e.N): state = pruned_parser(e.tokens, grammar, m) coarse = grammar.coarse_derivation(state.derivation) c,_,w = e.recall(coarse) recall = c/w data.append({'example': e, 'N': e.N, 'expected_recall': expected_recall, 'recall': recall}) T.compare() with axman('compare runtimes') as ax: T.plot_feature('N', ax=ax, loglog=1, show='scatter') df = DataFrame(data) with axman('compare rewards') as ax: compare(df.recall, df.expected_recall, scatter=1, show_regression=1, ax=ax)
def fdcheck(func, w, g, keys = None, eps = 1e-5, quiet=0, verbose=1, progressbar=1): """ Finite-difference check. Returns `arsenal.math.compare` instance. - `func`: zero argument function, which references `w` in caller's scope. - `w`: parameters. - `g`: gradient estimate to compare against - `keys`: dimensions to check - `eps`: perturbation size """ if quiet: verbose = 0 progressbar = 0 if keys is None: if hasattr(w, 'keys'): keys = w.keys() d = {} else: d = np.zeros_like(w) # use flat views, if need be. if len(w.shape) > 1: w = w.flat if len(g.shape) > 1: g = g.flat if len(d.shape) > 1: d = d.flat keys = range(len(w)) for k in (iterview(keys) if progressbar else keys): was = w[k] w[k] = was + eps b = func() w[k] = was - eps a = func() w[k] = was d[k] = (b-a) / (2*eps) return compare([d[k] for k in keys], [g[k] for k in keys], verbose=verbose)
def quick_fdcheck(func, w, g, n_checks, eps=1e-5, verbose=1, progressbar=1): "Check gradient along random directions (a faster alternative to axis-aligned directions)." keys = ['rand_%s' % i for i in range(n_checks)] H = {} G = {} was = w.copy() for k in (iterview(keys) if progressbar else keys): d = spherical(w.shape[0]) G[k] = g.dot(d) w[:] = was + eps * d b = func() w[:] = was - eps * d a = func() w[:] = was H[k] = (b - a) / (2 * eps) return compare(H, G, verbose=verbose)
def quick_fdcheck(func, w, g, n_checks, eps = 1e-5, verbose=1, progressbar=1): "Check gradient along random directions (a faster alternative to axis-aligned directions)." keys = ['rand_%s' % i for i in range(n_checks)] H = {} G = {} was = w.copy() for k in (iterview(keys) if progressbar else keys): d = spherical(w.shape[0]) G[k] = g.dot(d) w[:] = was + eps*d b = func() w[:] = was - eps*d a = func() w[:] = was H[k] = (b-a) / (2*eps) return compare(H, G, verbose=verbose)
def aggregate_multiple_runtime_trials(Ds, Ps): """Collapse multiple dataframes `Ds` from different timing runes into a single one, by taking the min over runtimes (i.e., new runtime will be "best-of k" where k=|Ds|). Actually, this function does more than that. It appears to collapse over sentence too, e.g., computing corpus-EVALB and avg[best-of-k runtimes]. """ D0 = Ds[0] # Append trials together foo = Ds[0] for dd in Ds[1:]: foo = foo.append(dd) # Take min over time_total for this policy-example pair. minz = foo[['policy','example','time_total']].groupby(['policy','example']).min() data = [] for policy in iterview(Ps): dump = path(policy).dirname() args = cPickle.load(file(dump / 'args.pkl')) log = pd.read_csv(dump / 'log.csv') # TODO: will need to add extra cases. if 'DP' in args.roll_out: type_ = 'DP' elif 'CP' in args.roll_out: type_ = 'CP' elif 'HY' in args.roll_out: type_ = 'HY' elif 'BODEN' in args.roll_out: type_ = 'baseline' else: raise ValueError(args.roll_out) min_times = minz.ix[policy]['time_total'] P = D0[D0.policy == policy] f = cgw_f(P.want_and_got.sum(), P.got.sum(), P.want.sum()) #pl.scatter(df.avg_bestof_time, df.evalb, c=C[name], lw=0) #show_frontier(df.avg_bestof_time, df.evalb, c=C[name], interpolation='linear', label=name) #[w,b] = np.polyfit(df.pushes, df.avg_bestof_time, deg=1) #show_frontier(df.pushes*w + b, df.evalb, interpolation='linear', c=C[name]) if 0: # log-log plot of pushes v. seconds. Really great correlation! PP = P[['example','pushes']].join(min_times, on='example') PP['log(pushes)'] = np.log(PP.pushes) PP['log(seconds)'] = np.log(PP.time_total) compare('log(pushes)', 'log(seconds)', data=PP, scatter=1, show_regression=1) #pl.figure() # pushes v. seconds. Really great correlation! #PP = P[['example','pushes']].join(min_times, on='example') #compare('pushes', 'time_total', data=PP, scatter=1, show_regression=1) pl.ioff(); pl.show() if 0: # empirical runtime estimates # scatter plot sentence length against runtime. n_by_time = P[['example','N']].join(min_times, on='example') pl.scatter(n_by_time.N, n_by_time.time_total, alpha=0.5, lw=0) # highlight median runtime per sentence length. n_by_median_time = n_by_time.groupby('N').median() pl.plot(n_by_median_time.index, n_by_median_time.time_total, c='k', lw=2) # empirical exponent and constant factor compare(np.log(n_by_time.time_total), np.log(n_by_time.N), scatter=1, show_regression=1) pl.ioff(); pl.show() # use early stopping on dev to pick the policy. dev = log.ix[log['dev_new_policy_reward'].argmax()] row = {'avg_bestof_time': np.mean(min_times), 'wps': np.mean(P.N) / np.mean(min_times), 'pushes': np.mean(P.pushes), 'pops': np.mean(P.pops), 'policy': policy, 'dev_pushes': dev.dev_new_policy_pushes, 'dev_evalb': dev.dev_new_policy_evalb_corpus, 'type': type_, 'evalb': f} row.update({'args_'+k: v for k,v in args.__dict__.items()}) data.append(row) # remove unused baselines (sorry this is a bit ugly). ddd = pd.DataFrame(data) others = ddd[ddd.type != 'baseline'] B = ddd[ddd.type == 'baseline'] used = set() for _, z in others.iterrows(): [ix] = B[B.policy == z.args_init_weights].index used.add(ix) B = B.ix[list(used)] ddd = others.append(B) return ddd
def active_set(self): for outer in xrange(1, self.outer_iterations + 1): print print colors.green % '=====================' print colors.green % 'Outer %s' % outer self.inner_optimization(self.inner_iterations) if outer != self.outer_iterations: print print colors.yellow % 'Grow %s' % outer # old feature index old = {c: self.context_feature_id(c) for c in self.C} w = self.dense.w.copy() q = np.array(self.dense.q, copy=1) TEST_EXPECT = 0 if TEST_EXPECT: # Record expectations under previous model. Technically, # this is observed-expected features. predictions = [] for x in self.train: S = ScoringModel(x, self.A, self.feature_backoff, self.sparse, self.dense) self.gradient( x.N, x.tags, S ) # don't backprop thru scoring model because we don't change the parameters. predictions.append( {k: S.d_dense[i] for k, i in old.iteritems()}) # "Grow" Z by extending active features with on more character. active = self.active_features() # Heuristic: Use an intelligent guess for 'new' q values in the # next iterations. # # This improves active set's ability to monotonically improve # after growing. Otherwise, adagrad will update too aggressively # compared to the sensible alternative of start at the last seen # value (if possible) or at the fudge value. # # In other words, new features get huge learning rates compared # to existing ones. Features that used to exist also get pretty # big learning rates too. This is because adagrad learning rates # decrease quickly with time as they are 1/sqrt(sum-of-squares). # # I found that guessing the mean q works better than min or max. self.dense.w[:] = 0 self.dense.q[:] = q.mean() # Grow active contexts to the right. cc = {p + (y, ) for p in active for y in self.sigma} #### # Note that just because we extended a bunch of active elements # by all elements of sigma, this does not mean that we are # last-character closed. # # Feel free to check via the following (failing) assertion # # assert set(prefix_closure(cc)) == set(last_char_sub_closure(self.sigma, prefix_closure(cc))) # # The reason is that some elements go to zero and, thus, get # pruned. This is the same reason why `active` is not # automatically prefix closed. #### # Is the growing set prefix closed by construction? # # No. The grown set is also not prefix closed either because # it's possible for a parent to be zero with nonzero children. # # Here is an assertion that will fail. # # assert set(prefix_closure(cc)) == set(cc) # #cc = set(prefix_closure(cc)) #### # XXX: In general, we probably do not want to do last-char-sub # closure. I've added it in because it seems to help use # more-closely preserve the distribution after manipulating the # active set. #cc = set(last_char_sub_closure(self.sigma, cc)) # Filter active set by allowed-context constraints, if supplied. if self.allowed_contexts: cc &= set(self.allowed_contexts) # Update DFA and group lasso data structures. self.update(self.sigma, cc) self.dense.set_groups(self.group_structure()) print colors.yellow % '=> new', '|C| = %s' % len(self.C) # Copy previous weights for c in self.C: i = self.context_feature_id(c) if c in old: o = old[c] self.dense.w[i] = w[o] self.dense.q[i] = q[o] if 0: print print colors.light_red % 'is accuracy the same???????' self.after_inner_pass() print colors.light_red % '^^^^^^^^^^^^^^^^^^^^^^^^^^^' print if TEST_EXPECT: # DEBUGGING: check that expections match # # I'm not sure this test is implemented perfectly because we # need to compute the expected value of all the old features # under the new model. # # We get away using the new model because it has backoff # features. # # In the case of a unigram model (order-0 model), this test # fails. Why? are the unigrams used incorrectly? # new = {c: self.context_feature_id(c) for c in self.C} for x, want in zip(self.train, predictions): S = ScoringModel(x, self.A, self.feature_backoff, self.sparse, self.dense) self.gradient( x.N, x.tags, S ) # don't backprop thru scoring model because we don't change the parameters. # just check on *old* features. E = {k: 0 for k in want} E.update( {k: S.d_dense[new[k]] for k in want if k in new}) # XXX: filter down to features in both vectors, I guess? E = {k: v for k, v in E.iteritems() if k in new} want = {k: v for k, v in want.iteritems() if k in new} c = compare(want, E, verbose=1) if c.cosine < .99: c.show()
def __test_gradient(example, grammar, m, gamma): """ Finite-difference test for gradient of numerator and denominator. """ assert gamma == 1, 'gamma = %g no longer supported' % gamma M = m*1.0 steps = 2 # print 'steps = %s' % steps # print colors.cyan % '>>> roll-in' test_grad = 1 # test_onebest = 0 test_linearity = 0 # test_viterbi = 0 # test that hg-viterbi alg matches cky # show_items = 0 # get_marginals = 0 # if show_items: # get_marginals = 1 # if test_viterbi: # compare_hypergraph_to_cky(grammar, example, m, steps) f_c = InsideOut(example, grammar, M*1.0, steps=steps, with_gradient=True) # f_c = InsideOut2(example, grammar, M*1.0, steps=steps, with_gradient=True, DEBUG=True, IOSPEEDUP=True) # f_c = InsideOut2(example, grammar, M*1.0, steps=steps, with_gradient=True, DEBUG=True, IOSPEEDUP=False) est = f_c.est # import ldp.dp.risk2 # io2 = ldp.dp.risk2.InsideOut(example, grammar, M*1.0, steps=steps, with_gradient=True) # debug = 0 # from arsenal.math import assert_equal # zoo = [] # for k,v in io2.est.items(): # assert_equal(v, est[k], name='est %s' % (k,), throw=0, tol=1e-4) # zoo.append([v, est[k]]) # assert_equal(f_c.val, io2.val, name='rollin', throw=0, tol=1e-4) # boo,bar = zip(*zoo) # compare(boo, bar, show_regression=1, name='compare to old version', scatter=1) # pl.show() # return # from arsenal.debug import ip; ip() # if test_onebest: # for k,v in f_c.marginals().iteritems(): # assert 0 <= v <= 1.000001, [k,v] # if 0.05 <= v <= 0.95: # not entirely saturated # print 'tie (rollin):', nice(grammar, k), v # initial roll-in # if test_onebest: # rollout(grammar, example, m) old_mask = M.copy() del m, M data = [] # for x in iterview(example.nodes, msg='fd'): for x in example.nodes: d = {'span': x, 'action': 'prune' if old_mask[x] else 'unprune'} # print '--------------------------------' # print colors.cyan % '>>>', d['action'], x ad_num = f_c.A2_r[x[0], x[1]] ad_den = f_c.A2_p[x[0], x[1]] was = old_mask[x] new_mask = old_mask*1 new_mask[x] = 1-was tie = False # if test_viterbi: # compare_hypergraph_to_cky(grammar, example, new_mask, steps) if test_grad: S = InsideOut(example, grammar, new_mask*1.0, steps=steps, with_gradient=0) #S = InsideOut2(example, grammar, new_mask*1.0, steps=steps, with_gradient=True, DEBUG=True, IOSPEEDUP=True) # Note: Since the function is multi-linear (for gamma=1) we can # extrapolate as far as we'd like. Thus, we take the full step # (change 0->1 and 1->0). if was == 1: f_m = S f_p = f_c else: f_p = S f_m = f_c surrogate = S.val # if show_items: # dump_items(S) # if get_marginals: # for k,v in S.marginals().iteritems(): # assert 0 <= v <= 1.000001, [k,v] # if 0.05 <= v <= 0.95: # not entirely saturated # print 'tie (unprune, %s):' % (x,), nice(grammar, k), v # tie = True fd_num = (f_p.num - f_m.num) fd_den = (f_p.den - f_m.den) if 0: assert fd_num >= 0 and fd_den >= 0, [fd_num, fd_den] assert ad_num >= 0 and ad_den >= 0, [ad_num, ad_den] # Yup, gradient should always be positive! Why? Well, for # unnormalized risk and Z it's always beneficial to increase the # score of an edge -- there is no competition among edges (until # there is normalization; the gradient of normalized risk would # have variation in sign). Thus, the gradient is always # positive. assert f_c.A2_p[x[0], x[1]] >= 0 assert f_c.A2_r[x[0], x[1]] >= 0 d.update({'surrogate': surrogate, 'fd_num': fd_num, 'fd_den': fd_den}) if test_linearity: cross_section(example, grammar, steps, old_mask*1.0, x) # NOTE: this test fails when gamma != 1. mid_mask = old_mask*1.0 mid_mask[x] = 0.5 mid = InsideOut(example, grammar, mid_mask, steps=steps, with_gradient=False) # Multilinearity check. Check that three points form a line #assert abs(mid.den - (0.5 * (f_p.den - f_m.den) / 1 + f_m.den)) < 1e-8 #assert abs(mid.num - (0.5 * (f_p.num - f_m.num) / 1 + f_m.num)) < 1e-8 assert abs(mid.den - 0.5 * f_p.den) < 1e-8 assert abs(mid.num - 0.5 * f_p.num) < 1e-8 show = False # if test_onebest: # # one-best rollout # r1 = rollout(grammar, example, new_mask) # d['onebest'] = r1 # if abs(surrogate - r1) > 0.0001: # print colors.red % '** error **', \ # 'surrogate does not equal onebest' # show = True estimate = est[x] d.update({'estimate': estimate, 'ad_num': ad_num, 'ad_den': ad_den, 'rel-error': relative_error(surrogate, estimate), 'abs-error': abs(surrogate-estimate), 'tie': tie}) if abs(f_c.val - surrogate) > 0.001: # need a big enough change. if surrogate < f_c.val: d['delta_type'] = 'decr' else: d['delta_type'] = 'incr' else: d['delta_type'] = 'same' is_error = abs(surrogate - estimate) > 0.001 or not np.isfinite(estimate) if is_error: print "%s: estimate doesn't match surrogate" % (colors.red % 'error') show = True # Taylor expansion should match brute-force method. # # Note: we're not comparing directly to onebest, since it'll just result # in confusion. Errors.data.append({'action': d['action'], 'delta': d['delta_type'], #'zero': float(d['surrogate']==0), 'n_error': int(is_error), 'tie': tie}) if show: for k, v in sorted(d.items()): if isinstance(v, float): print '%30s: %g' % (k, v) else: print '%30s: %s' % (k, v) #foobar(f_c) data.append(d) df = DataFrame(data) # print df if df.empty: print '** dataframe empty **' return Errors.show() if 0: #scale = 1 scale = max(np.abs(df.fd_den).max(), np.abs(df.ad_den).max()) or 1 compare(df.fd_den/scale, df.ad_den/scale, alphabet=example.nodes, show_regression=1, scatter=1, name='test_grad denominator') #scale = 1 scale = max(np.abs(df.fd_num).max(), np.abs(df.ad_num).max()) or 1 compare(df.fd_num/scale, df.ad_num/scale, scatter=1, show_regression=1, alphabet=example.nodes, name='test_grad numerator') # if test_onebest: # compare(df.onebest, # df.estimate, # alphabet=example.nodes, # show_regression=1, # scatter=1, # name='onebest v. estimate') if 0: compare(df.surrogate, df.estimate, alphabet=example.nodes, show_regression=1, scatter=1, name='surrogate v. estimate') # if 1: # if test_grad and test_onebest: # compare(df.onebest, df.surrogate, # alphabet=example.nodes, # show_regression=1, scatter=1, # name='onebest v. surrogate') # goal = {d['span']: d['surrogate'] for d in data} if 0: pl.ioff() pl.show()