def datawise_optimize(current_sample, data, steps=1000000, inner_steps=10, data_weight=1.0, ll_temperature=1.0, **kwargs): """ cycle through data points, taking a few steps in the direction of that data point This uses ll_temperature to simulate having len(data)*data_weight number of data points steps -- you take this many total steps (steps/inner_steps inner loops) inner steps -- how many steps to take on a single data point data_weight -- weight each single data point as len(data)*this """ # How many data points? Used for setting the temperature below NDATA = len(data) for mhi in lot_iter(xrange(steps / inner_steps)): for di in lot_iter(data): for h in mh_sample(current_sample, [di], steps=inner_steps, ll_temperature=ll_temperature / (NDATA * data_weight), **kwargs): current_sample = h yield h
def test_lp_regenerate_propose_to(self): # import the grammar from LOTlibTest.Grammars import lp_regenerate_propose_to_grammar self.G = lp_regenerate_propose_to_grammar.g # the RegenerationProposal class rp = RegenerationProposal(self.G) numTests = 100 # Sample 1000 trees from the grammar, and run a chi-squared test for each of them for i in lot_iter(range(numTests)): # keep track of expected and actual counts # expected_counts = defaultdict(int) # a dictionary whose keys are trees and values are the expected number of times we should be proposing to this tree actual_counts = defaultdict(int) # same as expected_counts, but stores the actual number of times we proposed to a given tree tree = self.G.generate('START') # Regenerate some number of trees at random numTrees = 1000 for j in range(numTrees): newtree = rp.propose_tree(tree)[0] # trees.append(newtree) actual_counts[newtree] += 1 # see if the frequency with which each category of trees is generated matches the # expected counts using a chi-squared test chisquared, p = self.get_pvalue(tree, actual_counts, numTrees) # print chisquared, p # if p > 0.01/1000, test passes self.assertTrue(p > 0.01/numTests, "Trees are not being generated according to the expected log probabilities") if i % 10 == 0 and i != 0: print i, "lp_regenerate_propose_to tests..." print numTests, "lp_regenerate_propose_to tests..."
def generate_unique_trees(grammar, start='START', N=1000): """ Yield a bunch of unique trees, produced from the grammar """ for _ in lot_iter(xrange(N)): t = grammar.generate(start) yield t
def test_lp_regenerate_propose_to(self): # import the grammar from LOTlibTest.Grammars import lp_regenerate_propose_to_grammar self.G = lp_regenerate_propose_to_grammar.g # the RegenerationProposal class rp = RegenerationProposal(self.G) numTests = 100 # Sample 1000 trees from the grammar, and run a chi-squared test for each of them for i in lot_iter(range(numTests)): # keep track of expected and actual counts # expected_counts = defaultdict(int) # a dictionary whose keys are trees and values are the expected number of times we should be proposing to this tree actual_counts = defaultdict( int ) # same as expected_counts, but stores the actual number of times we proposed to a given tree tree = self.G.generate('START') # Regenerate some number of trees at random numTrees = 1000 for j in range(numTrees): newtree = rp.propose_tree(tree)[0] # trees.append(newtree) actual_counts[newtree] += 1 # see if the frequency with which each category of trees is generated matches the # expected counts using a chi-squared test chisquared, p = self.get_pvalue(tree, actual_counts, numTrees) # print chisquared, p # if p > 0.01/1000, test passes self.assertTrue( p > 0.01 / numTests, "Trees are not being generated according to the expected log probabilities" ) if i % 10 == 0 and i != 0: print i, "lp_regenerate_propose_to tests..." print numTests, "lp_regenerate_propose_to tests..."
def next(self): if LOTlib.SIG_INTERRUPTED or self.samples_yielded >= self.steps: raise StopIteration else: for _ in lot_iter(xrange(self.skip+1)): self.proposal, fb = self.proposer(self.current_sample) # either compute this, or use the memoized version np, nl = self.compute_posterior(self.proposal, self.data) #print np, nl, current_sample.prior, current_sample.likelihood # NOTE: IT is important that we re-compute from the temperature since these may be altered externally from ParallelTempering and others prop = (np/self.prior_temperature+nl/self.likelihood_temperature) cur = (self.current_sample.prior/self.prior_temperature + self.current_sample.likelihood/self.likelihood_temperature) if MH_acceptance(cur, prop, fb, acceptance_temperature=self.acceptance_temperature): self.current_sample = self.proposal self.was_accepted = True self.acceptance_count += 1 else: self.was_accepted = False self.internal_sample(self.current_sample) self.proposal_count += 1 if self.trace: print self.current_sample.posterior_score, self.current_sample.likelihood, self.current_sample.prior, qq(self.current_sample) self.samples_yielded += 1 return self.current_sample
def plot_sampler(self, opath, sampler): """ Plot the sampler, for cases with many zeros where chisquared won't work well """ cnt = Counter() for h in lot_iter(sampler): cnt[h.value] += 1 Z = logsumexp([t.log_probability() for t in self.trees]) # renormalize to the trees in self.trees obsc = [cnt[t] for t in self.trees] expc = [exp(t.log_probability()-Z)*sum(obsc) for t in self.trees] for t, c, s in zip(self.trees, obsc, expc): print c, "\t", s, "\t", t expc, obsc, trees = zip(*sorted(zip(expc, obsc, self.trees), reverse=True)) import matplotlib.pyplot as plt from numpy import log plt.subplot(111) # Log here spaces things out at the high end, where we can see it! plt.scatter(log(range(len(trees))), expc, color="red", alpha=1.) plt.scatter(log(range(len(trees))), obsc, color="blue", marker="x", alpha=1.) plt.savefig(opath) plt.clf()
def run(): data = generate_data(target, NDATA, data_sd) # generate some data h0 = MAPSymbolicRegressionHypothesis(grammar, args=['x']+CONSTANT_NAMES) h0.CONSTANT_VALUES = numpy.zeros(NCONSTANTS) ## TODO: Move this to an itializer from LOTlib.Inference.MetropolisHastings import MHSampler for h in lot_iter(MHSampler(h0, data, STEPS, skip=SKIP, trace=False)): print h.posterior_score, h.likelihood, h.prior, h.CONSTANT_VALUES, qq(h)
def run(): """ Standard run function.""" h0 = SchemeFunction(grammar, ALPHA=ALPHA) for x in lot_iter(MHSampler(h0, data, STEPS)): print x.posterior_score, x for di in data: print "\t", di.input, "->", x(*di.input), " ; should be ", di.output
def save_hypotheses(sampler, filename='numbergame_hypotheses.p'): hypotheses = set() for h in lot_iter(sampler): hypotheses.add(h) f = open(filename, "wb") pickle.dump(hypotheses, f) return hypotheses
def run(llt=1.0): h0 = CCGLexicon(make_hypothesis, words=all_words, alpha=0.9, palpha=0.9, likelihood_temperature=llt) fbs = FiniteBestSet(N=10) from LOTlib.Inference.MetropolisHastings import mh_sample for h in lot_iter(mh_sample(h0, data, SAMPLES)): fbs.add(h, h.posterior_score) return fbs
def __call__(self, generator): """Pass this a generator, add each element as it's yielded. This allows us to make a pipeline. See Example in main docstring: '# Or as a generator...'. """ if hasattr(generator, 'data'): self.data = generator.data for sample in lot_iter(generator): self.add(sample) yield sample
def test_eq(self): counter = 0 for i in lot_iter(xrange(10000)): x = self.G.generate() y = self.G.generate() if pystring(x) == pystring(y): counter += 1 # print(counter) #print( pystring(x)+'\n'+ pystring(y)+'\n') self.assertEqual( pystring(x) == pystring(y), x == y, "Without bvs, the pystrings should be the same")
def run(): from LOTlib import lot_iter from LOTlib.Inference.Proposals.RegenerationProposal import RegenerationProposal #mp = MixtureProposal([RegenerationProposal(grammar), InsertDeleteProposal(grammar)] ) mp = RegenerationProposal(grammar) from LOTlib.Hypotheses.LOTHypothesis import LOTHypothesis h0 = LOTHypothesis(grammar, args=['x', 'y'], ALPHA=0.999, proposal_function=mp) # alpha here trades off with the amount of data. Currently assuming no noise, but that's not necessary from LOTlib.Inference.MetropolisHastings import MHSampler for h in lot_iter(MHSampler(h0, data, skip=100)): print h.posterior_score, h.likelihood, h.prior, cleanFunctionNodeString(h)
def datawise_optimize(current_sample, data, steps=1000000, inner_steps=10, data_weight=1.0, ll_temperature=1.0, **kwargs): """ cycle through data points, taking a few steps in the direction of that data point This uses ll_temperature to simulate having len(data)*data_weight number of data points steps -- you take this many total steps (steps/inner_steps inner loops) inner steps -- how many steps to take on a single data point data_weight -- weight each single data point as len(data)*this """ # How many data points? Used for setting the temperature below NDATA = len(data) for mhi in lot_iter(xrange(steps/inner_steps)): for di in lot_iter(data): for h in mh_sample(current_sample, [di], steps=inner_steps, ll_temperature=ll_temperature/(NDATA*data_weight), **kwargs): current_sample = h yield h
def tempered_transitions_sample(inh, data, steps, proposer=None, skip=0, temperatures=[1.0, 1.05, 1.1], stats=None): current_sample = inh LT = len(temperatures) ## TODO: CHECK THIS--STILL NOT SURE THIS IS RIGHT # a helper function for temperature transitions -- one single MH step, returning a new sample # this allows diff. temps for top and bottom def tt_helper(xi, data, tnew, told, proposer): if proposer is None: xinew, fb = xi.propose() else: xinew, fb = proposer(xi) xinew.compute_posterior(data) r = (xinew.prior + xinew.likelihood) / tnew - (xi.prior + xi.likelihood) / told - fb if r > 0.0 or random() < exp(r): return xinew else: return xi for mhi in lot_iter(xrange(steps)): for skp in xrange(skip + 1): xi = current_sample # do not need to copy this totlp = 0.0 #(xi.lp / temperatures[1]) - (xi.lp / temperatures[0]) for i in xrange(0, LT - 2): # go up xi = tt_helper(xi, data, temperatures[i + 1], temperatures[i], proposer) totlp = totlp + (xi.prior + xi.likelihood) / temperatures[ i + 1] - (xi.prior + xi.likelihood) / temperatures[i] # do the top: xi = tt_helper(xi, data, temperatures[LT - 1], temperatures[LT - 1], proposer) for i in xrange(len(temperatures) - 2, 0, -1): # go down xi = tt_helper(xi, data, temperatures[i], temperatures[i], proposer) totlp = totlp + (xi.prior + xi.likelihood) / temperatures[ i] - (xi.prior + xi.likelihood) / temperatures[i + 1] if random() < exp(totlp): current_sample = xi # copy this over yield current_sample
def generate_data(data_size): all_words = target.all_words() data = [] for i in lot_iter(xrange(data_size)): # a context is a set of men, pirates, and everything. functions are applied to this to get truth values context = sample_context() word = target.sample_utterance(all_words, context) data.append( UtteranceData(utterance=word, context=context, possible_utterances=all_words) ) return data
def prior_sample(h0, data, N): """ Just use the grammar and returntype of h0 to sample from the prior NOTE: Only implemented for LOTHypothesis """ assert isinstance(h0, LOTHypothesis) # extract from the grammar grammar = h0.grammar rt = h0.value.returntype for i in lot_iter(xrange(N)): h = type(h0)(grammar, start=rt) h.compute_posterior(data) yield h
def run(data_size): """ This out on the DATA_RANGE amounts of data and returns *all* hypothese in the top options.TOP_COUNT """ if LOTlib.SIG_INTERRUPTED: return TopN() # So we don't waste time making data for everything that isn't run # initialize the data data = generate_data(data_size) # starting hypothesis -- here this generates at random h0 = Utilities.make_h0() hyps = TopN(N=options.TOP_COUNT) hyps.add(lot_iter(MHSampler(h0, data, options.STEPS, trace=False))) return hyps
def run_mh(): """Run the MH; Run the vanilla sampler. Without steps, it will run infinitely. This prints out posterior (posterior_score), prior, tree grammar probability, likelihood, This yields data like below: -10.1447997767 -9.93962659915 -12.2377573418 -0.20517317755 'and_(not_(is_shape_(x, 'triangle')), not_(is_color_(x, 'blue')))' -11.9260879461 -8.77647578935 -12.2377573418 -3.14961215672 'and_(not_(is_shape_(x, 'triangle')), not_(is_shape_(x, 'triangle')))' """ # Create an initial hypothesis. Here we use a RationalRulesLOTHypothesis, which # is defined in LOTlib.Hypotheses and wraps LOTHypothesis with the rational rules prior h0 = RationalRulesLOTHypothesis(grammar=DNF, rrAlpha=1.0) for h in lot_iter(MHSampler(h0, data, 10000, skip=100)): print h.posterior_score, h.prior, h.value.log_probability(), h.likelihood, q(h)
def scheme_generate(): """ This generates random scheme code with cons, cdr, and car, and evaluates it on some simple list structures. No inference here -- just random sampling from a grammar. """ ## Generate some and print out unique ones seen = set() for i in lot_iter(xrange(10000)): x = grammar.generate('START') if x not in seen: seen.add(x) # make the function node version f = LOTHypothesis(grammar, value=x, args=['x']) print x.log_probability(), x for ei in example_input: print "\t", ei, " -> ", f(ei)
def tempered_transitions_sample(inh, data, steps, proposer=None, skip=0, temperatures=[1.0, 1.05, 1.1], stats=None): current_sample = inh LT = len(temperatures) ## TODO: CHECK THIS--STILL NOT SURE THIS IS RIGHT # a helper function for temperature transitions -- one single MH step, returning a new sample # this allows diff. temps for top and bottom def tt_helper(xi, data, tnew, told, proposer): if proposer is None: xinew, fb = xi.propose() else: xinew, fb = proposer(xi) xinew.compute_posterior(data) r = (xinew.prior + xinew.likelihood) / tnew - (xi.prior + xi.likelihood) / told - fb if r > 0.0 or random() < exp(r): return xinew else: return xi for mhi in lot_iter(xrange(steps)): for skp in xrange(skip+1): xi = current_sample # do not need to copy this totlp = 0.0 #(xi.lp / temperatures[1]) - (xi.lp / temperatures[0]) for i in xrange(0,LT-2): # go up xi = tt_helper(xi, data, temperatures[i+1], temperatures[i], proposer) totlp = totlp + (xi.prior + xi.likelihood) / temperatures[i+1] - (xi.prior + xi.likelihood) / temperatures[i] # do the top: xi = tt_helper(xi, data, temperatures[LT-1], temperatures[LT-1], proposer) for i in xrange(len(temperatures)-2, 0, -1): # go down xi = tt_helper(xi, data, temperatures[i], temperatures[i], proposer) totlp = totlp + (xi.prior + xi.likelihood) / temperatures[i] - (xi.prior + xi.likelihood) / temperatures[i+1] if random() < exp(totlp): current_sample = xi # copy this over yield current_sample
def next(self): if LOTlib.SIG_INTERRUPTED or self.samples_yielded >= self.steps: raise StopIteration else: for _ in lot_iter(xrange(self.skip + 1)): self.proposal, fb = self.proposer(self.current_sample) # either compute this, or use the memoized version np, nl = self.compute_posterior(self.proposal, self.data) #print np, nl, current_sample.prior, current_sample.likelihood # NOTE: IT is important that we re-compute from the temperature since these may be altered externally from ParallelTempering and others prop = (np / self.prior_temperature + nl / self.likelihood_temperature) cur = (self.current_sample.prior / self.prior_temperature + self.current_sample.likelihood / self.likelihood_temperature) if MH_acceptance( cur, prop, fb, acceptance_temperature=self.acceptance_temperature): self.current_sample = self.proposal self.was_accepted = True self.acceptance_count += 1 else: self.was_accepted = False self.internal_sample(self.current_sample) self.proposal_count += 1 if self.trace: print self.current_sample.posterior_score, self.current_sample.likelihood, self.current_sample.prior, qq( self.current_sample) self.samples_yielded += 1 return self.current_sample
def evaluate_sampler(my_sampler, print_every=1000, out_hypotheses=sys.stdout, out_aggregate=sys.stdout, trace=False, prefix=""): """ Print the stats for a single sampler run *my_sampler* -- a generator of samples print_every -- display the output every this many steps out_hypothesis -- where we put hypothesis stats out_aggregate -- where we put aggregate stats trace -- print every sample prefix -- display before lines """ visited_at = defaultdict(list) startt = time() for n, s in lot_iter(enumerate(my_sampler)): # each sample should have an .posterior_score defined if trace: print "#", n, s visited_at[s].append(n) if (n%print_every)==0 and n>0: post = sorted([x.posterior_score for x in visited_at.keys()], reverse=True) # the unnormalized posteriors of everything found ll = sorted([x.likelihood for x in visited_at.keys()], reverse=True) Z = logsumexp(post) # just compute total probability mass found -- the main measure out_aggregate.write('\t'.join(map(str, [prefix, n, r3(time()-startt), r5(Z), len(post)]+mydisplay(post))) + '\n') # Now once we're done, output the hypothesis stats for k,v in visited_at.items(): mean_diff = "NA" if len(v) > 1: mean_diff = mean(diff(v)) out_hypotheses.write('\t'.join(map(str, [prefix, k.posterior_score, k.prior, k.likelihood, len(v), min(v), max(v), mean_diff, sum(diff(v)==0) ])) +'\n') # number of rejects from this return 0.0
def evaluate_sampler(self, sampler): cnt = Counter() for h in lot_iter(sampler): cnt[h.value] += 1 ## TODO: When the MCMC methods get cleaned up for how many samples they return, we will assert that we got the right number here # assert sum(cnt.values()) == NSAMPLES # Just make sure we aren't using a sampler that returns fewer samples! I'm looking at you, ParallelTempering Z = logsumexp([t.log_probability() for t in self.trees]) # renormalize to the trees in self.trees obsc = [cnt[t] for t in self.trees] expc = [exp(t.log_probability()-Z)*sum(obsc) for t in self.trees] csq, pv = chisquare(obsc, expc) assert abs(sum(obsc) - sum(expc)) < 0.01 # assert min(expc) > 5 # or else chisq sux for t, c, s in zip(self.trees, obsc, expc): print c, s, t print (csq, pv), sum(obsc) self.assertGreater(pv, PVALUE, msg="Sampler failed chi squared!") return csq, pv
h0 = NumberExpression(grammar) ''' from LOTlib.Inference.Proposals.InsertDeleteProposal import InsertDeleteProposal h0 = NumberExpression(grammar, proposal_function=InsertDeleteProposal(grammar)) ''' # store hypotheses we've found allhyp = TopN(N=1000) # ======================================================================================================== # Run the standard RationalRules sampler mh_sampler = MHSampler(h0, data, STEPS, skip=SKIP) for h in lot_iter(mh_sampler): if TRACE: print q(get_knower_pattern(h)), h.posterior_score, h.compute_prior(), h.compute_likelihood(data), qq(h) # add h to our priority queue, with priority of its log probability, h.posterior_score allhyp.add(h) # ======================================================================================================== # now re-evaluate everything we found on new data ''' huge_data = generate_data(LARGE_DATA_SIZE) save this with a huge data set -- eval with average ll H = allhyp.get_sorted() compute the posterior for each hypothesis
from LOTlib.FiniteBestSet import FiniteBestSet from LOTlib.Inference.MetropolisHastings import MHSampler from Model import * NDATA = 50 # How many total data points? NSTEPS = 10000 BEST_N = 100 # How many from each hypothesis to store OUTFILE = "hypotheses.pkl" # Where we keep track of all hypotheses (across concepts) all_hypotheses = FiniteBestSet() if __name__ == "__main__": # Now loop over each target concept and get a set of hypotheses for i, f in enumerate(TARGET_CONCEPTS): # Set up the hypothesis h0 = LOTHypothesis(grammar, start='START', args=['x']) # Set up some data data = generate_data(NDATA, f) # Now run some MCMC fs = FiniteBestSet(N=BEST_N, key="posterior_score") fs.add(lot_iter(MHSampler(h0, data, steps=NSTEPS, trace=False))) all_hypotheses.merge(fs) pickle.dump(all_hypotheses, open(OUTFILE, 'w'))
# -*- coding: utf-8 -*- """ A simple symbolic regression demo """ from LOTlib import lot_iter from LOTlib.Hypotheses.GaussianLOTHypothesis import GaussianLOTHypothesis from LOTlib.Inference.MetropolisHastings import MHSampler from LOTlib.Miscellaneous import qq from LOTlib.Examples.SymbolicRegression.Grammar import grammar from Data import generate_data CHAINS = 4 STEPS = 50000 SKIP = 0 if __name__ == "__main__": print grammar # generate some data data = generate_data(50) # how many data points? # starting hypothesis -- here this generates at random h0 = GaussianLOTHypothesis(grammar) for h in lot_iter(MHSampler(h0, data, STEPS, skip=SKIP)): print h.posterior_score, qq(h)
for h in sorted(H, key=lambda h: h.posterior_score): print h.posterior_score, h.prior, h.likelihood, h.likelihood_temperature print h # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Play around with some different inference schemes # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #h0 = CCGLexicon(make_hypothesis, words=all_words, alpha=0.9, palpha=0.9, likelihood_temperature=0.01) #for i, h in lot_iter(enumerate(mh_sample(h0, data, 400000000, skip=0, debug=False))): #print h.posterior_score, h.prior, h.likelihood, qq(re.sub(r"\n", ";", str(h))) from LOTlib.Inference.IncreaseTemperatureMH import increase_temperature_mh_sample h0 = CCGLexicon(make_hypothesis, words=all_words, alpha=0.9, palpha=0.9, likelihood_temperature=0.01) for i, h in lot_iter(enumerate(increase_temperature_mh_sample(h0, data, 400000000, skip=0, increase_amount=1.50))): print h.posterior_score, h.prior, h.likelihood, qq(re.sub(r"\n", ";", str(h))) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Run on a single computer, printing out # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #fbs = FiniteBestSet(N=100) #h0 = CCGLexicon(make_hypothesis, words=all_words, alpha=0.9, palpha=0.9, likelihood_temperature=0.051) #for i, h in lot_iter(enumerate(mh_sample(h0, data, 400000000, skip=0, debug=False))): #fbs.add(h, h.posterior_score) #if i%100==0: #print h.posterior_score, h.prior, h.likelihood #, re.sub(r"\n", ";", str(h)) #print h
# -*- coding: utf-8 -*- """ A demo of "syntax" learning using a SimpleGenerativeHypothesis. This searches over probabilistic generating functions, running them forward to estimate the likelihood of the data. Very very simple. """ from LOTlib import lot_iter from LOTlib.Inference.MetropolisHastings import MHSampler from LOTlib.Hypotheses.SimpleGenerativeHypothesis import SimpleGenerativeHypothesis from Model import * if __name__ == "__main__": # # # # # # # # # # # # # # # # # # # # # # # # # # # # h0 = SimpleGenerativeHypothesis(grammar, args=[''] ) ## populate the finite sample by running the sampler for this many steps for h in lot_iter(MHSampler(h0, data, 100000, skip=100)): print h.posterior_score, h.prior, h.likelihood, h print h.llcounts
class ParticleSwarmPriorResample(ParticleSwarm): """ Like ParticleSwarm, but resamples from the prior """ def refresh(self): """ Resample by resampling those below the median from the prior. """ m = median(self.chainZ) for i in range(self.nchains): if self.chainZ[i] < m: self.chains[i] = self.make_h0(**self.kwargs) self.chainZ[i] = -Infinity # reset this # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == "__main__": from LOTlib.Examples.Number.Global import generate_data, grammar, make_h0 data = generate_data(300) ps = ParticleSwarm(make_h0, data) for h in lot_iter(ps): print h.posterior_score, h if len(ps.seen) > 0: print "#", sorted(ps.seen, key=lambda x: x.posterior_score, reverse=True)[0]
## TODO: Vary resample_p to make sure that works here! from LOTlib.Grammar import Grammar grammar = Grammar() grammar.add_rule('START', '', ['A'], 1.0) grammar.add_rule('A', 'A', ['A', 'A'], 0.2) grammar.add_rule('A', 'A', ['a'], 0.7) grammar.add_rule('A', 'apply_', ['L', 'A'], 0.10) grammar.add_rule('L', 'lambda', ['A'], 0.11, bv_p=0.07, bv_type='A') grammar.add_rule('A', 'apply_', ['LF', 'A'], 0.10) grammar.add_rule('LF', 'lambda', ['A'], 0.11, bv_p=0.07, bv_type='A', bv_args=['A'], bv_prefix='F') ## NOTE: DOES NTO HANDLE THE CASE WITH TWO A->APPLY, L->LAMBDAS if __name__ == "__main__": from LOTlib import lot_iter for t in lot_iter(grammar.enumerate()): print t
# the priors cancel, so this represents the posterior cur = self.ll_at_temperature(i, self.chains[i].likelihood_temperature) + self.ll_at_temperature(i+1, self.chains[i+1].likelihood_temperature) prop = self.ll_at_temperature(i, self.chains[i+1].likelihood_temperature) + self.ll_at_temperature(i+1, self.chains[i].likelihood_temperature) if MH_acceptance(cur, prop, 0.0): tmp = self.chains[i].current_sample self.chains[i].set_state( self.chains[i+1].current_sample, False) self.chains[i+1].set_state(tmp, False) # OLD: self.chains[i].current_sample, self.chains[i+1].current_sample = self.chains[i+1].current_sample, self.chains[i].current_sample if self.yield_only_t0 and self.chain_idx != 0: return self.next() # keep going until we're on the one we yield ## TODO: FIX THIS SINCE IT WILL BREAK FOR HUGE NUMBERS OF CHAINS else: return self.chains[self.chain_idx].next() if __name__ == "__main__": from LOTlib import lot_iter from LOTlib.Miscellaneous import Infinity from LOTlib.Examples.Number.Model import generate_data, NumberExpression, grammar data = generate_data(300) make_h0 = lambda : NumberExpression(grammar) for h in lot_iter(ParallelTemperingSampler(make_h0, data, steps=Infinity, yield_only_t0=True)): print h.posterior_score, h
# initialize each chain MultipleChainMCMC.__init__(self, lambda: None, data, steps=steps, nchains=len(partitions), **kwargs) # And set each to the partition for c, p in zip(self.chains, partitions): c.set_state(make_h0(value=p)) # and store these self.partitions = map(copy, partitions) if __name__ == "__main__": from LOTlib.Examples.Number.Model.Utilities import grammar, make_h0, generate_data data = generate_data(300) #from LOTlib.Examples.RegularExpression.Shared import grammar, make_h0, data #from LOTlib.Examples.RationalRules.Shared import grammar, data, make_h0 #PartitionMCMC(grammar, make_h0, data, 2, skip=0) for h in lot_iter(PartitionMCMC(grammar, make_h0, data, max_N=10, skip=0)): print h.posterior_score, h
def print_subtree_adaptations(hypotheses, posteriors, subtrees, relative_KL=True): """ Determine how useful it would be to explicitly define each subtree in H across all of the (corresponding) posteriors, as measured by KL from prior to posterior - hypotheses - a list of LOThypotheses - posteriors - [ [P(h|data) for h in hypotheies] x problems ] - subtrees - a collection of (possibly partial) subtrees to try adapting We treat hyps as a fixed finite hypothesis space, and assume every subtree considered is *not* derived compositionally (although thi scould change in future variants) p - the probability of going to kids in randomly generating a subtree subtree_multiplier - how many times we sample a subtree from *each* node in each hypothesis relative_KL - compute summed KL divergence absolutely, or relative to the h.compute_prior()? """ # compute the normalized posteriors Ps = map(lognormalize, posteriors) # Compute the baseline KL divergence so we can score relative to this if relative_KL: oldpriors = lognormalize( numpy.array([h.compute_prior() for h in hypotheses])) KL0s = [sum(exp(oldpriors) * (oldpriors - P)) for P in Ps] else: KL0s = [ 1.0 for P in Ps ] # pretend everything just had KL of 1, so we score relatively ## Now process each, starting with the most simple for t in lot_iter( sorted(subtrees, key=lambda t: t.log_probability(), reverse=True)): # Get some stats on t: tlp = t.log_probability() tnt = count_identical_nonterminals( t.returntype, t) # How many times is this nonterminal used? # How many matches of t are there in each H? m = numpy.array( [count_subtree_matches(t, h.value) for h in hypotheses]) ## TODO: There is a complications: partial patterns matching themselves. ## For simplicity, we'll just take the *first* match, seetting max(m)=1 ## In the future, we should change this to correctly handle and count ## partial matches matching themselves m = (m >= 1) * 1 assert max(m) == 1, "Error: " + str(t) + "\t" + str(m) # How many times is the nonterminal used, NOT counting in t? nt = numpy.array([ count_identical_nonterminals(t.returntype, h.value) for h in hypotheses ]) - (tnt - 1) * m assert min(nt) >= 0, "Error: " + str(t) # And the PCFG prior *not* counting t q = lognormalize( numpy.array([h.value.log_probability() for h in hypotheses]) - tlp * m) # The function to optimize def fnc(p): if p <= 0. or p >= 1.: return float("inf") # enforce bounds newprior = lognormalize(q + log(p) * m + log(1. - p) * nt) kl = 0.0 for P, kl0 in zip(Ps, KL0s): kl += sum(numpy.exp(newprior) * (newprior - P)) / kl0 return kl ### TODO: This optimization should be analytically tractable... ### but we need to check that it is convex! Any ideas? o = scipy.optimize.fmin(fnc, numpy.array([0.1]), xtol=0.0001, ftol=0.0001, disp=0) print fnc(o[0]), o[0], log(o[0]), t.log_probability(), qq(t)
grammar.add_rule('LAMBDA_WORD', 'lambda', ['WORD'], 1.0, bv_type='WORD') grammar.add_rule('WORD', 'apply_', ['LAMBDA_WORD', 'WORD'], 1.0) p = InverseInlineProposal(grammar) """ # Just look at some proposals for _ in xrange(200): t = grammar.generate() print ">>", t #assert t.check_generation_probabilities(grammar) #assert t.check_parent_refs() for _ in xrange(10): t = p.propose_tree(t)[0] print "\t", t """ # Run MCMC -- more informative about f-b errors from LOTlib.Inference.MetropolisHastings import MHSampler from LOTlib.Inference.Proposals.MixtureProposal import MixtureProposal from LOTlib.Inference.Proposals.RegenerationProposal import RegenerationProposal h = make_h0(proposal_function=MixtureProposal([InverseInlineProposal(grammar), RegenerationProposal(grammar)] )) data = generate_data(100) for h in lot_iter(MHSampler(h, data)): print h.posterior_score, h.prior, h.likelihood, get_knower_pattern(h), h
""" Define a new kind of LOTHypothesis, that gives regex strings. These have a special interpretation function that compiles differently than straight python eval. """ from LOTlib import lot_iter from LOTlib.Inference.MetropolisHastings import MHSampler from LOTlib.Miscellaneous import qq from Model import * if __name__ == "__main__": for h in lot_iter(MHSampler(make_h0(), data, steps=10000)): print h.posterior_score, h.prior, h.likelihood, qq(h)
""" Define a new kind of LOTHypothesis, that gives regex strings. These have a special interpretation function that compiles differently than straight python eval. """ from LOTlib import lot_iter from LOTlib.Inference.MetropolisHastings import MHSampler from LOTlib.Miscellaneous import qq from Shared import data, make_h0 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == "__main__": for h in lot_iter(MHSampler(make_h0(), data, steps=10000)): print h.posterior_score, h.prior, h.likelihood, qq(h)