def runparts(x,datamt): #problem: right now only recording last partition, never saving from others. print "Start: " + str(x) + " on this many: " + str(datamt) messup = TopN(options.top) try: #make new TopN for each data amount topn= TopN(N=200, key="posterior_score") for p in break_ctrlc(partitions): print "Starting on partition ", p # Now we have to go in and fill in the nodes that are nonterminals # We can do this with generate v = grammar.generate(deepcopy(p)) h0 = MyHypothesis(grammar, value=v) size = datamt data = [FunctionData(input=[], output={'n i k': size, 'h i N': size, 'f a n': size, 'g i f': size, 'm a N': size, 'f a m': size, 'g i k': size, 'k a n': size, 'f a f': size, 'g i n': size, 'g i m': size, 'g i s': size, 's i f': size, 's i n': size, 'n i s': size, 's i m': size, 's i k': size, 'h a N': size, 'f i N': size, 'h i m': size, 'h i n': size, 'h a m': size, 'n i N': size, 'h i k': size, 'f a s': size, 'f i n': size, 'h i f': size, 'n i m': size, 'g i N': size, 'h a g': size, 's i N': size, 'n i n': size, 'f i m': size, 's i s': size, 'h i s': size, 'n a s': size, 'k a s': size, 'f i s': size, 'n i f': size, 'm i n': size, 's a s': size, 'f a g': size, 'k a g': size, 'k a f': size, 's a m': size, 'n a f': size, 'n a g': size, 'm i N': size, 's a g': size, 'f i k': size, 'k a m': size, 'n a n': size, 's a f': size, 'n a m': size, 'm a s': size, 'h a f': size, 'h a s': size, 'n a N': size, 'm i s': size, 's a n': size, 's a N': size, 'm i k': size, 'f a N': size, 'm i m': size, 'm a g': size, 'm a f': size, 'f i f': size, 'k a N': size, 'h a n': size, 'm a n': size, 'm a m': size, 'm i f': size})] for h in break_ctrlc(MHSampler(h0, data, steps=options.steps, trace=False)): topn.add(h) return set(topn) except: #if we fail, we can return a blank TopN return messup
def generate_unique_trees(grammar, start='START', N=1000): """ Yield a bunch of unique trees, produced from the grammar """ for _ in break_ctrlc(xrange(N)): t = grammar.generate(start) yield t
def runTest(self): NSAMPLES = 10000 from LOTlib.DefaultGrammars import finiteTestGrammar as grammar from LOTlib.Hypotheses.LOTHypothesis import LOTHypothesis class MyH(LOTHypothesis): @attrmem('likelihood') def compute_likelihood(self, *args, **kwargs): return 0.0 @attrmem('prior') def compute_prior(self): return grammar.log_probability(self.value) print "# Taking MHSampler for a test run" cnt = Counter() h0 = MyH(grammar=grammar) for h in break_ctrlc( MHSampler(h0, [], steps=NSAMPLES, skip=10)): # huh the skip here seems to be important cnt[h] += 1 trees = list(cnt.keys()) print "# Done taking MHSampler for a test run" ## TODO: When the MCMC methods get cleaned up for how many samples they return, we will assert that we got the right number here # assert sum(cnt.values()) == NSAMPLES # Just make sure we aren't using a sampler that returns fewer samples! I'm looking at you, ParallelTempering Z = logsumexp([grammar.log_probability(t.value) for t in trees ]) # renormalize to the trees in self.trees obsc = [cnt[t] for t in trees] expc = [ exp(grammar.log_probability(t.value)) * sum(obsc) for t in trees ] # And plot here expc, obsc, trees = zip(*sorted(zip(expc, obsc, trees), reverse=True)) import matplotlib.pyplot as plt plt.subplot(111) # Log here spaces things out at the high end, where we can see it! plt.scatter(log(range(len(trees))), expc, color="red", alpha=1.) plt.scatter(log(range(len(trees))), obsc, color="blue", marker="x", alpha=1.) plt.savefig('finite-sampler-test.pdf') plt.clf() # Do chi squared test csq, pv = chisquare(obsc, expc) self.assertAlmostEqual(sum(obsc), sum(expc)) # And examine for t, c, s in zip(trees, obsc, expc): print c, s, t print(csq, pv), sum(obsc) self.assertGreater(pv, 0.01, msg="Sampler failed chi squared!")
def scheme_generate(): """ This generates random scheme code with cons, cdr, and car, and evaluates it on some simple list structures. No inference here -- just random sampling from a grammar. """ example_input = [ [], [[]], [[], []], [[[]]] ] ## Generate some and print out unique ones seen = set() for i in break_ctrlc(xrange(10000)): x = grammar.generate('START') if x not in seen: seen.add(x) # make the function node version f = LOTHypothesis(grammar, value=x, args=['x']) print x.log_probability(), x for ei in example_input: print "\t", ei, " -> ", f(ei)
def test_refs(self): """ Test the setting of parents and rules """ for _ in break_ctrlc(xrange(1000)): x = self.G.generate() self.assertTrue(x.check_parent_refs())
def standard_sample(make_hypothesis, make_data, skip=9, show=True, N=100, save_top='top.pkl', alsoprint='None', **kwargs): """ Just a simplified interface for sampling, allowing printing (showing), returning the top, and saving. This is used by many examples, and is meant to easily allow running with a variety of parameters. NOTE: This skip is a skip *only* on printing **kwargs get passed to sampler """ if LOTlib.SIG_INTERRUPTED: return TopN() # So we don't waste time! h0 = make_hypothesis() data = make_data() best_hypotheses = TopN(N=N) f = eval(alsoprint) for i, h in enumerate(break_ctrlc(MHSampler(h0, data, **kwargs))): best_hypotheses.add(h) if show and i%(skip+1) == 0: print i, \ h.posterior_score, \ h.prior, \ h.likelihood, \ f(h) if f is not None else '', \ qq(cleanFunctionNodeString(h)) if save_top is not None: print "# Saving top hypotheses" with open(save_top, 'w') as f: pickle.dump(best_hypotheses, f) return best_hypotheses
def plot_sampler(self, opath, sampler): """ Plot the sampler, for cases with many zeros where chisquared won't work well """ cnt = Counter() for h in break_ctrlc(sampler): cnt[h.value] += 1 Z = logsumexp([ self.grammar.log_probability(t) for t in self.trees]) # renormalize to the trees in self.trees obsc = [cnt[t] for t in self.trees] expc = [exp(self.grammar.log_probability(t)-Z)*sum(obsc) for t in self.trees] for t, c, s in zip(self.trees, obsc, expc): print c, "\t", s, "\t", t expc, obsc, trees = zip(*sorted(zip(expc, obsc, self.trees), reverse=True)) import matplotlib.pyplot as plt from numpy import log plt.subplot(111) # Log here spaces things out at the high end, where we can see it! plt.scatter(log(range(len(trees))), expc, color="red", alpha=1.) plt.scatter(log(range(len(trees))), obsc, color="blue", marker="x", alpha=1.) plt.savefig(opath) plt.clf()
def test_check_generation_probabilities(self): """ Test the generation probabilities """ for _ in break_ctrlc(xrange(1000)): x = self.G.generate() self.assertTrue(x.check_generation_probabilities(self.G))
def test_lp_regenerate_propose_to(self): # import the grammar from Grammars import lp_regenerate_propose_to_grammar self.G = lp_regenerate_propose_to_grammar.g # the RegenerationProposal class rp = RegenerationProposal(self.G) numTests = 100 # Sample 1000 trees from the grammar, and run a chi-squared test for each of them for i in break_ctrlc(range(numTests)): # keep track of expected and actual counts # expected_counts = defaultdict(int) # a dictionary whose keys are trees and values are the expected number of times we should be proposing to this tree actual_counts = defaultdict(int) # same as expected_counts, but stores the actual number of times we proposed to a given tree tree = self.G.generate('START') # Regenerate some number of trees at random numTrees = 1000 for j in range(numTrees): newtree = rp.propose_tree(tree)[0] # trees.append(newtree) actual_counts[newtree] += 1 # see if the frequency with which each category of trees is generated matches the # expected counts using a chi-squared test chisquared, p = self.get_pvalue(tree, actual_counts, numTrees) # print chisquared, p # if p > 0.01/1000, test passes self.assertTrue(p > 0.01/numTests, "Trees are not being generated according to the expected log probabilities") if i % 10 == 0 and i != 0: print i, "lp_regenerate_propose_to tests..." print numTests, "lp_regenerate_propose_to tests..."
def run(data_amount): print "Starting chain on %s data points" % data_amount data = makeVariableLexiconData(eval(options.word), options.word, the_context, n=data_amount, s=options.s, alpha=options.alpha, verbose=True) h0 = KinshipLexicon(words=[options.word], alpha=options.alpha) h0.set_word( options.word, LOTHypothesis(grammar, value=None, display='lambda recurse_, C, X:%s')) hyps = TopN(N=options.top_count) mhs = MHSampler(h0, data, options.steps, likelihood_temperature=options.llt, prior_temperature=options.prior_temp) for samples_yielded, h in break_ctrlc(enumerate(mhs)): if samples_yielded % 1000 == 0: print h.prior, h.likelihood, h hyps.add(h) return hyps
def save_hypotheses(sampler, filename='numbergame_hypotheses.p'): hypotheses = set() for h in break_ctrlc(sampler): hypotheses.add(h) f = open(filename, "wb") pickle.dump(hypotheses, f) return hypotheses
def partitionMCMC(data,partitions): print data topn= TopN(N=200, key="posterior_score") for p in break_ctrlc(partitions): print "Starting on partition ", p # Now we have to go in and fill in the nodes that are nonterminals v = grammar.generate(deepcopy(p)) #h0 = MyHypothesis(grammar, value=v) h0= make_hypothesis() print h0 for h in break_ctrlc(MHSampler(h0, data, steps=5000, skip=0)): # Show the partition and the hypothesis print h.posterior_score, p, h, howyoudoin(h) topn.add(h) return set(topn)
def run(options, ndata): if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG + "()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE z = sum(data[0].output.values()) if z > 0: best_ll = sum([(p / z) * log(p / z) for p in data[0].output.values()]) else: best_ll = 0.0 # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', "'%s'" % t, None, 1.0) # set up the hypothesis h0 = IncrementalLexiconHypothesis(grammar=grammar, alphabet_size=len(language.terminals())) h0.set_word( 0, h0.make_hypothesis(grammar=grammar)) # make the first word at random h0.N = 1 tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? if LOTlib.SIG_INTERRUPTED: return 0, set() # and re-set the posterior or else it's something weird h0.compute_posterior(data) # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): h.best_ll = best_ll # just store this tn.add(copy(h)) if options.TRACE: print h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata, h v = h() sortedv = sorted(v.items(), key=operator.itemgetter(1), reverse=True) print "{" + ', '.join(["'%s':%s" % i for i in sortedv]) + "}" # and start from where we ended h0 = copy(h) h0.deepen() return ndata, tn
def run(): h0 = make_hypothesis() data = make_data() for x in break_ctrlc(MHSampler(h0, data, STEPS)): print x.posterior_score, x for di in data: print "\t", di.input, "->", x(*di.input), " ; should be ", di.output
def run(): h0 = make_hypothesis() data = make_data() for x in break_ctrlc(MHSampler(h0, data, STEPS)): print x.posterior_score, x for di in data: print "\t", di.input, "->", x( *di.input), " ; should be ", di.output
def __call__(self, generator): """Pass this a generator, add each element as it's yielded. This allows us to make a pipeline. See Example in main docstring: '# Or as a generator...'. """ if hasattr(generator, 'data'): self.data = generator.data for sample in break_ctrlc(generator): self.add(sample) yield sample
def test_eq(self): counter = 0 for i in break_ctrlc(xrange(10000)): x = self.G.generate() y = self.G.generate() if pystring(x) == pystring(y): counter += 1 # print(counter) #print( pystring(x)+'\n'+ pystring(y)+'\n') self.assertEqual( pystring(x) == pystring(y), x == y, "Without bvs, the pystrings should be the same")
def run(options, ndata): if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE z = sum(data[0].output.values()) if z > 0: best_ll = sum([ (p/z)*log(p/z) for p in data[0].output.values() ]) else: best_ll = 0.0 # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', "'%s'" % t, None, 1.0) # set up the hypothesis h0 = IncrementalLexiconHypothesis(grammar=grammar, alphabet_size=len(language.terminals())) h0.set_word(0, h0.make_hypothesis(grammar=grammar)) # make the first word at random h0.N = 1 tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? if LOTlib.SIG_INTERRUPTED: return 0, set() # and re-set the posterior or else it's something weird h0.compute_posterior(data) # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): h.best_ll = best_ll # just store this tn.add(copy(h)) if options.TRACE: print h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata, h v = h() sortedv = sorted(v.items(), key=operator.itemgetter(1), reverse=True ) print "{" + ', '.join(["'%s':%s"% i for i in sortedv]) + "}" # and start from where we ended h0 = copy(h) h0.deepen() return ndata, tn
def runTest(self): NSAMPLES = 10000 from LOTlib.DefaultGrammars import finiteTestGrammar as grammar from LOTlib.Hypotheses.LOTHypothesis import LOTHypothesis class MyH(LOTHypothesis): @attrmem('likelihood') def compute_likelihood(self, *args, **kwargs): return 0.0 @attrmem('prior') def compute_prior(self): return grammar.log_probability(self.value) print "# Taking MHSampler for a test run" cnt = Counter() h0 = MyH(grammar=grammar) for h in break_ctrlc(MHSampler(h0, [], steps=NSAMPLES, skip=10)): # huh the skip here seems to be important cnt[h] += 1 trees = list(cnt.keys()) print "# Done taking MHSampler for a test run" ## TODO: When the MCMC methods get cleaned up for how many samples they return, we will assert that we got the right number here # assert sum(cnt.values()) == NSAMPLES # Just make sure we aren't using a sampler that returns fewer samples! I'm looking at you, ParallelTempering Z = logsumexp([grammar.log_probability(t.value) for t in trees]) # renormalize to the trees in self.trees obsc = [cnt[t] for t in trees] expc = [exp( grammar.log_probability(t.value))*sum(obsc) for t in trees] # And plot here expc, obsc, trees = zip(*sorted(zip(expc, obsc, trees), reverse=True)) import matplotlib.pyplot as plt plt.subplot(111) # Log here spaces things out at the high end, where we can see it! plt.scatter(log(range(len(trees))), expc, color="red", alpha=1.) plt.scatter(log(range(len(trees))), obsc, color="blue", marker="x", alpha=1.) plt.savefig('finite-sampler-test.pdf') plt.clf() # Do chi squared test csq, pv = chisquare(obsc, expc) self.assertAlmostEqual(sum(obsc), sum(expc)) # And examine for t, c, s in zip(trees, obsc, expc): print c, s, t print (csq, pv), sum(obsc) self.assertGreater(pv, 0.01, msg="Sampler failed chi squared!")
def generate_data(data_size): all_words = target.all_words() data = [] for i in break_ctrlc(xrange(data_size)): # a context is a set of men, pirates, and everything. functions are applied to this to get truth values context = sample_context() word = target.sample_utterance(all_words, context) data.append( UtteranceData(utterance=word, context=context, possible_utterances=all_words)) return data
def runme(chain, dataamt): if LOTlib.SIG_INTERRUPTED: return () data = make_data(dataamt) tn = TopN(options.top) h0 = make_hypothesis() for h in break_ctrlc(MHSampler(h0, data, steps=options.steps, skip=0)): # print h.posterior_score, h.prior, h.likelihood, h h.likelihood_per_data = h.likelihood/dataamt tn.add(h) return tn
def myrun(observed_set): if LOTlib.SIG_INTERRUPTED: return set() h0 = NumberGameHypothesis(grammar=grammar) data = [FunctionData(input=[], output=observed_set, alpha=ALPHA)] tn = TopN(N=options.TOP_COUNT) for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): tn.add(h) print "# Finished %s" % str(observed_set) return set(tn.get_all())
def run(data_amount): print "Starting chain on %s data points"%data_amount data = makeLexiconData(target, four_gen_tree_context, n=data_amount, alpha=options.alpha) h0 = KinshipLexicon(alpha=options.alpha) for w in target_words: h0.set_word(w, LOTHypothesis(my_grammar, args=['recurse_','C', 'X'])) hyps = TopN(N=options.top_count) mhs = MHSampler(h0, data, options.steps, likelihood_temperature=options.llt, prior_temperature=options.prior_temp) for samples_yielded, h in break_ctrlc(enumerate(mhs)): hyps.add(h) return hyps
def test_setto(self): """ Test the operation of setting a function node to another. """ for _ in break_ctrlc(xrange(1000)): x = self.G.generate() x0 = copy(x) y = self.G.generate() y_subnodes = y.subnodes() x.setto(y) self.assertTrue(x.check_parent_refs()) self.assertTrue(x.check_generation_probabilities(self.G)) for xi in x: self.assertTrue(xi in y_subnodes)
def standard_sample(make_hypothesis, make_data, show_skip=9, show=True, N=100, save_top='top.pkl', alsoprint='None', **kwargs): """ Just a simplified interface for sampling, allowing printing (showing), returning the top, and saving. This is used by many examples, and is meant to easily allow running with a variety of parameters. NOTE: This skip is a skip *only* on printing **kwargs get passed to sampler """ if LOTlib.SIG_INTERRUPTED: return TopN() # So we don't waste time! h0 = make_hypothesis() data = make_data() best_hypotheses = TopN(N=N) f = eval(alsoprint) sampler = MHSampler(h0, data, **kwargs) # # TODO change acceptance temperature over times # sampler.acceptance_temperature = 0.5 for i, h in enumerate(break_ctrlc(sampler)): # if i % 10000 == 0 and i != 0: # sampler.acceptance_temperature = min(1.0, sampler.acceptance_temperature+0.1) # print '='*50 # print 'change acc temperature to', sampler.acceptance_temperature best_hypotheses.add(h) if show and i%(show_skip+1) == 0: print i, \ h.posterior_score, \ h.prior, \ h.likelihood, \ f(h) if f is not None else '', \ qq(cleanFunctionNodeString(h)) if save_top is not None: print "# Saving top hypotheses" with open(save_top, 'w') as f: pickle.dump(best_hypotheses, f) return best_hypotheses
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return set() language = eval(options.LANG + "()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE # print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule("ATOM", q(t), None, 2) h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s") print "# Starting on ", h0 tn = TopN(N=options.TOP_COUNT) # print h0.compute_posterior(data) # for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): # # for h in MHSampler(h0, data, steps=options.STEPS, trace=True): # print h.posterior_score, h # print getattr(h, 'll_counts', None) with open( prefix + "hypotheses_" + options.LANG + "_" + str(rank) + "_" + str(ndata) + "_" + suffix + ".txt", "a" ) as ofile: for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): tn.add(h) # print h.posterior_score, getattr(h, 'll_counts', None), h if i % options.SKIP == 0 and h.posterior_score > -Infinity: print >> ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata print >> ofile, getattr(h, "ll_counts", None) print >> ofile, h, "\0" # must add \0 when not Lexicon return tn
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE #print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = IncrementalLexiconHypothesis(grammar=grammar) tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? # add to the grammar grammar.add_rule('SELFF', '%s' % (outer), None, 1.0) # Add one more to the number of words here h0.set_word(outer, h0.make_hypothesis(grammar=grammar)) h0.N = outer+1 assert len(h0.value.keys())==h0.N==outer+1 # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): tn.add(h) print h.posterior_score, h print getattr(h, 'll_counts', None) # and start from where we ended h0 = deepcopy(h) # must deepcopy return ndata, tn
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE #print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = IncrementalLexiconHypothesis(grammar=grammar) tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? # add to the grammar grammar.add_rule('SELFF', '%s' % (outer), None, 1.0) # Add one more to the number of words here h0.set_word(outer, h0.make_hypothesis(grammar=grammar)) h0.N = outer+1 assert len(h0.value.keys())==h0.N==outer+1 # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): tn.add(h) # print h.posterior_score, h # print getattr(h, 'll_counts', None) # and start from where we ended h0 = deepcopy(h) # must deepcopy return ndata, tn
def run(data_pts): print "Start run on ", str(data_pts) y = [pt.Y for pt in data_pts] filename = "".join(y) hyps = TopN(N=options.TOP_COUNT) h0 = KinshipLexicon(alpha=options.ALPHA) h0.set_word('Word', LOTHypothesis(my_grammar, value=None, display='lambda recurse_, C, X:%s')) mhs = MHSampler(h0, data_pts, options.STEPS, likelihood_temperature=options.llt) for samples_yielded, h in break_ctrlc(enumerate(mhs)): hyps.add(h) with open(options.OUT_PATH + filename + '.pkl', 'w') as f: pickle.dump(hyps, f) return filename, hyps
def run(): """A version that cares more about recent data, showing how to use Hypotheses.DecayedLikelihoodHypothesis. """ G = grammar # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Create an initial hypothesis # This is where we set a number of relevant variables -- whether to use RR, alpha, etc.Z h0 = MyHypothesis(G, ll_decay=1.0, rrAlpha=1.0, args=['x']) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Run the MH # Run the vanilla sampler. Without steps, it will run infinitely # this prints out posterior (posterior_score), prior, likelihood, for h in break_ctrlc(MHSampler(h0, data, 10000, skip=100)): print h.posterior_score, h.prior, h.likelihood, q(h)
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE # print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s") print "# Starting on ", h0 tn = TopN(N=options.TOP_COUNT) # print h0.compute_posterior(data) # for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): # # for h in MHSampler(h0, data, steps=options.STEPS, trace=True): # print h.posterior_score, h # print getattr(h, 'll_counts', None) with open(prefix+'hypotheses_'+options.LANG+'_'+str(rank)+'_'+str(ndata)+'_'+suffix+".txt", 'a') as ofile: for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): tn.add(h) # print h.posterior_score, getattr(h, 'll_counts', None), h if i%options.SKIP == 0 and h.posterior_score > -Infinity: print >>ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood/ndata print >>ofile, getattr(h,'ll_counts', None) print >>ofile, h, '\0' # must add \0 when not Lexicon return tn
def run(data_amount): print "Starting chain on %s data points" % data_amount data = makeVariableLexiconData(eval(options.word), options.word, the_context, n=data_amount, s=options.s, alpha=options.alpha, verbose=True) h0 = KinshipLexicon(words=[options.word], alpha=options.alpha) h0.set_word(options.word, LOTHypothesis(grammar, value=None, args=['recurse_', 'C', 'X'])) hyps = TopN(N=options.top_count) mhs = MHSampler(h0, data, options.steps, likelihood_temperature=options.llt, prior_temperature=options.prior_temp) for samples_yielded, h in break_ctrlc(enumerate(mhs)): #if samples_yielded % 100 == 0: # print h.prior, h.likelihood, h hyps.add(h) return hyps
def run_mh(): """Run the MH.""" # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # somewhat weirdly, we'll make an upper node above "START" for the two concepts # and require it to check if concept (an argument below) is 'A' grammar.add_rule('TWO_CONCEPT_START', 'if_', ['(concept==\'A\')', 'START', 'START'], 1.0) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Create an initial hypothesis # This is where we set a number of relevant variables -- whether to use RR, alpha, etc. # Here we give args as "concept" (used in TWO_CONCEPT_START above) and "x" h0 = RationalRulesLOTHypothesis(grammar=grammar, rrAlpha=1.0, ALPHA=0.9, start='TWO_CONCEPT_START', args=['concept', 'x']) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Run the vanilla sampler. Without steps, it will run infinitely # this prints out posterior (posterior_score), prior, likelihood, for h in break_ctrlc(MHSampler(h0, data, 10000, skip=100)): print h.posterior_score, h.prior, h.likelihood, q(h)
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return set() language = eval(options.LANG + "()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s") tn = TopN(N=options.TOP_COUNT) for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): print h.posterior_score, h print getattr(h, 'll_counts', None) # with open(prefix+'hypotheses_'+options.LANG+'_'+str(rank)+'_'+str(ndata)+'_'+suffix+".txt", 'a') as ofile: # # for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): # tn.add(h) # # print h.posterior_score, getattr(h, 'll_counts', None), h # if i%options.SKIP == 0: # print >>ofile, "\n" # print >>ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood/ndata # print >>ofile, getattr(h,'ll_counts', None), # print >>ofile, h # ends in \0 so we can sort with sort -g -z return tn
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return set() language = eval(options.LANG + "()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule("ATOM", q(t), None, 2) h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s") tn = TopN(N=options.TOP_COUNT) for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): print h.posterior_score, h print getattr(h, "ll_counts", None) # with open(prefix+'hypotheses_'+options.LANG+'_'+str(rank)+'_'+str(ndata)+'_'+suffix+".txt", 'a') as ofile: # # for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): # tn.add(h) # # print h.posterior_score, getattr(h, 'll_counts', None), h # if i%options.SKIP == 0: # print >>ofile, "\n" # print >>ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood/ndata # print >>ofile, getattr(h,'ll_counts', None), # print >>ofile, h # ends in \0 so we can sort with sort -g -z return tn
def scheme_generate(): """ This generates random scheme code with cons, cdr, and car, and evaluates it on some simple list structures. No inference here -- just random sampling from a grammar. """ ## Generate some and print out unique ones seen = set() for i in break_ctrlc(xrange(10000)): x = grammar.generate('START') if x not in seen: seen.add(x) # make the function node version f = LOTHypothesis(grammar, value=x, args=['x']) print x.log_probability(), x for ei in example_input: print "\t", ei, " -> ", f(ei)
def run(data_amount): print "Starting chain on %s data points"%data_amount data = makeLexiconData(target, four_gen_tree_context, n=data_amount, alpha=options.alpha, verbose=True) h0 = KinshipLexicon(alpha=options.alpha) for w in target_words: h0.set_word(w, LOTHypothesis(my_grammar, display='lambda recurse_, C, X: %s')) hyps = TopN(N=options.top_count) mhs = MHSampler(h0, data, options.steps, likelihood_temperature=options.llt, prior_temperature=options.prior_temp) for samples_yielded, h in break_ctrlc(enumerate(mhs)): hyps.add(h) import pickle print 'Writing ' + data[0].X + data[0].Y + str(data_amount) + data[0].word + '.pkl' with open('Chains/' + data[0].X + data[0].Y + str(data_amount) + data[0].word + '.pkl', 'w') as f: pickle.dump(hyps, f) return hyps
def test_substitute(self): """ Test how substitution works """ for _ in break_ctrlc(xrange(1000)): x = self.G.generate() y, _ = x.sample_subnode() oldy = copy(y) repl = self.G.generate(y.returntype) # pick a novel replacemnet (must NOT equal y) # NOTE: We can't just rejection sample here to pick repl because it may not be possible for a given x if repl == y: continue x.replace_subnodes(lambda z: z==y, repl) self.assertTrue(x.check_parent_refs()) # We cannot check generation_probabilites because replace_subnodes breaks that! # and ensure that y is not left! for xi in x: self.assertTrue(xi != oldy, "\n%s\n%s\n%s\n%s"%(x,y,repl,xi))
def run(hypothesis, data_amount): print "Starting chain on %s data points" % data_amount data = makeLexiconData(target, four_gen_tree_context, n=data_amount, alpha=options.alpha, verbose=True) h0 = KinshipLexicon(alpha=options.alpha) for w in target_words: h0.set_word( w, LOTHypothesis(grammar=my_grammar, value=hypothesis.value[w].value, display='lambda recurse_, C, X: %s')) hyps = TopN(N=options.top_count) mhs = MHSampler(h0, data, options.steps, likelihood_temperature=options.llt, prior_temperature=options.prior_temp) for samples_yielded, h in break_ctrlc(enumerate(mhs)): if samples_yielded % 100 == 0: pass #print h.likelihood, h.prior, h hyps.add(h) import pickle print 'Writing ' + data[0].X + data[0].Y + str( data_amount) + data[0].word + '.pkl' with open( 'Chains/' + data[0].X + data[0].Y + str(data_amount) + data[0].word + '.pkl', 'w') as f: pickle.dump(hyps, f) return hyps
def evaluate_sampler(self, sampler): cnt = Counter() for h in break_ctrlc(sampler): cnt[h.value] += 1 ## TODO: When the MCMC methods get cleaned up for how many samples they return, we will assert that we got the right number here # assert sum(cnt.values()) == NSAMPLES # Just make sure we aren't using a sampler that returns fewer samples! I'm looking at you, ParallelTempering Z = logsumexp([self.grammar.log_probability(t) for t in self.trees]) # renormalize to the trees in self.trees obsc = [cnt[t] for t in self.trees] expc = [exp( self.grammar.log_probability(t))*sum(obsc) for t in self.trees] csq, pv = chisquare(obsc, expc) assert abs(sum(obsc) - sum(expc)) < 0.01 # assert min(expc) > 5 # or else chisq sux for t, c, s in zip(self.trees, obsc, expc): print c, s, t print (csq, pv), sum(obsc) self.assertGreater(pv, PVALUE, msg="Sampler failed chi squared!") return csq, pv
def evaluate_sampler(my_sampler, print_every=1000, out_aggregate=sys.stdout, trace=False, pthreshold=0.999, prefix=""): """ Print the stats for a single sampler run *my_sampler* -- a generator of samples print_every -- display the output every this many steps out_hypothesis -- where we put hypothesis stats out_aggregate -- where we put aggregate stats trace -- print every sample prefix -- display before lines """ visited_at = defaultdict(list) startt = time() for n, s in break_ctrlc(enumerate(my_sampler)): # each sample should have an .posterior_score defined if trace: print "#", n, s visited_at[s].append(n) if (n%print_every)==0 and n>0: post = sorted([x.posterior_score for x in visited_at.keys()], reverse=True) # the unnormalized posteriors of everything found ll = sorted([x.likelihood for x in visited_at.keys()], reverse=True) Z = logsumexp(post) # just compute total probability mass found -- the main measure # determine how many you need to get pthreshold of the posterior mass J=0 while J < len(post): if logsumexp(post[J:]) < Z + log(1.0-pthreshold): break J += 1 out_aggregate.write('\t'.join(map(str, [prefix, n, r3(time()-startt), r5(Z), r5(post[0]), J, len(post)] )) + '\n') out_aggregate.flush() return
class ParticleSwarmPriorResample(ParticleSwarm): """ Like ParticleSwarm, but resamples from the prior """ def refresh(self): """ Resample by resampling those below the median from the prior. """ m = median(self.chainZ) for i in range(self.nchains): if self.chainZ[i] < m: self.chains[i] = self.make_h0(**self.kwargs) self.chainZ[i] = -Infinity # reset this # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == "__main__": from LOTlib.Examples.Number.Global import generate_data, make_h0 data = generate_data(300) ps = ParticleSwarm(make_h0, data) for h in break_ctrlc(ps): print h.posterior_score, h if len(ps.seen) > 0: print "#", sorted(ps.seen, key=lambda x: x.posterior_score, reverse=True)[0]
probs=[v.posterior_score for v in population], log=True) try: kid = mutate(crossover(mom, dad)) except (ProposalFailedException, NodeSamplingException): continue kid.compute_posterior(data) yield kid nextpopulation.append(kid) # # if MH_acceptance(population[i].posterior_score, kid.posterior_score, 0.0): # if kid.posterior_score > population[i].posterior_score: # population[i] = kid # yield kid population = nextpopulation if __name__ == "__main__": from LOTlib import break_ctrlc from LOTlib.Examples.Number.Model import make_hypothesis, make_data from LOTlib.Miscellaneous import qq data = make_data(400) for h in break_ctrlc( genetic_algorithm(make_hypothesis, data, mutate_lot, crossover_lot)): print h.posterior_score, h.get_knower_pattern(), qq(h)
def __iter__(self): for i, t in enumerate(self.grammar.enumerate()): if i >= self.steps: raise StopIteration h = self.make_h(value=t) h.compute_posterior(self.data) yield h if __name__ == "__main__": from LOTlib import break_ctrlc #from LOTlib.Examples.Number.Shared import grammar, make_h0, generate_data #data = generate_data(100) #from LOTlib.Examples.RegularExpression.Shared import grammar, make_h0, data from LOTlib.Examples.Magnetism.Simple import make_data, make_hypothesis from LOTlib.Examples.Magnetism.Simple.Grammar import grammar #from LOTlib.Examples.RationalRules.Shared import grammar, data, make_h0 for h in break_ctrlc( EnumerationInference(grammar, make_hypothesis, make_data(), steps=10000)): print h.posterior_score, h
def print_subtree_adaptations(grammar, hypotheses, posteriors, subtrees, relative_KL=True): """ Determine how useful it would be to explicitly define each subtree in H across all of the (corresponding) posteriors, as measured by KL from prior to posterior - hypotheses - a list of LOThypotheses - posteriors - [ [P(h|data) for h in hypotheies] x problems ] - subtrees - a collection of (possibly partial) subtrees to try adapting We treat hyps as a fixed finite hypothesis space, and assume every subtree considered is *not* derived compositionally (although this could change in future versions) p - the probability of going to kids in randomly generating a subtree subtree_multiplier - how many times we sample a subtree from *each* node in each hypothesis relative_KL - compute summed KL divergence absolutely, or relative to the h.compute_prior()? """ # compute the normalized posteriors Ps = map(lognormalize, posteriors) # Compute the baseline KL divergence so we can score relative to this if relative_KL: oldpriors = lognormalize( numpy.array([h.compute_prior() for h in hypotheses])) KL0s = [sum(exp(oldpriors) * (oldpriors - P)) for P in Ps] else: KL0s = [ 1.0 for P in Ps ] # pretend everything just had KL of 1, so we score relatively ## Now process each, starting with the most simple for t in break_ctrlc( sorted(subtrees, key=lambda t: grammar.log_probability(t), reverse=True)): # Get some stats on t: tlp = grammar.log_probability(t) tnt = count_identical_nonterminals( t.returntype, t) # How many times is this nonterminal used? # How many matches of t are there in each H? m = numpy.array( [count_subtree_matches(t, h.value) for h in hypotheses]) ## TODO: There is a complication: partial patterns matching themselves. ## For simplicity, we'll just take the *first* match, seetting max(m)=1 ## In the future, we should change this to correctly handle and count ## partial matches matching themselves m = (m >= 1) * 1 assert max(m) == 1, "Error: " + str(t) + "\t" + str(m) # How many times is the nonterminal used, NOT counting in t? nt = numpy.array([ count_identical_nonterminals(t.returntype, h.value) for h in hypotheses ]) - (tnt - 1) * m assert min(nt) >= 0, "Error: " + str(t) # And the PCFG prior *not* counting t q = lognormalize( numpy.array([grammar.log_probability(h.value) for h in hypotheses]) - tlp * m) # The function to optimize def fnc(p): if p <= 0. or p >= 1.: return float("inf") # enforce bounds newprior = lognormalize(q + log(p) * m + log(1. - p) * nt) kl = 0.0 for P, kl0 in zip(Ps, KL0s): kl += sum(numpy.exp(newprior) * (newprior - P)) / kl0 return kl ### TODO: This optimization should be analytically tractable... ### but we need to check that it is convex! Any ideas? o = scipy.optimize.fmin(fnc, numpy.array([0.1]), xtol=0.0001, ftol=0.0001, disp=0) print fnc(o[0]), o[0], log(o[0]), grammar.log_probability(t), qq(t)
from LOTlib import break_ctrlc from LOTlib.TopN import TopN from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler from Model import * from TargetConcepts import TargetConcepts NDATA = 20 # How many data points for each function? NSTEPS = 100000 BEST_N = 500 # How many from each hypothesis to store # Where we keep track of all hypotheses (across concepts) all_hypotheses = TopN(N=BEST_N) if __name__ == "__main__": # Now loop over each target concept and get a set of hypotheses for i, f in enumerate(TargetConcepts): # Set up the hypothesis h0 = make_hypothesis() # Set up some data data = make_data(NDATA, f) # Now run some MCMC fs = TopN(N=BEST_N, key="posterior_score") fs.add(break_ctrlc(MHSampler(h0, data, steps=NSTEPS, trace=False))) all_hypotheses.update(fs) pickle.dump(all_hypotheses, open("hypotheses.pkl", 'w'))
# Hypothesis # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ from LOTlib.Hypotheses.RationalRulesLOTHypothesis import RationalRulesLOTHypothesis def make_hypothesis(grammar=grammar, **kwargs): return RationalRulesLOTHypothesis(grammar=grammar, rrAlpha=1.0, **kwargs) if __name__ == "__main__": from LOTlib.TopN import TopN hyps = TopN(N=1000) from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler from LOTlib import break_ctrlc mhs = MHSampler(make_hypothesis(), make_data(), 1000000, likelihood_temperature=1., prior_temperature=1.) for samples_yielded, h in break_ctrlc(enumerate(mhs)): h.ll_decay = 0. hyps.add(h) import pickle with open('HypothesisSpace.pkl', 'w') as f: pickle.dump(hyps, f)
pickle.dump(hyps, f) return hyps ################################################################################### # Main Running ################################################################################### argarray = map(lambda x: [x], options.data_pts * options.chains) if is_master_process(): display_option_summary(options) seen = set() for fs in break_ctrlc( MPI_map(run, numpy.random.permutation(argarray), progress_bar=False)): for h in fs.get_all(): if h not in seen: seen.add(h) if h.prior > -Infinity: print h.prior, \ h.likelihood, \ h \ #sys.stdout.flush() #sys.stdout.flush() import pickle with open(options.out_path, 'w') as f: pickle.dump(seen, f)
grammar.start = 'TWO_CONCEPT_START' # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Hypothesis # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ from Model import make_hypothesis # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Main # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == "__main__": from LOTlib import break_ctrlc from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler from LOTlib.Miscellaneous import q # Create an initial hypothesis # This is where we set a number of relevant variables -- whether to use RR, alpha, etc. # Here we give args as "concept" (used in TWO_CONCEPT_START above) and "x" h0 = make_hypothesis(grammar=grammar, args=['concept', 'x']) data = make_data() # Run the vanilla sampler. Without steps, it will run infinitely # this prints out posterior (posterior_score), prior, likelihood, for h in break_ctrlc(MHSampler(h0, data, 10000, skip=100)): print h.posterior_score, h.prior, h.likelihood, q(h)
log(before_same_children) - log(nrk)) + old_lp_below return [newt, f - b] if __name__ == "__main__": from LOTlib import break_ctrlc #from LOTlib.Examples.Number.Shared import grammar, make_h0, generate_data #data = generate_data(300) ## NOTE: TO NORMALLY USE THIS, YOU MUST MIX WITH REGENERATION PROPOSAL -- ELSE NOT ERGODIC from LOTlib.Examples.Magnetism.Simple.Run import grammar, make_h0, data from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler idp = InsertDeleteProposal(grammar) #data = generate_data(100) h = make_h0(proposal_function=idp) for h in break_ctrlc(MHSampler(h, data, 100000)): print h.posterior_score, h """ for _ in xrange(100): t = grammar.generate() print "\n\n", t for _ in xrange(10): print "\t", idp.propose_tree(t) """
def make_hypothesis(**kwargs): return MyHypothesis(grammar=grammar, rrAlpha=1.0, **kwargs) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Main # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == "__main__": from LOTlib import break_ctrlc from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler from LOTlib.Miscellaneous import q # Create an initial hypothesis # This is where we set a number of relevant variables -- whether to use RR, alpha, etc.Z h0 = MyHypothesis(grammar, ll_decay=1.0, rrAlpha=1.0, args=['x']) data = make_data() # Run the vanilla sampler. Without steps, it will run infinitely # this prints out posterior (posterior_score), prior, likelihood, for h in break_ctrlc( MHSampler(h0, data, 10000, skip=100, shortcut_likelihood=False)): print h.posterior_score, h.prior, h.likelihood, q(h) # This setup requires the *later* data to be upweighted, meaning that hypotheses that get # later data wrong should be given lower likelhood. But also with the decay, the overall # magnitude of the likelihood decreases.
print "# Created L, NYes, NTrials, and HOutput of size %s" % len(L) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Run inference # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ from LOTlib import break_ctrlc from LOTlib.Inference.GrammarInference.FullGrammarHypothesis import FullGrammarHypothesis from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler h0 = FullGrammarHypothesis(counts, L, GroupLength, prior_offset, NYes, NTrials, Output) mhs = MHSampler(h0, [], 100000, skip=0) for s, h in break_ctrlc(enumerate(mhs)): print mhs.acceptance_ratio(), h.prior, h.likelihood,\ h.value['alpha'].value[0], h.value['beta'].value[0],\ h.value['prior_temperature'].value, h.value['likelihood_temperature'].value,\ 'RULES',\ ' '.join([str(x) for x in h.value['rulep']['BOOL'].value ]),\ ' '.join([str(x) for x in h.value['rulep']['PREDICATE'].value ]),\ ' '.join([str(x) for x in h.value['rulep']['START'].value ]),\ ' '.join([str(x) for x in h.value['rulep']['SET'].value ]) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Run gradient ascent # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # from LOTlib import break_ctrlc #
self.penalty = penalty self.seen = Counter() def next(self): v = MHSampler.next(self) self.seen[v] += 1 return v def compute_posterior(self, h, data, **kwargs): """ Compute prior & likelihood for `h`, penalizing prior by how many samples have been generated so far. """ return self.seen[h] * self.penalty + h.compute_posterior( data, **kwargs) if __name__ == "__main__": from LOTlib import break_ctrlc from LOTlib.Examples.Number.Model import * from LOTlib.Miscellaneous import q data = make_data(500) h0 = NumberExpression(grammar) tmc = TabooMCMC(h0, data, steps=10000) for h in break_ctrlc(tmc): print tmc.seen[h], h.posterior_score, h.prior, h.likelihood, q(h)