Exemple #1
0
def datawise_optimize(current_sample,
                      data,
                      steps=1000000,
                      inner_steps=10,
                      data_weight=1.0,
                      ll_temperature=1.0,
                      **kwargs):
    """
            cycle through data points, taking a few steps in the direction of that data point
            This uses ll_temperature to simulate having len(data)*data_weight number of data points

            steps -- you take this many total steps (steps/inner_steps inner loops)
            inner steps -- how many steps to take on a single data point
            data_weight -- weight each single data point as len(data)*this


    """

    # How many data points? Used for setting the temperature below
    NDATA = len(data)

    for mhi in lot_iter(xrange(steps / inner_steps)):

        for di in lot_iter(data):

            for h in mh_sample(current_sample, [di],
                               steps=inner_steps,
                               ll_temperature=ll_temperature /
                               (NDATA * data_weight),
                               **kwargs):
                current_sample = h
                yield h
Exemple #2
0
    def test_lp_regenerate_propose_to(self):
        # import the grammar
        from LOTlibTest.Grammars import lp_regenerate_propose_to_grammar
        self.G = lp_regenerate_propose_to_grammar.g
        # the RegenerationProposal class
        rp = RegenerationProposal(self.G)
        numTests = 100
        # Sample 1000 trees from the grammar, and run a chi-squared test for each of them
        for i in lot_iter(range(numTests)):
            # keep track of expected and actual counts
            # expected_counts = defaultdict(int) # a dictionary whose keys are trees and values are the expected number of times we should be proposing to this tree
            actual_counts = defaultdict(int) # same as expected_counts, but stores the actual number of times we proposed to a given tree
            tree = self.G.generate('START')

            # Regenerate some number of trees at random
            numTrees = 1000
            for j in range(numTrees):
                newtree = rp.propose_tree(tree)[0]
                # trees.append(newtree)
                actual_counts[newtree] += 1
            # see if the frequency with which each category of trees is generated matches the
            # expected counts using a chi-squared test
            chisquared, p = self.get_pvalue(tree, actual_counts, numTrees)
            # print chisquared, p
            # if p > 0.01/1000, test passes
            self.assertTrue(p > 0.01/numTests, "Trees are not being generated according to the expected log probabilities")
            if i % 10 == 0 and i != 0: print i, "lp_regenerate_propose_to tests..."
        print numTests, "lp_regenerate_propose_to tests..."
Exemple #3
0
def generate_unique_trees(grammar, start='START', N=1000):
    """
            Yield a bunch of unique trees, produced from the grammar
    """
    for _ in lot_iter(xrange(N)):
        t = grammar.generate(start)
        yield t
Exemple #4
0
    def test_lp_regenerate_propose_to(self):
        # import the grammar
        from LOTlibTest.Grammars import lp_regenerate_propose_to_grammar
        self.G = lp_regenerate_propose_to_grammar.g
        # the RegenerationProposal class
        rp = RegenerationProposal(self.G)
        numTests = 100
        # Sample 1000 trees from the grammar, and run a chi-squared test for each of them
        for i in lot_iter(range(numTests)):
            # keep track of expected and actual counts
            # expected_counts = defaultdict(int) # a dictionary whose keys are trees and values are the expected number of times we should be proposing to this tree
            actual_counts = defaultdict(
                int
            )  # same as expected_counts, but stores the actual number of times we proposed to a given tree
            tree = self.G.generate('START')

            # Regenerate some number of trees at random
            numTrees = 1000
            for j in range(numTrees):
                newtree = rp.propose_tree(tree)[0]
                # trees.append(newtree)
                actual_counts[newtree] += 1
            # see if the frequency with which each category of trees is generated matches the
            # expected counts using a chi-squared test
            chisquared, p = self.get_pvalue(tree, actual_counts, numTrees)
            # print chisquared, p
            # if p > 0.01/1000, test passes
            self.assertTrue(
                p > 0.01 / numTests,
                "Trees are not being generated according to the expected log probabilities"
            )
            if i % 10 == 0 and i != 0:
                print i, "lp_regenerate_propose_to tests..."
        print numTests, "lp_regenerate_propose_to tests..."
    def next(self):
        if LOTlib.SIG_INTERRUPTED or self.samples_yielded >= self.steps:
            raise StopIteration
        else:
            for _ in lot_iter(xrange(self.skip+1)):

                self.proposal, fb = self.proposer(self.current_sample)

                # either compute this, or use the memoized version
                np, nl = self.compute_posterior(self.proposal, self.data)

                #print np, nl, current_sample.prior, current_sample.likelihood
                # NOTE: IT is important that we re-compute from the temperature since these may be altered externally from ParallelTempering and others
                prop = (np/self.prior_temperature+nl/self.likelihood_temperature)
                cur  = (self.current_sample.prior/self.prior_temperature + self.current_sample.likelihood/self.likelihood_temperature)

                if MH_acceptance(cur, prop, fb, acceptance_temperature=self.acceptance_temperature):
                    self.current_sample = self.proposal
                    self.was_accepted = True
                    self.acceptance_count += 1
                else:
                    self.was_accepted = False

                self.internal_sample(self.current_sample)
                self.proposal_count += 1

            if self.trace:
                print self.current_sample.posterior_score, self.current_sample.likelihood, self.current_sample.prior, qq(self.current_sample)

            self.samples_yielded += 1
            return self.current_sample
Exemple #6
0
    def plot_sampler(self, opath, sampler):
        """
        Plot the sampler, for cases with many zeros where chisquared won't work well
        """
        cnt = Counter()
        for h in lot_iter(sampler):
            cnt[h.value] += 1

        Z = logsumexp([t.log_probability() for t in self.trees]) # renormalize to the trees in self.trees
        obsc = [cnt[t] for t in self.trees]
        expc = [exp(t.log_probability()-Z)*sum(obsc) for t in self.trees]

        for t, c, s in zip(self.trees, obsc, expc):
            print c, "\t", s, "\t", t


        expc, obsc, trees = zip(*sorted(zip(expc, obsc, self.trees), reverse=True))

        import matplotlib.pyplot as plt
        from numpy import log
        plt.subplot(111)
        # Log here spaces things out at the high end, where we can see it!
        plt.scatter(log(range(len(trees))), expc, color="red", alpha=1.)
        plt.scatter(log(range(len(trees))), obsc, color="blue", marker="x", alpha=1.)
        plt.savefig(opath)
        plt.clf()
Exemple #7
0
def run():
    data = generate_data(target, NDATA, data_sd) # generate some data
    h0 = MAPSymbolicRegressionHypothesis(grammar, args=['x']+CONSTANT_NAMES)
    h0.CONSTANT_VALUES = numpy.zeros(NCONSTANTS) ## TODO: Move this to an itializer

    from LOTlib.Inference.MetropolisHastings import MHSampler
    for h in lot_iter(MHSampler(h0, data, STEPS, skip=SKIP, trace=False)):
        print h.posterior_score, h.likelihood, h.prior, h.CONSTANT_VALUES, qq(h)
Exemple #8
0
def run():
    """ Standard run function."""
    h0 = SchemeFunction(grammar, ALPHA=ALPHA)
    for x in lot_iter(MHSampler(h0, data, STEPS)):

        print x.posterior_score, x
        for di in data:
            print "\t", di.input, "->", x(*di.input), " ; should be ", di.output
Exemple #9
0
def save_hypotheses(sampler, filename='numbergame_hypotheses.p'):
    hypotheses = set()
    for h in lot_iter(sampler):
        hypotheses.add(h)

    f = open(filename, "wb")
    pickle.dump(hypotheses, f)
    return hypotheses
Exemple #10
0
def run(llt=1.0):
    h0 = CCGLexicon(make_hypothesis, words=all_words, alpha=0.9, palpha=0.9, likelihood_temperature=llt)

    fbs = FiniteBestSet(N=10)
    from LOTlib.Inference.MetropolisHastings import mh_sample
    for h in lot_iter(mh_sample(h0, data, SAMPLES)):
        fbs.add(h, h.posterior_score)

    return fbs
Exemple #11
0
def run(llt=1.0):

    h0 = CCGLexicon(make_hypothesis, words=all_words, alpha=0.9, palpha=0.9, likelihood_temperature=llt)

    fbs = FiniteBestSet(N=10)
    from LOTlib.Inference.MetropolisHastings import mh_sample
    for h in lot_iter(mh_sample(h0, data, SAMPLES)):
        fbs.add(h, h.posterior_score)

    return fbs
Exemple #12
0
    def __call__(self, generator):
        """Pass this a generator, add each element as it's yielded.

        This allows us to make a pipeline. See Example in main docstring: '# Or as a generator...'.

        """
        if hasattr(generator, 'data'):
            self.data = generator.data
        for sample in lot_iter(generator):
            self.add(sample)
            yield sample
Exemple #13
0
    def test_eq(self):
        counter = 0
        for i in lot_iter(xrange(10000)):
            x = self.G.generate()
            y = self.G.generate()

            if pystring(x) == pystring(y):
                counter += 1
                # print(counter)
                #print( pystring(x)+'\n'+ pystring(y)+'\n')

            self.assertEqual( pystring(x) == pystring(y), x == y, "Without bvs, the pystrings should be the same")
Exemple #14
0
def run():
    from LOTlib import lot_iter
    from LOTlib.Inference.Proposals.RegenerationProposal import RegenerationProposal
    #mp = MixtureProposal([RegenerationProposal(grammar), InsertDeleteProposal(grammar)] )
    mp = RegenerationProposal(grammar)

    from LOTlib.Hypotheses.LOTHypothesis import LOTHypothesis
    h0 = LOTHypothesis(grammar, args=['x', 'y'], ALPHA=0.999, proposal_function=mp) # alpha here trades off with the amount of data. Currently assuming no noise, but that's not necessary

    from LOTlib.Inference.MetropolisHastings import MHSampler
    for h in lot_iter(MHSampler(h0, data, skip=100)):
        print h.posterior_score, h.likelihood, h.prior, cleanFunctionNodeString(h)
def datawise_optimize(current_sample, data, steps=1000000, inner_steps=10, data_weight=1.0, ll_temperature=1.0, **kwargs):
    """
            cycle through data points, taking a few steps in the direction of that data point
            This uses ll_temperature to simulate having len(data)*data_weight number of data points

            steps -- you take this many total steps (steps/inner_steps inner loops)
            inner steps -- how many steps to take on a single data point
            data_weight -- weight each single data point as len(data)*this


    """

    # How many data points? Used for setting the temperature below
    NDATA = len(data)

    for mhi in lot_iter(xrange(steps/inner_steps)):

        for di in lot_iter(data):

            for h in mh_sample(current_sample, [di], steps=inner_steps, ll_temperature=ll_temperature/(NDATA*data_weight), **kwargs):
                current_sample = h
                yield h
Exemple #16
0
def tempered_transitions_sample(inh,
                                data,
                                steps,
                                proposer=None,
                                skip=0,
                                temperatures=[1.0, 1.05, 1.1],
                                stats=None):
    current_sample = inh

    LT = len(temperatures)

    ## TODO: CHECK THIS--STILL NOT SURE THIS IS RIGHT
    # a helper function for temperature transitions -- one single MH step, returning a new sample
    # this allows diff. temps for top and bottom
    def tt_helper(xi, data, tnew, told, proposer):
        if proposer is None: xinew, fb = xi.propose()
        else: xinew, fb = proposer(xi)
        xinew.compute_posterior(data)
        r = (xinew.prior +
             xinew.likelihood) / tnew - (xi.prior + xi.likelihood) / told - fb
        if r > 0.0 or random() < exp(r):
            return xinew
        else:
            return xi

    for mhi in lot_iter(xrange(steps)):
        for skp in xrange(skip + 1):

            xi = current_sample  # do not need to copy this
            totlp = 0.0  #(xi.lp / temperatures[1]) - (xi.lp / temperatures[0])

            for i in xrange(0, LT - 2):  # go up
                xi = tt_helper(xi, data, temperatures[i + 1], temperatures[i],
                               proposer)
                totlp = totlp + (xi.prior + xi.likelihood) / temperatures[
                    i + 1] - (xi.prior + xi.likelihood) / temperatures[i]

            # do the top:
            xi = tt_helper(xi, data, temperatures[LT - 1],
                           temperatures[LT - 1], proposer)

            for i in xrange(len(temperatures) - 2, 0, -1):  # go down
                xi = tt_helper(xi, data, temperatures[i], temperatures[i],
                               proposer)
                totlp = totlp + (xi.prior + xi.likelihood) / temperatures[
                    i] - (xi.prior + xi.likelihood) / temperatures[i + 1]

            if random() < exp(totlp):
                current_sample = xi  # copy this over

        yield current_sample
Exemple #17
0
    def test_eq(self):
        counter = 0
        for i in lot_iter(xrange(10000)):
            x = self.G.generate()
            y = self.G.generate()

            if pystring(x) == pystring(y):
                counter += 1
                # print(counter)
                #print( pystring(x)+'\n'+ pystring(y)+'\n')

            self.assertEqual(
                pystring(x) == pystring(y), x == y,
                "Without bvs, the pystrings should be the same")
Exemple #18
0
def generate_data(data_size):

    all_words = target.all_words()

    data = []
    for i in lot_iter(xrange(data_size)):

        # a context is a set of men, pirates, and everything. functions are applied to this to get truth values
        context = sample_context()

        word = target.sample_utterance(all_words, context)

        data.append( UtteranceData(utterance=word, context=context, possible_utterances=all_words) )

    return data
Exemple #19
0
def prior_sample(h0, data, N):
    """
            Just use the grammar and returntype of h0 to sample from the prior
            NOTE: Only implemented for LOTHypothesis
    """
    assert isinstance(h0, LOTHypothesis)

    # extract from the grammar
    grammar = h0.grammar
    rt = h0.value.returntype

    for i in lot_iter(xrange(N)):

        h = type(h0)(grammar, start=rt)
        h.compute_posterior(data)

        yield h
Exemple #20
0
def prior_sample(h0, data, N):
    """
            Just use the grammar and returntype of h0 to sample from the prior
            NOTE: Only implemented for LOTHypothesis
    """
    assert isinstance(h0, LOTHypothesis)

    # extract from the grammar
    grammar = h0.grammar
    rt = h0.value.returntype

    for i in lot_iter(xrange(N)):

        h = type(h0)(grammar, start=rt)
        h.compute_posterior(data)

        yield h
Exemple #21
0
def run(data_size):
    """
    This out on the DATA_RANGE amounts of data and returns *all* hypothese in the top options.TOP_COUNT
    """

    if LOTlib.SIG_INTERRUPTED: return TopN()  # So we don't waste time making data for everything that isn't run

    # initialize the data
    data = generate_data(data_size)

    # starting hypothesis -- here this generates at random
    h0 = Utilities.make_h0()

    hyps = TopN(N=options.TOP_COUNT)
    
    hyps.add(lot_iter(MHSampler(h0, data, options.STEPS, trace=False)))

    return hyps
Exemple #22
0
def run_mh():
    """Run the MH; Run the vanilla sampler.

    Without steps, it will run infinitely. This prints out posterior (posterior_score), prior, tree grammar
    probability, likelihood,

    This yields data like below:
        -10.1447997767 -9.93962659915 -12.2377573418 -0.20517317755 'and_(not_(is_shape_(x, 'triangle')),
            not_(is_color_(x, 'blue')))'
        -11.9260879461 -8.77647578935 -12.2377573418 -3.14961215672 'and_(not_(is_shape_(x, 'triangle')),
            not_(is_shape_(x, 'triangle')))'

    """
    # Create an initial hypothesis. Here we use a RationalRulesLOTHypothesis, which
    # is defined in LOTlib.Hypotheses and wraps LOTHypothesis with the rational rules prior
    h0 = RationalRulesLOTHypothesis(grammar=DNF, rrAlpha=1.0)

    for h in lot_iter(MHSampler(h0, data, 10000, skip=100)):
        print h.posterior_score, h.prior, h.value.log_probability(), h.likelihood, q(h)
Exemple #23
0
def scheme_generate():
    """ This generates random scheme code with cons, cdr, and car, and evaluates it on some simple list
    structures.

    No inference here -- just random sampling from a grammar.

    """
    ## Generate some and print out unique ones
    seen = set()
    for i in lot_iter(xrange(10000)):
        x = grammar.generate('START')

        if x not in seen:
            seen.add(x)

            # make the function node version
            f = LOTHypothesis(grammar, value=x, args=['x'])

            print x.log_probability(), x
            for ei in example_input:
                print "\t", ei, " -> ", f(ei)
def tempered_transitions_sample(inh, data, steps, proposer=None, skip=0, temperatures=[1.0, 1.05, 1.1], stats=None):
    current_sample = inh

    LT = len(temperatures)

    ## TODO: CHECK THIS--STILL NOT SURE THIS IS RIGHT
    # a helper function for temperature transitions -- one single MH step, returning a new sample
    # this allows diff. temps for top and bottom
    def tt_helper(xi, data, tnew, told, proposer):
        if proposer is None: xinew, fb = xi.propose()
        else:                xinew, fb = proposer(xi)
        xinew.compute_posterior(data)
        r = (xinew.prior + xinew.likelihood) / tnew - (xi.prior + xi.likelihood) / told - fb
        if r > 0.0 or random() < exp(r):
            return xinew
        else:   return xi


    for mhi in lot_iter(xrange(steps)):
        for skp in xrange(skip+1):

            xi = current_sample # do not need to copy this
            totlp = 0.0 #(xi.lp / temperatures[1]) - (xi.lp / temperatures[0])

            for i in xrange(0,LT-2): # go up
                xi = tt_helper(xi, data, temperatures[i+1], temperatures[i], proposer)
                totlp = totlp + (xi.prior + xi.likelihood) / temperatures[i+1] - (xi.prior + xi.likelihood) / temperatures[i]

            # do the top:
            xi = tt_helper(xi, data, temperatures[LT-1], temperatures[LT-1], proposer)

            for i in xrange(len(temperatures)-2, 0, -1): # go down
                xi = tt_helper(xi, data, temperatures[i], temperatures[i], proposer)
                totlp = totlp + (xi.prior + xi.likelihood) / temperatures[i] - (xi.prior + xi.likelihood) / temperatures[i+1]

            if random() < exp(totlp):
                current_sample = xi # copy this over

        yield current_sample
Exemple #25
0
    def next(self):
        if LOTlib.SIG_INTERRUPTED or self.samples_yielded >= self.steps:
            raise StopIteration
        else:
            for _ in lot_iter(xrange(self.skip + 1)):

                self.proposal, fb = self.proposer(self.current_sample)

                # either compute this, or use the memoized version
                np, nl = self.compute_posterior(self.proposal, self.data)

                #print np, nl, current_sample.prior, current_sample.likelihood
                # NOTE: IT is important that we re-compute from the temperature since these may be altered externally from ParallelTempering and others
                prop = (np / self.prior_temperature +
                        nl / self.likelihood_temperature)
                cur = (self.current_sample.prior / self.prior_temperature +
                       self.current_sample.likelihood /
                       self.likelihood_temperature)

                if MH_acceptance(
                        cur,
                        prop,
                        fb,
                        acceptance_temperature=self.acceptance_temperature):
                    self.current_sample = self.proposal
                    self.was_accepted = True
                    self.acceptance_count += 1
                else:
                    self.was_accepted = False

                self.internal_sample(self.current_sample)
                self.proposal_count += 1

            if self.trace:
                print self.current_sample.posterior_score, self.current_sample.likelihood, self.current_sample.prior, qq(
                    self.current_sample)

            self.samples_yielded += 1
            return self.current_sample
Exemple #26
0
def evaluate_sampler(my_sampler, print_every=1000, out_hypotheses=sys.stdout, out_aggregate=sys.stdout, trace=False, prefix=""):
    """
            Print the stats for a single sampler run

            *my_sampler* -- a generator of samples
            print_every -- display the output every this many steps
            out_hypothesis -- where we put hypothesis stats
            out_aggregate  -- where we put aggregate stats

            trace -- print every sample
            prefix -- display before lines
    """
    visited_at = defaultdict(list)

    startt = time()
    for n, s in lot_iter(enumerate(my_sampler)): # each sample should have an .posterior_score defined
        if trace: print "#", n, s

        visited_at[s].append(n)

        if (n%print_every)==0 and n>0:
            post =  sorted([x.posterior_score for x in visited_at.keys()], reverse=True) # the unnormalized posteriors of everything found
            ll   =  sorted([x.likelihood for x in visited_at.keys()], reverse=True)
            Z = logsumexp(post) # just compute total probability mass found -- the main measure

            out_aggregate.write('\t'.join(map(str, [prefix, n, r3(time()-startt), r5(Z), len(post)]+mydisplay(post))) + '\n')

    # Now once we're done, output the hypothesis stats
    for k,v in visited_at.items():

        mean_diff = "NA"
        if len(v) > 1: mean_diff = mean(diff(v))

        out_hypotheses.write('\t'.join(map(str, [prefix, k.posterior_score, k.prior, k.likelihood, len(v), min(v), max(v), mean_diff, sum(diff(v)==0) ])) +'\n') # number of rejects from this

    return 0.0
Exemple #27
0
    def evaluate_sampler(self, sampler):

        cnt = Counter()
        for h in lot_iter(sampler):
            cnt[h.value] += 1

        ## TODO: When the MCMC methods get cleaned up for how many samples they return, we will assert that we got the right number here
        # assert sum(cnt.values()) == NSAMPLES # Just make sure we aren't using a sampler that returns fewer samples! I'm looking at you, ParallelTempering

        Z = logsumexp([t.log_probability() for t in self.trees]) # renormalize to the trees in self.trees
        obsc = [cnt[t] for t in self.trees]
        expc = [exp(t.log_probability()-Z)*sum(obsc) for t in self.trees]
        csq, pv = chisquare(obsc, expc)
        assert abs(sum(obsc) - sum(expc)) < 0.01

        # assert min(expc) > 5 # or else chisq sux

        for t, c, s in zip(self.trees, obsc, expc):
            print c, s, t
        print (csq, pv), sum(obsc)

        self.assertGreater(pv, PVALUE, msg="Sampler failed chi squared!")

        return csq, pv
Exemple #28
0
h0 = NumberExpression(grammar)

'''
from LOTlib.Inference.Proposals.InsertDeleteProposal import InsertDeleteProposal
h0 = NumberExpression(grammar, proposal_function=InsertDeleteProposal(grammar))
'''

# store hypotheses we've found
allhyp = TopN(N=1000)

# ========================================================================================================
# Run the standard RationalRules sampler

mh_sampler = MHSampler(h0, data, STEPS, skip=SKIP)

for h in lot_iter(mh_sampler):
    if TRACE:
        print q(get_knower_pattern(h)), h.posterior_score, h.compute_prior(), h.compute_likelihood(data), qq(h)

    # add h to our priority queue, with priority of its log probability, h.posterior_score
    allhyp.add(h)

# ========================================================================================================
#  now re-evaluate everything we found on new data
'''
huge_data = generate_data(LARGE_DATA_SIZE)

save this with a huge data set -- eval with average ll
H = allhyp.get_sorted()

compute the posterior for each hypothesis
Exemple #29
0
from LOTlib.FiniteBestSet import FiniteBestSet
from LOTlib.Inference.MetropolisHastings import MHSampler
from Model import *


NDATA = 50 # How many total data points?
NSTEPS = 10000
BEST_N = 100 # How many from each hypothesis to store
OUTFILE = "hypotheses.pkl"

# Where we keep track of all hypotheses (across concepts)
all_hypotheses = FiniteBestSet()

if __name__ == "__main__":
    # Now loop over each target concept and get a set of hypotheses
    for i, f in enumerate(TARGET_CONCEPTS):

        # Set up the hypothesis
        h0 = LOTHypothesis(grammar, start='START', args=['x'])

        # Set up some data
        data = generate_data(NDATA, f)

        # Now run some MCMC
        fs = FiniteBestSet(N=BEST_N, key="posterior_score")
        fs.add(lot_iter(MHSampler(h0, data, steps=NSTEPS, trace=False)))

        all_hypotheses.merge(fs)

    pickle.dump(all_hypotheses, open(OUTFILE, 'w'))
Exemple #30
0
# -*- coding: utf-8 -*-
"""
A simple symbolic regression demo

"""
from LOTlib import lot_iter
from LOTlib.Hypotheses.GaussianLOTHypothesis import GaussianLOTHypothesis
from LOTlib.Inference.MetropolisHastings import MHSampler
from LOTlib.Miscellaneous import qq
from LOTlib.Examples.SymbolicRegression.Grammar import grammar
from Data import generate_data

CHAINS = 4
STEPS = 50000
SKIP = 0

if __name__ == "__main__":

    print grammar

    # generate some data
    data = generate_data(50) # how many data points?

    # starting hypothesis -- here this generates at random
    h0 = GaussianLOTHypothesis(grammar)

    for h in lot_iter(MHSampler(h0, data, STEPS, skip=SKIP)):
        print h.posterior_score, qq(h)
Exemple #31
0
    for h in sorted(H, key=lambda h: h.posterior_score):
        print h.posterior_score, h.prior, h.likelihood, h.likelihood_temperature
        print h

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Play around with some different inference schemes
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#h0 = CCGLexicon(make_hypothesis, words=all_words, alpha=0.9, palpha=0.9, likelihood_temperature=0.01)
#for i, h in lot_iter(enumerate(mh_sample(h0, data, 400000000, skip=0, debug=False))):
    #print h.posterior_score, h.prior, h.likelihood, qq(re.sub(r"\n", ";", str(h)))

from LOTlib.Inference.IncreaseTemperatureMH import increase_temperature_mh_sample

h0 = CCGLexicon(make_hypothesis, words=all_words, alpha=0.9, palpha=0.9, likelihood_temperature=0.01)
for i, h in lot_iter(enumerate(increase_temperature_mh_sample(h0, data, 400000000, skip=0, increase_amount=1.50))):
    print h.posterior_score, h.prior, h.likelihood, qq(re.sub(r"\n", ";", str(h)))


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Run on a single computer, printing out
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#fbs = FiniteBestSet(N=100)
#h0 = CCGLexicon(make_hypothesis, words=all_words, alpha=0.9, palpha=0.9, likelihood_temperature=0.051)
#for i, h in lot_iter(enumerate(mh_sample(h0, data, 400000000, skip=0, debug=False))):
    #fbs.add(h, h.posterior_score)

    #if i%100==0:
        #print h.posterior_score, h.prior, h.likelihood #, re.sub(r"\n", ";", str(h))
        #print h
Exemple #32
0
# -*- coding: utf-8 -*-

"""
 A demo of "syntax" learning using a SimpleGenerativeHypothesis.

 This searches over probabilistic generating functions, running them forward to estimate
 the likelihood of the data. Very very simple.

"""
from LOTlib import lot_iter
from LOTlib.Inference.MetropolisHastings import MHSampler
from LOTlib.Hypotheses.SimpleGenerativeHypothesis import SimpleGenerativeHypothesis
from Model import *

if __name__ == "__main__":

    # # # # # # # # # # # # # # # # # # # # # # # # # # # #
    h0 = SimpleGenerativeHypothesis(grammar, args=[''] )

    ## populate the finite sample by running the sampler for this many steps
    for h in lot_iter(MHSampler(h0, data, 100000, skip=100)):
        print h.posterior_score, h.prior, h.likelihood, h
        print h.llcounts
Exemple #33
0
class ParticleSwarmPriorResample(ParticleSwarm):
    """
    Like ParticleSwarm, but resamples from the prior
    """

    def refresh(self):
        """
            Resample by resampling those below the median from the prior.
        """
        m = median(self.chainZ)

        for i in range(self.nchains):
            if self.chainZ[i] < m:
                self.chains[i] = self.make_h0(**self.kwargs)
            self.chainZ[i] = -Infinity  # reset this


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~			
if __name__ == "__main__":
    from LOTlib.Examples.Number.Global import generate_data, grammar, make_h0

    data = generate_data(300)

    ps = ParticleSwarm(make_h0, data)
    for h in lot_iter(ps):
        print h.posterior_score, h

        if len(ps.seen) > 0:
            print "#", sorted(ps.seen, key=lambda x: x.posterior_score, reverse=True)[0]
Exemple #34
0
## TODO: Vary resample_p to make sure that works here!


from LOTlib.Grammar import Grammar

grammar = Grammar()

grammar.add_rule('START', '', ['A'], 1.0)

grammar.add_rule('A', 'A', ['A', 'A'], 0.2)
grammar.add_rule('A', 'A', ['a'], 0.7)

grammar.add_rule('A', 'apply_', ['L', 'A'], 0.10)
grammar.add_rule('L', 'lambda', ['A'], 0.11, bv_p=0.07, bv_type='A')

grammar.add_rule('A', 'apply_', ['LF', 'A'], 0.10)
grammar.add_rule('LF', 'lambda', ['A'], 0.11, bv_p=0.07, bv_type='A', bv_args=['A'], bv_prefix='F')

## NOTE: DOES NTO HANDLE THE CASE WITH TWO A->APPLY, L->LAMBDAS

if __name__ == "__main__":
    from LOTlib import lot_iter

    for t in lot_iter(grammar.enumerate()):
        print t
Exemple #35
0
                # the priors cancel, so this represents the posterior
                cur  = self.ll_at_temperature(i, self.chains[i].likelihood_temperature) + self.ll_at_temperature(i+1,   self.chains[i+1].likelihood_temperature)
                prop = self.ll_at_temperature(i, self.chains[i+1].likelihood_temperature) + self.ll_at_temperature(i+1, self.chains[i].likelihood_temperature)

                if MH_acceptance(cur, prop, 0.0):
                    tmp = self.chains[i].current_sample
                    self.chains[i].set_state( self.chains[i+1].current_sample, False)
                    self.chains[i+1].set_state(tmp, False)

                    # OLD: self.chains[i].current_sample, self.chains[i+1].current_sample = self.chains[i+1].current_sample, self.chains[i].current_sample

        if self.yield_only_t0 and self.chain_idx != 0:
            return self.next() # keep going until we're on the one we yield
            ## TODO: FIX THIS SINCE IT WILL BREAK FOR HUGE NUMBERS OF CHAINS
        else:
            return self.chains[self.chain_idx].next()


if __name__ == "__main__":

    from LOTlib import lot_iter
    from LOTlib.Miscellaneous import Infinity
    from LOTlib.Examples.Number.Model import generate_data, NumberExpression, grammar
    data = generate_data(300)

    make_h0 = lambda : NumberExpression(grammar)

    for h in lot_iter(ParallelTemperingSampler(make_h0, data, steps=Infinity, yield_only_t0=True)):
        print h.posterior_score, h
Exemple #36
0




        # initialize each chain
        MultipleChainMCMC.__init__(self, lambda: None, data, steps=steps, nchains=len(partitions), **kwargs)
        
        # And set each to the partition
        for c, p in zip(self.chains, partitions):
            c.set_state(make_h0(value=p))
        
        # and store these
        self.partitions = map(copy, partitions)


if __name__ == "__main__":
    
    from LOTlib.Examples.Number.Model.Utilities import grammar, make_h0, generate_data
    data = generate_data(300)
    
    #from LOTlib.Examples.RegularExpression.Shared import grammar, make_h0, data
        
    #from LOTlib.Examples.RationalRules.Shared import grammar, data, make_h0
    
    #PartitionMCMC(grammar, make_h0, data, 2, skip=0)
    for h in lot_iter(PartitionMCMC(grammar, make_h0, data, max_N=10, skip=0)):
        print h.posterior_score, h

    
 
Exemple #37
0
def print_subtree_adaptations(hypotheses,
                              posteriors,
                              subtrees,
                              relative_KL=True):
    """
            Determine how useful it would be to explicitly define each subtree in H across
            all of the (corresponding) posteriors, as measured by KL from prior to posterior

            - hypotheses - a list of LOThypotheses
            - posteriors - [ [P(h|data) for h in hypotheies] x problems ]
            - subtrees   - a collection of (possibly partial) subtrees to try adapting

            We treat hyps as a fixed finite hypothesis space, and assume every subtree considered
            is *not* derived compositionally (although thi scould change in future variants)

            p - the probability of going to kids in randomly generating a subtree
            subtree_multiplier - how many times we sample a subtree from *each* node in each hypothesis
            relative_KL - compute summed KL divergence absolutely, or relative to the h.compute_prior()?

    """

    # compute the normalized posteriors
    Ps = map(lognormalize, posteriors)

    # Compute the baseline KL divergence so we can score relative to this
    if relative_KL:
        oldpriors = lognormalize(
            numpy.array([h.compute_prior() for h in hypotheses]))
        KL0s = [sum(exp(oldpriors) * (oldpriors - P)) for P in Ps]
    else:
        KL0s = [
            1.0 for P in Ps
        ]  # pretend everything just had KL of 1, so we score relatively

    ## Now process each, starting with the most simple
    for t in lot_iter(
            sorted(subtrees, key=lambda t: t.log_probability(), reverse=True)):

        # Get some stats on t:
        tlp = t.log_probability()
        tnt = count_identical_nonterminals(
            t.returntype, t)  # How many times is this nonterminal used?

        # How many matches of t are there in each H?
        m = numpy.array(
            [count_subtree_matches(t, h.value) for h in hypotheses])
        ## TODO: There is a complications: partial patterns matching themselves.
        ##       For simplicity, we'll just take the *first* match, seetting max(m)=1
        ##       In the future, we should change this to correctly handle and count
        ##       partial matches matching themselves
        m = (m >= 1) * 1
        assert max(m) == 1, "Error: " + str(t) + "\t" + str(m)

        # How many times is the nonterminal used, NOT counting in t?
        nt = numpy.array([
            count_identical_nonterminals(t.returntype, h.value)
            for h in hypotheses
        ]) - (tnt - 1) * m
        assert min(nt) >= 0, "Error: " + str(t)

        # And the PCFG prior *not* counting t
        q = lognormalize(
            numpy.array([h.value.log_probability()
                         for h in hypotheses]) - tlp * m)

        # The function to optimize
        def fnc(p):
            if p <= 0. or p >= 1.: return float("inf")  # enforce bounds

            newprior = lognormalize(q + log(p) * m + log(1. - p) * nt)

            kl = 0.0
            for P, kl0 in zip(Ps, KL0s):
                kl += sum(numpy.exp(newprior) * (newprior - P)) / kl0

            return kl

        ### TODO: This optimization should be analytically tractable...
        ###       but we need to check that it is convex! Any ideas?
        o = scipy.optimize.fmin(fnc,
                                numpy.array([0.1]),
                                xtol=0.0001,
                                ftol=0.0001,
                                disp=0)

        print fnc(o[0]), o[0], log(o[0]), t.log_probability(), qq(t)
        grammar.add_rule('LAMBDA_WORD', 'lambda', ['WORD'], 1.0, bv_type='WORD')
        grammar.add_rule('WORD', 'apply_', ['LAMBDA_WORD', 'WORD'], 1.0)
        
        p = InverseInlineProposal(grammar)
        
        """
        # Just look at some proposals
        for _ in xrange(200):    
            t = grammar.generate()
            print ">>", t
            #assert t.check_generation_probabilities(grammar)
            #assert t.check_parent_refs()
            
            for _ in xrange(10):
                t =  p.propose_tree(t)[0]
                print "\t", t
            
        """
        # Run MCMC -- more informative about f-b errors    
        from LOTlib.Inference.MetropolisHastings import MHSampler

        from LOTlib.Inference.Proposals.MixtureProposal import MixtureProposal          
        from LOTlib.Inference.Proposals.RegenerationProposal import RegenerationProposal
                
        h = make_h0(proposal_function=MixtureProposal([InverseInlineProposal(grammar), RegenerationProposal(grammar)] ))
        data = generate_data(100)
        for h in lot_iter(MHSampler(h, data)):
            print h.posterior_score, h.prior, h.likelihood, get_knower_pattern(h), h
        
            
Exemple #39
0
"""
Define a new kind of LOTHypothesis, that gives regex strings.

These have a special interpretation function that compiles differently than straight python eval.

"""
from LOTlib import lot_iter
from LOTlib.Inference.MetropolisHastings import MHSampler
from LOTlib.Miscellaneous import qq
from Model import *

if __name__ == "__main__":
    for h in lot_iter(MHSampler(make_h0(), data, steps=10000)):
        print h.posterior_score, h.prior, h.likelihood, qq(h)
"""
        Define a new kind of LOTHypothesis, that gives regex strings.
        These have a special interpretation function that compiles differently than straight python eval.
"""

from LOTlib import lot_iter
from LOTlib.Inference.MetropolisHastings import MHSampler
from LOTlib.Miscellaneous import qq

from Shared import data, make_h0

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if __name__ == "__main__":

    for h in lot_iter(MHSampler(make_h0(), data, steps=10000)):
        print h.posterior_score, h.prior, h.likelihood, qq(h)