예제 #1
0
def runparts(x,datamt):
    #problem: right now only recording last partition, never saving from others.
    print "Start: " + str(x) + " on this many: " + str(datamt)
    messup = TopN(options.top)
    try:
        #make new TopN for each data amount
        topn= TopN(N=200, key="posterior_score")
        for p in break_ctrlc(partitions):
            print "Starting on partition ", p

            # Now we have to go in and fill in the nodes that are nonterminals
            # We can do this with generate
            v = grammar.generate(deepcopy(p))

            h0 = MyHypothesis(grammar, value=v)
            size = datamt
            data = [FunctionData(input=[],
                             output={'n i k': size, 'h i N': size, 'f a n': size, 'g i f': size, 'm a N': size, 'f a m': size, 'g i k': size, 'k a n': size, 'f a f': size, 'g i n': size, 'g i m': size, 'g i s': size, 's i f': size, 's i n': size, 'n i s': size, 's i m': size, 's i k': size, 'h a N': size, 'f i N': size, 'h i m': size, 'h i n': size, 'h a m': size, 'n i N': size, 'h i k': size, 'f a s': size, 'f i n': size, 'h i f': size, 'n i m': size, 'g i N': size, 'h a g': size, 's i N': size, 'n i n': size, 'f i m': size, 's i s': size, 'h i s': size, 'n a s': size, 'k a s': size, 'f i s': size, 'n i f': size, 'm i n': size, 's a s': size, 'f a g': size, 'k a g': size, 'k a f': size, 's a m': size, 'n a f': size, 'n a g': size, 'm i N': size, 's a g': size, 'f i k': size, 'k a m': size, 'n a n': size, 's a f': size, 'n a m': size, 'm a s': size, 'h a f': size, 'h a s': size, 'n a N': size, 'm i s': size, 's a n': size, 's a N': size, 'm i k': size, 'f a N': size, 'm i m': size, 'm a g': size, 'm a f': size, 'f i f': size, 'k a N': size, 'h a n': size, 'm a n': size, 'm a m': size, 'm i f': size})]



            for h in break_ctrlc(MHSampler(h0, data, steps=options.steps, trace=False)):
                topn.add(h)
        return set(topn)

    except:
        #if we fail, we can return a blank TopN
        return messup
예제 #2
0
def generate_unique_trees(grammar, start='START', N=1000):
    """
            Yield a bunch of unique trees, produced from the grammar
    """
    for _ in break_ctrlc(xrange(N)):
        t = grammar.generate(start)
        yield t
예제 #3
0
    def runTest(self):
        NSAMPLES = 10000

        from LOTlib.DefaultGrammars import finiteTestGrammar as grammar

        from LOTlib.Hypotheses.LOTHypothesis import LOTHypothesis

        class MyH(LOTHypothesis):
            @attrmem('likelihood')
            def compute_likelihood(self, *args, **kwargs):
                return 0.0

            @attrmem('prior')
            def compute_prior(self):
                return grammar.log_probability(self.value)

        print "# Taking MHSampler for a test run"
        cnt = Counter()
        h0 = MyH(grammar=grammar)
        for h in break_ctrlc(
                MHSampler(h0, [], steps=NSAMPLES,
                          skip=10)):  # huh the skip here seems to be important
            cnt[h] += 1
        trees = list(cnt.keys())
        print "# Done taking MHSampler for a test run"

        ## TODO: When the MCMC methods get cleaned up for how many samples they return, we will assert that we got the right number here
        # assert sum(cnt.values()) == NSAMPLES # Just make sure we aren't using a sampler that returns fewer samples! I'm looking at you, ParallelTempering

        Z = logsumexp([grammar.log_probability(t.value) for t in trees
                       ])  # renormalize to the trees in self.trees
        obsc = [cnt[t] for t in trees]
        expc = [
            exp(grammar.log_probability(t.value)) * sum(obsc) for t in trees
        ]

        # And plot here
        expc, obsc, trees = zip(*sorted(zip(expc, obsc, trees), reverse=True))
        import matplotlib.pyplot as plt
        plt.subplot(111)
        # Log here spaces things out at the high end, where we can see it!
        plt.scatter(log(range(len(trees))), expc, color="red", alpha=1.)
        plt.scatter(log(range(len(trees))),
                    obsc,
                    color="blue",
                    marker="x",
                    alpha=1.)
        plt.savefig('finite-sampler-test.pdf')
        plt.clf()

        # Do chi squared test
        csq, pv = chisquare(obsc, expc)
        self.assertAlmostEqual(sum(obsc), sum(expc))

        # And examine
        for t, c, s in zip(trees, obsc, expc):
            print c, s, t
        print(csq, pv), sum(obsc)

        self.assertGreater(pv, 0.01, msg="Sampler failed chi squared!")
예제 #4
0
def scheme_generate():
    """ This generates random scheme code with cons, cdr, and car, and evaluates it on some simple list
    structures.

    No inference here -- just random sampling from a grammar.
    """

    example_input = [
        [],
        [[]],
        [[], []],
        [[[]]]
        ]

    ## Generate some and print out unique ones
    seen = set()
    for i in break_ctrlc(xrange(10000)):
        x = grammar.generate('START')

        if x not in seen:
            seen.add(x)

            # make the function node version
            f = LOTHypothesis(grammar, value=x, args=['x'])

            print x.log_probability(), x
            for ei in example_input:
                print "\t", ei, " -> ", f(ei)
예제 #5
0
 def test_refs(self):
     """
         Test the setting of parents and rules
     """
     for _ in break_ctrlc(xrange(1000)):
         x = self.G.generate()
         self.assertTrue(x.check_parent_refs())
예제 #6
0
파일: Demo.py 프로젝트: pratiksha/LOTlib
def standard_sample(make_hypothesis, make_data, skip=9, show=True, N=100, save_top='top.pkl', alsoprint='None', **kwargs):
    """
        Just a simplified interface for sampling, allowing printing (showing), returning the top, and saving.
        This is used by many examples, and is meant to easily allow running with a variety of parameters.
        NOTE: This skip is a skip *only* on printing
        **kwargs get passed to sampler
    """
    if LOTlib.SIG_INTERRUPTED:
        return TopN()  # So we don't waste time!

    h0 = make_hypothesis()
    data = make_data()

    best_hypotheses = TopN(N=N)

    f = eval(alsoprint)

    for i, h in enumerate(break_ctrlc(MHSampler(h0, data, **kwargs))):
        best_hypotheses.add(h)

        if show and i%(skip+1) == 0:
            print i, \
                h.posterior_score, \
                h.prior, \
                h.likelihood, \
                f(h) if f is not None else '', \
                qq(cleanFunctionNodeString(h))

    if save_top is not None:
        print "# Saving top hypotheses"
        with open(save_top, 'w') as f:
            pickle.dump(best_hypotheses, f)

    return best_hypotheses
예제 #7
0
    def plot_sampler(self, opath, sampler):
        """
        Plot the sampler, for cases with many zeros where chisquared won't work well
        """
        cnt = Counter()
        for h in break_ctrlc(sampler):
            cnt[h.value] += 1

        Z = logsumexp([ self.grammar.log_probability(t) for t in self.trees]) # renormalize to the trees in self.trees
        obsc = [cnt[t] for t in self.trees]
        expc = [exp(self.grammar.log_probability(t)-Z)*sum(obsc) for t in self.trees]

        for t, c, s in zip(self.trees, obsc, expc):
            print c, "\t", s, "\t", t


        expc, obsc, trees = zip(*sorted(zip(expc, obsc, self.trees), reverse=True))

        import matplotlib.pyplot as plt
        from numpy import log
        plt.subplot(111)
        # Log here spaces things out at the high end, where we can see it!
        plt.scatter(log(range(len(trees))), expc, color="red", alpha=1.)
        plt.scatter(log(range(len(trees))), obsc, color="blue", marker="x", alpha=1.)
        plt.savefig(opath)
        plt.clf()
예제 #8
0
파일: Subtrees.py 프로젝트: ebigelow/LOTlib
def generate_unique_trees(grammar, start='START', N=1000):
    """
            Yield a bunch of unique trees, produced from the grammar
    """
    for _ in break_ctrlc(xrange(N)):
        t = grammar.generate(start)
        yield t
예제 #9
0
 def test_check_generation_probabilities(self):
     """
         Test the generation probabilities
     """
     for _ in break_ctrlc(xrange(1000)):
         x = self.G.generate()
         self.assertTrue(x.check_generation_probabilities(self.G))
예제 #10
0
    def test_lp_regenerate_propose_to(self):
        # import the grammar
        from Grammars import lp_regenerate_propose_to_grammar
        self.G = lp_regenerate_propose_to_grammar.g
        # the RegenerationProposal class
        rp = RegenerationProposal(self.G)
        numTests = 100
        # Sample 1000 trees from the grammar, and run a chi-squared test for each of them
        for i in break_ctrlc(range(numTests)):
            # keep track of expected and actual counts
            # expected_counts = defaultdict(int) # a dictionary whose keys are trees and values are the expected number of times we should be proposing to this tree
            actual_counts = defaultdict(int) # same as expected_counts, but stores the actual number of times we proposed to a given tree
            tree = self.G.generate('START')

            # Regenerate some number of trees at random
            numTrees = 1000
            for j in range(numTrees):
                newtree = rp.propose_tree(tree)[0]
                # trees.append(newtree)
                actual_counts[newtree] += 1
            # see if the frequency with which each category of trees is generated matches the
            # expected counts using a chi-squared test
            chisquared, p = self.get_pvalue(tree, actual_counts, numTrees)
            # print chisquared, p
            # if p > 0.01/1000, test passes
            self.assertTrue(p > 0.01/numTests, "Trees are not being generated according to the expected log probabilities")
            if i % 10 == 0 and i != 0: print i, "lp_regenerate_propose_to tests..."
        print numTests, "lp_regenerate_propose_to tests..."
def run(data_amount):
    print "Starting chain on %s data points" % data_amount
    data = makeVariableLexiconData(eval(options.word),
                                   options.word,
                                   the_context,
                                   n=data_amount,
                                   s=options.s,
                                   alpha=options.alpha,
                                   verbose=True)

    h0 = KinshipLexicon(words=[options.word], alpha=options.alpha)
    h0.set_word(
        options.word,
        LOTHypothesis(grammar, value=None, display='lambda recurse_, C, X:%s'))

    hyps = TopN(N=options.top_count)

    mhs = MHSampler(h0,
                    data,
                    options.steps,
                    likelihood_temperature=options.llt,
                    prior_temperature=options.prior_temp)

    for samples_yielded, h in break_ctrlc(enumerate(mhs)):
        if samples_yielded % 1000 == 0:
            print h.prior, h.likelihood, h
        hyps.add(h)

    return hyps
예제 #12
0
def save_hypotheses(sampler, filename='numbergame_hypotheses.p'):
    hypotheses = set()
    for h in break_ctrlc(sampler):
        hypotheses.add(h)

    f = open(filename, "wb")
    pickle.dump(hypotheses, f)
    return hypotheses
예제 #13
0
def save_hypotheses(sampler, filename='numbergame_hypotheses.p'):
    hypotheses = set()
    for h in break_ctrlc(sampler):
        hypotheses.add(h)

    f = open(filename, "wb")
    pickle.dump(hypotheses, f)
    return hypotheses
예제 #14
0
def partitionMCMC(data,partitions):
    print data
    topn= TopN(N=200, key="posterior_score")
    for p in break_ctrlc(partitions):
        print "Starting on partition ", p

        # Now we have to go in and fill in the nodes that are nonterminals
        v = grammar.generate(deepcopy(p))

        #h0 = MyHypothesis(grammar, value=v)
        h0= make_hypothesis()
        print h0
        for h in break_ctrlc(MHSampler(h0, data, steps=5000, skip=0)):
            # Show the partition and the hypothesis
            print h.posterior_score, p, h, howyoudoin(h)
            topn.add(h)
    return set(topn)
예제 #15
0
def run(options, ndata):
    if LOTlib.SIG_INTERRUPTED: return 0, set()

    language = eval(options.LANG + "()")
    data = language.sample_data(LARGE_SAMPLE)

    assert len(data) == 1
    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE

    z = sum(data[0].output.values())
    if z > 0:
        best_ll = sum([(p / z) * log(p / z) for p in data[0].output.values()])
    else:
        best_ll = 0.0

    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', "'%s'" % t, None, 1.0)

    # set up the hypothesis
    h0 = IncrementalLexiconHypothesis(grammar=grammar,
                                      alphabet_size=len(language.terminals()))
    h0.set_word(
        0,
        h0.make_hypothesis(grammar=grammar))  # make the first word at random
    h0.N = 1

    tn = TopN(N=options.TOP_COUNT)

    for outer in xrange(options.N):  # how many do we add?
        if LOTlib.SIG_INTERRUPTED: return 0, set()

        # and re-set the posterior or else it's something weird
        h0.compute_posterior(data)

        # now run mcmc
        for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
            h.best_ll = best_ll  # just store this
            tn.add(copy(h))

            if options.TRACE:
                print h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata, h
                v = h()
                sortedv = sorted(v.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)
                print "{" + ', '.join(["'%s':%s" % i for i in sortedv]) + "}"

        # and start from where we ended
        h0 = copy(h)
        h0.deepen()

    return ndata, tn
예제 #16
0
파일: Run.py 프로젝트: ebigelow/LOTlib
def run():

    h0 = make_hypothesis()
    data = make_data()

    for x in break_ctrlc(MHSampler(h0, data, STEPS)):

        print x.posterior_score, x
        for di in data:
            print "\t", di.input, "->", x(*di.input), " ; should be ", di.output
예제 #17
0
def run():

    h0 = make_hypothesis()
    data = make_data()

    for x in break_ctrlc(MHSampler(h0, data, STEPS)):

        print x.posterior_score, x
        for di in data:
            print "\t", di.input, "->", x(
                *di.input), " ; should be ", di.output
예제 #18
0
    def __call__(self, generator):
        """Pass this a generator, add each element as it's yielded.

        This allows us to make a pipeline. See Example in main docstring: '# Or as a generator...'.

        """
        if hasattr(generator, 'data'):
            self.data = generator.data
        for sample in break_ctrlc(generator):
            self.add(sample)
            yield sample
예제 #19
0
    def __call__(self, generator):
        """Pass this a generator, add each element as it's yielded.

        This allows us to make a pipeline. See Example in main docstring: '# Or as a generator...'.

        """
        if hasattr(generator, 'data'):
            self.data = generator.data
        for sample in break_ctrlc(generator):
            self.add(sample)
            yield sample
예제 #20
0
    def test_eq(self):
        counter = 0
        for i in break_ctrlc(xrange(10000)):
            x = self.G.generate()
            y = self.G.generate()

            if pystring(x) == pystring(y):
                counter += 1
                # print(counter)
                #print( pystring(x)+'\n'+ pystring(y)+'\n')

            self.assertEqual( pystring(x) == pystring(y), x == y, "Without bvs, the pystrings should be the same")
예제 #21
0
파일: search.py 프로젝트: piantado/LOTlib
def run(options, ndata):
    if LOTlib.SIG_INTERRUPTED: return 0, set()

    language = eval(options.LANG+"()")
    data = language.sample_data(LARGE_SAMPLE)

    assert len(data) == 1
    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE

    z = sum(data[0].output.values())
    if z > 0:
        best_ll = sum([ (p/z)*log(p/z) for p in data[0].output.values() ])
    else:
        best_ll = 0.0

    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', "'%s'" % t, None, 1.0)

    # set up the hypothesis
    h0 = IncrementalLexiconHypothesis(grammar=grammar, alphabet_size=len(language.terminals()))
    h0.set_word(0, h0.make_hypothesis(grammar=grammar)) # make the first word at random
    h0.N = 1

    tn = TopN(N=options.TOP_COUNT)

    for outer in xrange(options.N): # how many do we add?
        if LOTlib.SIG_INTERRUPTED: return 0, set()

        # and re-set the posterior or else it's something weird
        h0.compute_posterior(data)

        # now run mcmc
        for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
            h.best_ll = best_ll # just store this
            tn.add(copy(h))

            if options.TRACE:
                print h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata, h
                v = h()
                sortedv = sorted(v.items(), key=operator.itemgetter(1), reverse=True )
                print "{" + ', '.join(["'%s':%s"% i for i in sortedv]) + "}"


        # and start from where we ended
        h0 = copy(h)
        h0.deepen()

    return ndata, tn
예제 #22
0
    def runTest(self):
        NSAMPLES = 10000

        from LOTlib.DefaultGrammars import finiteTestGrammar as grammar

        from LOTlib.Hypotheses.LOTHypothesis import LOTHypothesis
        class MyH(LOTHypothesis):

            @attrmem('likelihood')
            def compute_likelihood(self, *args, **kwargs):
                return 0.0

            @attrmem('prior')
            def compute_prior(self):
                return grammar.log_probability(self.value)

        print "# Taking MHSampler for a test run"
        cnt = Counter()
        h0 = MyH(grammar=grammar)
        for h in break_ctrlc(MHSampler(h0, [], steps=NSAMPLES, skip=10)): # huh the skip here seems to be important
            cnt[h] += 1
        trees = list(cnt.keys())
        print "# Done taking MHSampler for a test run"

        ## TODO: When the MCMC methods get cleaned up for how many samples they return, we will assert that we got the right number here
        # assert sum(cnt.values()) == NSAMPLES # Just make sure we aren't using a sampler that returns fewer samples! I'm looking at you, ParallelTempering

        Z = logsumexp([grammar.log_probability(t.value) for t in trees]) # renormalize to the trees in self.trees
        obsc = [cnt[t] for t in trees]
        expc = [exp( grammar.log_probability(t.value))*sum(obsc) for t in trees]

        # And plot here
        expc, obsc, trees = zip(*sorted(zip(expc, obsc, trees), reverse=True))
        import matplotlib.pyplot as plt
        plt.subplot(111)
        # Log here spaces things out at the high end, where we can see it!
        plt.scatter(log(range(len(trees))), expc, color="red", alpha=1.)
        plt.scatter(log(range(len(trees))), obsc, color="blue", marker="x", alpha=1.)
        plt.savefig('finite-sampler-test.pdf')
        plt.clf()

        # Do chi squared test
        csq, pv = chisquare(obsc, expc)
        self.assertAlmostEqual(sum(obsc), sum(expc))

        # And examine
        for t, c, s in zip(trees, obsc, expc):
            print c, s, t
        print (csq, pv), sum(obsc)

        self.assertGreater(pv, 0.01, msg="Sampler failed chi squared!")
예제 #23
0
def generate_data(data_size):
    all_words = target.all_words()
    data = []

    for i in break_ctrlc(xrange(data_size)):
        # a context is a set of men, pirates, and everything. functions are applied to this to get truth values
        context = sample_context()
        word = target.sample_utterance(all_words, context)
        data.append(
            UtteranceData(utterance=word,
                          context=context,
                          possible_utterances=all_words))

    return data
예제 #24
0
파일: Model.py 프로젝트: TerryLew/BinLOTlib
def runme(chain, dataamt):

    if LOTlib.SIG_INTERRUPTED: return ()

    data = make_data(dataamt)

    tn = TopN(options.top)

    h0 = make_hypothesis()
    for h in break_ctrlc(MHSampler(h0, data, steps=options.steps, skip=0)):
        # print h.posterior_score, h.prior, h.likelihood, h
        h.likelihood_per_data = h.likelihood/dataamt
        tn.add(h)

    return tn
예제 #25
0
def myrun(observed_set):

    if LOTlib.SIG_INTERRUPTED:
        return set()

    h0 = NumberGameHypothesis(grammar=grammar)

    data = [FunctionData(input=[], output=observed_set, alpha=ALPHA)]

    tn = TopN(N=options.TOP_COUNT)
    for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
        tn.add(h)

    print "# Finished %s" % str(observed_set)

    return set(tn.get_all())
예제 #26
0
def myrun(observed_set):

    if LOTlib.SIG_INTERRUPTED:
        return set()

    h0 = NumberGameHypothesis(grammar=grammar)

    data = [FunctionData(input=[], output=observed_set, alpha=ALPHA)]

    tn = TopN(N=options.TOP_COUNT)
    for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
        tn.add(h)

    print "# Finished %s" % str(observed_set)

    return set(tn.get_all())
예제 #27
0
def run(data_amount):
    print "Starting chain on %s data points"%data_amount
    data = makeLexiconData(target, four_gen_tree_context, n=data_amount, alpha=options.alpha)

    h0 = KinshipLexicon(alpha=options.alpha)
    for w in target_words:
        h0.set_word(w, LOTHypothesis(my_grammar, args=['recurse_','C', 'X']))

    hyps = TopN(N=options.top_count)

    mhs = MHSampler(h0, data, options.steps, likelihood_temperature=options.llt, prior_temperature=options.prior_temp)

    for samples_yielded, h in break_ctrlc(enumerate(mhs)):
        hyps.add(h)

    return hyps
예제 #28
0
 def test_setto(self):
     """
         Test the operation of setting a function node to another. 
     """
     for _ in break_ctrlc(xrange(1000)):
         x = self.G.generate()
         x0 = copy(x)
         y = self.G.generate()
         y_subnodes = y.subnodes()
         x.setto(y)
         
         self.assertTrue(x.check_parent_refs())
         self.assertTrue(x.check_generation_probabilities(self.G))
         
         for xi in x:
             self.assertTrue(xi in y_subnodes)
예제 #29
0
def standard_sample(make_hypothesis, make_data, show_skip=9, show=True, N=100, save_top='top.pkl', alsoprint='None', **kwargs):
    """
        Just a simplified interface for sampling, allowing printing (showing), returning the top, and saving.
        This is used by many examples, and is meant to easily allow running with a variety of parameters.
        NOTE: This skip is a skip *only* on printing
        **kwargs get passed to sampler
    """
    if LOTlib.SIG_INTERRUPTED:
        return TopN()  # So we don't waste time!

    h0 = make_hypothesis()
    data = make_data()


    best_hypotheses = TopN(N=N)

    f = eval(alsoprint)

    sampler = MHSampler(h0, data, **kwargs)

#    # TODO change acceptance temperature over times
#    sampler.acceptance_temperature = 0.5

    for i, h in enumerate(break_ctrlc(sampler)):

#        if i % 10000 == 0 and i != 0:
#            sampler.acceptance_temperature = min(1.0, sampler.acceptance_temperature+0.1)
#            print '='*50
#            print 'change acc temperature to', sampler.acceptance_temperature 

        best_hypotheses.add(h)

        if show and i%(show_skip+1) == 0:

            print i, \
                h.posterior_score, \
                h.prior, \
                h.likelihood, \
                f(h) if f is not None else '', \
                qq(cleanFunctionNodeString(h))

    if save_top is not None:
        print "# Saving top hypotheses"
        with open(save_top, 'w') as f:
            pickle.dump(best_hypotheses, f)

    return best_hypotheses
예제 #30
0
파일: search3.py 프로젝트: piantado/LOTlib
def run(options, ndata):
    """
    This out on the DATA_RANGE amounts of data and returns all hypotheses in top count
    """
    if LOTlib.SIG_INTERRUPTED:
        return set()

    language = eval(options.LANG + "()")
    data = language.sample_data(LARGE_SAMPLE)
    assert len(data) == 1

    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE
    # print data

    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule("ATOM", q(t), None, 2)

    h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s")
    print "# Starting on ", h0

    tn = TopN(N=options.TOP_COUNT)

    # print h0.compute_posterior(data)
    # for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))):
    # # for h in MHSampler(h0, data, steps=options.STEPS, trace=True):
    #     print h.posterior_score, h
    #     print getattr(h, 'll_counts', None)

    with open(
        prefix + "hypotheses_" + options.LANG + "_" + str(rank) + "_" + str(ndata) + "_" + suffix + ".txt", "a"
    ) as ofile:

        for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))):
            tn.add(h)
            # print h.posterior_score, getattr(h, 'll_counts', None), h
            if i % options.SKIP == 0 and h.posterior_score > -Infinity:
                print >> ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata
                print >> ofile, getattr(h, "ll_counts", None)
                print >> ofile, h, "\0"  # must add \0 when not Lexicon

    return tn
예제 #31
0
def run(options, ndata):
    """
    This out on the DATA_RANGE amounts of data and returns all hypotheses in top count
    """
    if LOTlib.SIG_INTERRUPTED:
        return 0, set()

    language = eval(options.LANG+"()")
    data = language.sample_data(LARGE_SAMPLE)
    assert len(data) == 1

    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE
    #print data

    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', q(t), None, 2)

    h0 = IncrementalLexiconHypothesis(grammar=grammar)

    tn = TopN(N=options.TOP_COUNT)

    for outer in xrange(options.N): # how many do we add?
        # add to the grammar
        grammar.add_rule('SELFF', '%s' % (outer), None, 1.0)

        # Add one more to the number of words here
        h0.set_word(outer, h0.make_hypothesis(grammar=grammar))
        h0.N = outer+1
        assert len(h0.value.keys())==h0.N==outer+1

        # now run mcmc
        for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
            tn.add(h)

            print h.posterior_score, h
            print getattr(h, 'll_counts', None)

        # and start from where we ended
        h0 = deepcopy(h) # must deepcopy

    return ndata, tn
예제 #32
0
파일: search.py 프로젝트: joshrule/LOTlib
def run(options, ndata):
    """
    This out on the DATA_RANGE amounts of data and returns all hypotheses in top count
    """
    if LOTlib.SIG_INTERRUPTED:
        return 0, set()

    language = eval(options.LANG+"()")
    data = language.sample_data(LARGE_SAMPLE)
    assert len(data) == 1

    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE
    #print data

    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', q(t), None, 2)

    h0 = IncrementalLexiconHypothesis(grammar=grammar)

    tn = TopN(N=options.TOP_COUNT)

    for outer in xrange(options.N): # how many do we add?
        # add to the grammar
        grammar.add_rule('SELFF', '%s' % (outer), None, 1.0)

        # Add one more to the number of words here
        h0.set_word(outer, h0.make_hypothesis(grammar=grammar))
        h0.N = outer+1
        assert len(h0.value.keys())==h0.N==outer+1

        # now run mcmc
        for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
            tn.add(h)

            # print h.posterior_score, h
            # print getattr(h, 'll_counts', None)

        # and start from where we ended
        h0 = deepcopy(h) # must deepcopy

    return ndata, tn
예제 #33
0
def run(data_pts):
    print "Start run on ", str(data_pts)

    y = [pt.Y for pt in data_pts]
    filename = "".join(y)

    hyps = TopN(N=options.TOP_COUNT)
    h0 = KinshipLexicon(alpha=options.ALPHA)
    h0.set_word('Word', LOTHypothesis(my_grammar, value=None, display='lambda recurse_, C, X:%s'))
    mhs = MHSampler(h0, data_pts, options.STEPS, likelihood_temperature=options.llt)

    for samples_yielded, h in break_ctrlc(enumerate(mhs)):
        hyps.add(h)

    with open(options.OUT_PATH + filename + '.pkl', 'w') as f:
        pickle.dump(hyps, f)

    return filename, hyps
예제 #34
0
파일: Run.py 프로젝트: ebigelow/LOTlib
def run():
    """A version that cares more about recent data, showing how to use
    Hypotheses.DecayedLikelihoodHypothesis.

    """
    G = grammar
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Create an initial hypothesis
    # This is where we set a number of relevant variables -- whether to use RR, alpha, etc.Z
    h0 = MyHypothesis(G, ll_decay=1.0, rrAlpha=1.0, args=['x'])

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Run the MH

    # Run the vanilla sampler. Without steps, it will run infinitely
    # this prints out posterior (posterior_score), prior, likelihood,
    for h in break_ctrlc(MHSampler(h0, data, 10000, skip=100)):
        print h.posterior_score, h.prior, h.likelihood, q(h)
예제 #35
0
def run(options, ndata):
    """
    This out on the DATA_RANGE amounts of data and returns all hypotheses in top count
    """
    if LOTlib.SIG_INTERRUPTED:
        return set()

    language = eval(options.LANG+"()")
    data = language.sample_data(LARGE_SAMPLE)
    assert len(data) == 1

    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE
    # print data

    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', q(t), None, 2)

    h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s")
    print "# Starting on ", h0

    tn = TopN(N=options.TOP_COUNT)

    # print h0.compute_posterior(data)
    # for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))):
    # # for h in MHSampler(h0, data, steps=options.STEPS, trace=True):
    #     print h.posterior_score, h
    #     print getattr(h, 'll_counts', None)

    with open(prefix+'hypotheses_'+options.LANG+'_'+str(rank)+'_'+str(ndata)+'_'+suffix+".txt", 'a') as ofile:

        for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))):
            tn.add(h)
            # print h.posterior_score, getattr(h, 'll_counts', None), h
            if i%options.SKIP == 0 and h.posterior_score > -Infinity:
                print >>ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood/ndata
                print >>ofile, getattr(h,'ll_counts', None)
                print >>ofile, h, '\0' # must add \0 when not Lexicon


    return tn
def run(data_amount):
    print "Starting chain on %s data points" % data_amount
    data = makeVariableLexiconData(eval(options.word), options.word, the_context, n=data_amount, s=options.s,
                                   alpha=options.alpha, verbose=True)

    h0 = KinshipLexicon(words=[options.word], alpha=options.alpha)
    h0.set_word(options.word, LOTHypothesis(grammar, value=None, args=['recurse_', 'C', 'X']))

    hyps = TopN(N=options.top_count)

    mhs = MHSampler(h0, data, options.steps, likelihood_temperature=options.llt,
                    prior_temperature=options.prior_temp)

    for samples_yielded, h in break_ctrlc(enumerate(mhs)):
        #if samples_yielded % 100 == 0:
         #   print h.prior, h.likelihood, h
        hyps.add(h)

    return hyps
예제 #37
0
파일: Run.py 프로젝트: ebigelow/LOTlib
def run_mh():
    """Run the MH."""
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # somewhat weirdly, we'll make an upper node above "START" for the two concepts
    # and require it to check if concept (an argument below) is 'A'
    grammar.add_rule('TWO_CONCEPT_START', 'if_', ['(concept==\'A\')', 'START', 'START'], 1.0)

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Create an initial hypothesis
    # This is where we set a number of relevant variables -- whether to use RR, alpha, etc.
    # Here we give args as "concept" (used in TWO_CONCEPT_START above) and "x"
    h0 = RationalRulesLOTHypothesis(grammar=grammar, rrAlpha=1.0, ALPHA=0.9, start='TWO_CONCEPT_START',
                                    args=['concept', 'x'])

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Run the vanilla sampler. Without steps, it will run infinitely
    # this prints out posterior (posterior_score), prior, likelihood,
    for h in break_ctrlc(MHSampler(h0, data, 10000, skip=100)):
        print h.posterior_score, h.prior, h.likelihood, q(h)
예제 #38
0
def run(options, ndata):
    """
    This out on the DATA_RANGE amounts of data and returns all hypotheses in top count
    """
    if LOTlib.SIG_INTERRUPTED:
        return set()

    language = eval(options.LANG + "()")
    data = language.sample_data(LARGE_SAMPLE)
    assert len(data) == 1

    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE

    print data
    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', q(t), None, 2)

    h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s")

    tn = TopN(N=options.TOP_COUNT)

    for i, h in enumerate(break_ctrlc(MHSampler(h0, data,
                                                steps=options.STEPS))):
        print h.posterior_score, h
        print getattr(h, 'll_counts', None)

    # with open(prefix+'hypotheses_'+options.LANG+'_'+str(rank)+'_'+str(ndata)+'_'+suffix+".txt", 'a') as ofile:
    #
    #     for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))):
    #         tn.add(h)
    #         # print h.posterior_score, getattr(h, 'll_counts', None), h
    #         if i%options.SKIP == 0:
    #             print >>ofile, "\n"
    #             print >>ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood/ndata
    #             print >>ofile, getattr(h,'ll_counts', None),
    #             print >>ofile, h # ends in \0 so we can sort with sort -g -z

    return tn
예제 #39
0
파일: search2.py 프로젝트: joshrule/LOTlib
def run(options, ndata):
    """
    This out on the DATA_RANGE amounts of data and returns all hypotheses in top count
    """
    if LOTlib.SIG_INTERRUPTED:
        return set()

    language = eval(options.LANG + "()")
    data = language.sample_data(LARGE_SAMPLE)
    assert len(data) == 1

    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE

    print data
    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule("ATOM", q(t), None, 2)

    h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s")

    tn = TopN(N=options.TOP_COUNT)

    for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))):
        print h.posterior_score, h
        print getattr(h, "ll_counts", None)

    # with open(prefix+'hypotheses_'+options.LANG+'_'+str(rank)+'_'+str(ndata)+'_'+suffix+".txt", 'a') as ofile:
    #
    #     for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))):
    #         tn.add(h)
    #         # print h.posterior_score, getattr(h, 'll_counts', None), h
    #         if i%options.SKIP == 0:
    #             print >>ofile, "\n"
    #             print >>ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood/ndata
    #             print >>ofile, getattr(h,'ll_counts', None),
    #             print >>ofile, h # ends in \0 so we can sort with sort -g -z

    return tn
예제 #40
0
파일: Run.py 프로젝트: ebigelow/LOTlib
def scheme_generate():
    """ This generates random scheme code with cons, cdr, and car, and evaluates it on some simple list
    structures.

    No inference here -- just random sampling from a grammar.

    """
    ## Generate some and print out unique ones
    seen = set()
    for i in break_ctrlc(xrange(10000)):
        x = grammar.generate('START')

        if x not in seen:
            seen.add(x)

            # make the function node version
            f = LOTHypothesis(grammar, value=x, args=['x'])

            print x.log_probability(), x
            for ei in example_input:
                print "\t", ei, " -> ", f(ei)
예제 #41
0
def run(data_amount):
    print "Starting chain on %s data points"%data_amount
    data = makeLexiconData(target, four_gen_tree_context, n=data_amount, alpha=options.alpha, verbose=True)

    h0 = KinshipLexicon(alpha=options.alpha)
    for w in target_words:
        h0.set_word(w, LOTHypothesis(my_grammar, display='lambda recurse_, C, X: %s'))

    hyps = TopN(N=options.top_count)

    mhs = MHSampler(h0, data, options.steps, likelihood_temperature=options.llt, prior_temperature=options.prior_temp)

    for samples_yielded, h in break_ctrlc(enumerate(mhs)):
        hyps.add(h)

    import pickle
    print 'Writing ' + data[0].X + data[0].Y + str(data_amount) + data[0].word + '.pkl'
    with open('Chains/' + data[0].X + data[0].Y + str(data_amount) + data[0].word + '.pkl', 'w') as f:
        pickle.dump(hyps, f)

    return hyps
예제 #42
0
 def test_substitute(self):
     """
         Test how substitution works
     """
     for _ in break_ctrlc(xrange(1000)):
         x = self.G.generate()
         y, _ = x.sample_subnode()
         oldy = copy(y)
         repl = self.G.generate(y.returntype)
         
         # pick a novel replacemnet (must NOT equal y)
         # NOTE: We can't just rejection sample here to pick repl because it may not be possible for a given x
         if repl == y: continue
         
         x.replace_subnodes(lambda z: z==y, repl)
         
         self.assertTrue(x.check_parent_refs())
         # We cannot check generation_probabilites because replace_subnodes breaks that!
         
         # and ensure that y is not left!
         for xi in x:
             self.assertTrue(xi != oldy, "\n%s\n%s\n%s\n%s"%(x,y,repl,xi))
예제 #43
0
def run(hypothesis, data_amount):
    print "Starting chain on %s data points" % data_amount
    data = makeLexiconData(target,
                           four_gen_tree_context,
                           n=data_amount,
                           alpha=options.alpha,
                           verbose=True)

    h0 = KinshipLexicon(alpha=options.alpha)
    for w in target_words:
        h0.set_word(
            w,
            LOTHypothesis(grammar=my_grammar,
                          value=hypothesis.value[w].value,
                          display='lambda recurse_, C, X: %s'))

    hyps = TopN(N=options.top_count)

    mhs = MHSampler(h0,
                    data,
                    options.steps,
                    likelihood_temperature=options.llt,
                    prior_temperature=options.prior_temp)

    for samples_yielded, h in break_ctrlc(enumerate(mhs)):
        if samples_yielded % 100 == 0:
            pass  #print h.likelihood, h.prior, h
        hyps.add(h)

    import pickle
    print 'Writing ' + data[0].X + data[0].Y + str(
        data_amount) + data[0].word + '.pkl'
    with open(
            'Chains/' + data[0].X + data[0].Y + str(data_amount) +
            data[0].word + '.pkl', 'w') as f:
        pickle.dump(hyps, f)

    return hyps
예제 #44
0
    def evaluate_sampler(self, sampler):

        cnt = Counter()
        for h in break_ctrlc(sampler):
            cnt[h.value] += 1

        ## TODO: When the MCMC methods get cleaned up for how many samples they return, we will assert that we got the right number here
        # assert sum(cnt.values()) == NSAMPLES # Just make sure we aren't using a sampler that returns fewer samples! I'm looking at you, ParallelTempering

        Z = logsumexp([self.grammar.log_probability(t) for t in self.trees]) # renormalize to the trees in self.trees
        obsc = [cnt[t] for t in self.trees]
        expc = [exp( self.grammar.log_probability(t))*sum(obsc) for t in self.trees]
        csq, pv = chisquare(obsc, expc)
        assert abs(sum(obsc) - sum(expc)) < 0.01

        # assert min(expc) > 5 # or else chisq sux

        for t, c, s in zip(self.trees, obsc, expc):
            print c, s, t
        print (csq, pv), sum(obsc)

        self.assertGreater(pv, PVALUE, msg="Sampler failed chi squared!")

        return csq, pv
예제 #45
0
def evaluate_sampler(my_sampler, print_every=1000, out_aggregate=sys.stdout, trace=False, pthreshold=0.999, prefix=""):
    """
            Print the stats for a single sampler run

            *my_sampler* -- a generator of samples
            print_every -- display the output every this many steps
            out_hypothesis -- where we put hypothesis stats
            out_aggregate  -- where we put aggregate stats

            trace -- print every sample
            prefix -- display before lines
    """
    visited_at = defaultdict(list)

    startt = time()
    for n, s in break_ctrlc(enumerate(my_sampler)): # each sample should have an .posterior_score defined
        if trace: print "#", n, s

        visited_at[s].append(n)

        if (n%print_every)==0 and n>0:
            post =  sorted([x.posterior_score for x in visited_at.keys()], reverse=True) # the unnormalized posteriors of everything found
            ll   =  sorted([x.likelihood for x in visited_at.keys()], reverse=True)
            Z = logsumexp(post) # just compute total probability mass found -- the main measure

            # determine how many you need to get pthreshold of the posterior mass
            J=0
            while J < len(post):
                if logsumexp(post[J:]) < Z + log(1.0-pthreshold):
                    break
                J += 1

            out_aggregate.write('\t'.join(map(str, [prefix, n, r3(time()-startt), r5(Z), r5(post[0]), J, len(post)] )) + '\n')
            out_aggregate.flush()

    return
예제 #46
0
class ParticleSwarmPriorResample(ParticleSwarm):
    """
    Like ParticleSwarm, but resamples from the prior
    """
    def refresh(self):
        """
            Resample by resampling those below the median from the prior.
        """
        m = median(self.chainZ)

        for i in range(self.nchains):
            if self.chainZ[i] < m:
                self.chains[i] = self.make_h0(**self.kwargs)
            self.chainZ[i] = -Infinity  # reset this


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if __name__ == "__main__":
    from LOTlib.Examples.Number.Global import generate_data, make_h0

    data = generate_data(300)

    ps = ParticleSwarm(make_h0, data)
    for h in break_ctrlc(ps):
        print h.posterior_score, h

        if len(ps.seen) > 0:
            print "#", sorted(ps.seen,
                              key=lambda x: x.posterior_score,
                              reverse=True)[0]
예제 #47
0
                probs=[v.posterior_score for v in population],
                log=True)

            try:
                kid = mutate(crossover(mom, dad))
            except (ProposalFailedException, NodeSamplingException):
                continue

            kid.compute_posterior(data)
            yield kid

            nextpopulation.append(kid)

            # # if MH_acceptance(population[i].posterior_score, kid.posterior_score, 0.0):
            # if kid.posterior_score > population[i].posterior_score:
            #     population[i] = kid
            #     yield kid
        population = nextpopulation


if __name__ == "__main__":
    from LOTlib import break_ctrlc
    from LOTlib.Examples.Number.Model import make_hypothesis, make_data
    from LOTlib.Miscellaneous import qq
    data = make_data(400)

    for h in break_ctrlc(
            genetic_algorithm(make_hypothesis, data, mutate_lot,
                              crossover_lot)):
        print h.posterior_score, h.get_knower_pattern(), qq(h)
예제 #48
0
    def __iter__(self):
        for i, t in enumerate(self.grammar.enumerate()):

            if i >= self.steps:
                raise StopIteration

            h = self.make_h(value=t)
            h.compute_posterior(self.data)
            yield h


if __name__ == "__main__":

    from LOTlib import break_ctrlc

    #from LOTlib.Examples.Number.Shared import grammar, make_h0, generate_data
    #data = generate_data(100)
    #from LOTlib.Examples.RegularExpression.Shared import grammar, make_h0, data

    from LOTlib.Examples.Magnetism.Simple import make_data, make_hypothesis
    from LOTlib.Examples.Magnetism.Simple.Grammar import grammar

    #from LOTlib.Examples.RationalRules.Shared import grammar, data, make_h0

    for h in break_ctrlc(
            EnumerationInference(grammar,
                                 make_hypothesis,
                                 make_data(),
                                 steps=10000)):
        print h.posterior_score, h
예제 #49
0
def print_subtree_adaptations(grammar,
                              hypotheses,
                              posteriors,
                              subtrees,
                              relative_KL=True):
    """
            Determine how useful it would be to explicitly define each subtree in H across
            all of the (corresponding) posteriors, as measured by KL from prior to posterior

            - hypotheses - a list of LOThypotheses
            - posteriors - [ [P(h|data) for h in hypotheies] x problems ]
            - subtrees   - a collection of (possibly partial) subtrees to try adapting

            We treat hyps as a fixed finite hypothesis space, and assume every subtree considered
            is *not* derived compositionally (although this could change in future versions)

            p - the probability of going to kids in randomly generating a subtree
            subtree_multiplier - how many times we sample a subtree from *each* node in each hypothesis
            relative_KL - compute summed KL divergence absolutely, or relative to the h.compute_prior()?

    """

    # compute the normalized posteriors
    Ps = map(lognormalize, posteriors)

    # Compute the baseline KL divergence so we can score relative to this
    if relative_KL:
        oldpriors = lognormalize(
            numpy.array([h.compute_prior() for h in hypotheses]))
        KL0s = [sum(exp(oldpriors) * (oldpriors - P)) for P in Ps]
    else:
        KL0s = [
            1.0 for P in Ps
        ]  # pretend everything just had KL of 1, so we score relatively

    ## Now process each, starting with the most simple
    for t in break_ctrlc(
            sorted(subtrees,
                   key=lambda t: grammar.log_probability(t),
                   reverse=True)):

        # Get some stats on t:
        tlp = grammar.log_probability(t)
        tnt = count_identical_nonterminals(
            t.returntype, t)  # How many times is this nonterminal used?

        # How many matches of t are there in each H?
        m = numpy.array(
            [count_subtree_matches(t, h.value) for h in hypotheses])
        ## TODO: There is a complication: partial patterns matching themselves.
        ##       For simplicity, we'll just take the *first* match, seetting max(m)=1
        ##       In the future, we should change this to correctly handle and count
        ##       partial matches matching themselves
        m = (m >= 1) * 1
        assert max(m) == 1, "Error: " + str(t) + "\t" + str(m)

        # How many times is the nonterminal used, NOT counting in t?
        nt = numpy.array([
            count_identical_nonterminals(t.returntype, h.value)
            for h in hypotheses
        ]) - (tnt - 1) * m
        assert min(nt) >= 0, "Error: " + str(t)

        # And the PCFG prior *not* counting t
        q = lognormalize(
            numpy.array([grammar.log_probability(h.value)
                         for h in hypotheses]) - tlp * m)

        # The function to optimize
        def fnc(p):
            if p <= 0. or p >= 1.: return float("inf")  # enforce bounds

            newprior = lognormalize(q + log(p) * m + log(1. - p) * nt)

            kl = 0.0
            for P, kl0 in zip(Ps, KL0s):
                kl += sum(numpy.exp(newprior) * (newprior - P)) / kl0

            return kl

        ### TODO: This optimization should be analytically tractable...
        ###       but we need to check that it is convex! Any ideas?
        o = scipy.optimize.fmin(fnc,
                                numpy.array([0.1]),
                                xtol=0.0001,
                                ftol=0.0001,
                                disp=0)

        print fnc(o[0]), o[0], log(o[0]), grammar.log_probability(t), qq(t)
예제 #50
0
파일: Run.py 프로젝트: TerryLew/BinLOTlib
from LOTlib import break_ctrlc
from LOTlib.TopN import TopN
from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler
from Model import *
from TargetConcepts import TargetConcepts

NDATA = 20 # How many data points for each function?
NSTEPS = 100000
BEST_N = 500 # How many from each hypothesis to store

# Where we keep track of all hypotheses (across concepts)
all_hypotheses = TopN(N=BEST_N)

if __name__ == "__main__":
    # Now loop over each target concept and get a set of hypotheses
    for i, f in enumerate(TargetConcepts):

        # Set up the hypothesis
        h0 = make_hypothesis()

        # Set up some data
        data = make_data(NDATA, f)

        # Now run some MCMC
        fs = TopN(N=BEST_N, key="posterior_score")
        fs.add(break_ctrlc(MHSampler(h0, data, steps=NSTEPS, trace=False)))

        all_hypotheses.update(fs)

    pickle.dump(all_hypotheses, open("hypotheses.pkl", 'w'))
예제 #51
0
# Hypothesis
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

from LOTlib.Hypotheses.RationalRulesLOTHypothesis import RationalRulesLOTHypothesis


def make_hypothesis(grammar=grammar, **kwargs):
    return RationalRulesLOTHypothesis(grammar=grammar, rrAlpha=1.0, **kwargs)


if __name__ == "__main__":

    from LOTlib.TopN import TopN
    hyps = TopN(N=1000)

    from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler
    from LOTlib import break_ctrlc
    mhs = MHSampler(make_hypothesis(),
                    make_data(),
                    1000000,
                    likelihood_temperature=1.,
                    prior_temperature=1.)

    for samples_yielded, h in break_ctrlc(enumerate(mhs)):
        h.ll_decay = 0.
        hyps.add(h)

    import pickle
    with open('HypothesisSpace.pkl', 'w') as f:
        pickle.dump(hyps, f)
예제 #52
0
        pickle.dump(hyps, f)

    return hyps


###################################################################################
# Main Running
###################################################################################

argarray = map(lambda x: [x], options.data_pts * options.chains)

if is_master_process():
    display_option_summary(options)

seen = set()
for fs in break_ctrlc(
        MPI_map(run, numpy.random.permutation(argarray), progress_bar=False)):
    for h in fs.get_all():
        if h not in seen:
            seen.add(h)
            if h.prior > -Infinity:
                print h.prior, \
                    h.likelihood, \
                    h \

        #sys.stdout.flush()

#sys.stdout.flush()

import pickle
with open(options.out_path, 'w') as f:
    pickle.dump(seen, f)
예제 #53
0
grammar.start = 'TWO_CONCEPT_START'

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Hypothesis
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

from Model import make_hypothesis

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Main
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == "__main__":

    from LOTlib import break_ctrlc
    from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler
    from LOTlib.Miscellaneous import q


    # Create an initial hypothesis
    # This is where we set a number of relevant variables -- whether to use RR, alpha, etc.
    # Here we give args as "concept" (used in TWO_CONCEPT_START above) and "x"
    h0 = make_hypothesis(grammar=grammar, args=['concept', 'x'])

    data = make_data()

    # Run the vanilla sampler. Without steps, it will run infinitely
    # this prints out posterior (posterior_score), prior, likelihood,
    for h in break_ctrlc(MHSampler(h0, data, 10000, skip=100)):
        print h.posterior_score, h.prior, h.likelihood, q(h)
예제 #54
0
                         log(before_same_children) - log(nrk)) + old_lp_below

        return [newt, f - b]


if __name__ == "__main__":

    from LOTlib import break_ctrlc
    #from LOTlib.Examples.Number.Shared import grammar, make_h0, generate_data
    #data = generate_data(300)

    ## NOTE: TO NORMALLY USE THIS, YOU MUST MIX WITH REGENERATION PROPOSAL -- ELSE NOT ERGODIC

    from LOTlib.Examples.Magnetism.Simple.Run import grammar, make_h0, data

    from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler

    idp = InsertDeleteProposal(grammar)

    #data = generate_data(100)
    h = make_h0(proposal_function=idp)
    for h in break_ctrlc(MHSampler(h, data, 100000)):
        print h.posterior_score, h
    """
    for _ in xrange(100):
        t = grammar.generate()
        print "\n\n", t
        for _ in xrange(10):
            print "\t", idp.propose_tree(t)
    """
예제 #55
0
def make_hypothesis(**kwargs):
    return MyHypothesis(grammar=grammar, rrAlpha=1.0, **kwargs)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Main
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == "__main__":

    from LOTlib import break_ctrlc
    from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler
    from LOTlib.Miscellaneous import q

    # Create an initial hypothesis
    # This is where we set a number of relevant variables -- whether to use RR, alpha, etc.Z
    h0 = MyHypothesis(grammar, ll_decay=1.0, rrAlpha=1.0, args=['x'])

    data = make_data()

    # Run the vanilla sampler. Without steps, it will run infinitely
    # this prints out posterior (posterior_score), prior, likelihood,
    for h in break_ctrlc(
            MHSampler(h0, data, 10000, skip=100, shortcut_likelihood=False)):
        print h.posterior_score, h.prior, h.likelihood, q(h)

    # This setup requires the *later* data to be upweighted, meaning that hypotheses that get
    # later data wrong should be given lower likelhood. But also with the decay, the overall
    # magnitude of the likelihood decreases.
예제 #56
0
print "# Created L, NYes, NTrials, and HOutput of size %s" % len(L)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Run inference
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
from LOTlib import break_ctrlc

from LOTlib.Inference.GrammarInference.FullGrammarHypothesis import FullGrammarHypothesis

from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler

h0 = FullGrammarHypothesis(counts, L, GroupLength, prior_offset, NYes, NTrials,
                           Output)
mhs = MHSampler(h0, [], 100000, skip=0)

for s, h in break_ctrlc(enumerate(mhs)):

    print mhs.acceptance_ratio(), h.prior, h.likelihood,\
          h.value['alpha'].value[0], h.value['beta'].value[0],\
          h.value['prior_temperature'].value, h.value['likelihood_temperature'].value,\
          'RULES',\
          ' '.join([str(x) for x in h.value['rulep']['BOOL'].value ]),\
          ' '.join([str(x) for x in h.value['rulep']['PREDICATE'].value ]),\
          ' '.join([str(x) for x in h.value['rulep']['START'].value ]),\
          ' '.join([str(x) for x in h.value['rulep']['SET'].value ])

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Run gradient ascent
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# from LOTlib import break_ctrlc
#
예제 #57
0
        self.penalty = penalty
        self.seen = Counter()

    def next(self):
        v = MHSampler.next(self)
        self.seen[v] += 1
        return v

    def compute_posterior(self, h, data, **kwargs):
        """
        Compute prior & likelihood for `h`, penalizing prior by how many samples have been generated so far.

        """
        return self.seen[h] * self.penalty + h.compute_posterior(
            data, **kwargs)


if __name__ == "__main__":

    from LOTlib import break_ctrlc
    from LOTlib.Examples.Number.Model import *
    from LOTlib.Miscellaneous import q

    data = make_data(500)
    h0 = NumberExpression(grammar)

    tmc = TabooMCMC(h0, data, steps=10000)

    for h in break_ctrlc(tmc):
        print tmc.seen[h], h.posterior_score, h.prior, h.likelihood, q(h)