Example #1
0
def probe_MHsampler(h,
                    language,
                    options,
                    name,
                    size=64,
                    data=None,
                    init_size=None,
                    iters_per_stage=None,
                    sampler=None,
                    ret_sampler=False):
    get_data = language.sample_data_as_FuncData
    evaluation_data = get_data(size, max_length=options.FINITE)

    if data is None:
        if init_size is None:
            data = evaluation_data
        else:
            data = get_data(n=size, max_length=init_size)

    if sampler is None:
        sampler = MHSampler(h, data)
    else:
        sampler.data = data

    best_hypotheses = TopN(N=options.TOP_COUNT)

    iter = 0

    for h in sampler:
        if iter == options.STEPS: break
        if iter % 100 == 0: print '---->', iter

        best_hypotheses.add(h)

        if iter % options.PROBE == 0:

            for h in best_hypotheses:
                h.compute_posterior(evaluation_data)
            Z = logsumexp([h.posterior_score for h in best_hypotheses])

            pr_data = get_data(1024, max_length=options.FINITE)
            weighted_score = 0
            for h in best_hypotheses:
                precision, recall = language.estimate_precision_and_recall(
                    h, pr_data)
                if precision + recall != 0:
                    f_score = precision * recall / (precision + recall)
                    weighted_score += np.exp(h.posterior_score - Z) * f_score
            weighted_score *= 2

            to_file([[iter, Z, weighted_score]], name)

        if init_size is not None and iter % iters_per_stage == 0:
            init_size += 2
            sampler.data = get_data(n=size, max_length=init_size)

        iter += 1

    if ret_sampler:
        return sampler
Example #2
0
    def __init__(self, h0, data, prior_schedule=None, likelihood_schedule=None, **kwargs):
        MHSampler.__init__(self, h0, data, **kwargs)

        if prior_schedule is None:
            prior_schedule = ConstantSchedule(1.0)
        if likelihood_schedule is None:
            likelihood_schedule = ConstantSchedule(1.0)

        self.prior_schedule = prior_schedule
        self.likelihood_schedule = likelihood_schedule
Example #3
0
def probe_MHsampler(h, language, options, name, size=64, data=None, init_size=None, iters_per_stage=None, sampler=None, ret_sampler=False):
    get_data = language.sample_data_as_FuncData
    evaluation_data = get_data(size, max_length=options.FINITE)

    if data is None:
        if init_size is None:
            data = evaluation_data
        else:
            data = get_data(n=size, max_length=init_size)

    if sampler is None:
        sampler = MHSampler(h, data)
    else:
        sampler.data = data

    best_hypotheses = TopN(N=options.TOP_COUNT)

    iter = 0

    for h in sampler:
        if iter == options.STEPS: break
        if iter % 100 == 0: print '---->', iter

        best_hypotheses.add(h)

        if iter % options.PROBE == 0:

            for h in best_hypotheses:
                h.compute_posterior(evaluation_data)
            Z = logsumexp([h.posterior_score for h in best_hypotheses])

            pr_data = get_data(1024, max_length=options.FINITE)
            weighted_score = 0
            for h in best_hypotheses:
                precision, recall = language.estimate_precision_and_recall(h, pr_data)
                if precision + recall != 0:
                    f_score = precision * recall / (precision + recall)
                    weighted_score += np.exp(h.posterior_score - Z) * f_score
            weighted_score *= 2

            to_file([[iter, Z, weighted_score]], name)

        if init_size is not None and iter % iters_per_stage == 0:
            init_size += 2
            sampler.data = get_data(n=size, max_length=init_size)

        iter += 1

    if ret_sampler:
        return sampler
Example #4
0
def mpirun(d):
    """
    Generate NumberGameHypotheses using MPI.

    """
    if options.grammar_scale:
        grammar_ = grammar_gamma(grammar, options.grammar_scale)
    else:
        grammar_ = grammar
    h0 = NumberGameHypothesis(grammar=grammar_, domain=100, alpha=0.9)
    mh_sampler = MHSampler(h0, d.input, options.iters)
    # hypotheses = TopN(N=options.N)
    hypotheses = set()

    # This is a dict so we don't add duplicate hypotheses sets, e.g. h1() == [4],  h2() == [4]
    h_sets = {}

    for h in break_ctrlc(mh_sampler):
        h_set = str(h())
        if h_set in h_sets:
            if h.prior > h_sets[h_set].prior:
                hypotheses.remove(h_sets[h_set])
                h_sets[h_set] = h
                hypotheses.add(h)
        else:
            h_sets[h_set] = h
            hypotheses.add(h)

    top1000 = sorted(hypotheses, key=lambda h: -h.posterior_score)[0:1000]
    return top1000
Example #5
0
def construct_hypothesis_space(data_size):
    all_hypotheses = TopN()
    print 'Data size: ', data_size
    for i in range(RUNS):
        print 'Run: ', i
        hypotheses = TopN(25)
        data = generate_data(data_size)
        learner = GriceanQuantifierLexicon(make_my_hypothesis,
                                           my_weight_function)
        for w in target.all_words():
            learner.set_word(w, make_my_hypothesis())
        j = 0
        for h in MHSampler(learner, data, SAMPLES, skip=0):
            hypotheses.add(h)
            j += 1
            if j > 0 and j % 1000 == 0:
                pickle.dump(
                    hypotheses,
                    open(
                        'data/hypset_' + GRAMMAR_TYPE + '_' + str(data_size) +
                        '_' + str(j) + '.pickle', 'w'))
            #sstr = str(h)
            #sstr = re.sub("[_ ]", "", sstr)
            #sstr = re.sub("presup", u"\u03BB A B . presup", sstr)
            #print sstr
        all_hypotheses.update(hypotheses)
    return all_hypotheses
def some_stuff():


    #simulate some data with high probability of
    #center embedding
    #and a lower probability of tail recursion
    lst = ["(", "[", "]", ")"]
    randomTstLists = [("(", ")", "[", "]"), ("[", "]", "(", ")")]
    d = simulateData(lst, randomTstLists, p=1.0, N=12)

    for k in d.keys():
        if d[k] > 0.0:
            print k, d[k]
    print len(d)

    #print isValidCenterEmbed(("(", "]"))
    #print isValidCenterEmbed(('(', '(', ')', ']'))

    data = [ FunctionData(input=(), output=d, alpha=0.9) ]
    h0 = MyHypothesis()
    from numpy import exp
    for h in MHSampler(h0, data, steps=5000):
        None
        #exp(h.compute_likelihood(data)) > 0.0 or
        y = h()
        #print h, y
        if isValidCenterEmbed(y):
            #print "hello"
            #None
            print exp(h.compute_posterior(data)), h, y#, isValidCenterEmbed(y)
        #else:
           # print h.compute_likelihood(data), h, y
Example #7
0
def run(options, ndata):
    if LOTlib.SIG_INTERRUPTED: return 0, set()

    language = eval(options.LANG + "()")
    data = language.sample_data(LARGE_SAMPLE)

    assert len(data) == 1
    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE

    z = sum(data[0].output.values())
    if z > 0:
        best_ll = sum([(p / z) * log(p / z) for p in data[0].output.values()])
    else:
        best_ll = 0.0

    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', "'%s'" % t, None, 1.0)

    # set up the hypothesis
    h0 = IncrementalLexiconHypothesis(grammar=grammar,
                                      alphabet_size=len(language.terminals()))
    h0.set_word(
        0,
        h0.make_hypothesis(grammar=grammar))  # make the first word at random
    h0.N = 1

    tn = TopN(N=options.TOP_COUNT)

    for outer in xrange(options.N):  # how many do we add?
        if LOTlib.SIG_INTERRUPTED: return 0, set()

        # and re-set the posterior or else it's something weird
        h0.compute_posterior(data)

        # now run mcmc
        for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
            h.best_ll = best_ll  # just store this
            tn.add(copy(h))

            if options.TRACE:
                print h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata, h
                v = h()
                sortedv = sorted(v.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)
                print "{" + ', '.join(["'%s':%s" % i for i in sortedv]) + "}"

        # and start from where we ended
        h0 = copy(h)
        h0.deepen()

    return ndata, tn
def get_top_N(pair1, pair2):
    priors = {}
    complete = 0
    for p1 in pair1:
        for p2 in pair2:
            data = [
                FunctionData(alpha=alpha, input=[p1], output={p2: len(p2)})
            ]
            h0 = MyHypothesis()
            top_hyps = set()
            seen = set()

            chains = 0
            while ((len(seen) < n_top and chains < max_chains)
                   or (len(seen) < 3)):
                chains += 1
                x = 0
                for h in MHSampler(h0,
                                   data,
                                   steps=steps,
                                   acceptance_temperature=acc_temp):
                    #print y
                    out = h(p1)[:len(p2)]

                    str_h = str(h)
                    if len(out) == len(p2) and hamming_distance(out, p2) == 0:
                        if str_h not in seen:  #and "from" not in str_h[14:]:
                            top_hyps.add((copy.deepcopy(h), h.prior))
                            seen.add(str_h)

                    if x % 1000 == 0:
                        print_star(x, h, out, p2, h.value.get_rule_signature(),
                                   len(seen))

                    x += 1

            print_star()
            priors[(p1, p2)] = []
            for h in sorted(top_hyps, key=lambda tup: -tup[1])[:n_top]:
                print p1, p2
                print h[0], h[1], h[0].value.count_subnodes()
                priors[(p1, p2)].append(
                    (copy.deepcopy(h[0]), h[1], h[0].value.count_subnodes()))
            complete += 1
            print "complete: %d" % complete

    for key in priors:
        print "***"
        print key
        for p in priors[key]:
            print p

        print "***"

    return priors
Example #9
0
def run():

    h0 = make_hypothesis()
    data = make_data()

    for x in break_ctrlc(MHSampler(h0, data, STEPS)):

        print x.posterior_score, x
        for di in data:
            print "\t", di.input, "->", x(
                *di.input), " ; should be ", di.output
Example #10
0
def run_one(iteration, model, model2data, sampler_type):
    """
    Run one iteration of a sampling method
    """

    if LOTlib.SIG_INTERRUPTED: return

    # Take model and load the function to create hypotheses
    # Data is passed in to be constant across runs
    if re.search(r":", model):
        m, d = re.split(r":", model)
        make_hypothesis, _ = load_example(m)
    else:
        make_hypothesis, _ = load_example(model)


    h0 = make_hypothesis()
    grammar = h0.grammar
    data = model2data[model]

    # Create a sampler
    if sampler_type == 'mh_sample_A':               sampler = MHSampler(h0, data, options.SAMPLES,  likelihood_temperature=1.0)
    # elif sampler_type == 'mh_sample_B':             sampler = MHSampler(h0, data, options.SAMPLES,  likelihood_temperature=1.1)
    # elif sampler_type == 'mh_sample_C':             sampler = MHSampler(h0, data, options.SAMPLES,  likelihood_temperature=1.25)
    # elif sampler_type == 'mh_sample_D':             sampler = MHSampler(h0, data, options.SAMPLES,  likelihood_temperature=2.0 )
    # elif sampler_type == 'mh_sample_E':             sampler = MHSampler(h0, data, options.SAMPLES,  likelihood_temperature=5.0 )
    elif sampler_type == 'particle_swarm_A':        sampler = ParticleSwarm(make_hypothesis, data, steps=options.SAMPLES, within_steps=10)
    elif sampler_type == 'particle_swarm_B':        sampler = ParticleSwarm(make_hypothesis, data, steps=options.SAMPLES, within_steps=100)
    elif sampler_type == 'particle_swarm_C':        sampler = ParticleSwarm(make_hypothesis, data, steps=options.SAMPLES, within_steps=200)
    elif sampler_type == 'particle_swarm_prior_sample_A':        sampler = ParticleSwarmPriorResample(make_hypothesis, data, steps=options.SAMPLES, within_steps=10)
    elif sampler_type == 'particle_swarm_prior_sample_B':        sampler = ParticleSwarmPriorResample(make_hypothesis, data, steps=options.SAMPLES, within_steps=100)
    elif sampler_type == 'particle_swarm_prior_sample_C':        sampler = ParticleSwarmPriorResample(make_hypothesis, data, steps=options.SAMPLES, within_steps=200)
    elif sampler_type == 'multiple_chains_A':       sampler = MultipleChainMCMC(make_hypothesis, data, steps=options.SAMPLES, nchains=10)
    elif sampler_type == 'multiple_chains_B':       sampler = MultipleChainMCMC(make_hypothesis, data, steps=options.SAMPLES, nchains=100)
    elif sampler_type == 'multiple_chains_C':       sampler = MultipleChainMCMC(make_hypothesis, data, steps=options.SAMPLES, nchains=1000)
    elif sampler_type == 'parallel_tempering_A':    sampler = ParallelTemperingSampler(make_hypothesis, data, steps=options.SAMPLES, within_steps=10, temperatures=[1.0, 1.025, 1.05], swaps=1, yield_only_t0=False)
    elif sampler_type == 'parallel_tempering_B':    sampler = ParallelTemperingSampler(make_hypothesis, data, steps=options.SAMPLES, within_steps=10, temperatures=[1.0, 1.25, 1.5], swaps=1, yield_only_t0=False)
    elif sampler_type == 'parallel_tempering_C':    sampler = ParallelTemperingSampler(make_hypothesis, data, steps=options.SAMPLES, within_steps=10, temperatures=[1.0, 2.0, 5.0], swaps=1, yield_only_t0=False)
    elif sampler_type == 'taboo_A':                 sampler = TabooMCMC(h0, data, steps=options.SAMPLES, skip=0, penalty= 0.001)
    elif sampler_type == 'taboo_B':                 sampler = TabooMCMC(h0, data, steps=options.SAMPLES, skip=0, penalty= 0.010)
    elif sampler_type == 'taboo_C':                 sampler = TabooMCMC(h0, data, steps=options.SAMPLES, skip=0, penalty= 0.100)
    elif sampler_type == 'taboo_D':                 sampler = TabooMCMC(h0, data, steps=options.SAMPLES, skip=0, penalty= 1.000)
    elif sampler_type == 'taboo_E':                 sampler = TabooMCMC(h0, data, steps=options.SAMPLES, skip=0, penalty=10.000)
    # elif sampler_type == 'partitionMCMC_A':         sampler = PartitionMCMC(grammar, make_hypothesis, data, 10, steps=options.SAMPLES)
    # elif sampler_type == 'partitionMCMC_B':         sampler = PartitionMCMC(grammar, make_hypothesis, data, 100, steps=options.SAMPLES)
    # elif sampler_type == 'partitionMCMC_C':         sampler = PartitionMCMC(grammar, make_hypothesis, data, 1000, steps=options.SAMPLES)
    elif sampler_type == 'enumeration_A':           sampler = EnumerationInference(grammar, make_hypothesis, data, steps=options.SAMPLES)
    else: assert False, "Bad sampler type: %s" % sampler_type

    # And open our output and evaluate
    with open("output/out-aggregate.%s" % get_rank(), 'a') as out_aggregate:
        evaluate_sampler(sampler, trace=False, prefix="\t".join(map(str, [model, iteration, sampler_type])),
                         out_aggregate=out_aggregate, print_every=options.PRINTEVERY)
Example #11
0
def runme(chain, dataamt):

    if LOTlib.SIG_INTERRUPTED: return ()

    data = make_data(dataamt)

    tn = TopN(options.top)

    h0 = make_hypothesis()
    for h in break_ctrlc(MHSampler(h0, data, steps=options.steps, skip=0)):
        # print h.posterior_score, h.prior, h.likelihood, h
        h.likelihood_per_data = h.likelihood/dataamt
        tn.add(h)

    return tn
def run(data, TOP=100, STEPS=1000):
    #if LOTlib.SIG_INTERRUPTED:
      #  return ""
    #data = [FunctionData(input=(), output={lst: len(lst)})]
    h0 = MyHypothesis()
    tn = TopN(N=TOP)
    # run the sampler
    counter = Counter()
    for h in MHSampler(h0, data, steps=STEPS, acceptance_temperature=1.0, likelihood_temperature=1.0):#, likelihood_temperature=10.0):
        # counter[h] += 1
        tn.add(h)

    z = logsumexp([h.posterior_score for h in tn])
    sort_post_probs = [(h, exp(h.posterior_score - z)) for h in tn.get_all(sorted=True)][::-1]
    return sort_post_probs
Example #13
0
def myrun(observed_set):

    if LOTlib.SIG_INTERRUPTED:
        return set()

    h0 = NumberGameHypothesis(grammar=grammar)

    data = [FunctionData(input=[], output=observed_set, alpha=ALPHA)]

    tn = TopN(N=options.TOP_COUNT)
    for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
        tn.add(h)

    print "# Finished %s" % str(observed_set)

    return set(tn.get_all())
Example #14
0
def standard_sample(make_hypothesis, make_data, show_skip=9, show=True, N=100, save_top='top.pkl', alsoprint='None', **kwargs):
    """
        Just a simplified interface for sampling, allowing printing (showing), returning the top, and saving.
        This is used by many examples, and is meant to easily allow running with a variety of parameters.
        NOTE: This skip is a skip *only* on printing
        **kwargs get passed to sampler
    """
    if LOTlib.SIG_INTERRUPTED:
        return TopN()  # So we don't waste time!

    h0 = make_hypothesis()
    data = make_data()


    best_hypotheses = TopN(N=N)

    f = eval(alsoprint)

    sampler = MHSampler(h0, data, **kwargs)

#    # TODO change acceptance temperature over times
#    sampler.acceptance_temperature = 0.5

    for i, h in enumerate(break_ctrlc(sampler)):

#        if i % 10000 == 0 and i != 0:
#            sampler.acceptance_temperature = min(1.0, sampler.acceptance_temperature+0.1)
#            print '='*50
#            print 'change acc temperature to', sampler.acceptance_temperature 

        best_hypotheses.add(h)

        if show and i%(show_skip+1) == 0:

            print i, \
                h.posterior_score, \
                h.prior, \
                h.likelihood, \
                f(h) if f is not None else '', \
                qq(cleanFunctionNodeString(h))

    if save_top is not None:
        print "# Saving top hypotheses"
        with open(save_top, 'w') as f:
            pickle.dump(best_hypotheses, f)

    return best_hypotheses
Example #15
0
def run(save_file, alpha, iters, propose_scale, propose_n, skip, summary_cap):
    # Faux data
    data = [
        HumanData(
            data=FunctionData(input=[2,4,6,8], output=[]),
            queries=(1, 20, 30, 48, 80, 99),
            responses=((1, 19), (17, 3), (15, 5), (19, 1), (20, 0), (2, 18))
        ),
        HumanData(
            data=FunctionData(input=[10, 40], output=[]),
            queries=(1, 20, 30, 48, 80, 99),
            responses=((1, 19), (20, 0), (20, 0), (2, 18), (19, 1), (2, 18))
        )
    ]


    # Enumerate all 'domain level' hypotheses generated by our grammar
    hypotheses = []
    for fn in simple_grammar.enumerate(d=10):
        h = NumberGameHypothesis(grammar=simple_grammar, domain=100, alpha=alpha)
        h.set_value(fn)
        h.compute_prior()
        hypotheses.append(h)

    grammar_h0 = GrammarHypothesisVectorized(simple_grammar, hypotheses,
                                             propose_scale=propose_scale, propose_n=propose_n)
    mh_grammar_sampler = MHSampler(grammar_h0, data, iters)
    mh_grammar_summary = VectorSummary(skip=skip, cap=summary_cap)

    print '^*'*60, '\nGenerating GrammarHypothesis Samples\n', '^*'*60

    # Initialize csv file
    mh_grammar_summary.csv_initfiles(save_file)

    # Sample GrammarHypotheses!
    for i, gh in enumerate(mh_grammar_summary(mh_grammar_sampler)):
        if (i % 10 == 0):
            print i, " ITERATIONS"
            print '\n', '#'*100

        # Save to CSV & print grammar rule values
        if (i % skip == 0):
            mh_grammar_summary.csv_appendfiles(save_file, data)
            for idx in grammar_h0.get_propose_idxs():
                print idx, '\t|  ', grammar_h0.rules[idx]

    mh_grammar_summary.pickle_summary(filename=save_file + '_summary.p')
def get_top_N(pair1, pair2):
    priors = {}
    for p1 in pair1:
        for p2 in pair2:
            data = [
                FunctionData(alpha=alpha, input=[p1], output={p2: len(p2)})
            ]
            h0 = MyHypothesis()
            top_hyps = set()
            seen = set()

            x = 0
            while len(top_hyps) < n_top * 2:
                for h in MHSampler(h0, data, steps=steps):
                    #print y
                    out = h(p1)[:len(p2)]

                    str_h = str(h)
                    if len(out) == len(p2) and hamming_distance(out, p2) == 0:
                        if str_h not in seen:
                            top_hyps.add((copy.deepcopy(h), h.prior))
                            seen.add(str_h)

                    if x % 1000 == 0:
                        print p1, p2
                        print_star(x, h, out, p2, h.value.get_rule_signature())

                    x += 1
            print_star()
            priors[(p1, p2)] = []
            for h in sorted(top_hyps, key=lambda tup: -tup[1])[:n_top]:
                print p1, p2
                print h[0], h[1], h[0].value.count_subnodes()
                priors[(p1, p2)].append(
                    (copy.deepcopy(h[0]), h[1], h[0].value.count_subnodes()))

    for key in priors:
        print "***"
        print key
        for p in priors[key]:
            print p

        print "***"

    return priors
Example #17
0
def run(options, ndata):
    """
    This out on the DATA_RANGE amounts of data and returns all hypotheses in top count
    """
    if LOTlib.SIG_INTERRUPTED:
        return 0, set()

    language = eval(options.LANG+"()")
    data = language.sample_data(LARGE_SAMPLE)
    assert len(data) == 1

    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE
    #print data

    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', q(t), None, 2)

    h0 = IncrementalLexiconHypothesis(grammar=grammar)

    tn = TopN(N=options.TOP_COUNT)

    for outer in xrange(options.N): # how many do we add?
        # add to the grammar
        grammar.add_rule('SELFF', '%s' % (outer), None, 1.0)

        # Add one more to the number of words here
        h0.set_word(outer, h0.make_hypothesis(grammar=grammar))
        h0.N = outer+1
        assert len(h0.value.keys())==h0.N==outer+1

        # now run mcmc
        for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
            tn.add(h)

            print h.posterior_score, h
            print getattr(h, 'll_counts', None)

        # and start from where we ended
        h0 = deepcopy(h) # must deepcopy

    return ndata, tn
Example #18
0
def run(options, ndata):
    """
    This out on the DATA_RANGE amounts of data and returns all hypotheses in top count
    """
    if LOTlib.SIG_INTERRUPTED:
        return set()

    language = eval(options.LANG+"()")
    data = language.sample_data(LARGE_SAMPLE)
    assert len(data) == 1

    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE
    # print data

    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', q(t), None, 2)

    h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s")
    print "# Starting on ", h0

    tn = TopN(N=options.TOP_COUNT)

    # print h0.compute_posterior(data)
    # for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))):
    # # for h in MHSampler(h0, data, steps=options.STEPS, trace=True):
    #     print h.posterior_score, h
    #     print getattr(h, 'll_counts', None)

    with open(prefix+'hypotheses_'+options.LANG+'_'+str(rank)+'_'+str(ndata)+'_'+suffix+".txt", 'a') as ofile:

        for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))):
            tn.add(h)
            # print h.posterior_score, getattr(h, 'll_counts', None), h
            if i%options.SKIP == 0 and h.posterior_score > -Infinity:
                print >>ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood/ndata
                print >>ofile, getattr(h,'ll_counts', None)
                print >>ofile, h, '\0' # must add \0 when not Lexicon


    return tn
Example #19
0
def run(options, ndata):
    """
    This out on the DATA_RANGE amounts of data and returns all hypotheses in top count
    """
    if LOTlib.SIG_INTERRUPTED:
        return set()

    language = eval(options.LANG + "()")
    data = language.sample_data(LARGE_SAMPLE)
    assert len(data) == 1

    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE

    print data
    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', q(t), None, 2)

    h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s")

    tn = TopN(N=options.TOP_COUNT)

    for i, h in enumerate(break_ctrlc(MHSampler(h0, data,
                                                steps=options.STEPS))):
        print h.posterior_score, h
        print getattr(h, 'll_counts', None)

    # with open(prefix+'hypotheses_'+options.LANG+'_'+str(rank)+'_'+str(ndata)+'_'+suffix+".txt", 'a') as ofile:
    #
    #     for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))):
    #         tn.add(h)
    #         # print h.posterior_score, getattr(h, 'll_counts', None), h
    #         if i%options.SKIP == 0:
    #             print >>ofile, "\n"
    #             print >>ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood/ndata
    #             print >>ofile, getattr(h,'ll_counts', None),
    #             print >>ofile, h # ends in \0 so we can sort with sort -g -z

    return tn
Example #20
0
    def runTest(self):

        for model in [
                'EvenOdd', 'FOL', 'Magnetism.Simple', 'Magnetism.Complex',
                'NAND', 'Number', 'RegularExpression', 'RationalRules',
                'StochasticGrammarInduction', 'SymbolicRegression.Galileo',
                'SymbolicRegression.Symbolic', 'Prolog', 'PureLambda', 'Lua'
        ]:
            print "# Testing loading of example", model

            make_hypothesis, make_data = load_example(model)

            d = make_data()
            d = make_data(10)  # require an amount

            # Let's just try initializing a bunch of times
            for _ in xrange(100):
                h0 = make_hypothesis()

            # and ensure that the samplign will run
            for _ in MHSampler(h0, d, steps=100):
                pass
Example #21
0
def run(data_size, my_finite_trees):
    data = generate_data(data_size)

    # the prior for each tree
    prior = np.array([x.compute_prior() for x in my_finite_trees])
    prior = prior - logsumexp(prior)

    # the likelihood weights for each hypothesis
    weights = np.array([my_weight_function(h) for h in my_finite_trees])
    # response[h,di] gives the response of the h'th tree to data di
    response = np.array(
        [mapto012(get_tree_set_responses(t, data)) for t in my_finite_trees])

    # Now actually run:
    hypset = TopN(N=TOP_COUNT)

    learner = VectorizedLexicon_DistanceMetricProposal(target.all_words(),
                                                       my_finite_trees, prior)
    databundle = [response, weights]
    generator = MHSampler(learner, databundle, STEPS, skip=SKIP)
    for g in generator:
        hypset.add(VectorizedLexicon_to_SimpleLexicon(g), g.posterior_score)
    return hypset
Example #22
0
        h0.start_counts = start_counts
        #for h in grammar.get_rule
        #print h0.__dict__.get('rrAlpha', 1.0)

        h0.set_value(value=h)
        h0.compute_prior()
        h0.compute_likelihood(data)
        print s, h0.value, exp(h0.prior)  #, h0.likelihood
        s += 1
    #unit_tests()
    assert (False)
    stp = 0
    t1 = time.time()
    best = None
    best_posterior = None
    for h in SampleStream(MHSampler(h0, data, steps=stps)):

        r = h()
        if best_posterior == None or h.posterior_score >= best_posterior:
            best = copy.deepcopy(h)
            best_out = r
            best_posterior = h.posterior_score

        if stp % 500 == 0:
            print stp, float(stp + 1) / (time.time() - t1)
            try:
                print hamming_distance(lst, r[:len(lst)])
            except:
                print len(lst)
            print best
            print best_out
Example #23
0
    # print "#\t Loaded human data for concept %s" % concept

print "# Created L, NYes, NTrials, and HOutput of size %s" % len(L)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Run inference
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
from LOTlib import break_ctrlc

from LOTlib.Inference.GrammarInference.FullGrammarHypothesis import FullGrammarHypothesis

from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler

h0 = FullGrammarHypothesis(counts, L, GroupLength, prior_offset, NYes, NTrials, Output)
mhs = MHSampler(h0, [], 100000, skip=0)

for s, h in break_ctrlc(enumerate(mhs)):

    print mhs.acceptance_ratio(), h.prior, h.likelihood,\
          h.value['alpha'].value[0], h.value['beta'].value[0],\
          h.value['prior_temperature'].value, h.value['likelihood_temperature'].value,\
          'RULES',\
          ' '.join([str(x) for x in h.value['rulep']['BOOL'].value ]),\
          ' '.join([str(x) for x in h.value['rulep']['PREDICATE'].value ]),\
          ' '.join([str(x) for x in h.value['rulep']['START'].value ]),\
          ' '.join([str(x) for x in h.value['rulep']['SET'].value ])


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Run gradient ascent
Example #24
0
def make_hypothesis(**kwargs):
    return MyHypothesis(grammar=grammar, rrAlpha=1.0, **kwargs)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Main
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == "__main__":

    from LOTlib import break_ctrlc
    from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler
    from LOTlib.Miscellaneous import q

    # Create an initial hypothesis
    # This is where we set a number of relevant variables -- whether to use RR, alpha, etc.Z
    h0 = MyHypothesis(grammar, ll_decay=1.0, rrAlpha=1.0, args=['x'])

    data = make_data()

    # Run the vanilla sampler. Without steps, it will run infinitely
    # this prints out posterior (posterior_score), prior, likelihood,
    for h in break_ctrlc(
            MHSampler(h0, data, 10000, skip=100, shortcut_likelihood=False)):
        print h.posterior_score, h.prior, h.likelihood, q(h)

    # This setup requires the *later* data to be upweighted, meaning that hypotheses that get
    # later data wrong should be given lower likelhood. But also with the decay, the overall
    # magnitude of the likelihood decreases.
Example #25
0
    # print "#\t Loaded human data for concept %s" % concept

print "# Created L, NYes, NTrials, and HOutput of size %s" % len(L)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Run inference
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
from LOTlib import break_ctrlc

from LOTlib.Inference.GrammarInference.FullGrammarHypothesis import FullGrammarHypothesis

from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler

h0 = FullGrammarHypothesis(counts, L, GroupLength, prior_offset, NYes, NTrials,
                           Output)
mhs = MHSampler(h0, [], 100000, skip=0)

for s, h in break_ctrlc(enumerate(mhs)):

    print mhs.acceptance_ratio(), h.prior, h.likelihood,\
          h.value['alpha'].value[0], h.value['beta'].value[0],\
          h.value['prior_temperature'].value, h.value['likelihood_temperature'].value,\
          'RULES',\
          ' '.join([str(x) for x in h.value['rulep']['BOOL'].value ]),\
          ' '.join([str(x) for x in h.value['rulep']['PREDICATE'].value ]),\
          ' '.join([str(x) for x in h.value['rulep']['START'].value ]),\
          ' '.join([str(x) for x in h.value['rulep']['SET'].value ])

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Run gradient ascent
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Example #26
0
            #output=stim)]

            h0 = MyHypothesis()

            MAP = None
            best_post = -float("inf")
            best_out = ""

            n_comp = n_compatible(stim, concepts)
            print stim, n_comp

            while n_compatible(stim, concepts) < 2:

                for h in MHSampler(h0,
                                   data,
                                   steps=STEPS,
                                   acceptance_temperature=2.):

                    out = h(all_C)
                    str_h = str(h)

                    if out not in concepts:
                        concepts[out] = []

                    if str_h not in seen:
                        if len(concepts[out]) < 200 or np.exp(h.prior) > min(
                            [x[0] for x in concepts[out]]):
                            # if len(concepts[out]) > 0:
                            # print len(concepts[out]), np.exp(h.prior), min([x[0] for x in concepts[out]])

                            concepts[out].append(
def run(pairs):

    priors = {}
    complete = 0
    top_hyps = set()
    already_done = set()
    t_start = time.time()

    for pair in pairs:
        p1 = pair[0]
        p2 = pair[1]

        h0 = MyHypothesis()
        t_pair = time.time()
        #h0.start_counts = add_counts

        seen = set()
        #for ind in xrange(2, 3):
        for ind in xrange(len(p1) + 1):

            seen_round = set()
            x = 0
            p1_i = p1[:ind]
            p2_i = p2[:ind]
            if (p1, p2_i) not in already_done:
                already_done.add((p1, p2_i))
                data = [
                    FunctionData(alpha=alpha,
                                 input=[p1],
                                 output={p2_i: len(p2_i)})
                ]

                while len(seen_round) < n_top:
                    for h in MHSampler(h0,
                                       data,
                                       steps=steps,
                                       acceptance_temperature=acc_temp,
                                       prior_temperature=prior_temp):
                        if len(seen_round) >= n_top:
                            break
                        str_h = str(h.value)
                        out = h(p1)[:len(p2_i)]
                        if (len(out) == len(p2_i)
                                and (hamming_distance(out, p2_i) == 0)
                                and (len(h(p1)[:len(p1)]) == len(p1))):
                            if str_h not in seen:  #and "from" not in str_h[14:]:
                                l_rules = [
                                    str(i) for i in list(
                                        numpy.hstack(
                                            get_rule_counts(grammar, h.value)))
                                ]
                                top_hyps.add(
                                    (toAB(p1), ind, copy.deepcopy(h), toAB(p2),
                                     toAB(h(p1_i)[:len(p1)]),
                                     ",".join(l_rules), str(h.value)))
                                seen.add(str_h)
                            if str_h not in seen_round:
                                seen_round.add(str_h)

                        if x % 1000 == 0:
                            print_star(
                                "seen:%d" % len(seen_round), "steps:%d" % x,
                                "hyp:%s" % str_h, "p2:%s" % p2_i,
                                "out:%s" % out, "prior:%f" % h.prior,
                                "pair_time:%.2f" % (time.time() - t_pair),
                                "tot_time:%.2f" % (time.time() - t_start))

                        x += 1

        for h in top_hyps:
            print_star(h[0], h[1], h[2], h[3], h[4], h[5])

    return top_hyps
Example #28
0
# Hypothesis
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

from LOTlib.Hypotheses.RationalRulesLOTHypothesis import RationalRulesLOTHypothesis


def make_hypothesis(grammar=grammar, **kwargs):
    return RationalRulesLOTHypothesis(grammar=grammar, rrAlpha=1.0, **kwargs)


if __name__ == "__main__":

    from LOTlib.TopN import TopN
    hyps = TopN(N=1000)

    from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler
    from LOTlib import break_ctrlc
    mhs = MHSampler(make_hypothesis(),
                    make_data(),
                    1000000,
                    likelihood_temperature=1.,
                    prior_temperature=1.)

    for samples_yielded, h in break_ctrlc(enumerate(mhs)):
        h.ll_decay = 0.
        hyps.add(h)

    import pickle
    with open('HypothesisSpace.pkl', 'w') as f:
        pickle.dump(hyps, f)
Example #29
0
# Stash counts for viz
with open('Viz/Counts_' + MODEL + '.csv', 'w') as f:
    f.writelines('\n'.join([','.join([str(r) for r in h0]) + ',' + ','.join([str(r) for r in h]) for h0, h in zip(counts['BOOL'], counts['PREDICATE'])]))

print "# Computed counts for each hypothesis & nonterminal"

from LOTlib.Inference.GrammarInference.SimpleGrammarHypothesis import SimpleGrammarHypothesis
from LOTlib.Inference.GrammarInference.FullGrammarHypothesis import FullGrammarHypothesis

from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler

h0 = SimpleGrammarHypothesis(counts, L, GroupLength, prior_offset, NYes, NTrials, Output)
# h0 = FullGrammarHypothesis(counts, L, GroupLength, prior_offset, NYes, NTrials, Output)

writ = []
mhs = MHSampler(h0, [], 100, skip=500)
for s, h in break_ctrlc(enumerate(mhs)):


    if isinstance(h, SimpleGrammarHypothesis):
        a = str(mhs.acceptance_ratio()) + ',' + str(h.prior) + ',' + str(h.likelihood) +  ',BOOLS,' +\
            ','.join([str(x) for x in h.value['BOOL'].value ]) + ',PREDS,' + ','.join([str(x) for x in h.value['PREDICATE'].value ])
    else:
        assert isinstance(h, FullGrammarHypothesis)
        a = str(mhs.acceptance_ratio()) + ',' + str(h.prior) + ',' + str(h.likelihood) +  ',' + \
        str(h.value['alpha'].value[0]) + ',' + str(h.value['beta'].value[0]) + ',' + \
        str(h.value['prior_temperature']) + ',' + str(h.value['likelihood_temperature'])  + ',RULES,' +\
            ','.join([str(x) for x in h.value['rulep']['PREDICATE'].value ])
    print a
    writ.append(a)
Example #30
0
    def next(self):
        # Just set the temperatures by the schedules
        self.prior_temperature      = self.prior_schedule.next()
        self.likelihood_temperature = self.likelihood_schedule.next()

        return MHSampler.next(self)
Example #31
0
from LOTlib import break_ctrlc
from LOTlib.TopN import TopN
from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler
from Model import *
from TargetConcepts import TargetConcepts

NDATA = 20 # How many data points for each function?
NSTEPS = 100000
BEST_N = 500 # How many from each hypothesis to store

# Where we keep track of all hypotheses (across concepts)
all_hypotheses = TopN(N=BEST_N)

if __name__ == "__main__":
    # Now loop over each target concept and get a set of hypotheses
    for i, f in enumerate(TargetConcepts):

        # Set up the hypothesis
        h0 = make_hypothesis()

        # Set up some data
        data = make_data(NDATA, f)

        # Now run some MCMC
        fs = TopN(N=BEST_N, key="posterior_score")
        fs.add(break_ctrlc(MHSampler(h0, data, steps=NSTEPS, trace=False)))

        all_hypotheses.update(fs)

    pickle.dump(all_hypotheses, open("hypotheses.pkl", 'w'))
Example #32
0
                         log(before_same_children) - log(nrk)) + old_lp_below

        return [newt, f - b]


if __name__ == "__main__":

    from LOTlib import break_ctrlc
    #from LOTlib.Examples.Number.Shared import grammar, make_h0, generate_data
    #data = generate_data(300)

    ## NOTE: TO NORMALLY USE THIS, YOU MUST MIX WITH REGENERATION PROPOSAL -- ELSE NOT ERGODIC

    from LOTlib.Examples.Magnetism.Simple.Run import grammar, make_h0, data

    from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler

    idp = InsertDeleteProposal(grammar)

    #data = generate_data(100)
    h = make_h0(proposal_function=idp)
    for h in break_ctrlc(MHSampler(h, data, 100000)):
        print h.posterior_score, h
    """
    for _ in xrange(100):
        t = grammar.generate()
        print "\n\n", t
        for _ in xrange(10):
            print "\t", idp.propose_tree(t)
    """
Example #33
0
grammar.start = 'TWO_CONCEPT_START'

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Hypothesis
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

from Model import make_hypothesis

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Main
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == "__main__":

    from LOTlib import break_ctrlc
    from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler
    from LOTlib.Miscellaneous import q


    # Create an initial hypothesis
    # This is where we set a number of relevant variables -- whether to use RR, alpha, etc.
    # Here we give args as "concept" (used in TWO_CONCEPT_START above) and "x"
    h0 = make_hypothesis(grammar=grammar, args=['concept', 'x'])

    data = make_data()

    # Run the vanilla sampler. Without steps, it will run infinitely
    # this prints out posterior (posterior_score), prior, likelihood,
    for h in break_ctrlc(MHSampler(h0, data, 10000, skip=100)):
        print h.posterior_score, h.prior, h.likelihood, q(h)
Example #34
0
def run(grammar=lot_grammar, mixture_model=0, data=toy_exp_3,
        iters=10000, skip=10, cap=100, print_stuff='sgr',
        ngh='out/ngh_100k', hypotheses=None, domain=100, alpha=0.9,
        save_file='', csv_freq=500,
        pickle_summary=False, pickle_gh=0):
    """
    Enumerate some NumberGameHypotheses, then use these to sample some GrammarHypotheses over `data`.

    Arguments
    ---------
    grammar : LOTlib.Grammar
        This is our grammar.
    mixture_model : bool
        Are we using the MixtureGrammarHypothesis
    data : list
        List of FunctionData to use as input/output data.
    ngh : str
        Where is the file we save/load our ngh's to/from?
    iters : int
        Number of GrammarHypotheses to sample.
    skip : int
        Collect 1 gh sample every `skip` samples.
    cap : int
        VectorSummary will collect this many GrammarHypothesis samples.
    print_stuff : str
        What do we print? ['s' | 'g' | 'r']
    save_file : str
        If we're pickling or saving csvs, this is the file name to save to.
    # csv_file : str
    #     If saving to csv, this is the file name to save to (don't include .csv!).
    # csv_compare_model : int
    #     Do we save model comparison (regression) plots as we iterate? These take ~15 minutes to save.

    """
    # --------------------------------------------------------------------------------------------------------

    if mixture_model:
        ParameterHypothesis = MixtureGrammarHypothesis
    else:
        ParameterHypothesis = NoConstGrammarHypothesis

    # --------------------------------------------------------------------------------------------------------
    # Load NumberGameHypotheses

    if hypotheses is None:
        # In case we want to enumerate hypotheses instead of loading from file
        if 'enum' in ngh:
            hypotheses = []
            for fn in grammar.enumerate(d=int(re.sub('[a-z]', '', ngh))):
                h = NumberGameHypothesis(grammar=grammar, domain=domain, alpha=alpha)
                h.set_value(fn)
                h.compute_prior()
                hypotheses.append(h)
            ngh += '.p'
        # Load NumberGameHypotheses
        else:
            f = open(ngh, "rb")
            hypotheses = pickle.load(f)
            for h in hypotheses:
                h.grammar = grammar

    # --------------------------------------------------------------------------------------------------------
    # Fill VectorSummary

    grammar_h0 = ParameterHypothesis(grammar, hypotheses, ngh_file=ngh, propose_scale=.1, propose_n=1)
    mh_grammar_sampler = MHSampler(grammar_h0, data, iters)
    mh_grammar_summary = VectorSummary(skip=skip, cap=cap)

    # Print all GrammarRules in grammar with corresponding value index
    if 'r' in print_stuff:
        print '='*100, '\nGrammarRules:'
        for idx in grammar_h0.get_propose_idxs():
            print idx, '\t|  ', grammar_h0.rules[idx]

    if 's' in print_stuff:
        print '^*'*60, '\nGenerating GrammarHypothesis Samples\n', '^*'*60

    # Initialize csv file
    if save_file:
        mh_grammar_summary.csv_initfiles(save_file)

    # Sample GrammarHypotheses!
    for i, gh in enumerate(mh_grammar_summary(mh_grammar_sampler)):

        if save_file and csv_freq and (i % csv_freq == 0):
            mh_grammar_summary.csv_appendfiles(save_file, data)

        # Save to N samples, where N=pickle_gh
        if pickle_gh and (i % pickle_gh == 0):
            mh_grammar_summary.pickle_MAPsample(save_file+'_map_'+str(i/pickle_gh)+'.p')
            mh_grammar_summary.pickle_cursample(save_file+'_cur_'+str(i/pickle_gh)+'.p')

        # Print every N/20 samples
        if 's' in print_stuff:
            if i % (iters/20) is 0:
                for idx in gh.get_propose_idxs():  print idx, '\t|  ', gh.rules[idx], ' --> ', gh.value[idx]
                # print i, '-'*100, '\n', {idx:gh.value[idx] for idx in gh.get_propose_idxs()}
                print gh.prior, gh.likelihood, gh.posterior_score

    # Save summary & print top samples
    if pickle_summary:
        mh_grammar_summary.pickle_summary(filename=save_file+'_summary.p')
    if 'g' in print_stuff:
        mh_grammar_summary.print_top_samples()