class Top(SampleStream): """ This stores the top samples and then *only* on exit does it allow them to pass through. (It can't before exit since it won't know what the samples are!) """ def __init__(self, N=1000, key='posterior_score', sorted=True): """ :param N: How many samples to store. :param key: The key we sort by :param sorted: When we output, do we output sorted? (slightly slower) :return: """ self.__dict__.update(locals()) SampleStream.__init__(self) self.top = TopN(N=N, key=key) def process(self, x): """ Overwrite process so all outputs are NOT sent to children. """ self.top.add(x) return None # Do no pass through def __exit__(self, t, value, traceback): ## Here, only on exit do I give my data (the tops) to my outputs for v in self.top.get_all(sorted=sorted): # Cannot just call self.process_and_push since self.process always returns None if v is not None: for a in self.outputs: a.process_and_push(v) return SampleStream.__exit__(self, t, value, traceback)
def run(damount): lexicon, L, hugeData = normalize(damount) words = target.all_words() def propose(current_state, bag=lexicon, probs=L): mod = len(current_state.all_words()) proposal = copy(current_state) proposal.value[words[propose.inx % mod]].value = weighted_sample( bag[words[propose.inx % mod]], probs=probs[words[propose.inx % mod]], log=True).value propose.inx += 1 return proposal propose.inx = 0 proposer = lambda x: propose(x) h0 = KinshipLexicon(alpha=options.alpha, epsilon=options.epsilon, s=options.s) for w in target.all_words(): h0.set_word( w, LOTHypothesis(my_grammar, display='lambda recurse_, C, X: %s')) gs = Gibbs(h0, hugeData, proposer=proposer, steps=options.samples) hyps = TopN(N=options.top_count) for s, h in enumerate(gs): hyps.add(h) print h.prior, \ h.likelihood, \ h return hyps
def runparts(x,datamt): #problem: right now only recording last partition, never saving from others. print "Start: " + str(x) + " on this many: " + str(datamt) messup = TopN(options.top) try: #make new TopN for each data amount topn= TopN(N=200, key="posterior_score") for p in break_ctrlc(partitions): print "Starting on partition ", p # Now we have to go in and fill in the nodes that are nonterminals # We can do this with generate v = grammar.generate(deepcopy(p)) h0 = MyHypothesis(grammar, value=v) size = datamt data = [FunctionData(input=[], output={'n i k': size, 'h i N': size, 'f a n': size, 'g i f': size, 'm a N': size, 'f a m': size, 'g i k': size, 'k a n': size, 'f a f': size, 'g i n': size, 'g i m': size, 'g i s': size, 's i f': size, 's i n': size, 'n i s': size, 's i m': size, 's i k': size, 'h a N': size, 'f i N': size, 'h i m': size, 'h i n': size, 'h a m': size, 'n i N': size, 'h i k': size, 'f a s': size, 'f i n': size, 'h i f': size, 'n i m': size, 'g i N': size, 'h a g': size, 's i N': size, 'n i n': size, 'f i m': size, 's i s': size, 'h i s': size, 'n a s': size, 'k a s': size, 'f i s': size, 'n i f': size, 'm i n': size, 's a s': size, 'f a g': size, 'k a g': size, 'k a f': size, 's a m': size, 'n a f': size, 'n a g': size, 'm i N': size, 's a g': size, 'f i k': size, 'k a m': size, 'n a n': size, 's a f': size, 'n a m': size, 'm a s': size, 'h a f': size, 'h a s': size, 'n a N': size, 'm i s': size, 's a n': size, 's a N': size, 'm i k': size, 'f a N': size, 'm i m': size, 'm a g': size, 'm a f': size, 'f i f': size, 'k a N': size, 'h a n': size, 'm a n': size, 'm a m': size, 'm i f': size})] for h in break_ctrlc(MHSampler(h0, data, steps=options.steps, trace=False)): topn.add(h) return set(topn) except: #if we fail, we can return a blank TopN return messup
class Top(SampleStream): """ This stores the top samples and then *only* on exit does it allow them to pass through. (It can't before exit since it won't know what the samples are!) """ def __init__(self, N=1000, key='posterior_score', sorted=True): """ :param N: How many samples to store. :param key: The key we sort by :param sorted: When we output, do we output sorted? (slightly slower) :return: """ self.__dict__.update(locals()) SampleStream.__init__(self) self.top = TopN(N=N, key=key) def process(self, x): """ Overwrite process so all outputs are NOT sent to children. """ self.top.add(x) return None # Do no pass through def __exit__(self, t, value, traceback): ## Here, only on exit do I give my data (the tops) to my outputs for v in self.top.get_all(sorted=sorted): # Cannot just call self.process_and_push since self.process always returns None if v is not None: for a in self.outputs: a.process_and_push(v) return SampleStream.__exit__(self, t,value,traceback)
def construct_hypothesis_space(data_size): all_hypotheses = TopN() print 'Data size: ', data_size for i in range(RUNS): print 'Run: ', i hypotheses = TopN(25) data = generate_data(data_size) learner = GriceanQuantifierLexicon(make_my_hypothesis, my_weight_function) for w in target.all_words(): learner.set_word(w, make_my_hypothesis()) j = 0 for h in MHSampler(learner, data, SAMPLES, skip=0): hypotheses.add(h) j += 1 if j > 0 and j % 1000 == 0: pickle.dump( hypotheses, open( 'data/hypset_' + GRAMMAR_TYPE + '_' + str(data_size) + '_' + str(j) + '.pickle', 'w')) #sstr = str(h) #sstr = re.sub("[_ ]", "", sstr) #sstr = re.sub("presup", u"\u03BB A B . presup", sstr) #print sstr all_hypotheses.update(hypotheses) return all_hypotheses
def probe_MHsampler(h, language, options, name, size=64, data=None, init_size=None, iters_per_stage=None, sampler=None, ret_sampler=False): get_data = language.sample_data_as_FuncData evaluation_data = get_data(size, max_length=options.FINITE) if data is None: if init_size is None: data = evaluation_data else: data = get_data(n=size, max_length=init_size) if sampler is None: sampler = MHSampler(h, data) else: sampler.data = data best_hypotheses = TopN(N=options.TOP_COUNT) iter = 0 for h in sampler: if iter == options.STEPS: break if iter % 100 == 0: print '---->', iter best_hypotheses.add(h) if iter % options.PROBE == 0: for h in best_hypotheses: h.compute_posterior(evaluation_data) Z = logsumexp([h.posterior_score for h in best_hypotheses]) pr_data = get_data(1024, max_length=options.FINITE) weighted_score = 0 for h in best_hypotheses: precision, recall = language.estimate_precision_and_recall( h, pr_data) if precision + recall != 0: f_score = precision * recall / (precision + recall) weighted_score += np.exp(h.posterior_score - Z) * f_score weighted_score *= 2 to_file([[iter, Z, weighted_score]], name) if init_size is not None and iter % iters_per_stage == 0: init_size += 2 sampler.data = get_data(n=size, max_length=init_size) iter += 1 if ret_sampler: return sampler
def run(options, ndata): if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG + "()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE z = sum(data[0].output.values()) if z > 0: best_ll = sum([(p / z) * log(p / z) for p in data[0].output.values()]) else: best_ll = 0.0 # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', "'%s'" % t, None, 1.0) # set up the hypothesis h0 = IncrementalLexiconHypothesis(grammar=grammar, alphabet_size=len(language.terminals())) h0.set_word( 0, h0.make_hypothesis(grammar=grammar)) # make the first word at random h0.N = 1 tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? if LOTlib.SIG_INTERRUPTED: return 0, set() # and re-set the posterior or else it's something weird h0.compute_posterior(data) # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): h.best_ll = best_ll # just store this tn.add(copy(h)) if options.TRACE: print h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata, h v = h() sortedv = sorted(v.items(), key=operator.itemgetter(1), reverse=True) print "{" + ', '.join(["'%s':%s" % i for i in sortedv]) + "}" # and start from where we ended h0 = copy(h) h0.deepen() return ndata, tn
def run(options, ndata): if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE z = sum(data[0].output.values()) if z > 0: best_ll = sum([ (p/z)*log(p/z) for p in data[0].output.values() ]) else: best_ll = 0.0 # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', "'%s'" % t, None, 1.0) # set up the hypothesis h0 = IncrementalLexiconHypothesis(grammar=grammar, alphabet_size=len(language.terminals())) h0.set_word(0, h0.make_hypothesis(grammar=grammar)) # make the first word at random h0.N = 1 tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? if LOTlib.SIG_INTERRUPTED: return 0, set() # and re-set the posterior or else it's something weird h0.compute_posterior(data) # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): h.best_ll = best_ll # just store this tn.add(copy(h)) if options.TRACE: print h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata, h v = h() sortedv = sorted(v.items(), key=operator.itemgetter(1), reverse=True ) print "{" + ', '.join(["'%s':%s"% i for i in sortedv]) + "}" # and start from where we ended h0 = copy(h) h0.deepen() return ndata, tn
def probe_MHsampler(h, language, options, name, size=64, data=None, init_size=None, iters_per_stage=None, sampler=None, ret_sampler=False): get_data = language.sample_data_as_FuncData evaluation_data = get_data(size, max_length=options.FINITE) if data is None: if init_size is None: data = evaluation_data else: data = get_data(n=size, max_length=init_size) if sampler is None: sampler = MHSampler(h, data) else: sampler.data = data best_hypotheses = TopN(N=options.TOP_COUNT) iter = 0 for h in sampler: if iter == options.STEPS: break if iter % 100 == 0: print '---->', iter best_hypotheses.add(h) if iter % options.PROBE == 0: for h in best_hypotheses: h.compute_posterior(evaluation_data) Z = logsumexp([h.posterior_score for h in best_hypotheses]) pr_data = get_data(1024, max_length=options.FINITE) weighted_score = 0 for h in best_hypotheses: precision, recall = language.estimate_precision_and_recall(h, pr_data) if precision + recall != 0: f_score = precision * recall / (precision + recall) weighted_score += np.exp(h.posterior_score - Z) * f_score weighted_score *= 2 to_file([[iter, Z, weighted_score]], name) if init_size is not None and iter % iters_per_stage == 0: init_size += 2 sampler.data = get_data(n=size, max_length=init_size) iter += 1 if ret_sampler: return sampler
def run(data, TOP=100, STEPS=1000): #if LOTlib.SIG_INTERRUPTED: # return "" #data = [FunctionData(input=(), output={lst: len(lst)})] h0 = MyHypothesis() tn = TopN(N=TOP) # run the sampler counter = Counter() for h in MHSampler(h0, data, steps=STEPS, acceptance_temperature=1.0, likelihood_temperature=1.0):#, likelihood_temperature=10.0): # counter[h] += 1 tn.add(h) z = logsumexp([h.posterior_score for h in tn]) sort_post_probs = [(h, exp(h.posterior_score - z)) for h in tn.get_all(sorted=True)][::-1] return sort_post_probs
def runme(chain, dataamt): if LOTlib.SIG_INTERRUPTED: return () data = make_data(dataamt) tn = TopN(options.top) h0 = make_hypothesis() for h in break_ctrlc(MHSampler(h0, data, steps=options.steps, skip=0)): # print h.posterior_score, h.prior, h.likelihood, h h.likelihood_per_data = h.likelihood/dataamt tn.add(h) return tn
def myrun(observed_set): if LOTlib.SIG_INTERRUPTED: return set() h0 = NumberGameHypothesis(grammar=grammar) data = [FunctionData(input=[], output=observed_set, alpha=ALPHA)] tn = TopN(N=options.TOP_COUNT) for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): tn.add(h) print "# Finished %s" % str(observed_set) return set(tn.get_all())
def standard_sample(make_hypothesis, make_data, show_skip=9, show=True, N=100, save_top='top.pkl', alsoprint='None', **kwargs): """ Just a simplified interface for sampling, allowing printing (showing), returning the top, and saving. This is used by many examples, and is meant to easily allow running with a variety of parameters. NOTE: This skip is a skip *only* on printing **kwargs get passed to sampler """ if LOTlib.SIG_INTERRUPTED: return TopN() # So we don't waste time! h0 = make_hypothesis() data = make_data() best_hypotheses = TopN(N=N) f = eval(alsoprint) sampler = MHSampler(h0, data, **kwargs) # # TODO change acceptance temperature over times # sampler.acceptance_temperature = 0.5 for i, h in enumerate(break_ctrlc(sampler)): # if i % 10000 == 0 and i != 0: # sampler.acceptance_temperature = min(1.0, sampler.acceptance_temperature+0.1) # print '='*50 # print 'change acc temperature to', sampler.acceptance_temperature best_hypotheses.add(h) if show and i%(show_skip+1) == 0: print i, \ h.posterior_score, \ h.prior, \ h.likelihood, \ f(h) if f is not None else '', \ qq(cleanFunctionNodeString(h)) if save_top is not None: print "# Saving top hypotheses" with open(save_top, 'w') as f: pickle.dump(best_hypotheses, f) return best_hypotheses
def partitionMCMC(data,partitions): print data topn= TopN(N=200, key="posterior_score") for p in break_ctrlc(partitions): print "Starting on partition ", p # Now we have to go in and fill in the nodes that are nonterminals v = grammar.generate(deepcopy(p)) #h0 = MyHypothesis(grammar, value=v) h0= make_hypothesis() print h0 for h in break_ctrlc(MHSampler(h0, data, steps=5000, skip=0)): # Show the partition and the hypothesis print h.posterior_score, p, h, howyoudoin(h) topn.add(h) return set(topn)
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE #print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = IncrementalLexiconHypothesis(grammar=grammar) tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? # add to the grammar grammar.add_rule('SELFF', '%s' % (outer), None, 1.0) # Add one more to the number of words here h0.set_word(outer, h0.make_hypothesis(grammar=grammar)) h0.N = outer+1 assert len(h0.value.keys())==h0.N==outer+1 # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): tn.add(h) # print h.posterior_score, h # print getattr(h, 'll_counts', None) # and start from where we ended h0 = deepcopy(h) # must deepcopy return ndata, tn
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE #print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = IncrementalLexiconHypothesis(grammar=grammar) tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? # add to the grammar grammar.add_rule('SELFF', '%s' % (outer), None, 1.0) # Add one more to the number of words here h0.set_word(outer, h0.make_hypothesis(grammar=grammar)) h0.N = outer+1 assert len(h0.value.keys())==h0.N==outer+1 # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): tn.add(h) print h.posterior_score, h print getattr(h, 'll_counts', None) # and start from where we ended h0 = deepcopy(h) # must deepcopy return ndata, tn
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return set() language = eval(options.LANG + "()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE # print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule("ATOM", q(t), None, 2) h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s") print "# Starting on ", h0 tn = TopN(N=options.TOP_COUNT) # print h0.compute_posterior(data) # for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): # # for h in MHSampler(h0, data, steps=options.STEPS, trace=True): # print h.posterior_score, h # print getattr(h, 'll_counts', None) with open( prefix + "hypotheses_" + options.LANG + "_" + str(rank) + "_" + str(ndata) + "_" + suffix + ".txt", "a" ) as ofile: for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): tn.add(h) # print h.posterior_score, getattr(h, 'll_counts', None), h if i % options.SKIP == 0 and h.posterior_score > -Infinity: print >> ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata print >> ofile, getattr(h, "ll_counts", None) print >> ofile, h, "\0" # must add \0 when not Lexicon return tn
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE # print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = AugustHypothesis(grammar=grammar, display="lambda recurse_ :%s") print "# Starting on ", h0 tn = TopN(N=options.TOP_COUNT) # print h0.compute_posterior(data) # for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): # # for h in MHSampler(h0, data, steps=options.STEPS, trace=True): # print h.posterior_score, h # print getattr(h, 'll_counts', None) with open(prefix+'hypotheses_'+options.LANG+'_'+str(rank)+'_'+str(ndata)+'_'+suffix+".txt", 'a') as ofile: for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): tn.add(h) # print h.posterior_score, getattr(h, 'll_counts', None), h if i%options.SKIP == 0 and h.posterior_score > -Infinity: print >>ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood/ndata print >>ofile, getattr(h,'ll_counts', None) print >>ofile, h, '\0' # must add \0 when not Lexicon return tn
def run(data_amount): print "Starting chain on %s data points"%data_amount data = makeLexiconData(target, four_gen_tree_context, n=data_amount, alpha=options.alpha, verbose=True) h0 = KinshipLexicon(alpha=options.alpha) for w in target_words: h0.set_word(w, LOTHypothesis(my_grammar, display='lambda recurse_, C, X: %s')) hyps = TopN(N=options.top_count) mhs = MHSampler(h0, data, options.steps, likelihood_temperature=options.llt, prior_temperature=options.prior_temp) for samples_yielded, h in break_ctrlc(enumerate(mhs)): hyps.add(h) import pickle print 'Writing ' + data[0].X + data[0].Y + str(data_amount) + data[0].word + '.pkl' with open('Chains/' + data[0].X + data[0].Y + str(data_amount) + data[0].word + '.pkl', 'w') as f: pickle.dump(hyps, f) return hyps
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return set() language = eval(options.LANG + "()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = MyHypothesis(grammar=grammar, N=options.N) tn = TopN(N=options.TOP_COUNT) with open( prefix + 'hypotheses_' + options.LANG + '_' + str(rank) + '_' + str(ndata) + '_' + suffix + ".txt", 'a') as ofile: for i, h in enumerate( break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): tn.add(h) # print h.posterior_score, getattr(h, 'll_counts', None), h if i % options.SKIP == 0: print >> ofile, "\n" print >> ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata print >> ofile, getattr(h, 'll_counts', None), print >> ofile, h # ends in \0 so we can sort with sort -g -z return tn
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = MyHypothesis(grammar=grammar, N=options.N) tn = TopN(N=options.TOP_COUNT) with open(prefix+'hypotheses_'+options.LANG+'_'+str(rank)+'_'+str(ndata)+'_'+suffix+".txt", 'a') as ofile: for i, h in enumerate(break_ctrlc(MHSampler(h0, data, steps=options.STEPS))): tn.add(h) # print h.posterior_score, getattr(h, 'll_counts', None), h if i%options.SKIP == 0: print >>ofile, "\n" print >>ofile, i, ndata, h.posterior_score, h.prior, h.likelihood, h.likelihood/ndata print >>ofile, getattr(h,'ll_counts', None), print >>ofile, h # ends in \0 so we can sort with sort -g -z return tn
def run(hypothesis, data_amount): print "Starting chain on %s data points" % data_amount data = makeLexiconData(target, four_gen_tree_context, n=data_amount, alpha=options.alpha, verbose=True) h0 = KinshipLexicon(alpha=options.alpha) for w in target_words: h0.set_word( w, LOTHypothesis(grammar=my_grammar, value=hypothesis.value[w].value, display='lambda recurse_, C, X: %s')) hyps = TopN(N=options.top_count) mhs = MHSampler(h0, data, options.steps, likelihood_temperature=options.llt, prior_temperature=options.prior_temp) for samples_yielded, h in break_ctrlc(enumerate(mhs)): if samples_yielded % 100 == 0: pass #print h.likelihood, h.prior, h hyps.add(h) import pickle print 'Writing ' + data[0].X + data[0].Y + str( data_amount) + data[0].word + '.pkl' with open( 'Chains/' + data[0].X + data[0].Y + str(data_amount) + data[0].word + '.pkl', 'w') as f: pickle.dump(hyps, f) return hyps
def run(damount): lexicon, L, hugeData = normalize(damount) words = target.all_words() def propose(current_state, bag=lexicon, probs=L): mod = len(current_state.all_words()) proposal = copy(current_state) proposal.value[words[propose.inx % mod]].value = weighted_sample(bag[words[propose.inx % mod]], probs=probs[words[propose.inx % mod]], log=True).value propose.inx += 1 return proposal propose.inx = 0 proposer = lambda x : propose(x) h0 = KinshipLexicon(alpha=options.alpha) for w in target.all_words(): h0.set_word(w, LOTHypothesis(my_grammar, display='lambda recurse_, C, X: %s')) gs = Gibbs(h0, hugeData, proposer=proposer, steps=options.samples) hyps = TopN(N=options.top_count) for s, h in enumerate(gs): hyps.add(h) return hyps
def run(data_size, my_finite_trees): data = generate_data(data_size) # the prior for each tree prior = np.array([x.compute_prior() for x in my_finite_trees]) prior = prior - logsumexp(prior) # the likelihood weights for each hypothesis weights = np.array([my_weight_function(h) for h in my_finite_trees]) # response[h,di] gives the response of the h'th tree to data di response = np.array( [mapto012(get_tree_set_responses(t, data)) for t in my_finite_trees]) # Now actually run: hypset = TopN(N=TOP_COUNT) learner = VectorizedLexicon_DistanceMetricProposal(target.all_words(), my_finite_trees, prior) databundle = [response, weights] generator = MHSampler(learner, databundle, STEPS, skip=SKIP) for g in generator: hypset.add(VectorizedLexicon_to_SimpleLexicon(g), g.posterior_score) return hypset
def standard_sample( make_hypothesis, make_data, skip=9, show=True, N=100, save_top="top.pkl", alsoprint="None", **kwargs ): """ Just a simplified interface for sampling, allowing printing (showing), returning the top, and saving. This is used by many examples, and is meant to easily allow running with a variety of parameters. NOTE: This skip is a skip *only* on printing **kwargs get passed to sampler """ if LOTlib.SIG_INTERRUPTED: return TopN() # So we don't waste time! h0 = make_hypothesis() data = make_data() best_hypotheses = TopN(N=N) f = eval(alsoprint) sampler = MHSampler(h0, data, **kwargs) for i, h in enumerate(break_ctrlc(sampler)): best_hypotheses.add(h) if show and i % (skip + 1) == 0: print i, h.posterior_score, h.prior, h.likelihood, f(h) if f is not None else "", qq( cleanFunctionNodeString(h) ) if save_top is not None: print "# Saving top hypotheses" with open(save_top, "w") as f: pickle.dump(best_hypotheses, f) return best_hypotheses
grammar.add_rule('PREDICATE', 'is_size_(x, "miniature")', None, 1.0) grammar.add_rule('PREDICATE', 'is_size_(x, "intermediate")', None, 1.0) grammar.add_rule('PREDICATE', 'is_size_(x, "colossal")', None, 1.0) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Hypothesis # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ from LOTlib.Hypotheses.RationalRulesLOTHypothesis import RationalRulesLOTHypothesis def make_hypothesis(grammar=grammar, **kwargs): return RationalRulesLOTHypothesis(grammar=grammar, rrAlpha=1.0, **kwargs) if __name__ == "__main__": from LOTlib.TopN import TopN hyps = TopN(N = 1000) from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler from LOTlib import break_ctrlc mhs = MHSampler(make_hypothesis(), make_data(), 1000000, likelihood_temperature = 1., prior_temperature = 1.) for samples_yielded, h in break_ctrlc(enumerate(mhs)): h.ll_decay = 0. hyps.add(h) import pickle with open('HypothesisSpace.pkl', 'w') as f: pickle.dump(hyps, f)
collapsed_prob = grammar.log_probability(collapsed_forms[resps]) collapsed_forms[resps].my_log_probability = logplusexp(collapsed_prob, tprior) if tprior > collapsed_forms[resps].display_tree_probability: # display the most concise form collapsed_forms[resps] = t collapsed_forms[resps].display_tree_probability = tprior else: collapsed_forms[resps] = t collapsed_forms[resps].display_tree_probability = tprior t.my_log_probability = tprior # FunctionNode uses this value when we call log_probability() print ">>", all_tree_count, len(collapsed_forms), t, tprior ############################################ ### Now actually enumarate trees for t in grammar.enumerate(d=DEPTH): if 'presup_(False' in str(t): continue if not check_expansion(t): continue if t.count_subnodes() <= MAX_NODES: add_to_collapsed_trees(t) all_tree_count += 1 print ">", t, grammar.log_probability(t) ## for kinder saving and unsaving: upq = TopN() for k in collapsed_forms.values(): upq.add(LOTHypothesis(grammar, k, display='lambda context: %s'), 0.0) pickle.dump(upq, open(OUT, 'w')) print "Total tree count: ", all_tree_count
print "Loading Space 1: " + options.filename with open(options.filename, 'r') as f: d.update(pickle.load(f)) if options.filename2 is not None: print "Loading Space 2: " + options.filename2 with open(options.filename2, 'r') as f: d.update(pickle.load(f)) Mass = set() for a in range(1, 25, 2) + range(25, 251, 25): print "Grabbing Top " + str(options.Nsize) + " from " + str(a) + ' dp' data = makeLexiconData(target, four_gen_tree_context, n=a) simplicity_mass = TopN(N=options.Nsize) reuse_mass = TopN(N=options.Nsize) for h in d: h.posterior_score = h.compute_likelihood(data) + compute_reuse_prior(h) reuse_mass.add(h) h.compute_posterior(data) simplicity_mass.add(h) Mass.update(simplicity_mass) Mass.update(reuse_mass) print "Writing output file for " + str(len(Mass)) + ' hypotheses.' with open(options.out_path, 'w') as f: pickle.dump(Mass, f)
# Hypothesis # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ from LOTlib.Hypotheses.RationalRulesLOTHypothesis import RationalRulesLOTHypothesis def make_hypothesis(grammar=grammar, **kwargs): return RationalRulesLOTHypothesis(grammar=grammar, rrAlpha=1.0, **kwargs) if __name__ == "__main__": from LOTlib.TopN import TopN hyps = TopN(N=1000) from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler from LOTlib import break_ctrlc mhs = MHSampler(make_hypothesis(), make_data(), 1000000, likelihood_temperature=1., prior_temperature=1.) for samples_yielded, h in break_ctrlc(enumerate(mhs)): h.ll_decay = 0. hyps.add(h) import pickle with open('HypothesisSpace.pkl', 'w') as f: pickle.dump(hyps, f)
d.update(pickle.load(f)) if options.filename2 is not None: print "Loading Space 2: " + options.filename2 with open(options.filename2, 'r') as f: d.update(pickle.load(f)) Mass = set() for a in range(1, 25, 2) + range(25, 251, 25): print "Grabbing Top " + str(options.Nsize) + " from " + str(a) + ' dp' data = makeZipfianLexiconData(target, four_gen_tree_context, n=a, alpha=0.9, s=0.0, epsilon=0.0) simplicity_mass = TopN(N=options.Nsize) reuse_mass = TopN(N=options.Nsize) for h in d: h.posterior_score = h.compute_likelihood(data) + compute_reuse_prior(h) reuse_mass.add(h) h.compute_posterior(data) simplicity_mass.add(h) Mass.update(simplicity_mass) Mass.update(reuse_mass) print "Writing output file for " + str(len(Mass)) + ' hypotheses.' with open(options.out_path, 'w') as f: pickle.dump(Mass, f)
def runparts(size, x, p): #problem: right now only recording last partition, never saving from others. print "Start: " + str(x) + " on this many: " + str(size) try: #make new TopN for each data amount topn = TopN(N=200, key="posterior_score") print "Starting on partition ", p # Now we have to go in and fill in the nodes that are nonterminals # We can do this with generate v = grammar.generate(copy(p)) h0 = MyHypothesis(grammar, value=v) data = [ FunctionData(input=[], output={ 'n i k': size, 'h i N': size, 'f a n': size, 'g i f': size, 'm a N': size, 'f a m': size, 'g i k': size, 'k a n': size, 'f a f': size, 'g i n': size, 'g i m': size, 'g i s': size, 's i f': size, 's i n': size, 'n i s': size, 's i m': size, 's i k': size, 'h a N': size, 'f i N': size, 'h i m': size, 'h i n': size, 'h a m': size, 'n i N': size, 'h i k': size, 'f a s': size, 'f i n': size, 'h i f': size, 'n i m': size, 'g i N': size, 'h a g': size, 's i N': size, 'n i n': size, 'f i m': size, 's i s': size, 'h i s': size, 'n a s': size, 'k a s': size, 'f i s': size, 'n i f': size, 'm i n': size, 's a s': size, 'f a g': size, 'k a g': size, 'k a f': size, 's a m': size, 'n a f': size, 'n a g': size, 'm i N': size, 's a g': size, 'f i k': size, 'k a m': size, 'n a n': size, 's a f': size, 'n a m': size, 'm a s': size, 'h a f': size, 'h a s': size, 'n a N': size, 'm i s': size, 's a n': size, 's a N': size, 'm i k': size, 'f a N': size, 'm i m': size, 'm a g': size, 'm a f': size, 'f i f': size, 'k a N': size, 'h a n': size, 'm a n': size, 'm a m': size, 'm i f': size }) ] for h in break_ctrlc( MHSampler(h0, data, steps=options.steps, trace=False)): # print "\t", h.posterior_score, h topn.add(h) return size, set(topn) except Exception as e: print "*** Exception ignored: ", e #if we fail, we can return a blank TopN return size, set()
print_star("") print from_seq, to_seq data = [ FunctionData(alpha=ALPHA, input=[from_seq], output={to_seq: len(to_seq)}) ] h0 = MyHypothesis() step = 0 tn = TopN(N=N_H) # Stream from the sampler to a printer for h in MHSampler(h0, data, steps=STEPS, acceptance_temperature=5.): tn.add(h) print for h in tn.get_all(sorted=True): out = h(from_seq) if len(out) >= len(to_seq): hd = hamming_distance(out[:len(to_seq)], to_seq) else: hd = 15 print h.posterior_score, h.likelihood, h.prior, h, hd print out[:len(to_seq)] print_star("")
from LOTlib import break_ctrlc from LOTlib.TopN import TopN from LOTlib.Inference.Samplers.MetropolisHastings import MHSampler from Model import * from TargetConcepts import TargetConcepts NDATA = 20 # How many data points for each function? NSTEPS = 100000 BEST_N = 500 # How many from each hypothesis to store # Where we keep track of all hypotheses (across concepts) all_hypotheses = TopN(N=BEST_N) if __name__ == "__main__": # Now loop over each target concept and get a set of hypotheses for i, f in enumerate(TargetConcepts): # Set up the hypothesis h0 = make_hypothesis() # Set up some data data = make_data(NDATA, f) # Now run some MCMC fs = TopN(N=BEST_N, key="posterior_score") fs.add(break_ctrlc(MHSampler(h0, data, steps=NSTEPS, trace=False))) all_hypotheses.update(fs) pickle.dump(all_hypotheses, open("hypotheses.pkl", 'w'))
# for h in MHSampler(h0, data_1, steps=5000): # print h.posterior_score # Running and show only the top choice from LOTlib.TopN import TopN topChoice = TopN(N=10) posProbs = [] stepNum = 40000 for step, h in enumerate(MHSampler(h0, make_data(data_size=1), steps=stepNum)): if step % 5000 == 0: print('current step: %d, current posterior score: %f' % (step, h.posterior_score)) posProbs.append(h.posterior_score) topChoice.add(h) h0 = h # for step, h in enumerate(MHSampler(h0, make_data(data_size=2), steps=stepNum)): # if step % 5000 == 0: # print ('current step: %d, current posterior score: %f' % (step, h.posterior_score)) # posProbs.append(h.posterior_score) # topChoice.add(h) # h0 = h # for step, h in enumerate(MHSampler(h0, make_data(data_size=3), steps=stepNum)): # if step % 5000 == 0: # print ('current step: %d, current posterior score: %f' % (step, h.posterior_score)) # posProbs.append(h.posterior_score) # topChoice.add(h) # h0 = h