def standard_sample(make_hypothesis, make_data, skip=9, show=True, N=100, save_top='top.pkl', alsoprint='None', **kwargs): """ Just a simplified interface for sampling, allowing printing (showing), returning the top, and saving. This is used by many examples, and is meant to easily allow running with a variety of parameters. NOTE: This skip is a skip *only* on printing **kwargs get passed to sampler """ if LOTlib.SIG_INTERRUPTED: return TopN() # So we don't waste time! h0 = make_hypothesis() data = make_data() best_hypotheses = TopN(N=N) f = eval(alsoprint) for i, h in enumerate(break_ctrlc(MHSampler(h0, data, **kwargs))): best_hypotheses.add(h) if show and i%(skip+1) == 0: print i, \ h.posterior_score, \ h.prior, \ h.likelihood, \ f(h) if f is not None else '', \ qq(cleanFunctionNodeString(h)) if save_top is not None: print "# Saving top hypotheses" with open(save_top, 'w') as f: pickle.dump(best_hypotheses, f) return best_hypotheses
def run(): data = generate_data(target, NDATA, data_sd) # generate some data h0 = MAPSymbolicRegressionHypothesis(grammar, args=['x']+CONSTANT_NAMES) h0.CONSTANT_VALUES = numpy.zeros(NCONSTANTS) ## TODO: Move this to an itializer from LOTlib.Inference.MetropolisHastings import MHSampler for h in lot_iter(MHSampler(h0, data, STEPS, skip=SKIP, trace=False)): print h.posterior_score, h.likelihood, h.prior, h.CONSTANT_VALUES, qq(h)
def __str__(self): """ This defaultly puts a \0 at the end so that we can sort -z if we want (e.g. if we print out a posterior first) """ return '\n' + '\n'.join([ "%-15s: %s" % (qq(w), str(v)) for w, v in sorted(self.value.iteritems()) ]) + '\0'
def process(self, x): # print "PrintH.process ", x print >>self.file_, self.prefix, \ round(x.posterior_score,3), \ round(x.prior,3), \ round(x.likelihood,3), \ qq(x) # qq(cleanFunctionNodeString(x)) return x
def standard_sample(make_hypothesis, make_data, show_skip=9, show=True, N=100, save_top='top.pkl', alsoprint='None', **kwargs): """ Just a simplified interface for sampling, allowing printing (showing), returning the top, and saving. This is used by many examples, and is meant to easily allow running with a variety of parameters. NOTE: This skip is a skip *only* on printing **kwargs get passed to sampler """ if LOTlib.SIG_INTERRUPTED: return TopN() # So we don't waste time! h0 = make_hypothesis() data = make_data() best_hypotheses = TopN(N=N) f = eval(alsoprint) sampler = MHSampler(h0, data, **kwargs) # # TODO change acceptance temperature over times # sampler.acceptance_temperature = 0.5 for i, h in enumerate(break_ctrlc(sampler)): # if i % 10000 == 0 and i != 0: # sampler.acceptance_temperature = min(1.0, sampler.acceptance_temperature+0.1) # print '='*50 # print 'change acc temperature to', sampler.acceptance_temperature best_hypotheses.add(h) if show and i%(show_skip+1) == 0: print i, \ h.posterior_score, \ h.prior, \ h.likelihood, \ f(h) if f is not None else '', \ qq(cleanFunctionNodeString(h)) if save_top is not None: print "# Saving top hypotheses" with open(save_top, 'w') as f: pickle.dump(best_hypotheses, f) return best_hypotheses
bv_type='INNER-BOOL', bv_args=['OBJECT'], bv_prefix='F') # Define a predicate that will just check if something is in a BASE-SET g.add_rule('lambdaDefinePredicate', 'lambda', ['lambdaDefinePredicateINNER'], 1.0, bv_type='OBJECT', bv_args=None, bv_prefix='z') # the function on objects, that allows them to be put into classes (analogous to a logical model here) g.add_rule('lambdaDefinePredicateINNER', 'is_in_', ['OBJECT', 'BASE-SET'], 1.0) # After we've defined F, these are used to construct the concept g.add_rule('INNER-BOOL', 'and_', ['INNER-BOOL', 'INNER-BOOL'], 1.0) g.add_rule('INNER-BOOL', 'or_', ['INNER-BOOL', 'INNER-BOOL'], 1.0) g.add_rule('INNER-BOOL', 'not_', ['INNER-BOOL'], 1.0) g.add_rule('OBJECT', 'x', None, 1.0) g.add_rule('OBJECT', 'y', None, 1.0) g.add_rule('OBJECT', '', ['BASE-OBJECT'], 1.0) # maybe or maybe not? # BASE-SET is here a set of BASE-OBJECTS (non-args) g.add_rule('BASE-SET', 'set_add_', ['BASE-OBJECT', 'BASE-SET'], 1.0) g.add_rule('BASE-SET', 'set_', [], 1.0) g.add_rule('BASE-OBJECT', qq('p1'), None, 1.0) g.add_rule('BASE-OBJECT', qq('p2'), None, 1.0) g.add_rule('BASE-OBJECT', qq('n1'), None, 1.0) g.add_rule('BASE-OBJECT', qq('n2'), None, 1.0)
# -*- coding: utf-8 -*- """ A simple symbolic regression demo """ from LOTlib import lot_iter from LOTlib.Hypotheses.GaussianLOTHypothesis import GaussianLOTHypothesis from LOTlib.Inference.MetropolisHastings import MHSampler from LOTlib.Miscellaneous import qq from LOTlib.Examples.SymbolicRegression.Grammar import grammar from Data import generate_data CHAINS = 4 STEPS = 50000 SKIP = 0 if __name__ == "__main__": print grammar # generate some data data = generate_data(50) # how many data points? # starting hypothesis -- here this generates at random h0 = GaussianLOTHypothesis(grammar) for h in lot_iter(MHSampler(h0, data, STEPS, skip=SKIP)): print h.posterior_score, qq(h)
def next(self): if LOTlib.SIG_INTERRUPTED or self.samples_yielded >= self.steps: raise StopIteration else: for _ in lot_iter(xrange(self.skip+1)): self.proposal, fb = self.proposer(self.current_sample) # either compute this, or use the memoized version np, nl = self.compute_posterior(self.proposal, self.data) #print np, nl, current_sample.prior, current_sample.likelihood # NOTE: IT is important that we re-compute from the temperature since these may be altered externally from ParallelTempering and others prop = (np/self.prior_temperature+nl/self.likelihood_temperature) cur = (self.current_sample.prior/self.prior_temperature + self.current_sample.likelihood/self.likelihood_temperature) if MH_acceptance(cur, prop, fb, acceptance_temperature=self.acceptance_temperature): self.current_sample = self.proposal self.was_accepted = True self.acceptance_count += 1 else: self.was_accepted = False self.internal_sample(self.current_sample) self.proposal_count += 1 if self.trace: print self.current_sample.posterior_score, self.current_sample.likelihood, self.current_sample.prior, qq(self.current_sample) self.samples_yielded += 1 return self.current_sample
def next(self): """Generate another sample.""" if self.samples_yielded >= self.steps: raise StopIteration else: for _ in xrange(self.skip+1): self.proposal, fb = self.proposer(self.current_sample) # print self.proposal assert self.proposal is not self.current_sample, "*** Proposal cannot be the same as the current sample!" assert self.proposal.value is not self.current_sample.value, "*** Proposal cannot be the same as the current sample!" # Call myself so memoized subclasses can override self.compute_posterior(self.proposal, self.data) np, nl = self.proposal.prior, self.proposal.likelihood # Note: It is important that we re-compute from the temperature since these may be altered # externally from ParallelTempering and others prop = (np/self.prior_temperature + nl/self.likelihood_temperature) cur = (self.current_sample.prior/self.prior_temperature + self.current_sample.likelihood/self.likelihood_temperature) # print "# Current:", self.current_sample # print "# Proposal:", self.proposal if MH_acceptance(cur, prop, fb, acceptance_temperature=self.acceptance_temperature): self.current_sample = self.proposal self.was_accepted = True self.acceptance_count += 1 else: self.was_accepted = False self.internal_sample(self.current_sample) self.proposal_count += 1 if self.trace: print self.current_sample.posterior_score, self.current_sample.likelihood, self.current_sample.prior, qq(self.current_sample) self.samples_yielded += 1 return self.current_sample
def display(self): for h in self.get_all(): print h.posterior_score, h.prior, h.likelihood, qq(h)
# After we've defined F, these are used to construct the concept grammar.add_rule('INNER-BOOL', 'and_', ['INNER-BOOL', 'INNER-BOOL'], 1.0) grammar.add_rule('INNER-BOOL', 'or_', ['INNER-BOOL', 'INNER-BOOL'], 1.0) grammar.add_rule('INNER-BOOL', 'not_', ['INNER-BOOL'], 1.0) grammar.add_rule('OBJECT', 'x', None, 1.0) grammar.add_rule('OBJECT', 'y', None, 1.0) # BASE-SET is here a set of BASE-OBJECTS (non-args) grammar.add_rule('BASE-SET', 'set_add_', ['BASE-OBJECT', 'BASE-SET'], 1.0) grammar.add_rule('BASE-SET', 'set_', [], 1.0) objects = [t + str(i) for t, i in itertools.product('pnx', range(3))] for o in objects: grammar.add_rule('BASE-OBJECT', qq(o), None, 1.0) #from LOTlib.Subtrees import * #for t in generate_trees(grammar): #print t # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set up data -- true output means attraction (p=positive; n=negative) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ data = [] for a, b in itertools.product(objects, objects): myinput = [a, b] # opposites (n/p) interact; x interacts with nothing
grammar.add_rule('lambdaDefinePredicateINNER', 'is_in_', ['OBJECT', 'BASE-SET'], 1.0) # After we've defined F, these are used to construct the concept grammar.add_rule('INNER-BOOL', 'and_', ['INNER-BOOL', 'INNER-BOOL'], 1.0) grammar.add_rule('INNER-BOOL', 'or_', ['INNER-BOOL', 'INNER-BOOL'], 1.0) grammar.add_rule('INNER-BOOL', 'not_', ['INNER-BOOL'], 1.0) grammar.add_rule('OBJECT', 'x', None, 1.0) grammar.add_rule('OBJECT', 'y', None, 1.0) # BASE-SET is here a set of BASE-OBJECTS (non-args) grammar.add_rule('BASE-SET', 'set_add_', ['BASE-OBJECT', 'BASE-SET'], 1.0) grammar.add_rule('BASE-SET', 'set_', [], 1.0) for o in OBJECTS: grammar.add_rule('BASE-OBJECT', qq(o), None, 1.0) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Data # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ from LOTlib.DataAndObjects import FunctionData # Set up data -- true output means attraction (p=positive; n=negative) def make_data(n=1): data = [] for _ in xrange(n): for a,b in itertools.product(OBJECTS, OBJECTS):
return str(self.value) def __call__(self, *args): try: return LOTHypothesis.__call__(self, *args) except EvaluationException: return None def make_hypothesis(**kwargs): """Define a new kind of LOTHypothesis, that gives regex strings. These have a special interpretation function that compiles differently than straight python eval. """ return RegexHypothesis(**kwargs) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Main # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == "__main__": from LOTlib.Inference.Samplers.StandardSample import standard_sample from LOTlib import break_ctrlc from LOTlib.Miscellaneous import qq for h in break_ctrlc( standard_sample(make_hypothesis, make_data, steps=10000)): print h.posterior_score, h.prior, h.likelihood, qq(h)
from Data import generate_data from Grammar import grammar, NCONSTANTS STEPS = 500000 SKIP = 0 data_sd = 0.1 # the SD of the data NDATA = 50 MEMOIZE = 1000 # 0 means don't memoize ## The target function for symbolic regression target = lambda x: 3. * x + sin(4.3 / x) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # starting hypothesis -- here this generates at random data = generate_data(target, NDATA, data_sd) # generate some data h0 = MAPSymbolicRegressionHypothesis(grammar) h0.CONSTANT_VALUES = numpy.zeros( NCONSTANTS) ## TODO: Move this to an itializer from LOTlib.Inference.MetropolisHastings import mh_sample for h in mh_sample(h0, data, STEPS, skip=SKIP, trace=False, debug=False, memoize=MEMOIZE): print h.posterior_score, h.likelihood, h.prior, h.CONSTANT_VALUES, qq(h)
print h.posterior_score, h.prior, h.likelihood, h.likelihood_temperature print h # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Play around with some different inference schemes # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #h0 = CCGLexicon(make_hypothesis, words=all_words, alpha=0.9, palpha=0.9, likelihood_temperature=0.01) #for i, h in lot_iter(enumerate(mh_sample(h0, data, 400000000, skip=0, debug=False))): #print h.posterior_score, h.prior, h.likelihood, qq(re.sub(r"\n", ";", str(h))) from LOTlib.Inference.IncreaseTemperatureMH import increase_temperature_mh_sample h0 = CCGLexicon(make_hypothesis, words=all_words, alpha=0.9, palpha=0.9, likelihood_temperature=0.01) for i, h in lot_iter(enumerate(increase_temperature_mh_sample(h0, data, 400000000, skip=0, increase_amount=1.50))): print h.posterior_score, h.prior, h.likelihood, qq(re.sub(r"\n", ";", str(h))) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Run on a single computer, printing out # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #fbs = FiniteBestSet(N=100) #h0 = CCGLexicon(make_hypothesis, words=all_words, alpha=0.9, palpha=0.9, likelihood_temperature=0.051) #for i, h in lot_iter(enumerate(mh_sample(h0, data, 400000000, skip=0, debug=False))): #fbs.add(h, h.posterior_score) #if i%100==0: #print h.posterior_score, h.prior, h.likelihood #, re.sub(r"\n", ";", str(h)) #print h #for h in fbs.get_all(sorted=True):
def next(self): if LOTlib.SIG_INTERRUPTED or self.samples_yielded >= self.steps: raise StopIteration else: for _ in lot_iter(xrange(self.skip + 1)): self.proposal, fb = self.proposer(self.current_sample) # either compute this, or use the memoized version np, nl = self.compute_posterior(self.proposal, self.data) #print np, nl, current_sample.prior, current_sample.likelihood # NOTE: IT is important that we re-compute from the temperature since these may be altered externally from ParallelTempering and others prop = (np / self.prior_temperature + nl / self.likelihood_temperature) cur = (self.current_sample.prior / self.prior_temperature + self.current_sample.likelihood / self.likelihood_temperature) if MH_acceptance( cur, prop, fb, acceptance_temperature=self.acceptance_temperature): self.current_sample = self.proposal self.was_accepted = True self.acceptance_count += 1 else: self.was_accepted = False self.internal_sample(self.current_sample) self.proposal_count += 1 if self.trace: print self.current_sample.posterior_score, self.current_sample.likelihood, self.current_sample.prior, qq( self.current_sample) self.samples_yielded += 1 return self.current_sample
from Data import data from Grammar import grammar from Utilities import make_h0 def run(*args): """The running function.""" # starting hypothesis -- here this generates at random h0 = GaussianLOTHypothesis(grammar) # We store the top 100 from each run pq = FiniteBestSet(N=100, max=True, key="posterior_score") pq.add(MHSampler(h0, data, STEPS, skip=SKIP)) return pq if __name__ == "__main__": CHAINS = 10 STEPS = 10000000 SKIP = 0 finitesample = FiniteBestSet(max=True) # the finite sample of all results = map(run, [ [None] ] * CHAINS ) # Run on a single core finitesample.merge(results) ## and display for r in finitesample.get_all(decreasing=False, sorted=True): print r.posterior_score, r.prior, r.likelihood, qq(str(r))
from LOTlib.DataAndObjects import FunctionData from LOTlib.Hypotheses.LOTHypothesis import LOTHypothesis from LOTlib.Miscellaneous import qq from MAPSymbolicRegressionHypothesis import MAPSymbolicRegressionHypothesis, grammar from Data import generate_data from Grammar import grammar, NCONSTANTS STEPS = 500000 SKIP = 0 data_sd = 0.1 # the SD of the data NDATA = 50 MEMOIZE = 1000 # 0 means don't memoize ## The target function for symbolic regression target = lambda x: 3.*x + sin(4.3/x) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # starting hypothesis -- here this generates at random data = generate_data(target, NDATA, data_sd) # generate some data h0 = MAPSymbolicRegressionHypothesis(grammar) h0.CONSTANT_VALUES = numpy.zeros(NCONSTANTS) ## TODO: Move this to an itializer from LOTlib.Inference.MetropolisHastings import mh_sample for h in mh_sample(h0, data, STEPS, skip=SKIP, trace=False, debug=False, memoize=MEMOIZE): print h.posterior_score, h.likelihood, h.prior, h.CONSTANT_VALUES, qq(h)
if options.EVAL_DATA > 0: eval_data = make_data(options.EVAL_DATA) # choose the appropriate map function args = list(itertools.product([make_hypothesis],[make_data], data_amounts * options.CHAINS) ) # set the output codec -- needed to display lambda to stdout sys.stdout = codecs.getwriter('utf8')(sys.stdout) seen = set() for fs in MPI_unorderedmap(run, numpy.random.permutation(args)): assert is_master_process() for h in fs: if h not in seen: seen.add(h) if eval_data is not None: h.compute_posterior(eval_data) # evaluate on the big data print h.posterior_score, h.prior, h.likelihood / options.EVAL_DATA, \ alsoprint(h) if alsoprint is not None else '',\ qq(cleanFunctionNodeString(h)) import pickle with open(options.OUT_PATH, 'w') as f: pickle.dump(seen, f)
from LOTlib.Miscellaneous import qq # What are the objects we may use? OBJECTS = ['JOHN', 'MARY', 'SUSAN', 'BILL'] SEMANTIC_1PREDICATES = ['SMILED', 'LAUGHED', 'MAN', 'WOMAN'] SEMANTIC_2PREDICATES = ['SAW', 'LOVED'] ## Define the grammar grammar = Grammar() grammar.add_rule('START', '', ['FUNCTION'], 2.0) grammar.add_rule('START', '', ['BOOL'], 1.0) grammar.add_rule('START', '', ['OBJECT'], 1.0) for m in SEMANTIC_1PREDICATES: grammar.add_rule('BOOL', 'C.relation_', [ qq(m), 'OBJECT'], 1.0) for m in SEMANTIC_2PREDICATES: grammar.add_rule('BOOL', 'C.relation_', [ qq(m), 'OBJECT', 'OBJECT'], 1.0) for o in OBJECTS: grammar.add_rule('OBJECT', qq(o), None, 1.0) grammar.add_rule('BOOL', 'exists_', ['FUNCTION.O2B', 'C.objects'], 1.00) # can quantify over objects->bool functions grammar.add_rule('BOOL', 'forall_', ['FUNCTION.O2B', 'C.objects'], 1.00) grammar.add_rule('FUNCTION.O2B', 'lambda', ['BOOL'], 1.0, bv_type='OBJECT') grammar.add_rule('BOOL', 'and_', ['BOOL', 'BOOL'], 1.0) grammar.add_rule('BOOL', 'or_', ['BOOL', 'BOOL'], 1.0) grammar.add_rule('BOOL', 'not_', ['BOOL'], 1.0)
display_option_summary(options) eval_data = None if options.EVAL_DATA > 0: eval_data = make_data(options.EVAL_DATA) # choose the appropriate map function args = list(itertools.product([make_hypothesis],[make_data], data_amounts * options.CHAINS) ) # set the output codec -- needed to display lambda to stdout sys.stdout = codecs.getwriter('utf8')(sys.stdout) seen = set() for fs in MPI_unorderedmap(run, numpy.random.permutation(args)): assert is_master_process() for h in fs: if h not in seen: seen.add(h) if eval_data is not None: h.compute_posterior(eval_data) # evaluate on the big data print h.prior, h.likelihood / options.EVAL_DATA, qq(cleanFunctionNodeString(h)) import pickle with open(options.OUT_PATH, 'w') as f: pickle.dump(seen, f)
grammar = lot_grammar # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Load the hypotheses # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # map each concept to a hypothesis with open('hypotheses/lot_hypotheses-10.pkl', 'r') as f: hypotheses = pickle.load(f) print "# Loaded hypotheses: ", len(hypotheses) # - - logging - - - - - - - - with open(LOG+"/hypotheses.txt", 'w') as f: for i, h in enumerate(hypotheses): print >>f, i, qq(h) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Load the human data # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Load the concepts from the human data from Data import load_human_data human_nyes, human_ntrials = load_human_data() print "# Loaded human data" observed_sets = set([ k[0] for k in human_nyes.keys() ]) ## TRIM TO FEWER # observed_sets = set(list(observed_sets)[:100])
def __str__(self): return ('\n'.join([ u"%-15s: %s" % (qq(w), lambdastring(v.value)) for w, v in sorted(self.value.iteritems()) ]) + '\0').encode('utf-8')
def __str__(self): return ('\n'.join([u"%-15s: %s" % (qq(w), lambdastring(v.value)) for w, v in sorted(self.value.iteritems())]) + '\0').encode('utf-8')
''' from LOTlib.Inference.Proposals.InsertDeleteProposal import InsertDeleteProposal h0 = NumberExpression(grammar, proposal_function=InsertDeleteProposal(grammar)) ''' # store hypotheses we've found allhyp = TopN(N=1000) # ======================================================================================================== # Run the standard RationalRules sampler mh_sampler = MHSampler(h0, data, STEPS, skip=SKIP) for h in lot_iter(mh_sampler): if TRACE: print q(get_knower_pattern(h)), h.posterior_score, h.compute_prior(), h.compute_likelihood(data), qq(h) # add h to our priority queue, with priority of its log probability, h.posterior_score allhyp.add(h) # ======================================================================================================== # now re-evaluate everything we found on new data ''' huge_data = generate_data(LARGE_DATA_SIZE) save this with a huge data set -- eval with average ll H = allhyp.get_sorted() compute the posterior for each hypothesis [ h.compute_posterior(huge_data) for h in H]
if __name__ == "__main__": # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Main running if is_master_process(): display_option_summary(options) huge_data = generate_data(options.LARGE_DATA_SIZE) # choose the appropriate map function argarray = map(lambda x: [x], options.DATA_AMOUNTS * options.CHAINS) seen = set() for fs in MPI_unorderedmap(run, numpy.random.permutation(argarray)): for h in fs.get_all(): if h not in seen: seen.add(h) h.compute_posterior(huge_data) if h.prior > float("-inf"): print h.prior, \ h.likelihood /float(options.LARGE_DATA_SIZE), \ q(get_knower_pattern(h)), \ qq(h) sys.stdout.flush() import pickle with open(options.OUT_PATH, 'w') as f: pickle.dump(seen, f)
def make_h0(value=None): return GaussianLOTHypothesis(grammar, value=value) if __name__ == "__main__": # # # # # # # # # # # # # # # # # # # # # # # # # # # # # the running function def run(*args): # starting hypothesis -- here this generates at random h0 = GaussianLOTHypothesis(grammar, prior_temperature=PRIOR_TEMPERATURE) # We store the top 100 from each run pq = FiniteBestSet(100, max=True, key="posterior_score") pq.add(mh_sample(h0, data, STEPS, skip=SKIP)) return pq finitesample = FiniteBestSet(max=True) # the finite sample of all results = map(run, [[None]] * CHAINS) # Run on a single core finitesample.merge(results) ## and display for r in finitesample.get_all(decreasing=False, sorted=True): print r.posterior_score, r.prior, r.likelihood, qq(str(r))
def __repr__(self): return qq(str(self.utterance))+' in '+ str(self.context) + " from " + str(self.possible_utterances)
# After we've defined F, these are used to construct the concept grammar.add_rule('INNER-BOOL', 'and_', ['INNER-BOOL', 'INNER-BOOL'], 1.0) grammar.add_rule('INNER-BOOL', 'or_', ['INNER-BOOL', 'INNER-BOOL'], 1.0) grammar.add_rule('INNER-BOOL', 'not_', ['INNER-BOOL'], 1.0) grammar.add_rule('OBJECT', 'x', None, 1.0) grammar.add_rule('OBJECT', 'y', None, 1.0) grammar.add_rule('OBJECT', '', ['BASE-OBJECT'], 1.0) # maybe or maybe not? # BASE-SET is here a set of BASE-OBJECTS (non-args) grammar.add_rule('BASE-SET', 'set_add_', ['BASE-OBJECT', 'BASE-SET'], 1.0) grammar.add_rule('BASE-SET', 'set_', [], 1.0) grammar.add_rule('BASE-OBJECT', qq('p1'), None, 1.0) grammar.add_rule('BASE-OBJECT', qq('p2'), None, 1.0) grammar.add_rule('BASE-OBJECT', qq('n1'), None, 1.0) grammar.add_rule('BASE-OBJECT', qq('n2'), None, 1.0) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set up data -- true output means attraction (p=positive; n=negative) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ data = [ FunctionData(input=[ "p1", "n1" ], output=True), FunctionData(input=[ "p1", "n2" ], output=True), FunctionData(input=[ "p1", "p1" ], output=False), FunctionData(input=[ "p1", "p2" ], output=False), FunctionData(input=[ "p2", "n1" ], output=True), FunctionData(input=[ "p2", "n2" ], output=True),
# Load the hypotheses # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # map each concept to a hypothesis with open('hypotheses.pkl', 'r') as f: # with open('hypotheses/hypotheses-1.pkl', 'r') as f: concept2hypotheses = pickle.load(f) hypotheses = set() for hset in concept2hypotheses.values(): hypotheses.update(hset) print "# Loaded %s hypotheses" % len(hypotheses) with open(LOG + "/hypotheses.txt", 'w') as f: for i, h in enumerate(hypotheses): print >> f, i, qq(h) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Load the human data # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # We will map tuples of concept-list, set, response to counts. import pandas import math from collections import Counter human_data = pandas.read_csv('HumanData/TurkData-Accuracy.txt', sep='\t', low_memory=False, index_col=False) human_yes, human_no = Counter(), Counter() for r in xrange(human_data.shape[0]): # for each row
""" Define a new kind of LOTHypothesis, that gives regex strings. These have a special interpretation function that compiles differently than straight python eval. """ from LOTlib import lot_iter from LOTlib.Inference.MetropolisHastings import MHSampler from LOTlib.Miscellaneous import qq from Model import * if __name__ == "__main__": for h in lot_iter(MHSampler(make_h0(), data, steps=10000)): print h.posterior_score, h.prior, h.likelihood, qq(h)
pr_data = language.sample_data_as_FuncData(1024, max_length=options.FINITE) p = [] r = [] print 'compute precision and recall..' for h in hypotheses: precision, recall = language.estimate_precision_and_recall(h, pr_data) p.append(precision) r.append(recall) # Now go through each hypothesis and print out some summary stats for data_size in DATA_RANGE: print 'get stats from size : ', data_size evaluation_data = language.sample_data_as_FuncData(data_size, max_length=options.FINITE) # Now update everyone's posterior for h in hypotheses: h.compute_posterior(evaluation_data) # compute the normalizing constant. This is the log of the sum of the probabilities Z = logsumexp([h.posterior_score for h in hypotheses]) f = open('out' + suffix, 'a') cnt = 0 for h in hypotheses: #compute the number of different strings we generate generated_strings = set([h() for _ in xrange(1000)]) print >> f, data_size, np.exp(h.posterior_score-Z), h.posterior_score, h.prior, \ h.likelihood, len(generated_strings), qq(h), p[cnt], r[cnt] cnt += 1 f.close()
eval_data = None if options.EVAL_DATA > 0: eval_data = make_data(options.EVAL_DATA) # choose the appropriate map function args = list( itertools.product([make_hypothesis], [make_data], data_amounts * options.CHAINS)) # set the output codec -- needed to display lambda to stdout sys.stdout = codecs.getwriter('utf8')(sys.stdout) seen = set() for fs in MPI_unorderedmap(run, numpy.random.permutation(args)): assert is_master_process() for h in fs: if h not in seen: seen.add(h) if eval_data is not None: h.compute_posterior(eval_data) # evaluate on the big data print h.posterior_score, h.prior, h.likelihood / options.EVAL_DATA, \ alsoprint(h) if alsoprint is not None else '',\ qq(cleanFunctionNodeString(h)) import pickle with open(options.OUT_PATH, 'w') as f: pickle.dump(seen, f)
def __repr__(self): return qq(str(self.utterance)) + ' in ' + str( self.context) + " from " + str(self.possible_utterances)
args = list(itertools.product([make_hypothesis], [make_data], DATA_RANGE)) # run on MPI results = MPI_map(run, args) # collapse all returned sets hypotheses = set() for r in results: hypotheses.update(r) # add the ith's results to the set # Now go through each hypothesis and print out some summary stats for data_size in DATA_RANGE: evaluation_data = make_data(data_size) # Now update everyone's posterior for h in hypotheses: h.compute_posterior(evaluation_data) # compute the normalizing constant. This is the log of the sum of the probabilities Z = logsumexp([h.posterior_score for h in hypotheses]) for h in hypotheses: #compute the number of different strings we generate generated_strings = set([h() for _ in xrange(1000)]) # print out some info. We can here use np.exp(h.posterior_score-Z) because Z is computed via logsumexp, so is more numerically stable # This is the probability at this amount of data print data_size, np.exp(h.posterior_score-Z), h.posterior_score, h.prior, h.likelihood, len(generated_strings), qq(h)
def __str__(self): """ This defaultly puts a \0 at the end so that we can sort -z if we want (e.g. if we print out a posterior first) """ return '\n'+'\n'.join(["%-15s: %s" % (qq(w), str(v)) for w, v in sorted(self.value.iteritems())]) + '\0'
from LOTlib.Grammar import Grammar from LOTlib.Miscellaneous import qq from Shared import OBJECTS, SEMANTIC_1PREDICATES, SEMANTIC_2PREDICATES grammar = Grammar() grammar.add_rule('START', '', ['FUNCTION'], 2.0) grammar.add_rule('START', '', ['BOOL'], 1.0) grammar.add_rule('START', '', ['OBJECT'], 1.0) for m in SEMANTIC_1PREDICATES: grammar.add_rule('BOOL', 'C.relation_', [ qq(m), 'OBJECT'], 1.0) for m in SEMANTIC_2PREDICATES: grammar.add_rule('BOOL', 'C.relation_', [ qq(m), 'OBJECT', 'OBJECT'], 1.0) for o in OBJECTS: grammar.add_rule('OBJECT', qq(o), None, 1.0) grammar.add_rule('BOOL', 'exists_', ['FUNCTION.O2B', 'C.objects'], 1.00) # can quantify over objects->bool functions grammar.add_rule('BOOL', 'forall_', ['FUNCTION.O2B', 'C.objects'], 1.00) grammar.add_rule('FUNCTION.O2B', 'lambda', ['BOOL'], 1.0, bv_type='OBJECT') grammar.add_rule('BOOL', 'and_', ['BOOL', 'BOOL'], 1.0) grammar.add_rule('BOOL', 'or_', ['BOOL', 'BOOL'], 1.0) grammar.add_rule('BOOL', 'not_', ['BOOL'], 1.0) # And for outermost functions grammar.add_rule('FUNCTION', 'lambda', ['START'], 1.0, bv_type='OBJECT')
probs=[v.posterior_score for v in population], log=True) try: kid = mutate(crossover(mom, dad)) except (ProposalFailedException, NodeSamplingException): continue kid.compute_posterior(data) yield kid nextpopulation.append(kid) # # if MH_acceptance(population[i].posterior_score, kid.posterior_score, 0.0): # if kid.posterior_score > population[i].posterior_score: # population[i] = kid # yield kid population = nextpopulation if __name__ == "__main__": from LOTlib import break_ctrlc from LOTlib.Examples.Number.Model import make_hypothesis, make_data from LOTlib.Miscellaneous import qq data = make_data(400) for h in break_ctrlc( genetic_algorithm(make_hypothesis, data, mutate_lot, crossover_lot)): print h.posterior_score, h.get_knower_pattern(), qq(h)
mom = weighted_sample(population, probs=[v.posterior_score for v in population], log=True) dad = weighted_sample(population, probs=[v.posterior_score for v in population], log=True) try: kid = mutate(crossover(mom, dad)) except (ProposalFailedException, NodeSamplingException): continue kid.compute_posterior(data) yield kid nextpopulation.append(kid) # # if MH_acceptance(population[i].posterior_score, kid.posterior_score, 0.0): # if kid.posterior_score > population[i].posterior_score: # population[i] = kid # yield kid population = nextpopulation if __name__ == "__main__": from LOTlib import break_ctrlc from LOTlib.Examples.Number.Model import make_hypothesis, make_data from LOTlib.Miscellaneous import qq data = make_data(400) for h in break_ctrlc(genetic_algorithm(make_hypothesis, data, mutate_lot, crossover_lot)): print h.posterior_score, h.get_knower_pattern(), qq(h)