def setUp(self): config.StringParameter('test.param0', 'a param', default='foo') config.StringParameter('test.param1', 'a param', config.oneof('foo', 'bar')) config.IntParameter('test.param2', 'a param', default=20) config.IntParameter('test.param3', 'a param', config.atmost(100)) config.IntParameter('test.param4', 'a param', config.atleast(100)) config.IntParameter('test.param5', 'a param', config.between(10, 100)) config.IntParameter('test.param6', 'a param', lambda x: x == 50) config.FloatParameter('test.param7', 'a param', config.between(1.3, 2.7))
class EC2Controller(IPython1Controller): _params = (config.StringParameter('ec2.config', 'EC2 config file', default=''), config.IntParameter( 'ec2.min_count', 'Minimum number of EC2 instances to create (default=1).', default=1), config.IntParameter( 'ec2.max_count', """Maximum number of EC2 instances to create (default=0 means the same number as ec2.min_count).""", default=0)) def __init__(self, **options): config.setparams(self, options) self.ec2 = ec2ipy1.EC2Cluster(self.config) self.start() def __del__(self): self.stop() def start(self): self.ec2.create_instances(self.min_count, self.max_count) print "Updating pebl on worker nodes" self.ec2.remote_all( "cd /usr/local/src/pebl; svn update; python setup.py install") self.ec2.start_ipython1(engine_on_controller=True) self.ipy1taskcontroller = IPython1Controller( self.ec2.task_controller_url) def stop(self): self.ec2.terminate_instances() def submit(self, tasks): return self.ipy1taskcontroller.submit(tasks) def retrieve(self, deferred_results): return self.ipy1taskcontroller.retrieve(deferred_results) def run(self, tasks): return self.ipy1taskcontroller.run(tasks)
def test_configobj1(self): expected = \ """[test] param1 = foo param0 = foo [test1] param1 = 5 """ config.IntParameter('test1.param1', 'a param', default=5) config.set('test.param1', 'foo') params = [ config._parameters.get(x) for x in ('test.param0', 'test.param1', 'test1.param1') ] tmpfile = NamedTemporaryFile(prefix="pebl.test") config.configobj(params).write(tmpfile) tmpfile.file.seek(0) actual = tmpfile.read() assert actual == expected
class LearnerResult: """Class for storing any and all output of a learner. This is a mutable container for networks and scores. In the future, it will also be the place to collect statistics related to the learning task. """ # # Parameters # _params = ( config.StringParameter( 'result.filename', 'The name of the result output file', default='result.pebl' ), config.StringParameter( 'result.format', 'The format for the pebl result file (pickle or html)', config.oneof('pickle', 'html'), default='pickle' ), config.StringParameter( 'result.outdir', 'Directory for html report.', default='result' ), config.IntParameter( 'result.size', """Number of top-scoring networks to save. Specify 0 to indicate that all scored networks should be saved.""", default=1000 ) ) def __init__(self, learner_=None, size=None): self.data = learner_.data if learner_ else None self.nodes = self.data.variables if self.data else None self.size = size or config.get('result.size') self.networks = [] self.nethashes = {} self.runs = [] def start_run(self): """Indicates that the learner is starting a new run.""" self.runs.append(LearnerRunStats(time.time())) def stop_run(self): """Indicates that the learner is stopping a run.""" self.runs[-1].end = time.time() def add_network(self, net, score): """Add a network and score to the results.""" nets = self.networks nethashes = self.nethashes nethash = hash(net.edges) if self.size == 0 or len(nets) < self.size: if nethash not in nethashes: snet = _ScoredNetwork(copy(net.edges), score) insort(nets, snet) nethashes[nethash] = 1 elif score > nets[0].score and nethash not in nethashes: nethashes.pop(hash(nets[0].edges)) nets.remove(nets[0]) snet = _ScoredNetwork(copy(net.edges), score) insort(nets, snet) nethashes[nethash] = 1 def tofile(self, filename=None): """Save the result to a python pickle file. The result can be later read using the result.fromfile function. """ filename = filename or config.get('result.filename') with open(filename, 'w') as fp: cPickle.dump(self, fp) def tohtml(self, outdir=None): """Create a html report of the result. outdir is a directory to create html files inside. """ if _can_create_html: HtmlFormatter().htmlreport( self, outdir or config.get('result.outdir') ) else: print "Cannot create html reports because some dependencies are missing." @property def posterior(self): """Returns a posterior object for this result.""" return posterior.from_sorted_scored_networks( self.nodes, list(reversed(self.networks)) )
class SimulatedAnnealingLearner(Learner): # # Parameters # _params = ( config.FloatParameter( 'simanneal.start_temp', "Starting temperature for a run.", config.atleast(0.0), default=100.0 ), config.FloatParameter( 'simanneal.delta_temp', 'Change in temp between steps.', config.atleast(0.0), default=0.5 ), config.IntParameter( 'simanneal.max_iters_at_temp', 'Max iterations at any temperature.', config.atleast(0), default=100 ), config.StringParameter( 'simanneal.seed', 'Starting network for a greedy search.', default='' ) ) def __init__(self, data_=None, prior_=None, **options): """Create a Simulated Aneaaling learner. For more information about Simulated Annealing algorithms, consult: 1. http://en.wikipedia.org/wiki/Simulated_annealing 2. D. Heckerman. A Tutorial on Learning with Bayesian Networks. Microsoft Technical Report MSR-TR-95-06, 1995. p.35-36. Any config param for 'simanneal' can be passed in via options. Use just the option part of the parameter name. """ super(SimulatedAnnealingLearner,self).__init__(data_, prior_) config.setparams(self, options) if not isinstance(self.seed, network.Network): self.seed = network.Network(self.data.variables, self.seed) def run(self): """Run the learner.""" self.stats = SALearnerStatistics(self.start_temp, self.delta_temp, self.max_iters_at_temp) self.result = result.LearnerResult(self) self.evaluator = evaluator.fromconfig(self.data, self.seed, self.prior) self.evaluator.score_network(self.seed.copy()) self.result.start_run() curscore = self.evaluator.score_network() # temperature decays exponentially, so we'll never get to 0. # So, we continue until temp < 1 while self.stats.temp >= 1: try: newscore = self._alter_network_randomly_and_score() except CannotAlterNetworkException: return self.result.add_network(self.evaluator.network, newscore) if self._accept(newscore): # set current score self.stats.current_score = newscore if self.stats.current_score > self.stats.best_score: self.stats.best_score = self.stats.current_score else: # undo network alteration self.evaluator.restore_network() # temp not updated EVERY iteration. just whenever criteria met. self.stats.update() self.result.stop_run() return self.result def _accept(self, newscore): oldscore = self.stats.current_score if newscore >= oldscore: return True elif random.random() < exp((newscore - oldscore)/self.stats.temp): return True else: return False
# # Module parameters # _pfilename = config.StringParameter( 'data.filename', 'File to read data from.', config.fileexists(), ) _ptext = config.StringParameter( 'data.text', 'The text of a dataset included in config file.', default='') _pdiscretize = config.IntParameter( 'data.discretize', 'Number of bins used to discretize data. Specify 0 to indicate that '+\ 'data should not be discretized.', default=0 ) # # Exceptions # class ParsingError(Exception): """Error encountered while parsing an ill-formed datafile.""" pass class IncorrectArityError(Exception): """Error encountered when the datafile speifies an incorrect variable arity.
class MissingDataNetworkEvaluator(SmartNetworkEvaluator): # # Parameters # _params = (config.IntParameter( 'gibbs.burnin', """Burn-in period for the gibbs sampler (specified as a multiple of the number of missing values)""", default=10), config.StringParameter( 'gibbs.max_iterations', """Stopping criteria for the gibbs sampler. The number of Gibb's sampler iterations to run. Should be a valid python expression using the variable n (number of missing values). Examples: * n**2 (for n-squared iterations) * 100 (for 100 iterations) """, default="n**2")) def __init__(self, data_, network_, prior_=None, localscore_cache=None, **options): """Create a network evaluator for use with missing values. This evaluator uses a Gibb's sampler for sampling over the space of possible completions for the missing values. For more information about Gibb's sampling, consult: 1. http://en.wikipedia.org/wiki/Gibbs_sampling 2. D. Heckerman. A Tutorial on Learning with Bayesian Networks. Microsoft Technical Report MSR-TR-95-06, 1995. p.21-22. Any config param for 'gibbs' can be passed in via options. Use just the option part of the parameter name. """ super(MissingDataNetworkEvaluator, self).__init__(data_, network_, prior_) self._localscore = None # no cache w/ missing data config.setparams(self, options) def _init_state(self): parents = self.network.edges.parents self.cpds = [self._cpd(n, parents(n)) for n in self.datavars] self.localscores = N.array([cpd.loglikelihood() for cpd in self.cpds], dtype=float) self.data_dirtynodes = set(self.datavars) def _update_dirtynodes(self, add, remove): # With hidden nodes: # 1. dirtynode calculation is more expensive (need to look beyond # markov blanket). # 2. time spent rescoring observed nodes is insignificant compared # to scoring hidden/missing nodes. self.dirtynodes = set(self.datavars) def _score_network_with_tempdata(self): # update localscore for data_dirtynodes, then calculate globalscore. for n in self.data_dirtynodes: self.localscores[n] = self.cpds[n].loglikelihood() self.data_dirtynodes = set() self.score = self._globalscore(self.localscores) return self.score def _alter_data(self, row, col, value): oldrow = self.data.observations[row].copy() self.data.observations[row, col] = value # update data_dirtynodes affected_nodes = set(self.network.edges.children(col) + [col]) self.data_dirtynodes.update(affected_nodes) # update cpds for node in affected_nodes: datacols = [node] + self.network.edges.parents(node) if not self.data.interventions[row, node]: self.cpds[node].replace_data( oldrow[datacols], self.data.observations[row][datacols]) def _alter_data_and_score(self, row, col, value): self._alter_data(row, col, value) return self._score_network_with_tempdata() def _calculate_score(self, chosenscores, gibbs_state): # discard the burnin period scores and average the rest burnin_period = self.burnin * \ self.data.missing[self.data.missing==True].size if gibbs_state: # resuming from a previous gibbs run. so, no burnin required. scoresum = logsum( N.concatenate((chosenscores, [gibbs_state.scoresum]))) numscores = len(chosenscores) + gibbs_state.numscores elif len(chosenscores) > burnin_period: # remove scores from burnin period. nonburn_scores = chosenscores[burnin_period:] scoresum = logsum(nonburn_scores) numscores = len(nonburn_scores) else: # this occurs when gibbs iterations were less than burnin period. scoresum = chosenscores[-1] numscores = 1 score = scoresum - log(numscores) return score, numscores def _assign_missingvals(self, indices, gibbs_state): if gibbs_state: assignedvals = gibbs_state.assignedvals else: arities = [v.arity for v in self.data.variables] assignedvals = [ random.randint(0, arities[col] - 1) for row, col in indices ] self.data.observations[unzip(indices)] = assignedvals def score_network(self, net=None, gibbs_state=None): """Score a network. If net is provided, scores that. Otherwise, score network previously set. The default stopping criteria is to run for n**2 iterations. gibbs_state is the state of a previous run of the Gibb's sampler. With this, one can do the following:: myeval = evaluator.MissingDataNetworkEvaluator(...) myeval.score_network(...) gibbs_state = myeval.gibbs_state cPickle.dump(gibbs_state, 'gibbs_state.txt') # look at results, do other analysis, etc # If we decide that we need further Gibb's sampler iterations, we # don't need to restart gibbs_state = cPickle.load(open('gibbs_state.txt')) myeval = evaluator.MissingDataNetworkEvaluator(...) # continue with the previous run of the Gibb's sampler myeval.score_network( gibbs_state=gibbs_state, stopping_criteria=lambda i,N: i>200*N**2 ) """ self.gibbs_state = gibbs_state return super(MissingDataNetworkEvaluator, self).score_network(net) def _score_network_core(self): # create some useful lists and local variables missing_indices = unzip(N.where(self.data.missing == True)) num_missingvals = len(missing_indices) n = num_missingvals max_iterations = eval(self.max_iterations) arities = [v.arity for v in self.data.variables] chosenscores = [] self._assign_missingvals(missing_indices, self.gibbs_state) self._init_state() # Gibbs Sampling: # For each missing value: # 1) score net with each possible value (based on node's arity) # 2) using a probability wheel, sample a value from the possible values iters = 0 while iters < max_iterations: for row, col in missing_indices: scores = [self._alter_data_and_score(row, col, val) \ for val in xrange(arities[col])] chosenval = logscale_probwheel(range(len(scores)), scores) self._alter_data(row, col, chosenval) chosenscores.append(scores[chosenval]) iters += num_missingvals self.chosenscores = N.array(chosenscores) self.score, numscores = self._calculate_score(self.chosenscores, self.gibbs_state) # save state of gibbs sampler self.gibbs_state = GibbsSamplerState( avgscore=self.score, numscores=numscores, assignedvals=self.data.observations[unzip( missing_indices)].tolist()) return self.score
class LocalscoreCache(object): """ A LRU cache for local scores. Based on code from http://code.activestate.com/recipes/498245/ """ _params = (config.IntParameter( 'localscore_cache.maxsize', "Max number of localscores to cache. Default=-1 means unlimited size.", default=-1)) def __init__(self, evaluator, cachesize=None): self._cache = {} self._queue = deque() self._refcount = {} self.cachesize = cachesize or config.get('localscore_cache.maxsize') self.neteval = evaluator self.hits = 0 self.misses = 0 def __call__(self, node, parents): # make variables local _len = len _queue = self._queue _refcount = self._refcount _cache = self._cache _maxsize = self.cachesize index = tuple([node] + parents) # get from cache or compute try: score = _cache[index] self.hits += 1 except KeyError: score = _cache[index] = self.neteval._cpd(node, parents).loglikelihood() self.misses += 1 # if using LRU cache (maxsize != -1) if _maxsize > 0: # record that key was accessed _queue.append(index) _refcount[index] = _refcount.get(index, 0) + 1 # purge LRU entry while _len(_cache) > _maxsize: k = _queue.popleft() _refcount[k] -= 1 if not _refcount[k]: del _cache[k] del _refcount[k] # Periodically compact the queue by duplicate keys if _len(_queue) > _maxsize * 4: for i in xrange(_len(_queue)): k = _queue.popleft() if _refcount[k] == 1: _queue.append(k) else: _refcount[k] -= 1 return score
# _plearnertype = config.StringParameter( 'learner.type', """Type of learner to use. The following learners are included with pebl: * greedy.GreedyLearner * simanneal.SimulatedAnnealingLearner * exhaustive.ListLearner """, default = 'greedy.GreedyLearner' ) _ptasks = config.IntParameter( 'learner.numtasks', "Number of learner tasks to run.", config.atleast(0), default=1 ) class Learner(Task): def __init__(self, data_=None, prior_=None, **kw): self.data = data_ or data.fromconfig() self.prior = prior_ or prior.fromconfig() self.__dict__.update(kw) # parameters self.numtasks = config.get('learner.numtasks') # stats self.reverse = 0
class MultiProcessController(_BaseController): # # Parameters # _params = ( config.IntParameter( 'multiprocess.poolsize', 'Number of processes to run concurrently (0 means no limit)', default=0 ) ) def __init__(self, poolsize=None): """Creates a task controller that runs taks on multiple processes. This task controller uses a pool of processes rather than spawning all processes concurrently. poolsize is the size of this pool and by default it is big enough to run all processes concurrently. """ self.poolsize = poolsize or config.get('multiprocess.poolsize') def run(self, tasks): """Run tasks by creating multiple processes. If poolsize was specified when creating this controller, additional tasks will be queued. """ tasks = copy(tasks) # because we do tasks.pop() below.. numtasks = len(tasks) poolsize = self.poolsize or numtasks running = {} done = [] opjoin = os.path.join while len(done) < numtasks: # submit tasks (if below poolsize and tasks remain) for i in xrange(min(poolsize-len(running), len(tasks))): task = tasks.pop() task.cwd = tempfile.mkdtemp() cPickle.dump(task, open(opjoin(task.cwd, 'task.pebl'), 'w')) pid = os.spawnlp(os.P_NOWAIT, PEBL, PEBL, "runtask", opjoin(task.cwd, "task.pebl")) running[pid] = task # wait for any child process to finish pid,status = os.wait() done.append(running.pop(pid, None)) results = [result.fromfile(opjoin(t.cwd, 'result.pebl')) for t in done] # to make the results look like deferred results for r in results: r.taskid = 0 # clean up for t in done: shutil.rmtree(t.cwd) return results