def setUp(self): config.StringParameter('test.param0', 'a param', default='foo') config.StringParameter('test.param1', 'a param', config.oneof('foo', 'bar')) config.IntParameter('test.param2', 'a param', default=20) config.IntParameter('test.param3', 'a param', config.atmost(100)) config.IntParameter('test.param4', 'a param', config.atleast(100)) config.IntParameter('test.param5', 'a param', config.between(10, 100)) config.IntParameter('test.param6', 'a param', lambda x: x == 50) config.FloatParameter('test.param7', 'a param', config.between(1.3, 2.7))
class SparseLearner(Learner): """docstring for SparseLearner""" _params = (config.StringParameter( 'sparselearner.experiments', """List with variables covered by each experiment""", default='')) def __init__(self, data_=None, prior_=None, experiments=None): """Create a SparseLearner learner experiments should be a list of experiments, consisting of a tuple of (which variables are covered, experiment size) """ super(SparseLearner, self).__init__(data_, prior_) self.experiments = experiments def run(self): self.result = result.LearnerResult(self) self.evaluator = evaluator.fromconfig(self.data, prior_=self.prior) self.result.start_run() # algorithm here! print 'Yay! Nothing to see here...' # get prior! # complete datasets! # self.evaluator.score_network() self.result.stop_run() return self.result
class EC2Controller(IPython1Controller): _params = (config.StringParameter('ec2.config', 'EC2 config file', default=''), config.IntParameter( 'ec2.min_count', 'Minimum number of EC2 instances to create (default=1).', default=1), config.IntParameter( 'ec2.max_count', """Maximum number of EC2 instances to create (default=0 means the same number as ec2.min_count).""", default=0)) def __init__(self, **options): config.setparams(self, options) self.ec2 = ec2ipy1.EC2Cluster(self.config) self.start() def __del__(self): self.stop() def start(self): self.ec2.create_instances(self.min_count, self.max_count) print "Updating pebl on worker nodes" self.ec2.remote_all( "cd /usr/local/src/pebl; svn update; python setup.py install") self.ec2.start_ipython1(engine_on_controller=True) self.ipy1taskcontroller = IPython1Controller( self.ec2.task_controller_url) def stop(self): self.ec2.terminate_instances() def submit(self, tasks): return self.ipy1taskcontroller.submit(tasks) def retrieve(self, deferred_results): return self.ipy1taskcontroller.retrieve(deferred_results) def run(self, tasks): return self.ipy1taskcontroller.run(tasks)
class IPython1Controller(_BaseSubmittingController): _params = (config.StringParameter( 'ipython1.controller', 'IPython1 TaskController (default is 127.0.0.1:10113)', default='127.0.0.1:10113')) def __init__(self, tcserver=None): """Create a IPython1Controller instance. tcserver is the server and port of the Ipython1 TaskController. It should be of the form <ip>:<port>. (default is "127.0.0.1:10113"). """ if not ipy1kernel: print "IPython1 not found." return None self.tcserver = tcserver or config.get('ipython1.controller') self.tc = ipy1kernel.TaskController(tuple(self.tcserver.split(':'))) def submit(self, tasks): drs = [] for task in tasks: # create an ipython1 task from pebl task ipy1task = ipy1kernel.Task( "from pebl.pebl_script import runtask_picklestr; result = runtask_picklestr(task)", resultNames=['result'], setupNS={'task': cPickle.dumps(task)}) task.ipy1_taskid = self.tc.run(ipy1task) drs.append(IPython1DeferredResult(self.tc, task.ipy1_taskid)) return drs def retrieve(self, deferred_results): # block/wait for all tasks taskids = [dr.taskid for dr in deferred_results] self.tc.barrier(taskids) return [dr.result for dr in deferred_results]
class LearnerResult: """Class for storing any and all output of a learner. This is a mutable container for networks and scores. In the future, it will also be the place to collect statistics related to the learning task. """ # # Parameters # _params = ( config.StringParameter( 'result.filename', 'The name of the result output file', default='result.pebl' ), config.StringParameter( 'result.format', 'The format for the pebl result file (pickle or html)', config.oneof('pickle', 'html'), default='pickle' ), config.StringParameter( 'result.outdir', 'Directory for html report.', default='result' ), config.IntParameter( 'result.size', """Number of top-scoring networks to save. Specify 0 to indicate that all scored networks should be saved.""", default=1000 ) ) def __init__(self, learner_=None, size=None): self.data = learner_.data if learner_ else None self.nodes = self.data.variables if self.data else None self.size = size or config.get('result.size') self.networks = [] self.nethashes = {} self.runs = [] def start_run(self): """Indicates that the learner is starting a new run.""" self.runs.append(LearnerRunStats(time.time())) def stop_run(self): """Indicates that the learner is stopping a run.""" self.runs[-1].end = time.time() def add_network(self, net, score): """Add a network and score to the results.""" nets = self.networks nethashes = self.nethashes nethash = hash(net.edges) if self.size == 0 or len(nets) < self.size: if nethash not in nethashes: snet = _ScoredNetwork(copy(net.edges), score) insort(nets, snet) nethashes[nethash] = 1 elif score > nets[0].score and nethash not in nethashes: nethashes.pop(hash(nets[0].edges)) nets.remove(nets[0]) snet = _ScoredNetwork(copy(net.edges), score) insort(nets, snet) nethashes[nethash] = 1 def tofile(self, filename=None): """Save the result to a python pickle file. The result can be later read using the result.fromfile function. """ filename = filename or config.get('result.filename') with open(filename, 'w') as fp: cPickle.dump(self, fp) def tohtml(self, outdir=None): """Create a html report of the result. outdir is a directory to create html files inside. """ if _can_create_html: HtmlFormatter().htmlreport( self, outdir or config.get('result.outdir') ) else: print "Cannot create html reports because some dependencies are missing." @property def posterior(self): """Returns a posterior object for this result.""" return posterior.from_sorted_scored_networks( self.nodes, list(reversed(self.networks)) )
class SimulatedAnnealingLearner(Learner): # # Parameters # _params = ( config.FloatParameter( 'simanneal.start_temp', "Starting temperature for a run.", config.atleast(0.0), default=100.0 ), config.FloatParameter( 'simanneal.delta_temp', 'Change in temp between steps.', config.atleast(0.0), default=0.5 ), config.IntParameter( 'simanneal.max_iters_at_temp', 'Max iterations at any temperature.', config.atleast(0), default=100 ), config.StringParameter( 'simanneal.seed', 'Starting network for a greedy search.', default='' ) ) def __init__(self, data_=None, prior_=None, **options): """Create a Simulated Aneaaling learner. For more information about Simulated Annealing algorithms, consult: 1. http://en.wikipedia.org/wiki/Simulated_annealing 2. D. Heckerman. A Tutorial on Learning with Bayesian Networks. Microsoft Technical Report MSR-TR-95-06, 1995. p.35-36. Any config param for 'simanneal' can be passed in via options. Use just the option part of the parameter name. """ super(SimulatedAnnealingLearner,self).__init__(data_, prior_) config.setparams(self, options) if not isinstance(self.seed, network.Network): self.seed = network.Network(self.data.variables, self.seed) def run(self): """Run the learner.""" self.stats = SALearnerStatistics(self.start_temp, self.delta_temp, self.max_iters_at_temp) self.result = result.LearnerResult(self) self.evaluator = evaluator.fromconfig(self.data, self.seed, self.prior) self.evaluator.score_network(self.seed.copy()) self.result.start_run() curscore = self.evaluator.score_network() # temperature decays exponentially, so we'll never get to 0. # So, we continue until temp < 1 while self.stats.temp >= 1: try: newscore = self._alter_network_randomly_and_score() except CannotAlterNetworkException: return self.result.add_network(self.evaluator.network, newscore) if self._accept(newscore): # set current score self.stats.current_score = newscore if self.stats.current_score > self.stats.best_score: self.stats.best_score = self.stats.current_score else: # undo network alteration self.evaluator.restore_network() # temp not updated EVERY iteration. just whenever criteria met. self.stats.update() self.result.stop_run() return self.result def _accept(self, newscore): oldscore = self.stats.current_score if newscore >= oldscore: return True elif random.random() < exp((newscore - oldscore)/self.stats.temp): return True else: return False
from pebl import config # # Module Parameteres # _pcontrollertype = config.StringParameter('taskcontroller.type', 'The task controller to use.', default='serial.SerialController') #TODO:test def fromconfig(): tctype = config.get('taskcontroller.type') tcmodule, tcclass = tctype.split('.') mymod = __import__("pebl.taskcontroller.%s" % tcmodule, fromlist=['pebl.taskcontroller']) mytc = getattr(mymod, tcclass) return mytc()
import re import copy from itertools import groupby import numpy as N from pebl.util import * from pebl import discretizer from pebl import config # # Module parameters # _pfilename = config.StringParameter( 'data.filename', 'File to read data from.', config.fileexists(), ) _ptext = config.StringParameter( 'data.text', 'The text of a dataset included in config file.', default='') _pdiscretize = config.IntParameter( 'data.discretize', 'Number of bins used to discretize data. Specify 0 to indicate that '+\ 'data should not be discretized.', default=0 ) #
class ListLearner(Learner): # # Parameter # _params = (config.StringParameter( 'listlearner.networks', """List of networks, one per line, in network.Network.as_string() format.""", default='')) def __init__(self, data_=None, prior_=None, networks=None): """Create a ListLearner learner. networks should be a list of networks (as network.Network instances). """ super(ListLearner, self).__init__(data_, prior_) self.networks = networks if not networks: variables = self.data.variables _net = lambda netstr: network.Network(variables, netstr) netstrings = config.get('listlearner.networks').splitlines() self.networks = (_net(s) for s in netstrings if s) def run(self): self.result = result.LearnerResult(self) self.evaluator = evaluator.fromconfig(self.data, prior_=self.prior) self.result.start_run() for net in self.networks: self.result.add_network(net, self.evaluator.score_network(net)) self.result.stop_run() return self.result def split(self, count): """Split the learner into multiple learners. Splits self.networks into `count` parts. This is similar to MPI's scatter functionality. """ nets = list(self.networks) numnets = len(nets) netspertask = numnets / count # divide list into parts indices = [[i, i + netspertask] for i in xrange(0, numnets, netspertask)] if len(indices) > count: indices.pop(-1) indices[-1][1] = numnets - 1 return [ ListLearner(self.data, self.prior, nets[i:j]) for i, j in indices ] def __getstate__(self): # convert self.network from iterators or generators to a list d = self.__dict__ d['networks'] = list(d['networks']) return d
class MissingDataNetworkEvaluator(SmartNetworkEvaluator): # # Parameters # _params = (config.IntParameter( 'gibbs.burnin', """Burn-in period for the gibbs sampler (specified as a multiple of the number of missing values)""", default=10), config.StringParameter( 'gibbs.max_iterations', """Stopping criteria for the gibbs sampler. The number of Gibb's sampler iterations to run. Should be a valid python expression using the variable n (number of missing values). Examples: * n**2 (for n-squared iterations) * 100 (for 100 iterations) """, default="n**2")) def __init__(self, data_, network_, prior_=None, localscore_cache=None, **options): """Create a network evaluator for use with missing values. This evaluator uses a Gibb's sampler for sampling over the space of possible completions for the missing values. For more information about Gibb's sampling, consult: 1. http://en.wikipedia.org/wiki/Gibbs_sampling 2. D. Heckerman. A Tutorial on Learning with Bayesian Networks. Microsoft Technical Report MSR-TR-95-06, 1995. p.21-22. Any config param for 'gibbs' can be passed in via options. Use just the option part of the parameter name. """ super(MissingDataNetworkEvaluator, self).__init__(data_, network_, prior_) self._localscore = None # no cache w/ missing data config.setparams(self, options) def _init_state(self): parents = self.network.edges.parents self.cpds = [self._cpd(n, parents(n)) for n in self.datavars] self.localscores = N.array([cpd.loglikelihood() for cpd in self.cpds], dtype=float) self.data_dirtynodes = set(self.datavars) def _update_dirtynodes(self, add, remove): # With hidden nodes: # 1. dirtynode calculation is more expensive (need to look beyond # markov blanket). # 2. time spent rescoring observed nodes is insignificant compared # to scoring hidden/missing nodes. self.dirtynodes = set(self.datavars) def _score_network_with_tempdata(self): # update localscore for data_dirtynodes, then calculate globalscore. for n in self.data_dirtynodes: self.localscores[n] = self.cpds[n].loglikelihood() self.data_dirtynodes = set() self.score = self._globalscore(self.localscores) return self.score def _alter_data(self, row, col, value): oldrow = self.data.observations[row].copy() self.data.observations[row, col] = value # update data_dirtynodes affected_nodes = set(self.network.edges.children(col) + [col]) self.data_dirtynodes.update(affected_nodes) # update cpds for node in affected_nodes: datacols = [node] + self.network.edges.parents(node) if not self.data.interventions[row, node]: self.cpds[node].replace_data( oldrow[datacols], self.data.observations[row][datacols]) def _alter_data_and_score(self, row, col, value): self._alter_data(row, col, value) return self._score_network_with_tempdata() def _calculate_score(self, chosenscores, gibbs_state): # discard the burnin period scores and average the rest burnin_period = self.burnin * \ self.data.missing[self.data.missing==True].size if gibbs_state: # resuming from a previous gibbs run. so, no burnin required. scoresum = logsum( N.concatenate((chosenscores, [gibbs_state.scoresum]))) numscores = len(chosenscores) + gibbs_state.numscores elif len(chosenscores) > burnin_period: # remove scores from burnin period. nonburn_scores = chosenscores[burnin_period:] scoresum = logsum(nonburn_scores) numscores = len(nonburn_scores) else: # this occurs when gibbs iterations were less than burnin period. scoresum = chosenscores[-1] numscores = 1 score = scoresum - log(numscores) return score, numscores def _assign_missingvals(self, indices, gibbs_state): if gibbs_state: assignedvals = gibbs_state.assignedvals else: arities = [v.arity for v in self.data.variables] assignedvals = [ random.randint(0, arities[col] - 1) for row, col in indices ] self.data.observations[unzip(indices)] = assignedvals def score_network(self, net=None, gibbs_state=None): """Score a network. If net is provided, scores that. Otherwise, score network previously set. The default stopping criteria is to run for n**2 iterations. gibbs_state is the state of a previous run of the Gibb's sampler. With this, one can do the following:: myeval = evaluator.MissingDataNetworkEvaluator(...) myeval.score_network(...) gibbs_state = myeval.gibbs_state cPickle.dump(gibbs_state, 'gibbs_state.txt') # look at results, do other analysis, etc # If we decide that we need further Gibb's sampler iterations, we # don't need to restart gibbs_state = cPickle.load(open('gibbs_state.txt')) myeval = evaluator.MissingDataNetworkEvaluator(...) # continue with the previous run of the Gibb's sampler myeval.score_network( gibbs_state=gibbs_state, stopping_criteria=lambda i,N: i>200*N**2 ) """ self.gibbs_state = gibbs_state return super(MissingDataNetworkEvaluator, self).score_network(net) def _score_network_core(self): # create some useful lists and local variables missing_indices = unzip(N.where(self.data.missing == True)) num_missingvals = len(missing_indices) n = num_missingvals max_iterations = eval(self.max_iterations) arities = [v.arity for v in self.data.variables] chosenscores = [] self._assign_missingvals(missing_indices, self.gibbs_state) self._init_state() # Gibbs Sampling: # For each missing value: # 1) score net with each possible value (based on node's arity) # 2) using a probability wheel, sample a value from the possible values iters = 0 while iters < max_iterations: for row, col in missing_indices: scores = [self._alter_data_and_score(row, col, val) \ for val in xrange(arities[col])] chosenval = logscale_probwheel(range(len(scores)), scores) self._alter_data(row, col, chosenval) chosenscores.append(scores[chosenval]) iters += num_missingvals self.chosenscores = N.array(chosenscores) self.score, numscores = self._calculate_score(self.chosenscores, self.gibbs_state) # save state of gibbs sampler self.gibbs_state = GibbsSamplerState( avgscore=self.score, numscores=numscores, assignedvals=self.data.observations[unzip( missing_indices)].tolist()) return self.score
assignedvals=self.data.observations[N.where( self.data.missing == True)].tolist()) return self.score # # Parameters # _pmissingdatahandler = config.StringParameter( 'evaluator.missingdata_evaluator', """ Evaluator to use for handling missing data. Choices include: * gibbs: Gibb's sampling * maxentropy_gibbs: Gibbs's sampling over all completions of the missing values that result in maximum entropy discretization for the variables. * exact: exact enumeration of all possible missing values (only useable when there are few missing values) """, config.oneof('gibbs', 'exact', 'maxentropy_gibbs'), default='gibbs') _missingdata_evaluators = { 'gibbs': MissingDataNetworkEvaluator, 'exact': MissingDataExactNetworkEvaluator, 'maxentropy_gibbs': MissingDataMaximumEntropyNetworkEvaluator } def fromconfig(data_=None, network_=None, prior_=None):
# # Exceptions # class CannotAlterNetworkException(Exception): pass # # Module parameters # _plearnertype = config.StringParameter( 'learner.type', """Type of learner to use. The following learners are included with pebl: * greedy.GreedyLearner * simanneal.SimulatedAnnealingLearner * exhaustive.ListLearner """, default = 'greedy.GreedyLearner' ) _ptasks = config.IntParameter( 'learner.numtasks', "Number of learner tasks to run.", config.atleast(0), default=1 ) class Learner(Task):
class XgridController(_BaseSubmittingController): # # Parameters # _params = ( config.StringParameter('xgrid.controller', 'Hostname or IP of the Xgrid controller.', default=''), config.StringParameter('xgrid.password', 'Password for the Xgrid controller.', default=''), config.StringParameter( 'xgrid.grid', 'Id of the grid to use at the Xgrid controller.', default='0'), config.FloatParameter( 'xgrid.pollinterval', 'Time (in secs) to wait between polling the Xgrid controller.', default=60.0), config.StringParameter('xgrid.peblpath', 'Full path to the pebl script on Xgrid agents.', default='pebl')) def __init__(self, **options): """Create a XGridController. Any config param for 'xgrid' can be passed in via options. Use just the option part of the parameter name. """ config.setparams(self, options) @property def _grid(self): if xg: cn = xg.Connection(self.controller, self.password) ct = xg.Controller(cn) return ct.grid(self.grid) return None # # Public interface # def submit(self, tasks): grid = self._grid drs = [] for task in tasks: task.cwd = tempfile.mkdtemp() cPickle.dump(task, open(os.path.join(task.cwd, 'task.pebl'), 'w')) task.job = grid.submit(self.peblpath, 'runtask task.pebl', indir=task.cwd) drs.append(XgridDeferredResult(grid, task)) return drs def retrieve(self, deferred_results): drs = deferred_results # poll for job results # i'd rather select() or wait() but xgrid doesn't offer that via the # xgrid command line app done = [] while drs: for i, dr in enumerate(drs): if dr.finished: done.append(drs.pop(i)) break # modified drs, so break and re-iterate else: time.sleep(self.pollinterval) return [dr.result for dr in done]