Ejemplo n.º 1
0
 def setUp(self):
     config.StringParameter('test.param0', 'a param', default='foo')
     config.StringParameter('test.param1', 'a param',
                            config.oneof('foo', 'bar'))
     config.IntParameter('test.param2', 'a param', default=20)
     config.IntParameter('test.param3', 'a param', config.atmost(100))
     config.IntParameter('test.param4', 'a param', config.atleast(100))
     config.IntParameter('test.param5', 'a param', config.between(10, 100))
     config.IntParameter('test.param6', 'a param', lambda x: x == 50)
     config.FloatParameter('test.param7', 'a param',
                           config.between(1.3, 2.7))
Ejemplo n.º 2
0
class SparseLearner(Learner):
    """docstring for SparseLearner"""
    _params = (config.StringParameter(
        'sparselearner.experiments',
        """List with variables covered by each experiment""",
        default=''))

    def __init__(self, data_=None, prior_=None, experiments=None):
        """Create a SparseLearner learner
		
		experiments should be a list of experiments, consisting of a tuple of (which variables are covered, experiment size)
		
		"""
        super(SparseLearner, self).__init__(data_, prior_)
        self.experiments = experiments

    def run(self):
        self.result = result.LearnerResult(self)
        self.evaluator = evaluator.fromconfig(self.data, prior_=self.prior)
        self.result.start_run()

        # algorithm here!
        print 'Yay! Nothing to see here...'
        # get prior!
        # complete datasets!
        # self.evaluator.score_network()

        self.result.stop_run()
        return self.result
Ejemplo n.º 3
0
class EC2Controller(IPython1Controller):
    _params = (config.StringParameter('ec2.config',
                                      'EC2 config file',
                                      default=''),
               config.IntParameter(
                   'ec2.min_count',
                   'Minimum number of EC2 instances to create (default=1).',
                   default=1),
               config.IntParameter(
                   'ec2.max_count',
                   """Maximum number of EC2 instances to create 
            (default=0 means the same number as ec2.min_count).""",
                   default=0))

    def __init__(self, **options):
        config.setparams(self, options)
        self.ec2 = ec2ipy1.EC2Cluster(self.config)
        self.start()

    def __del__(self):
        self.stop()

    def start(self):
        self.ec2.create_instances(self.min_count, self.max_count)

        print "Updating pebl on worker nodes"
        self.ec2.remote_all(
            "cd /usr/local/src/pebl; svn update; python setup.py install")

        self.ec2.start_ipython1(engine_on_controller=True)
        self.ipy1taskcontroller = IPython1Controller(
            self.ec2.task_controller_url)

    def stop(self):
        self.ec2.terminate_instances()

    def submit(self, tasks):
        return self.ipy1taskcontroller.submit(tasks)

    def retrieve(self, deferred_results):
        return self.ipy1taskcontroller.retrieve(deferred_results)

    def run(self, tasks):
        return self.ipy1taskcontroller.run(tasks)
Ejemplo n.º 4
0
class IPython1Controller(_BaseSubmittingController):
    _params = (config.StringParameter(
        'ipython1.controller',
        'IPython1 TaskController (default is 127.0.0.1:10113)',
        default='127.0.0.1:10113'))

    def __init__(self, tcserver=None):
        """Create a IPython1Controller instance.

        tcserver is the server and port of the Ipython1 TaskController. 
        It should be of the form <ip>:<port>. (default is "127.0.0.1:10113").
        
        """

        if not ipy1kernel:
            print "IPython1 not found."
            return None

        self.tcserver = tcserver or config.get('ipython1.controller')
        self.tc = ipy1kernel.TaskController(tuple(self.tcserver.split(':')))

    def submit(self, tasks):
        drs = []
        for task in tasks:
            # create an ipython1 task from pebl task
            ipy1task = ipy1kernel.Task(
                "from pebl.pebl_script import runtask_picklestr; result = runtask_picklestr(task)",
                resultNames=['result'],
                setupNS={'task': cPickle.dumps(task)})

            task.ipy1_taskid = self.tc.run(ipy1task)
            drs.append(IPython1DeferredResult(self.tc, task.ipy1_taskid))
        return drs

    def retrieve(self, deferred_results):
        # block/wait for all tasks
        taskids = [dr.taskid for dr in deferred_results]
        self.tc.barrier(taskids)
        return [dr.result for dr in deferred_results]
Ejemplo n.º 5
0
class LearnerResult:
    """Class for storing any and all output of a learner.

    This is a mutable container for networks and scores. In the future, it will
    also be the place to collect statistics related to the learning task.

    """

    #
    # Parameters
    #
    _params = (
        config.StringParameter(
            'result.filename',
            'The name of the result output file',
            default='result.pebl'
        ),
        config.StringParameter(
            'result.format',
            'The format for the pebl result file (pickle or html)',
            config.oneof('pickle', 'html'),
            default='pickle'
        ),
        config.StringParameter(
            'result.outdir',
            'Directory for html report.',
            default='result'
        ),
        config.IntParameter(
            'result.size',
            """Number of top-scoring networks to save. Specify 0 to indicate that
            all scored networks should be saved.""",
            default=1000
        )
    )

    def __init__(self, learner_=None, size=None):
        self.data = learner_.data if learner_ else None
        self.nodes = self.data.variables if self.data else None
        self.size = size or config.get('result.size')
        self.networks = []
        self.nethashes = {}
        self.runs = []

    def start_run(self):
        """Indicates that the learner is starting a new run."""
        self.runs.append(LearnerRunStats(time.time()))

    def stop_run(self):
        """Indicates that the learner is stopping a run."""
        self.runs[-1].end = time.time()

    def add_network(self, net, score):
        """Add a network and score to the results."""
        nets = self.networks
        nethashes = self.nethashes
        nethash = hash(net.edges)

        if self.size == 0 or len(nets) < self.size:
            if nethash not in nethashes:
                snet = _ScoredNetwork(copy(net.edges), score)
                insort(nets, snet)
                nethashes[nethash] = 1
        elif score > nets[0].score and nethash not in nethashes:
            nethashes.pop(hash(nets[0].edges))
            nets.remove(nets[0])

            snet = _ScoredNetwork(copy(net.edges), score)
            insort(nets, snet)
            nethashes[nethash] = 1

    def tofile(self, filename=None):
        """Save the result to a python pickle file.

        The result can be later read using the result.fromfile function.
        """

        filename = filename or config.get('result.filename')
        with open(filename, 'w') as fp:
            cPickle.dump(self, fp)
    
    def tohtml(self, outdir=None):
        """Create a html report of the result.

        outdir is a directory to create html files inside.
        """

        if _can_create_html:
            HtmlFormatter().htmlreport(
                self, 
                outdir or config.get('result.outdir')
            )
        else:
            print "Cannot create html reports because some dependencies are missing."

    @property
    def posterior(self):
        """Returns a posterior object for this result."""
        return posterior.from_sorted_scored_networks(
                    self.nodes, 
                    list(reversed(self.networks))
        )
Ejemplo n.º 6
0
class SimulatedAnnealingLearner(Learner):
    #
    # Parameters
    #
    _params = (
        config.FloatParameter(
            'simanneal.start_temp',
            "Starting temperature for a run.",
            config.atleast(0.0),
            default=100.0
        ),
        config.FloatParameter(
            'simanneal.delta_temp',
            'Change in temp between steps.',
            config.atleast(0.0),
            default=0.5
        ),
        config.IntParameter(
            'simanneal.max_iters_at_temp',
            'Max iterations at any temperature.',
            config.atleast(0),
            default=100
        ),
        config.StringParameter(
            'simanneal.seed',
            'Starting network for a greedy search.',
            default=''
        )
    )

    def __init__(self, data_=None, prior_=None, **options):
        """Create a Simulated Aneaaling learner.

        For more information about Simulated Annealing algorithms, consult:

            1. http://en.wikipedia.org/wiki/Simulated_annealing
            2. D. Heckerman. A Tutorial on Learning with Bayesian Networks. 
               Microsoft Technical Report MSR-TR-95-06, 1995. p.35-36.

        Any config param for 'simanneal' can be passed in via options.
        Use just the option part of the parameter name.
        
        """

        super(SimulatedAnnealingLearner,self).__init__(data_, prior_)
        config.setparams(self, options)
        if not isinstance(self.seed, network.Network):
            self.seed = network.Network(self.data.variables, self.seed)
        
    def run(self):
        """Run the learner."""

        self.stats = SALearnerStatistics(self.start_temp, self.delta_temp, 
                                         self.max_iters_at_temp)
        self.result =  result.LearnerResult(self)
        self.evaluator = evaluator.fromconfig(self.data, self.seed, self.prior)
        self.evaluator.score_network(self.seed.copy())

        self.result.start_run()
        curscore = self.evaluator.score_network()
        
        # temperature decays exponentially, so we'll never get to 0. 
        # So, we continue until temp < 1
        while self.stats.temp >= 1:
            try:
                newscore = self._alter_network_randomly_and_score()
            except CannotAlterNetworkException:
                return

            self.result.add_network(self.evaluator.network, newscore)

            if self._accept(newscore):
                # set current score
                self.stats.current_score = newscore
                if self.stats.current_score > self.stats.best_score:
                    self.stats.best_score = self.stats.current_score
            else:
                # undo network alteration
                self.evaluator.restore_network()

            # temp not updated EVERY iteration. just whenever criteria met.
            self.stats.update() 

        self.result.stop_run()
        return self.result

    def _accept(self, newscore):
        oldscore = self.stats.current_score

        if newscore >= oldscore:
            return True
        elif random.random() < exp((newscore - oldscore)/self.stats.temp):
            return True
        else:
            return False
Ejemplo n.º 7
0
from pebl import config

#
# Module Parameteres
#
_pcontrollertype = config.StringParameter('taskcontroller.type',
                                          'The task controller to use.',
                                          default='serial.SerialController')


#TODO:test
def fromconfig():
    tctype = config.get('taskcontroller.type')
    tcmodule, tcclass = tctype.split('.')
    mymod = __import__("pebl.taskcontroller.%s" % tcmodule,
                       fromlist=['pebl.taskcontroller'])
    mytc = getattr(mymod, tcclass)
    return mytc()
Ejemplo n.º 8
0
import re
import copy
from itertools import groupby

import numpy as N

from pebl.util import *
from pebl import discretizer
from pebl import config

#
# Module parameters
#
_pfilename = config.StringParameter(
    'data.filename',
    'File to read data from.',
    config.fileexists(),
)

_ptext = config.StringParameter(
    'data.text', 'The text of a dataset included in config file.', default='')

_pdiscretize = config.IntParameter(
    'data.discretize',
    'Number of bins used to discretize data. Specify 0 to indicate that '+\
    'data should not be discretized.',
    default=0
)


#
Ejemplo n.º 9
0
class ListLearner(Learner):
    #
    # Parameter
    #
    _params = (config.StringParameter(
        'listlearner.networks',
        """List of networks, one per line, in network.Network.as_string()
            format.""",
        default=''))

    def __init__(self, data_=None, prior_=None, networks=None):
        """Create a ListLearner learner.

        networks should be a list of networks (as network.Network instances). 

        """

        super(ListLearner, self).__init__(data_, prior_)
        self.networks = networks
        if not networks:
            variables = self.data.variables
            _net = lambda netstr: network.Network(variables, netstr)
            netstrings = config.get('listlearner.networks').splitlines()
            self.networks = (_net(s) for s in netstrings if s)

    def run(self):
        self.result = result.LearnerResult(self)
        self.evaluator = evaluator.fromconfig(self.data, prior_=self.prior)

        self.result.start_run()
        for net in self.networks:
            self.result.add_network(net, self.evaluator.score_network(net))
        self.result.stop_run()
        return self.result

    def split(self, count):
        """Split the learner into multiple learners.

        Splits self.networks into `count` parts. This is similar to MPI's
        scatter functionality.
    
        """

        nets = list(self.networks)
        numnets = len(nets)
        netspertask = numnets / count

        # divide list into parts
        indices = [[i, i + netspertask]
                   for i in xrange(0, numnets, netspertask)]
        if len(indices) > count:
            indices.pop(-1)
            indices[-1][1] = numnets - 1

        return [
            ListLearner(self.data, self.prior, nets[i:j]) for i, j in indices
        ]

    def __getstate__(self):
        # convert self.network from iterators or generators to a list
        d = self.__dict__
        d['networks'] = list(d['networks'])
        return d
Ejemplo n.º 10
0
class MissingDataNetworkEvaluator(SmartNetworkEvaluator):
    #
    # Parameters
    #
    _params = (config.IntParameter(
        'gibbs.burnin',
        """Burn-in period for the gibbs sampler (specified as a multiple of
            the number of missing values)""",
        default=10),
               config.StringParameter(
                   'gibbs.max_iterations',
                   """Stopping criteria for the gibbs sampler.
            
            The number of Gibb's sampler iterations to run. Should be a valid
            python expression using the variable n (number of missing values).
            Examples:

                * n**2  (for n-squared iterations)
                * 100   (for 100 iterations)
            """,
                   default="n**2"))

    def __init__(self,
                 data_,
                 network_,
                 prior_=None,
                 localscore_cache=None,
                 **options):
        """Create a network evaluator for use with missing values.

        This evaluator uses a Gibb's sampler for sampling over the space of
        possible completions for the missing values.

        For more information about Gibb's sampling, consult:

            1. http://en.wikipedia.org/wiki/Gibbs_sampling
            2. D. Heckerman. A Tutorial on Learning with Bayesian Networks. 
               Microsoft Technical Report MSR-TR-95-06, 1995. p.21-22.

       
        Any config param for 'gibbs' can be passed in via options.
        Use just the option part of the parameter name.

        """

        super(MissingDataNetworkEvaluator,
              self).__init__(data_, network_, prior_)
        self._localscore = None  # no cache w/ missing data
        config.setparams(self, options)

    def _init_state(self):
        parents = self.network.edges.parents

        self.cpds = [self._cpd(n, parents(n)) for n in self.datavars]
        self.localscores = N.array([cpd.loglikelihood() for cpd in self.cpds],
                                   dtype=float)
        self.data_dirtynodes = set(self.datavars)

        def _update_dirtynodes(self, add, remove):
            # With hidden nodes:
            # 	1. dirtynode calculation is more expensive (need to look beyond
            #      markov blanket).
            # 	2. time spent rescoring observed nodes is insignificant compared
            #      to scoring hidden/missing nodes.
            self.dirtynodes = set(self.datavars)

    def _score_network_with_tempdata(self):
        # update localscore for data_dirtynodes, then calculate globalscore.
        for n in self.data_dirtynodes:
            self.localscores[n] = self.cpds[n].loglikelihood()

        self.data_dirtynodes = set()
        self.score = self._globalscore(self.localscores)
        return self.score

    def _alter_data(self, row, col, value):
        oldrow = self.data.observations[row].copy()
        self.data.observations[row, col] = value

        # update data_dirtynodes
        affected_nodes = set(self.network.edges.children(col) + [col])
        self.data_dirtynodes.update(affected_nodes)

        # update cpds
        for node in affected_nodes:
            datacols = [node] + self.network.edges.parents(node)
            if not self.data.interventions[row, node]:
                self.cpds[node].replace_data(
                    oldrow[datacols], self.data.observations[row][datacols])

    def _alter_data_and_score(self, row, col, value):
        self._alter_data(row, col, value)
        return self._score_network_with_tempdata()

    def _calculate_score(self, chosenscores, gibbs_state):
        # discard the burnin period scores and average the rest
        burnin_period = self.burnin * \
                        self.data.missing[self.data.missing==True].size

        if gibbs_state:
            # resuming from a previous gibbs run. so, no burnin required.
            scoresum = logsum(
                N.concatenate((chosenscores, [gibbs_state.scoresum])))
            numscores = len(chosenscores) + gibbs_state.numscores
        elif len(chosenscores) > burnin_period:
            # remove scores from burnin period.
            nonburn_scores = chosenscores[burnin_period:]
            scoresum = logsum(nonburn_scores)
            numscores = len(nonburn_scores)
        else:
            # this occurs when gibbs iterations were less than burnin period.
            scoresum = chosenscores[-1]
            numscores = 1

        score = scoresum - log(numscores)
        return score, numscores

    def _assign_missingvals(self, indices, gibbs_state):
        if gibbs_state:
            assignedvals = gibbs_state.assignedvals
        else:
            arities = [v.arity for v in self.data.variables]
            assignedvals = [
                random.randint(0, arities[col] - 1) for row, col in indices
            ]

        self.data.observations[unzip(indices)] = assignedvals

    def score_network(self, net=None, gibbs_state=None):
        """Score a network.

        If net is provided, scores that. Otherwise, score network previously
        set.

        The default stopping criteria is to run for n**2 iterations.

        gibbs_state is the state of a previous run of the Gibb's sampler.  With
        this, one can do the following::
        
            myeval = evaluator.MissingDataNetworkEvaluator(...)
            myeval.score_network(...)
            gibbs_state = myeval.gibbs_state
            cPickle.dump(gibbs_state, 'gibbs_state.txt')

            # look at results, do other analysis, etc
            # If we decide that we need further Gibb's sampler iterations, we
            # don't need to restart
            gibbs_state = cPickle.load(open('gibbs_state.txt'))
            myeval = evaluator.MissingDataNetworkEvaluator(...)

            # continue with the previous run of the Gibb's sampler
            myeval.score_network(
                gibbs_state=gibbs_state,
                stopping_criteria=lambda i,N: i>200*N**2
            )

        """
        self.gibbs_state = gibbs_state
        return super(MissingDataNetworkEvaluator, self).score_network(net)

    def _score_network_core(self):
        # create some useful lists and local variables
        missing_indices = unzip(N.where(self.data.missing == True))
        num_missingvals = len(missing_indices)
        n = num_missingvals
        max_iterations = eval(self.max_iterations)
        arities = [v.arity for v in self.data.variables]
        chosenscores = []

        self._assign_missingvals(missing_indices, self.gibbs_state)
        self._init_state()

        # Gibbs Sampling:
        # For each missing value:
        #    1) score net with each possible value (based on node's arity)
        #    2) using a probability wheel, sample a value from the possible values
        iters = 0
        while iters < max_iterations:
            for row, col in missing_indices:
                scores = [self._alter_data_and_score(row, col, val) \
                             for val in xrange(arities[col])]
                chosenval = logscale_probwheel(range(len(scores)), scores)
                self._alter_data(row, col, chosenval)
                chosenscores.append(scores[chosenval])

            iters += num_missingvals

        self.chosenscores = N.array(chosenscores)
        self.score, numscores = self._calculate_score(self.chosenscores,
                                                      self.gibbs_state)

        # save state of gibbs sampler
        self.gibbs_state = GibbsSamplerState(
            avgscore=self.score,
            numscores=numscores,
            assignedvals=self.data.observations[unzip(
                missing_indices)].tolist())

        return self.score
Ejemplo n.º 11
0
            assignedvals=self.data.observations[N.where(
                self.data.missing == True)].tolist())

        return self.score


#
# Parameters
#
_pmissingdatahandler = config.StringParameter(
    'evaluator.missingdata_evaluator',
    """
    Evaluator to use for handling missing data. Choices include:
        * gibbs: Gibb's sampling
        * maxentropy_gibbs: Gibbs's sampling over all completions of the
          missing values that result in maximum entropy discretization for the
          variables.  
        * exact: exact enumeration of all possible missing values (only
                 useable when there are few missing values)
    """,
    config.oneof('gibbs', 'exact', 'maxentropy_gibbs'),
    default='gibbs')

_missingdata_evaluators = {
    'gibbs': MissingDataNetworkEvaluator,
    'exact': MissingDataExactNetworkEvaluator,
    'maxentropy_gibbs': MissingDataMaximumEntropyNetworkEvaluator
}


def fromconfig(data_=None, network_=None, prior_=None):
Ejemplo n.º 12
0
#
# Exceptions
#
class CannotAlterNetworkException(Exception):
    pass

#
# Module parameters
#
_plearnertype = config.StringParameter(
    'learner.type',
    """Type of learner to use. 

    The following learners are included with pebl:
        * greedy.GreedyLearner
        * simanneal.SimulatedAnnealingLearner
        * exhaustive.ListLearner
    """,
    default = 'greedy.GreedyLearner'
)

_ptasks = config.IntParameter(
    'learner.numtasks',
    "Number of learner tasks to run.",
    config.atleast(0),
    default=1
)


class Learner(Task):
Ejemplo n.º 13
0
class XgridController(_BaseSubmittingController):
    #
    # Parameters
    #
    _params = (
        config.StringParameter('xgrid.controller',
                               'Hostname or IP of the Xgrid controller.',
                               default=''),
        config.StringParameter('xgrid.password',
                               'Password for the Xgrid controller.',
                               default=''),
        config.StringParameter(
            'xgrid.grid',
            'Id of the grid to use at the Xgrid controller.',
            default='0'),
        config.FloatParameter(
            'xgrid.pollinterval',
            'Time (in secs) to wait between polling the Xgrid controller.',
            default=60.0),
        config.StringParameter('xgrid.peblpath',
                               'Full path to the pebl script on Xgrid agents.',
                               default='pebl'))

    def __init__(self, **options):
        """Create a XGridController.

        Any config param for 'xgrid' can be passed in via options.
        Use just the option part of the parameter name.
        
        """
        config.setparams(self, options)

    @property
    def _grid(self):
        if xg:
            cn = xg.Connection(self.controller, self.password)
            ct = xg.Controller(cn)
            return ct.grid(self.grid)

        return None

    #
    # Public interface
    #
    def submit(self, tasks):
        grid = self._grid

        drs = []
        for task in tasks:
            task.cwd = tempfile.mkdtemp()
            cPickle.dump(task, open(os.path.join(task.cwd, 'task.pebl'), 'w'))
            task.job = grid.submit(self.peblpath,
                                   'runtask task.pebl',
                                   indir=task.cwd)
            drs.append(XgridDeferredResult(grid, task))
        return drs

    def retrieve(self, deferred_results):
        drs = deferred_results

        # poll for job results
        # i'd rather select() or wait() but xgrid doesn't offer that via the
        # xgrid command line app
        done = []
        while drs:
            for i, dr in enumerate(drs):
                if dr.finished:
                    done.append(drs.pop(i))
                    break  # modified drs, so break and re-iterate
            else:
                time.sleep(self.pollinterval)

        return [dr.result for dr in done]