Esempio n. 1
0
 def setUp(self):
     config.StringParameter('test.param0', 'a param', default='foo')
     config.StringParameter('test.param1', 'a param',
                            config.oneof('foo', 'bar'))
     config.IntParameter('test.param2', 'a param', default=20)
     config.IntParameter('test.param3', 'a param', config.atmost(100))
     config.IntParameter('test.param4', 'a param', config.atleast(100))
     config.IntParameter('test.param5', 'a param', config.between(10, 100))
     config.IntParameter('test.param6', 'a param', lambda x: x == 50)
     config.FloatParameter('test.param7', 'a param',
                           config.between(1.3, 2.7))
Esempio n. 2
0
class EC2Controller(IPython1Controller):
    _params = (config.StringParameter('ec2.config',
                                      'EC2 config file',
                                      default=''),
               config.IntParameter(
                   'ec2.min_count',
                   'Minimum number of EC2 instances to create (default=1).',
                   default=1),
               config.IntParameter(
                   'ec2.max_count',
                   """Maximum number of EC2 instances to create 
            (default=0 means the same number as ec2.min_count).""",
                   default=0))

    def __init__(self, **options):
        config.setparams(self, options)
        self.ec2 = ec2ipy1.EC2Cluster(self.config)
        self.start()

    def __del__(self):
        self.stop()

    def start(self):
        self.ec2.create_instances(self.min_count, self.max_count)

        print "Updating pebl on worker nodes"
        self.ec2.remote_all(
            "cd /usr/local/src/pebl; svn update; python setup.py install")

        self.ec2.start_ipython1(engine_on_controller=True)
        self.ipy1taskcontroller = IPython1Controller(
            self.ec2.task_controller_url)

    def stop(self):
        self.ec2.terminate_instances()

    def submit(self, tasks):
        return self.ipy1taskcontroller.submit(tasks)

    def retrieve(self, deferred_results):
        return self.ipy1taskcontroller.retrieve(deferred_results)

    def run(self, tasks):
        return self.ipy1taskcontroller.run(tasks)
Esempio n. 3
0
    def test_configobj1(self):
        expected = \
"""[test]
param1 = foo
param0 = foo

[test1]
param1 = 5

"""

        config.IntParameter('test1.param1', 'a param', default=5)
        config.set('test.param1', 'foo')
        params = [
            config._parameters.get(x)
            for x in ('test.param0', 'test.param1', 'test1.param1')
        ]

        tmpfile = NamedTemporaryFile(prefix="pebl.test")
        config.configobj(params).write(tmpfile)

        tmpfile.file.seek(0)
        actual = tmpfile.read()
        assert actual == expected
Esempio n. 4
0
class LearnerResult:
    """Class for storing any and all output of a learner.

    This is a mutable container for networks and scores. In the future, it will
    also be the place to collect statistics related to the learning task.

    """

    #
    # Parameters
    #
    _params = (
        config.StringParameter(
            'result.filename',
            'The name of the result output file',
            default='result.pebl'
        ),
        config.StringParameter(
            'result.format',
            'The format for the pebl result file (pickle or html)',
            config.oneof('pickle', 'html'),
            default='pickle'
        ),
        config.StringParameter(
            'result.outdir',
            'Directory for html report.',
            default='result'
        ),
        config.IntParameter(
            'result.size',
            """Number of top-scoring networks to save. Specify 0 to indicate that
            all scored networks should be saved.""",
            default=1000
        )
    )

    def __init__(self, learner_=None, size=None):
        self.data = learner_.data if learner_ else None
        self.nodes = self.data.variables if self.data else None
        self.size = size or config.get('result.size')
        self.networks = []
        self.nethashes = {}
        self.runs = []

    def start_run(self):
        """Indicates that the learner is starting a new run."""
        self.runs.append(LearnerRunStats(time.time()))

    def stop_run(self):
        """Indicates that the learner is stopping a run."""
        self.runs[-1].end = time.time()

    def add_network(self, net, score):
        """Add a network and score to the results."""
        nets = self.networks
        nethashes = self.nethashes
        nethash = hash(net.edges)

        if self.size == 0 or len(nets) < self.size:
            if nethash not in nethashes:
                snet = _ScoredNetwork(copy(net.edges), score)
                insort(nets, snet)
                nethashes[nethash] = 1
        elif score > nets[0].score and nethash not in nethashes:
            nethashes.pop(hash(nets[0].edges))
            nets.remove(nets[0])

            snet = _ScoredNetwork(copy(net.edges), score)
            insort(nets, snet)
            nethashes[nethash] = 1

    def tofile(self, filename=None):
        """Save the result to a python pickle file.

        The result can be later read using the result.fromfile function.
        """

        filename = filename or config.get('result.filename')
        with open(filename, 'w') as fp:
            cPickle.dump(self, fp)
    
    def tohtml(self, outdir=None):
        """Create a html report of the result.

        outdir is a directory to create html files inside.
        """

        if _can_create_html:
            HtmlFormatter().htmlreport(
                self, 
                outdir or config.get('result.outdir')
            )
        else:
            print "Cannot create html reports because some dependencies are missing."

    @property
    def posterior(self):
        """Returns a posterior object for this result."""
        return posterior.from_sorted_scored_networks(
                    self.nodes, 
                    list(reversed(self.networks))
        )
Esempio n. 5
0
class SimulatedAnnealingLearner(Learner):
    #
    # Parameters
    #
    _params = (
        config.FloatParameter(
            'simanneal.start_temp',
            "Starting temperature for a run.",
            config.atleast(0.0),
            default=100.0
        ),
        config.FloatParameter(
            'simanneal.delta_temp',
            'Change in temp between steps.',
            config.atleast(0.0),
            default=0.5
        ),
        config.IntParameter(
            'simanneal.max_iters_at_temp',
            'Max iterations at any temperature.',
            config.atleast(0),
            default=100
        ),
        config.StringParameter(
            'simanneal.seed',
            'Starting network for a greedy search.',
            default=''
        )
    )

    def __init__(self, data_=None, prior_=None, **options):
        """Create a Simulated Aneaaling learner.

        For more information about Simulated Annealing algorithms, consult:

            1. http://en.wikipedia.org/wiki/Simulated_annealing
            2. D. Heckerman. A Tutorial on Learning with Bayesian Networks. 
               Microsoft Technical Report MSR-TR-95-06, 1995. p.35-36.

        Any config param for 'simanneal' can be passed in via options.
        Use just the option part of the parameter name.
        
        """

        super(SimulatedAnnealingLearner,self).__init__(data_, prior_)
        config.setparams(self, options)
        if not isinstance(self.seed, network.Network):
            self.seed = network.Network(self.data.variables, self.seed)
        
    def run(self):
        """Run the learner."""

        self.stats = SALearnerStatistics(self.start_temp, self.delta_temp, 
                                         self.max_iters_at_temp)
        self.result =  result.LearnerResult(self)
        self.evaluator = evaluator.fromconfig(self.data, self.seed, self.prior)
        self.evaluator.score_network(self.seed.copy())

        self.result.start_run()
        curscore = self.evaluator.score_network()
        
        # temperature decays exponentially, so we'll never get to 0. 
        # So, we continue until temp < 1
        while self.stats.temp >= 1:
            try:
                newscore = self._alter_network_randomly_and_score()
            except CannotAlterNetworkException:
                return

            self.result.add_network(self.evaluator.network, newscore)

            if self._accept(newscore):
                # set current score
                self.stats.current_score = newscore
                if self.stats.current_score > self.stats.best_score:
                    self.stats.best_score = self.stats.current_score
            else:
                # undo network alteration
                self.evaluator.restore_network()

            # temp not updated EVERY iteration. just whenever criteria met.
            self.stats.update() 

        self.result.stop_run()
        return self.result

    def _accept(self, newscore):
        oldscore = self.stats.current_score

        if newscore >= oldscore:
            return True
        elif random.random() < exp((newscore - oldscore)/self.stats.temp):
            return True
        else:
            return False
Esempio n. 6
0
#
# Module parameters
#
_pfilename = config.StringParameter(
    'data.filename',
    'File to read data from.',
    config.fileexists(),
)

_ptext = config.StringParameter(
    'data.text', 'The text of a dataset included in config file.', default='')

_pdiscretize = config.IntParameter(
    'data.discretize',
    'Number of bins used to discretize data. Specify 0 to indicate that '+\
    'data should not be discretized.',
    default=0
)


#
# Exceptions
#
class ParsingError(Exception):
    """Error encountered while parsing an ill-formed datafile."""
    pass


class IncorrectArityError(Exception):
    """Error encountered when the datafile speifies an incorrect variable arity.
Esempio n. 7
0
class MissingDataNetworkEvaluator(SmartNetworkEvaluator):
    #
    # Parameters
    #
    _params = (config.IntParameter(
        'gibbs.burnin',
        """Burn-in period for the gibbs sampler (specified as a multiple of
            the number of missing values)""",
        default=10),
               config.StringParameter(
                   'gibbs.max_iterations',
                   """Stopping criteria for the gibbs sampler.
            
            The number of Gibb's sampler iterations to run. Should be a valid
            python expression using the variable n (number of missing values).
            Examples:

                * n**2  (for n-squared iterations)
                * 100   (for 100 iterations)
            """,
                   default="n**2"))

    def __init__(self,
                 data_,
                 network_,
                 prior_=None,
                 localscore_cache=None,
                 **options):
        """Create a network evaluator for use with missing values.

        This evaluator uses a Gibb's sampler for sampling over the space of
        possible completions for the missing values.

        For more information about Gibb's sampling, consult:

            1. http://en.wikipedia.org/wiki/Gibbs_sampling
            2. D. Heckerman. A Tutorial on Learning with Bayesian Networks. 
               Microsoft Technical Report MSR-TR-95-06, 1995. p.21-22.

       
        Any config param for 'gibbs' can be passed in via options.
        Use just the option part of the parameter name.

        """

        super(MissingDataNetworkEvaluator,
              self).__init__(data_, network_, prior_)
        self._localscore = None  # no cache w/ missing data
        config.setparams(self, options)

    def _init_state(self):
        parents = self.network.edges.parents

        self.cpds = [self._cpd(n, parents(n)) for n in self.datavars]
        self.localscores = N.array([cpd.loglikelihood() for cpd in self.cpds],
                                   dtype=float)
        self.data_dirtynodes = set(self.datavars)

        def _update_dirtynodes(self, add, remove):
            # With hidden nodes:
            # 	1. dirtynode calculation is more expensive (need to look beyond
            #      markov blanket).
            # 	2. time spent rescoring observed nodes is insignificant compared
            #      to scoring hidden/missing nodes.
            self.dirtynodes = set(self.datavars)

    def _score_network_with_tempdata(self):
        # update localscore for data_dirtynodes, then calculate globalscore.
        for n in self.data_dirtynodes:
            self.localscores[n] = self.cpds[n].loglikelihood()

        self.data_dirtynodes = set()
        self.score = self._globalscore(self.localscores)
        return self.score

    def _alter_data(self, row, col, value):
        oldrow = self.data.observations[row].copy()
        self.data.observations[row, col] = value

        # update data_dirtynodes
        affected_nodes = set(self.network.edges.children(col) + [col])
        self.data_dirtynodes.update(affected_nodes)

        # update cpds
        for node in affected_nodes:
            datacols = [node] + self.network.edges.parents(node)
            if not self.data.interventions[row, node]:
                self.cpds[node].replace_data(
                    oldrow[datacols], self.data.observations[row][datacols])

    def _alter_data_and_score(self, row, col, value):
        self._alter_data(row, col, value)
        return self._score_network_with_tempdata()

    def _calculate_score(self, chosenscores, gibbs_state):
        # discard the burnin period scores and average the rest
        burnin_period = self.burnin * \
                        self.data.missing[self.data.missing==True].size

        if gibbs_state:
            # resuming from a previous gibbs run. so, no burnin required.
            scoresum = logsum(
                N.concatenate((chosenscores, [gibbs_state.scoresum])))
            numscores = len(chosenscores) + gibbs_state.numscores
        elif len(chosenscores) > burnin_period:
            # remove scores from burnin period.
            nonburn_scores = chosenscores[burnin_period:]
            scoresum = logsum(nonburn_scores)
            numscores = len(nonburn_scores)
        else:
            # this occurs when gibbs iterations were less than burnin period.
            scoresum = chosenscores[-1]
            numscores = 1

        score = scoresum - log(numscores)
        return score, numscores

    def _assign_missingvals(self, indices, gibbs_state):
        if gibbs_state:
            assignedvals = gibbs_state.assignedvals
        else:
            arities = [v.arity for v in self.data.variables]
            assignedvals = [
                random.randint(0, arities[col] - 1) for row, col in indices
            ]

        self.data.observations[unzip(indices)] = assignedvals

    def score_network(self, net=None, gibbs_state=None):
        """Score a network.

        If net is provided, scores that. Otherwise, score network previously
        set.

        The default stopping criteria is to run for n**2 iterations.

        gibbs_state is the state of a previous run of the Gibb's sampler.  With
        this, one can do the following::
        
            myeval = evaluator.MissingDataNetworkEvaluator(...)
            myeval.score_network(...)
            gibbs_state = myeval.gibbs_state
            cPickle.dump(gibbs_state, 'gibbs_state.txt')

            # look at results, do other analysis, etc
            # If we decide that we need further Gibb's sampler iterations, we
            # don't need to restart
            gibbs_state = cPickle.load(open('gibbs_state.txt'))
            myeval = evaluator.MissingDataNetworkEvaluator(...)

            # continue with the previous run of the Gibb's sampler
            myeval.score_network(
                gibbs_state=gibbs_state,
                stopping_criteria=lambda i,N: i>200*N**2
            )

        """
        self.gibbs_state = gibbs_state
        return super(MissingDataNetworkEvaluator, self).score_network(net)

    def _score_network_core(self):
        # create some useful lists and local variables
        missing_indices = unzip(N.where(self.data.missing == True))
        num_missingvals = len(missing_indices)
        n = num_missingvals
        max_iterations = eval(self.max_iterations)
        arities = [v.arity for v in self.data.variables]
        chosenscores = []

        self._assign_missingvals(missing_indices, self.gibbs_state)
        self._init_state()

        # Gibbs Sampling:
        # For each missing value:
        #    1) score net with each possible value (based on node's arity)
        #    2) using a probability wheel, sample a value from the possible values
        iters = 0
        while iters < max_iterations:
            for row, col in missing_indices:
                scores = [self._alter_data_and_score(row, col, val) \
                             for val in xrange(arities[col])]
                chosenval = logscale_probwheel(range(len(scores)), scores)
                self._alter_data(row, col, chosenval)
                chosenscores.append(scores[chosenval])

            iters += num_missingvals

        self.chosenscores = N.array(chosenscores)
        self.score, numscores = self._calculate_score(self.chosenscores,
                                                      self.gibbs_state)

        # save state of gibbs sampler
        self.gibbs_state = GibbsSamplerState(
            avgscore=self.score,
            numscores=numscores,
            assignedvals=self.data.observations[unzip(
                missing_indices)].tolist())

        return self.score
Esempio n. 8
0
class LocalscoreCache(object):
    """ A LRU cache for local scores.

    Based on code from http://code.activestate.com/recipes/498245/
    """

    _params = (config.IntParameter(
        'localscore_cache.maxsize',
        "Max number of localscores to cache. Default=-1 means unlimited size.",
        default=-1))

    def __init__(self, evaluator, cachesize=None):
        self._cache = {}
        self._queue = deque()
        self._refcount = {}
        self.cachesize = cachesize or config.get('localscore_cache.maxsize')

        self.neteval = evaluator
        self.hits = 0
        self.misses = 0

    def __call__(self, node, parents):
        # make variables local
        _len = len
        _queue = self._queue
        _refcount = self._refcount
        _cache = self._cache
        _maxsize = self.cachesize

        index = tuple([node] + parents)

        # get from cache or compute
        try:
            score = _cache[index]
            self.hits += 1
        except KeyError:
            score = _cache[index] = self.neteval._cpd(node,
                                                      parents).loglikelihood()
            self.misses += 1

        # if using LRU cache (maxsize != -1)
        if _maxsize > 0:
            # record that key was accessed
            _queue.append(index)
            _refcount[index] = _refcount.get(index, 0) + 1

            # purge LRU entry
            while _len(_cache) > _maxsize:
                k = _queue.popleft()
                _refcount[k] -= 1
                if not _refcount[k]:
                    del _cache[k]
                    del _refcount[k]

            # Periodically compact the queue by duplicate keys
            if _len(_queue) > _maxsize * 4:
                for i in xrange(_len(_queue)):
                    k = _queue.popleft()
                    if _refcount[k] == 1:
                        _queue.append(k)
                    else:
                        _refcount[k] -= 1

        return score
Esempio n. 9
0
#
_plearnertype = config.StringParameter(
    'learner.type',
    """Type of learner to use. 

    The following learners are included with pebl:
        * greedy.GreedyLearner
        * simanneal.SimulatedAnnealingLearner
        * exhaustive.ListLearner
    """,
    default = 'greedy.GreedyLearner'
)

_ptasks = config.IntParameter(
    'learner.numtasks',
    "Number of learner tasks to run.",
    config.atleast(0),
    default=1
)


class Learner(Task):
    def __init__(self, data_=None, prior_=None, **kw):
        self.data = data_ or data.fromconfig()
        self.prior = prior_ or prior.fromconfig()
        self.__dict__.update(kw)

        # parameters
        self.numtasks = config.get('learner.numtasks')

        # stats
        self.reverse = 0
Esempio n. 10
0
class MultiProcessController(_BaseController):
    #
    # Parameters
    # 
    _params = (
            config.IntParameter(
            'multiprocess.poolsize',
            'Number of processes to run concurrently (0 means no limit)',
            default=0
        )
    )
        
    def __init__(self, poolsize=None):
        """Creates a task controller that runs taks on multiple processes.

        This task controller uses a pool of processes rather than spawning all
        processes concurrently. poolsize is the size of this pool and by
        default it is big enough to run all processes concurrently.

        """
        self.poolsize = poolsize or config.get('multiprocess.poolsize')

    def run(self, tasks):
        """Run tasks by creating multiple processes.

        If poolsize was specified when creating this controller, additional
        tasks will be queued.

        """
        tasks = copy(tasks) # because we do tasks.pop() below..
        numtasks = len(tasks)
        poolsize = self.poolsize or numtasks
        running = {}
        done = []
        opjoin = os.path.join

        while len(done) < numtasks:
            # submit tasks (if below poolsize and tasks remain)
            for i in xrange(min(poolsize-len(running), len(tasks))):
                task = tasks.pop()
                task.cwd = tempfile.mkdtemp()
                cPickle.dump(task, open(opjoin(task.cwd, 'task.pebl'), 'w'))
                pid = os.spawnlp(os.P_NOWAIT, PEBL, PEBL, "runtask", 
                                 opjoin(task.cwd, "task.pebl"))
                running[pid] = task
            
            # wait for any child process to finish
            pid,status = os.wait() 
            done.append(running.pop(pid, None))

        results = [result.fromfile(opjoin(t.cwd, 'result.pebl')) for t in done]

        # to make the results look like deferred results
        for r in results:
            r.taskid = 0
        
        # clean up 
        for t in done:
            shutil.rmtree(t.cwd)

        return results