Ejemplo n.º 1
0
    def run(self, r, niters=10000):
        """Run the specified mixturemodel kernel for `niters`, in a single
        thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        inds = xrange(len(self._defn.domains()))
        models = [bind(self._latent, i, self._views) for i in inds]
        for _ in xrange(niters):
            for name, config in self._kernel_config:
                if name == 'assign':
                    for idx in config.keys():
                        gibbs.assign(models[idx], r)
                elif name == 'assign_resample':
                    for idx, v in config.iteritems():
                        gibbs.assign_resample(models[idx], v['m'], r)
                elif name == 'slice_cluster_hp':
                    for idx, v in config.iteritems():
                        slice.hp(models[idx], r, cparam=v['cparam'])
                elif name == 'grid_relation_hp':
                    gibbs.hp(models[0], config, r)
                elif name == 'slice_relation_hp':
                    slice.hp(models[0], r, hparams=config['hparams'])
                elif name == 'theta':
                    slice.theta(models[0], r, tparams=config['tparams'])
                else:
                    assert False, "should not be reached"
Ejemplo n.º 2
0
    def run(self, r, niters=10000):
        """Run the lda kernel for `niters`, in a single thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')

        for _ in xrange(niters):
            for name, config in self._kernel_config:
                if name == 'crf':
                    lda_crp_gibbs(self._latent, r)
                elif name == 'direct_base_dp_hp':
                    sample_gamma(self._latent, r, config['hp1'], config['hp2'])
                elif name == 'direct_second_dp_hp':
                    sample_alpha(self._latent, r, config['hp1'], config['hp2'])
                elif name == 'direct_vocab_hp':
                    raise NotImplementedError(
                        'direct_vocab_hp not yet implemented')
                else:
                    raise ValueError(
                        "Bad kernel specification {}".format(name))
Ejemplo n.º 3
0
    def run(self, r, niters=10000):
        """Run the specified mixturemodel kernel for `niters`, in a single
        thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        doc_model = bind(self._latent, data=self._view)
        for _ in xrange(niters):
            for name, config in self._kernel_config:
                if name == 'assign':
                    assign2(doc_model, r)
                    tabel_models = [
                        bind(self._latent, document=did)
                        for did in xrange(self._latent.nentities())
                    ]
                    for table_model in tabel_models:
                        assign(table_model, r)
                else:
                    assert False, 'should not be reached'
Ejemplo n.º 4
0
    def run(self, r, niters=10000):
        """Run the specified mixturemodel kernel for `niters`, in a single
        thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        model = bind(self._latent, self._view)
        for _ in xrange(niters):
            for name, config in self._kernel_config:
                if name == 'assign':
                    gibbs.assign(model, r)
                elif name == 'assign_resample':
                    gibbs.assign_resample(model, config['m'], r)
                elif name == 'grid_feature_hp':
                    gibbs.hp(model, config, r)
                elif name == 'slice_feature_hp':
                    slice.hp(model, r, hparams=config['hparams'])
                elif name == 'slice_cluster_hp':
                    slice.hp(model, r, cparam=config['cparam'])
                elif name == 'theta':
                    slice.theta(model, r, tparams=config['tparams'])
                else:
                    assert False, "should not be reach"
Ejemplo n.º 5
0
    def run(self, r, niters=10000):
        """Run the specified mixturemodel kernel for `niters`, in a single
        thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        doc_model = bind(self._latent, data=self._view)
        for _ in xrange(niters):
            for name, config in self._kernel_config:
                if name == 'assign':
                    assign2(doc_model, r)
                    tabel_models = [
                        bind(self._latent, document=did)
                        for did in xrange(self._latent.nentities())
                    ]
                    for table_model in tabel_models:
                        assign(table_model, r)
                else:
                    assert False, 'should not be reached'
Ejemplo n.º 6
0
    def run(self, r, niters=10000):
        """Run each runner for `niters`, using the backend supplied in the
        constructor for parallelism.

        Parameters
        ----------
        r : rng
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        if self._backend == 'multiprocessing':
            pool = mp.Pool(processes=self._processes)
            args = [(runner, niters, r.next(), None)
                    for runner in self._runners]
            # map_async() + get() allows us to workaround a bug where
            # control-C doesn't kill multiprocessing workers
            self._runners = pool.map_async(_mp_work, args).get(10000000)
            pool.close()
            pool.join()
        elif self._backend == 'multyvac':

            # XXX(stephentu): the only parallelism strategy thus far is every
            # runner gets a dedicated core (multicore=1) on a machine
            jids = []
            has_volume = bool(self._volume)
            zipped = zip(self._runners, self._digests)
            expensive_states = []
            for i, (runner, digest) in enumerate(zipped):
                if has_volume:
                    statearg = (self._volume, 'state-{}'.format(digest))
                    expensive_states.append(runner.expensive_state)
                    runner.expensive_state = None
                else:
                    statearg = None
                args = (runner, niters, r.next(), statearg)
                jids.append(
                    multyvac.submit(
                        _mp_work,
                        args,
                        _ignore_module_dependencies=True,
                        _layer=self._layer,
                        _vol=self._volume,
                        _env=dict(self._env),  # submit() mutates the env
                        _core=self._core,
                        _name='kernels-parallel-runner-{}'.format(i)))
            self._runners = [multyvac.get(jid).get_result() for jid in jids]
            if not expensive_states:
                return
            for runner, state in zip(self._runners, expensive_states):
                runner.expensive_state = state
        else:
            assert False, 'should not be reached'
Ejemplo n.º 7
0
    def run(self, r, niters=10000):
        """Run each runner for `niters`, using the backend supplied in the
        constructor for parallelism.

        Parameters
        ----------
        r : rng
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        if self._backend == 'multiprocessing':
            pool = mp.Pool(processes=self._processes)
            args = [(runner, niters, r.next(), None)
                    for runner in self._runners]
            # map_async() + get() allows us to workaround a bug where
            # control-C doesn't kill multiprocessing workers
            self._runners = pool.map_async(_mp_work, args).get(10000000)
            pool.close()
            pool.join()
        elif self._backend == 'multyvac':

            # XXX(stephentu): the only parallelism strategy thus far is every
            # runner gets a dedicated core (multicore=1) on a machine
            jids = []
            has_volume = bool(self._volume)
            zipped = zip(self._runners, self._digests)
            expensive_states = []
            for i, (runner, digest) in enumerate(zipped):
                if has_volume:
                    statearg = (self._volume, 'state-{}'.format(digest))
                    expensive_states.append(runner.expensive_state)
                    runner.expensive_state = None
                else:
                    statearg = None
                args = (runner, niters, r.next(), statearg)
                jids.append(
                    multyvac.submit(
                        _mp_work,
                        args,
                        _ignore_module_dependencies=True,
                        _layer=self._layer,
                        _vol=self._volume,
                        _env=dict(self._env),  # submit() mutates the env
                        _core=self._core,
                        _name='kernels-parallel-runner-{}'.format(i)))
            self._runners = [multyvac.get(jid).get_result() for jid in jids]
            if not expensive_states:
                return
            for runner, state in zip(self._runners, expensive_states):
                runner.expensive_state = state
        else:
            assert False, 'should not be reached'
Ejemplo n.º 8
0
    def run(self, r, niters=10000):
        """Run the lda kernel for `niters`, in a single thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')

        for _ in xrange(niters):
            lda_crp_gibbs(self._latent, r)
Ejemplo n.º 9
0
    def run(self, r, niters=10000):
        """Run the lda kernel for `niters`, in a single thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')

        for _ in xrange(niters):
            lda_crp_gibbs(self._latent, r)
Ejemplo n.º 10
0
def posterior_predictive(q, latents, r, samples_per_chain=1):
    """Generate a bag of samples from the posterior distribution of each
    mixturemodel state object.

    Parameters
    ----------
    q : (N,) masked recarray
        The query object
    latents : list of mixturemodel latent objects
    r : random state
    samples_per_chain : int, optional
        Default is 1.

    Returns
    -------
    samples : (N, M) recarray
        where ``M = len(latents) * samples_per_chain``

    Notes
    -----
    If ``N=1``, the resultng `samples` will *not* be collasped into a (M,)
    shape recarray for consistency purposes.

    """

    if len(q.shape) != 1:
        raise ValueError("1d masked recarrays only")
    if not len(latents):
        raise ValueError("no latents given")
    validator.validate_positive(
        samples_per_chain, param_name='samples_per_chain')

    def f(q):
        samples = []
        for latent in latents:
            for _ in xrange(samples_per_chain):
                samples.append(latent.sample_post_pred(q, r)[1])
        return np.hstack(samples)

    return np.array(map(f, q))
Ejemplo n.º 11
0
    def run(self, r, niters=10000):
        """Run the specified kernel for `niters`, in a single
        thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')
        for _ in xrange(niters):
            # This goes against every object-oriented bone in my body, but the interface must be satisfied
            # And actually Python won't even let me do this because I'm accessing a method in a C++ class...
            # I'd have to write this whole thing in Cython or change the state interface to expose all these
            # functions separately...which might actually be worth doing.
            self._latent._thisptr.get()[0].sample_aux()
            self._latent._thisptr.get()[0].sample_state()
            self._latent._thisptr.get()[0].clear_empty_states()
            self._latent._thisptr.get()[0].sample_hypers(20)
            self._latent._thisptr.get()[0].sample_pi()
            self._latent._thisptr.get()[0].sample_phi()
Ejemplo n.º 12
0
    def run(self, r, niters=10000):
        """Run the lda kernel for `niters`, in a single thread.

        Parameters
        ----------
        r : random state
        niters : int

        """
        validator.validate_type(r, rng, param_name='r')
        validator.validate_positive(niters, param_name='niters')

        for _ in xrange(niters):
            for name, config in self._kernel_config:
                if name == 'crf':
                    lda_crp_gibbs(self._latent, r)
                elif name == 'direct_base_dp_hp':
                    sample_gamma(self._latent, r, config['hp1'], config['hp2'])
                elif name == 'direct_second_dp_hp':
                    sample_alpha(self._latent, r, config['hp1'], config['hp2'])
                elif name == 'direct_vocab_hp':
                    raise NotImplementedError('direct_vocab_hp not yet implemented')
                else:
                    raise ValueError("Bad kernel specification {}".format(name))
Ejemplo n.º 13
0
    def __init__(self, defn, view, latent, kernel_config):
        defn = _validate_definition(defn)
        validator.validate_type(view, abstract_dataview, param_name='view')
        if not isinstance(latent, state):
            raise ValueError("bad latent given")
        validator.validate_len(view, defn.n())

        def require_feature_indices(v):
            nfeatures = len(defn.models())
            valid_keys = set(xrange(nfeatures))
            if not set(v.keys()).issubset(valid_keys):
                msg = "bad config found: {}".format(v)
                raise ValueError(msg)

        self._defn = defn
        self._view = view
        self._latent = copy.deepcopy(latent)

        self._kernel_config = []
        for kernel in kernel_config:

            if hasattr(kernel, '__iter__'):
                name, config = kernel
            else:
                name, config = kernel, {}
            validator.validate_dict_like(config)

            if name == 'assign':
                if config:
                    raise ValueError("assign has no parameters")

            elif name == 'assign_resample':
                if config.keys() != ['m']:
                    raise ValueError("bad config found: {}".format(config))
                validator.validate_positive(config['m'])

            elif name == 'grid_feature_hp':
                require_feature_indices(config)
                for fi, ps in config.iteritems():
                    if set(ps.keys()) != set(('hpdf', 'hgrid',)):
                        raise ValueError("bad config found: {}".format(ps))
                    full = []
                    for partial in ps['hgrid']:
                        hp = latent.get_feature_hp(fi)
                        hp.update(partial)
                        full.append(hp)
                    ps['hgrid'] = full

            elif name == 'slice_feature_hp':
                if config.keys() != ['hparams']:
                    raise ValueError("bad config found: {}".format(config))
                require_feature_indices(config['hparams'])

            elif name == 'slice_cluster_hp':
                if config.keys() != ['cparam']:
                    raise ValueError("bad config found: {}".format(config))
                if config['cparam'].keys() != ['alpha']:
                    msg = "bad config found: {}".format(config['cparam'])
                    raise ValueError(msg)

            elif name == 'theta':
                if config.keys() != ['tparams']:
                    raise ValueError("bad config found: {}".format(config))
                require_feature_indices(config['tparams'])

            else:
                raise ValueError("bad kernel found: {}".format(name))

            self._kernel_config.append((name, config))
Ejemplo n.º 14
0
    def __init__(self, runners, backend='multiprocessing', **kwargs):
        self._runners = runners
        if backend not in (
                'multiprocessing',
                'multyvac',
        ):
            raise ValueError("invalid backend: {}".format(backend))
        self._backend = backend
        if backend == 'multiprocessing':
            validator.validate_kwargs(kwargs, ('processes', ))
            if 'processes' not in kwargs:
                kwargs['processes'] = mp.cpu_count()
            validator.validate_positive(kwargs['processes'], 'processes')
            self._processes = kwargs['processes']
        elif backend == 'multyvac':
            if not _has_multyvac:
                raise ValueError("multyvac module not installed on machine")
            validator.validate_kwargs(kwargs, (
                'layer',
                'core',
                'volume',
            ))
            if 'layer' not in kwargs:
                msg = ('multyvac support requires setting up a layer.'
                       'see scripts in bin')
                raise ValueError(msg)
            self._volume = kwargs.get('volume', None)
            if self._volume is None:
                msg = "use of a volume is highly recommended"
                warnings.warn(msg)
            else:
                volume = multyvac.volume.get(self._volume)
                if not volume:
                    raise ValueError("no such volume: {}".format(self._volume))

            self._layer = kwargs['layer']
            if (not multyvac.config.api_key
                    or not multyvac.config.api_secret_key):
                raise ValueError("multyvac is not auth-ed")
            # XXX(stephentu): currently defaults to the good stuff
            self._core = kwargs.get('core', 'f2')
            self._env = {}
            # XXX(stephentu): assumes you used the setup multyvac scripts we
            # provide
            self._env['PATH'] = '{}:{}'.format(
                '/home/multyvac/miniconda/envs/build/bin', _MULTYVAC_PATH)
            self._env['CONDA_DEFAULT_ENV'] = 'build'
            # this is needed for multyvacinit.pybootstrap
            self._env['PYTHONPATH'] = '/usr/local/lib/python2.7/dist-packages'

            # XXX(stephentu): multyvac post requests are limited in size
            # (don't know what the hard limit is). so to avoid the limits,
            # we explicitly serialize the expensive state to a file

            if not self._volume:
                # no volume provided for uploads
                self._digests = [None for _ in xrange(len(self._runners))]
                return

            # XXX(stephentu): we shouldn't reach in there like this
            self._digests = []
            digest_cache = {}
            for runner in self._runners:
                cache_key = id(runner.expensive_state)
                if cache_key in digest_cache:
                    digest = digest_cache[cache_key]
                else:
                    h = hashlib.sha1()
                    runner.expensive_state_digest(h)
                    digest = h.hexdigest()
                    digest_cache[cache_key] = digest
                self._digests.append(digest)

            uploaded = set(_mvac_list_files_in_dir(volume, ""))
            _logger.info("starting state uploads")
            start = time.time()
            for runner, digest in zip(self._runners, self._digests):
                if digest in uploaded:
                    continue
                _logger.info("uploaded state-%s since not found", digest)
                f = tempfile.NamedTemporaryFile()
                pickle.dump(runner.expensive_state, f)
                f.flush()
                # XXX(stephentu) this seems to fail for large files
                #volume.put_file(f.name, 'state-{}'.format(digest))
                volume.sync_up(f.name, 'state-{}'.format(digest))
                f.close()
                uploaded.add(digest)
            _logger.info("state upload took %f seconds", (time.time() - start))

        else:
            assert False, 'should not be reached'
Ejemplo n.º 15
0
    def __init__(self, runners, backend='multiprocessing', **kwargs):
        self._runners = runners
        if backend not in ('multiprocessing', 'multyvac',):
            raise ValueError("invalid backend: {}".format(backend))
        self._backend = backend
        if backend == 'multiprocessing':
            validator.validate_kwargs(kwargs, ('processes',))
            if 'processes' not in kwargs:
                kwargs['processes'] = mp.cpu_count()
            validator.validate_positive(kwargs['processes'], 'processes')
            self._processes = kwargs['processes']
        elif backend == 'multyvac':
            if not _has_multyvac:
                raise ValueError("multyvac module not installed on machine")
            validator.validate_kwargs(kwargs, ('layer', 'core', 'volume',))
            if 'layer' not in kwargs:
                msg = ('multyvac support requires setting up a layer.'
                       'see scripts in bin')
                raise ValueError(msg)
            self._volume = kwargs.get('volume', None)
            if self._volume is None:
                msg = "use of a volume is highly recommended"
                warnings.warn(msg)
            else:
                volume = multyvac.volume.get(self._volume)
                if not volume:
                    raise ValueError(
                        "no such volume: {}".format(self._volume))

            self._layer = kwargs['layer']
            if (not multyvac.config.api_key or
                    not multyvac.config.api_secret_key):
                raise ValueError("multyvac is not auth-ed")
            # XXX(stephentu): currently defaults to the good stuff
            self._core = kwargs.get('core', 'f2')
            self._env = {}
            # XXX(stephentu): assumes you used the setup multyvac scripts we
            # provide
            self._env['PATH'] = '{}:{}'.format(
                '/home/multyvac/miniconda/envs/build/bin', _MULTYVAC_PATH)
            self._env['CONDA_DEFAULT_ENV'] = 'build'
            # this is needed for multyvacinit.pybootstrap
            self._env['PYTHONPATH'] = '/usr/local/lib/python2.7/dist-packages'

            # XXX(stephentu): multyvac post requests are limited in size
            # (don't know what the hard limit is). so to avoid the limits,
            # we explicitly serialize the expensive state to a file

            if not self._volume:
                # no volume provided for uploads
                self._digests = [None for _ in xrange(len(self._runners))]
                return

            # XXX(stephentu): we shouldn't reach in there like this
            self._digests = []
            digest_cache = {}
            for runner in self._runners:
                cache_key = id(runner.expensive_state)
                if cache_key in digest_cache:
                    digest = digest_cache[cache_key]
                else:
                    h = hashlib.sha1()
                    runner.expensive_state_digest(h)
                    digest = h.hexdigest()
                    digest_cache[cache_key] = digest
                self._digests.append(digest)

            uploaded = set(_mvac_list_files_in_dir(volume, ""))
            _logger.info("starting state uploads")
            start = time.time()
            for runner, digest in zip(self._runners, self._digests):
                if digest in uploaded:
                    continue
                _logger.info("uploaded state-%s since not found", digest)
                f = tempfile.NamedTemporaryFile()
                pickle.dump(runner.expensive_state, f)
                f.flush()
                # XXX(stephentu) this seems to fail for large files
                #volume.put_file(f.name, 'state-{}'.format(digest))
                volume.sync_up(f.name, 'state-{}'.format(digest))
                f.close()
                uploaded.add(digest)
            _logger.info("state upload took %f seconds", (time.time() - start))

        else:
            assert False, 'should not be reached'