コード例 #1
0
    def __init__(self, bandit):
        TheanoBanditAlgo.__init__(self, bandit)
        self.numpy_rng = numpy.random.RandomState(234)
        self.s_prior = IdxsValsList.fromlists(self.s_idxs, self.s_vals)
        self.s_n_train = tensor.lscalar('n_train')
        self.s_n_test = tensor.lscalar('n_test')
        self.y_obs = tensor.vector('y_obs')
        self.y_obs_var = tensor.vector('y_obs_var')
        self.x_obs_IVL = self.s_prior.new_like_self()

        self.cand_x = self.s_prior.new_like_self()
        self.cand_EI_thresh = tensor.scalar()

        self.init_kernels()
        self.init_gram_weights()
        self.params.extend(self.convex_coefficient_params)
        self.param_bounds.extend(self.convex_coefficient_params_bounds)

        self.s_big_param_vec = tensor.vector()
        ### assumes all variables are refinable
        ### assumes all variables are vectors
        n_elements_used = 0
        for k, iv in zip(self.kernels, self.cand_x):
            if self.is_refinable[k]:
                n_elements_in_v = iv.idxs.shape[0]
                start = n_elements_used
                stop = n_elements_used + n_elements_in_v
                iv.vals = self.s_big_param_vec[start:stop]
                n_elements_used += n_elements_in_v

        self.gprmath = GPR_math(self.x_obs_IVL,
                                self.y_obs,
                                self.y_obs_var,
                                picklable_instancemethod(self, 'K_fn'),
                                N=self.s_n_train,
                                min_variance=self.y_minvar)

        self.nll_obs = self.gprmath.s_nll()

        self.cand_EI = tensor.log(
            self.gprmath.s_expectation_lt_thresh(self.cand_x,
                                                 self.cand_EI_thresh))

        # self.gm_algo is used to draw candidates for subsequent refinement
        # It is also entirely responsible for choosing categorical variables.
        self.gm_algo = AdaptiveParzenGM(self.bandit)
        self.gm_algo.n_EI_candidates = self.n_candidates_to_draw_in_GM
コード例 #2
0
ファイル: theano_gp.py プロジェクト: cyip/hyperopt
    def __init__(self, bandit):
        TheanoBanditAlgo.__init__(self, bandit)
        self.numpy_rng = numpy.random.RandomState(234)
        self.s_prior = IdxsValsList.fromlists(self.s_idxs, self.s_vals)
        self.s_n_train = tensor.lscalar('n_train')
        self.s_n_test = tensor.lscalar('n_test')
        self.y_obs = tensor.vector('y_obs')
        self.y_obs_var = tensor.vector('y_obs_var')
        self.x_obs_IVL = self.s_prior.new_like_self()

        self.cand_x = self.s_prior.new_like_self()
        self.cand_EI_thresh = tensor.scalar()

        self.init_kernels()
        self.init_gram_weights()
        self.params.extend(self.convex_coefficient_params)
        self.param_bounds.extend(self.convex_coefficient_params_bounds)

        self.s_big_param_vec = tensor.vector()
        ### assumes all variables are refinable
        ### assumes all variables are vectors
        n_elements_used = 0
        for k, iv in zip(self.kernels, self.cand_x):
            if self.is_refinable[k]:
                n_elements_in_v = iv.idxs.shape[0]
                start = n_elements_used
                stop = n_elements_used + n_elements_in_v
                iv.vals = self.s_big_param_vec[start:stop]
                n_elements_used += n_elements_in_v

        self.gprmath = GPR_math(self.x_obs_IVL,
                self.y_obs,
                self.y_obs_var,
                picklable_instancemethod(self, 'K_fn'),
                N=self.s_n_train,
                min_variance=self.y_minvar)

        self.nll_obs = self.gprmath.s_nll()
            
        self.cand_EI = tensor.log(self.gprmath.s_expectation_lt_thresh(
                    self.cand_x,
                    self.cand_EI_thresh))

        # self.gm_algo is used to draw candidates for subsequent refinement
        # It is also entirely responsible for choosing categorical variables.
        self.gm_algo = AdaptiveParzenGM(self.bandit)
        self.gm_algo.n_EI_candidates = self.n_candidates_to_draw_in_GM
コード例 #3
0
ファイル: theano_gp.py プロジェクト: cyip/hyperopt
class GP_BanditAlgo(TheanoBanditAlgo):
    """
    Gaussian proces - based BanditAlgo
    """
    params_l2_penalty = 0
    # fitting penalty on the lengthscales of kernels
    # might make sense to make this negative to blur out the ML solution.

    mode = None          # None to use theano's default compilation mode

    n_startup_jobs = 30  # enough to estimate mean and variance in Y | prior(X)
                         # should be bandit-agnostic

    y_minvar = 1e-6      # minimum variance to permit for observations

    EI_ambition = 0.75

    n_candidates_to_draw = 50
    # number of candidates returned by GM, and refined with gradient EI

    n_candidates_to_draw_in_GM = 200
    # number of candidates drawn within GM

    trace_on = False

    local_improvement_patience = 20
    # For this many iterations after the suggestion of a new best point, this
    # algorithm will use the GP (and not the GM).
    # N.B. than in parallel search, this number must be overestimated because
    # several time-steps will have elapsed by the time the best point switches
    # to status 'ok'.

    p_GP_during_exploration = .5
    # probability of using the GP when more than `local_improvement_patience`
    # iterations have elapsed since the last winning point was found.

    liar_percentile = .2
    # Attribute to jobs in progress the mean and variance of this quantile of
    # finished jobs.  0 would be most optimistic, 1 would be least.

    def trace(self, msg, obj):
        """Keep a trace of actions and results, useful for debugging"""
        if self.trace_on:
            try:
                _trace = self._trace
            except AttributeError:
                _trace = self._trace = []
            _trace.append((msg, copy.deepcopy(obj)))

    def theano_trace_mode(self):
        print >> sys.stderr, "WARNING: theano_trace_mode breaks pickling"
        class PrintEverythingMode(theano.Mode):
            def __init__(sss):
                def print_eval(i, node, fn):
                    for j, ij in enumerate(fn.inputs):
                        self.trace(('linker in', j, node.op), ij[0])
                    fn()
                    for j, ij in enumerate(fn.outputs):
                        self.trace(('linker out', j, node.op), ij[0])
                wrap_linker = theano.gof.WrapLinkerMany([theano.gof.OpWiseCLinker()], [print_eval])
                super(PrintEverythingMode, sss).__init__(wrap_linker, optimizer='fast_run')
        return PrintEverythingMode()

    def qln_cleanup(self, prior_vals, kern, candidate_vals):
        """
        Undo the smooth relaxation applied to quantized log-normal variables
        """
        round = tensor.get_constant_value(
                mt_dist.quantized_lognormal_get_round(
                    prior_vals))
        intlike = numpy.ceil(candidate_vals / float(round))
        assert intlike.ndim >= 1
        # in test problems, it seems possible to get stuck in a mode
        # where the EI optimum always gets rounded up to 3
        # and so 2 is never tried, even though it is actually the best point.
        intlike = numpy.maximum(1,
                intlike - self.numpy_rng.randint(2, size=len(intlike)))
        assert intlike.ndim >= 1
        rval = intlike * float(round)
        rval = rval.astype(prior_vals.dtype)
        return rval

    def post_refinement(self, candidates):
        # Coercing candidates from the form that was good for optimizing
        # to the form that is required by the configuration grammar
        for i, (iv, k, c) in enumerate(
                zip(self.s_prior, self.kernels, candidates)):
            if k in self.post_refinement_cleanup:
                f = self.post_refinement_cleanup[k]
                cvals = f(iv.vals, k, c.vals)
                assert cvals.shape == c.vals.shape
                assert str(cvals.dtype) == iv.vals.dtype
                assert cvals.ndim == iv.vals.ndim
                c.vals = cvals

    def __init__(self, bandit):
        TheanoBanditAlgo.__init__(self, bandit)
        self.numpy_rng = numpy.random.RandomState(234)
        self.s_prior = IdxsValsList.fromlists(self.s_idxs, self.s_vals)
        self.s_n_train = tensor.lscalar('n_train')
        self.s_n_test = tensor.lscalar('n_test')
        self.y_obs = tensor.vector('y_obs')
        self.y_obs_var = tensor.vector('y_obs_var')
        self.x_obs_IVL = self.s_prior.new_like_self()

        self.cand_x = self.s_prior.new_like_self()
        self.cand_EI_thresh = tensor.scalar()

        self.init_kernels()
        self.init_gram_weights()
        self.params.extend(self.convex_coefficient_params)
        self.param_bounds.extend(self.convex_coefficient_params_bounds)

        self.s_big_param_vec = tensor.vector()
        ### assumes all variables are refinable
        ### assumes all variables are vectors
        n_elements_used = 0
        for k, iv in zip(self.kernels, self.cand_x):
            if self.is_refinable[k]:
                n_elements_in_v = iv.idxs.shape[0]
                start = n_elements_used
                stop = n_elements_used + n_elements_in_v
                iv.vals = self.s_big_param_vec[start:stop]
                n_elements_used += n_elements_in_v

        self.gprmath = GPR_math(self.x_obs_IVL,
                self.y_obs,
                self.y_obs_var,
                picklable_instancemethod(self, 'K_fn'),
                N=self.s_n_train,
                min_variance=self.y_minvar)

        self.nll_obs = self.gprmath.s_nll()
            
        self.cand_EI = tensor.log(self.gprmath.s_expectation_lt_thresh(
                    self.cand_x,
                    self.cand_EI_thresh))

        # self.gm_algo is used to draw candidates for subsequent refinement
        # It is also entirely responsible for choosing categorical variables.
        self.gm_algo = AdaptiveParzenGM(self.bandit)
        self.gm_algo.n_EI_candidates = self.n_candidates_to_draw_in_GM

    def __getstate__(self):
        rval = dict(self.__dict__)
        todel = [k for k, v in rval.items()
                if isinstance(v, theano.compile.Function)]
        for name in todel:
            del rval[name]
        return rval

    def init_kernels(self):
        self.kernels = []
        self.is_refinable = {}
        self.bounds = {}
        self.params = []
        self.param_bounds = []
        self.idxs_mulsets = {}
        self.post_refinement_cleanup = {}

        for iv in self.s_prior:
            dist_name = montetheano.rstreams.rv_dist_name(iv.vals)
            if dist_name == 'normal':
                k = SquaredExponentialKernel()
                self.is_refinable[k] = get_refinability(iv, dist_name)
                self.bounds[k] = (None, None)
            elif dist_name == 'uniform':
                k = SquaredExponentialKernel()
                self.is_refinable[k] = get_refinability(iv, dist_name)
                if self.is_refinable[k]:
                    low = tensor.get_constant_value(
                            mt_dist.uniform_get_low(iv.vals))
                    high = tensor.get_constant_value(
                            mt_dist.uniform_get_high(iv.vals))
                    self.bounds[k] = (low, high)
            elif dist_name == 'lognormal':
                k = LogSquaredExponentialKernel()
                self.is_refinable[k] = get_refinability(iv, dist_name)
                self.bounds[k] = (1e-8, None)
            elif dist_name == 'quantized_lognormal':
                k = LogSquaredExponentialKernel()
                self.is_refinable[k] = get_refinability(iv, dist_name)
                if self.is_refinable:
                    lbound = tensor.get_constant_value(
                            mt_dist.quantized_lognormal_get_round(
                                iv.vals))
                    self.bounds[k] = (lbound, None)
                    ff = picklable_instancemethod(self, 'qln_cleanup')
                    self.post_refinement_cleanup[k] = ff
            elif dist_name == 'categorical':
                # XXX: a better CategoryKernel would have different
                # similarities for different choices
                k = CategoryKernel()
                self.is_refinable[k] = False
                # refinable is false, so not setting bounds
            else:
                raise TypeError("unsupported distribution", dist_name)

            self.kernels.append(k)
            self.params.extend(k.params())
            self.param_bounds.extend(k.param_bounds())
            # XXX : to be more robust, it would be nice to build an Env with
            # the idxs as outputs, and then run the MergeOptimizer on it.
            self.idxs_mulsets.setdefault(iv.idxs, []).append(k)

    def init_gram_weights_helper(self, idxs, parent_weight, cparent):
        if parent_weight.ndim != 0:
            raise TypeError(parent_weight.type)
        kerns = self.idxs_mulsets[idxs]
        cat_kerns = [k for k, iv in zip(self.kernels, self.s_prior) if (
            isinstance(k, CategoryKernel)
            and k in kerns
            and iv.vals in cparent.values())]
        if len(cat_kerns) == 0:
            self.gram_weights[idxs] = parent_weight
        elif len(cat_kerns) == 1:
            # We have a mulset with one categorical variable in it.
            param = theano.shared(numpy.asarray(0.0))
            self.convex_coefficient_params.append(param)
            self.convex_coefficient_params_bounds.append((-5, 5))
            weight = tensor.nnet.sigmoid(param)
            # call recursively for each mulset
            # that corresponds to a slice out of idxs
            cat_vals = self.s_prior[self.kernels.index(cat_kerns[0])].vals
            self.weight_to_children[cat_vals] = parent_weight * weight
            sub_idxs_list = [sub_idxs for sub_idxs in self.idxs_mulsets
                    if cparent[sub_idxs] == cat_vals]
            assert all(si.owner.inputs[0] == idxs for si in sub_idxs_list)
            for sub_idxs in sub_idxs_list:
                    self.init_gram_weights_helper(
                            sub_idxs,
                            parent_weight=self.weight_to_children[cat_vals],
                            cparent=cparent)
            #print 'adding gram_weight', idxs
            #theano.printing.debugprint(parent_weight * (1 - weight))
            self.gram_weights[idxs] = parent_weight * (1 - weight)
        else:
            # We have a mulset with multiple categorical variables in it.
            # in this case the parent_weight must be divided among
            # this mulset itself, and each of the contained mulsets
            # (corresponding to the choices within each categorical variable)
            n_terms = len(cat_kerns) + 1
            params = theano.shared(numpy.zeros(n_terms))
            self.convex_coefficient_params.append(params)
            self.convex_coefficient_params_bounds.extend([(-5, 5)] * n_terms)
            weights = tensor.nnet.softmax(params) * parent_weight
            if weights.ndim == 2:
                # dimshuffle gets rid of the extra dimension inserted by the
                # stupid softmax implementation.  Get rid of this once
                # Theano's softmax vector branch is merged to master.
                weights = weights.dimshuffle(1)
            for i, k in enumerate(cat_kerns):
                # we're looking for sub_idxs that are formed by
                # advanced-indexing into `idxs` at positions determined
                # by the random choices of the variable corresponding to
                # kernel k
                weights_i = weights[i]
                cat_vals = self.s_prior[self.kernels.index(k)].vals
                self.weight_to_children[cat_vals] = weights_i
                sub_idxs_list = [sub_idxs for sub_idxs in self.idxs_mulsets
                        if cparent[sub_idxs] == cat_vals]
                assert all(si.owner.inputs[0] == idxs for si in sub_idxs_list)
                for sub_idxs in sub_idxs_list:
                    self.init_gram_weights_helper(
                            sub_idxs,
                            parent_weight=weights_i,
                            cparent=cparent)
            self.gram_weights[idxs] = weights[len(cat_kerns)]

    def init_gram_weights(self):
        """ Initialize mixture component weights of the hierarchical kernel.
        """
        try:
            self.gram_weights
            raise Exception('already initialized weights')
        except AttributeError:
            self.convex_coefficient_params = []
            self.convex_coefficient_params_bounds = []
            self.gram_weights = {}
            self.weight_to_children = {}

        # XXX : to be more robust, it would be better to build an Env
        # with the idxs as outputs, and then run the MergeOptimizer on
        # it.

        # Precondition: all idxs are either the root ARange or an
        # AdvancedSubtensor1 of some other idxs variable
        root_idxs = None
        cparent = {}
        for ii in self.idxs_mulsets:
            assert ii.owner
            if isinstance(ii.owner.op, tensor.ARange):
                assert root_idxs in (ii, None)
                root_idxs = ii
                cparent[ii] = None
            else:
                if isinstance(ii.owner.op, tensor.AdvancedSubtensor1):
                    assert ii.owner.inputs[0] in self.idxs_mulsets
                    cparent[ii] = categorical_parent(ii)
                else:
                    raise Exception('WHAT IS', ii)

        self.categorical_parent_of_idxs = cparent
        self.init_gram_weights_helper(root_idxs, as_tensor_variable(1.0), cparent)

    def K_fn(self, x0, x1):
        """
        :param x0: an IdxsValsList of symbolic variables
        :param x1: an IdxsValsList of symbolic variables

        :returns: symbolic gram matrix
        """

        gram_matrices = {}
        gram_matrices_idxs = {}
        for k, iv_prior, iv0, iv1 in zip(self.kernels, self.s_prior, x0, x1):
            gram = k.K(iv0.vals, iv1.vals)
            gram_matrices.setdefault(iv_prior.idxs, []).append(gram)
            gram_matrices_idxs.setdefault(iv_prior.idxs, [iv0.idxs, iv1.idxs])

        nx1 = self.s_n_train if x1 is x0 else self.s_n_test
        # N.B. the asarray works around mysterious Theano casting rules...
        base = tensor.alloc(numpy.asarray(0.0), self.s_n_train, nx1)
        for idxs, grams in gram_matrices.items():
            prod = self.gram_weights[idxs] * tensor.mul(*grams)
            base = sparse_gram_inc(base, prod, *gram_matrices_idxs[idxs])

        # we need to top up the gram matrix with weighted blocks of 1s
        # every time a categorical variable
        # sliced categoricals
        if 1:
            sliced_vals = set(self.categorical_parent_of_idxs.values())
            sliced_vals.remove(None)
            if 0:
                print sliced_vals
                for v in sliced_vals:
                    print v, [iv for iv in self.s_prior if iv.vals is v]
            # assert there are no dups
            assert len(sliced_vals) == len(set(sliced_vals))

            cparent = self.categorical_parent_of_idxs

            for prior_vals in sliced_vals:
                weight = self.weight_to_children[prior_vals]
                pos_of_child_idxs = [i for i, iv in enumerate(self.s_prior)
                        if cparent[iv.idxs] == prior_vals]
                child_idxs0 = [x0[i].idxs for i in pos_of_child_idxs]
                child_idxs1 = [x1[i].idxs for i in pos_of_child_idxs]
                iii = self.s_prior.valslist().index(prior_vals)
                assert iii >= 0
                base = sparse_gram_inc(base, weight,
                        set_difference(x0[iii].idxs, *child_idxs0),
                        set_difference(x1[iii].idxs, *child_idxs1))
        assert base.dtype == 'float64'
        return base

    def prepare_GP_training_data(self, ivls):
        # The mean and std should be estimated only from
        # the initial jobs that were sampled randomly.
        ok_idxs = ivls['losses']['ok'].idxs
        ok_vals = ivls['losses']['ok'].vals
        if (max(ok_idxs[:self.n_startup_jobs])
                < min([sys.maxint] + ok_idxs[self.n_startup_jobs:])):
            y_mean = numpy.mean(ok_vals[:self.n_startup_jobs])
            y_std = numpy.std(ok_vals[:self.n_startup_jobs])
        else:
            # TODO: extract the elements of losses['ok'] corresponding to
            # initial random jobs, and use them to estimate y_mean, y_std
            raise NotImplementedError()
        y_std = numpy.maximum(y_std, numpy.sqrt(self.y_minvar))
        del ok_idxs, ok_vals

        x_all = ivls['x_IVLs']['ok'].as_list()
        y_all_iv = ivls['losses']['ok'].as_list()
        y_var_iv = ivls['losses_variance']['ok'].as_list()

        # -- HEURISTIC: assign running jobs the same performance as the
        #    some percentile of the observed losses.
        liar_y_pos = numpy.argsort(ivls['losses']['ok'].vals)[
                int(self.liar_percentile * len(ivls['losses']['ok'].vals))]
        liar_y_mean = ivls['losses']['ok'].vals[liar_y_pos]
        liar_y_var = ivls['losses_variance']['ok'].vals[liar_y_pos]

        for pseudo_bad_status in 'new', 'running':
            logger.info('GM_BanditAlgo assigning bad scores to %i new jobs'
                    % len(ivls['losses'][pseudo_bad_status].idxs))
            x_all.stack(ivls['x_IVLs'][pseudo_bad_status])
            y_all_iv.stack(IdxsVals(
                ivls['losses'][pseudo_bad_status].idxs,
                [liar_y_mean] * len(ivls['losses'][pseudo_bad_status].idxs)))
            y_var_iv.stack(IdxsVals(
                ivls['losses_variance'][pseudo_bad_status].idxs,
                [liar_y_var] * len(ivls['losses'][pseudo_bad_status].idxs)))

        # renumber the configurations in x_all to be 0 .. (n_train - 1)
        idmap = y_all_iv.reindex()
        idmap = y_var_iv.reindex(idmap)
        idmap = x_all.reindex(idmap)

        assert y_all_iv.idxset() == y_var_iv.idxset() == x_all.idxset()

        assert numpy.all(y_all_iv.idxs == numpy.arange(len(y_all_iv.idxs)))
        assert numpy.all(y_var_iv.idxs == numpy.arange(len(y_all_iv.idxs)))

        y_all = y_all_iv.as_numpy(vdtype=theano.config.floatX).vals
        y_var = y_var_iv.as_numpy(vdtype=theano.config.floatX).vals
        x_all = x_all.as_numpy_floatX()

        y_all = (y_all - y_mean) / (1e-8 + y_std)
        y_var /= (1e-8 + y_std) ** 2

        assert y_all.shape == y_var.shape
        if y_var.min() < -1e-6:
            raise ValueError('negative variance encountered in results')
        y_var = numpy.maximum(y_var, self.y_minvar)
        return x_all, y_all, y_mean, y_var, y_std

    def fit_GP(self, x_all, y_all, y_mean, y_var, y_std, maxiter=1000):
        """
        Fit GPR kernel parameters by minimizing magininal nll.

        Returns: None

        Side effect: chooses optimal kernel parameters.
        """
        if y_std <= 0:
            raise ValueError('y_std must be postiive', y_std)

        if list(sorted(x_all.idxset())) != range(len(x_all.idxset())):
            raise NotImplementedError('need contiguous 0-based indexes on x')
        n_train = len(y_all)


        #TODO: optimize this function by making theano include the get_pt and
        #      set_pt, and theano function returns gradient and function value
        #      at once.

        self._GP_n_train = n_train
        self._GP_x_all = x_all
        self._GP_y_all = y_all
        self._GP_y_var = y_var
        self._GP_y_mean = y_mean
        self._GP_y_std = y_std

        if hasattr(self, 'nll_fn'):
            nll_fn = self.nll_fn
            dnll_dparams = self.dnll_dparams
        else:
            cost = (self.nll_obs
                + self.params_l2_penalty * sum(
                    [(p ** 2).sum() for p in self.params]))
            nll_fn = self.nll_fn = theano.function(
                    [self.s_n_train, self.s_n_test, self.y_obs, self.y_obs_var]
                        + self.x_obs_IVL.flatten(),
                    cost,
                    allow_input_downcast=True,
                    mode=self.mode,
                    )
            dnll_dparams = self.dnll_dparams = theano.function(
                    [self.s_n_train, self.s_n_test, self.y_obs, self.y_obs_var]
                        + self.x_obs_IVL.flatten(),
                    tensor.grad(cost, self.params),
                    allow_input_downcast=True,
                    mode=self.mode)
            print('Compiled nll_fn with %i thunks' %
                    len(nll_fn.maker.env.toposort()))
            print('Compiled dnll_fn with %i thunks' %
                    len(dnll_dparams.maker.env.toposort()))

        lbounds = []
        ubounds = []
        for lb, ub in self.param_bounds:
            lbounds.extend(numpy.asarray(value(lb)).flatten())
            ubounds.extend(numpy.asarray(value(ub)).flatten())
        bounds = numpy.asarray([lbounds, ubounds]).T

        # re-initialize params to eliminate warm-start bias
        for k in self.kernels:
            k.random_reset(self.numpy_rng)

        # re-initialize coefficients to even weights
        for p in self.convex_coefficient_params:
            p.set_value(0 * p.get_value())

        def get_pt():
            rval = []
            for p in self.params:
                v = p.get_value().flatten()
                rval.extend(v)
            return numpy.asarray(rval)

        def set_pt(pt):
            i = 0
            self.trace('fit_GP set_pt', pt)
            for p in self.params:
                assert p.dtype == 'float64'
                shape = p.get_value(borrow=True).shape
                size = int(numpy.prod(shape))
                p.set_value(pt[i:i + size].reshape(shape))
                i += size
            assert i == len(pt)

        n_calls = [0]
        def f(pt):
            n_calls[0] += 1
            set_pt(pt)
            rval = nll_fn(self._GP_n_train,
                    self._GP_n_train,
                    self._GP_y_all,
                    self._GP_y_var,
                    *self._GP_x_all.flatten())
            self.trace('fit_GP f', rval)
            return rval

        def df(pt):
            n_calls[0] += 1
            set_pt(pt)
            dparams = dnll_dparams(self._GP_n_train,
                    self._GP_n_train,
                    self._GP_y_all,
                    self._GP_y_var,
                    *self._GP_x_all.flatten())
            rval = []
            for dp in dparams:
                rval.extend(dp.flatten())

            rval = numpy.asarray(rval)
            self.trace('fit_GP df', rval)
            return rval

        self.trace('fit_GP start_pt', get_pt())

        best_pt, best_value, best_d = fmin_l_bfgs_b(f,
                get_pt(),
                df,
                maxfun=maxiter,
                bounds=bounds,
                iprint=-1)
        logger.info('fit_GP best value: %f' % best_value)
        set_pt(best_pt)
        self.trace('fit_GP best_pt', best_pt)
        return best_value

    def GP_mean(self, x):
        """
        Compute mean at points in x
        """
        return self.GP_mean_variance(x)[0]

    def GP_variance(self, x):
        """
        Compute variance at points in x
        """
        return self.GP_mean_variance(x)[1]

    def GP_mean_variance(self, x, ret_K=False):
        """
        Compute mean and variance at points in x
        """
        try:
            self._mean_variance
        except AttributeError:
            s_x = self.s_prior.new_like_self()
            self._mean_variance = theano.function(
                    [self.s_n_train, self.s_n_test, self.y_obs, self.y_obs_var]
                        + self.x_obs_IVL.flatten()
                        + s_x.flatten(),
                    [self.gprmath.s_mean(s_x),
                        self.gprmath.s_variance(s_x),
                        #self.K_fn(self.x_obs_IVL, self.x_obs_IVL),
                        self.K_fn(self.x_obs_IVL, s_x),
                        ],
                    allow_input_downcast=True)
            #theano.printing.debugprint(self._mean_variance)
        if len(x) != len(self._GP_x_all):
            raise ValueError('x has wrong len',
                    (len(x), len(self._GP_x_all)))
        x_idxset = x.idxset()
        if list(sorted(x_idxset)) != range(len(x_idxset)):
            raise ValueError('x needs re-indexing')
        rval_mean, rval_var, rval_K = self._mean_variance(
                self._GP_n_train,
                len(x_idxset),
                self._GP_y_all,
                self._GP_y_var,
                *(self._GP_x_all.flatten() + x.flatten()))

        if ret_K:
            return rval_K

        rval_var_min = rval_var.min()
        assert rval_var_min > -1e-4, rval_var_min
        rval_var = numpy.maximum(rval_var, 0)
        return (rval_mean * self._GP_y_std + self._GP_y_mean,
                rval_var * self._GP_y_std ** 2)

    def GP_train_K(self):
        return self.GP_mean_variance(self._GP_x_all, ret_K=True)

    def GP_EI_thresh(self):
        thresh = (self._GP_y_all
                - self.EI_ambition * numpy.sqrt(self._GP_y_var)).min()
        return thresh

    def GP_EI(self, x):
        x_idxset = x.idxset()
        if list(sorted(x_idxset)) != range(len(x_idxset)):
            raise ValueError('x needs re-indexing')

        try:
            self._EI_fn
        except AttributeError:
            self._EI_fn = theano.function(
                    [self.s_n_train, self.s_n_test, self.y_obs, self.y_obs_var]
                        + self.x_obs_IVL.flatten()
                        + [self.cand_EI_thresh]
                        + self.cand_x.flatten(),
                    self.cand_EI,
                    allow_input_downcast=True)

        thresh = self.GP_EI_thresh()

        rval = self._EI_fn(self._GP_n_train,
                len(x_idxset),
                self._GP_y_all,
                self._GP_y_var,
                *(self._GP_x_all.flatten()
                    + [thresh]
                    + x.flatten()))
        assert rval.shape == (len(x_idxset),)
        return rval

    def GP_EI_optimize(self, x, maxiter=1000):
        x_idxset = x.idxset()
        if list(sorted(x_idxset)) != range(len(x_idxset)):
            raise ValueError('x needs re-indexing')

        if len(x) != len(self.kernels):
            raise ValueError('len(x) == %i but len(self.kernels)==%i' % (
                len(x), len(self.kernels)))

        n_refinable = len([k for k in self.kernels if self.is_refinable[k]])
        if n_refinable == 0:
            return x

        try:
            EI_fn_g = self._EI_fn_g
        except AttributeError:
            criterion = -self.cand_EI.sum()
            EI_fn_g = self._EI_fn_g = theano.function(
                    [self.s_big_param_vec] +
                    [self.s_n_train, self.s_n_test,
                        self.y_obs,
                        self.y_obs_var]
                        + self.x_obs_IVL.flatten()
                        + [self.cand_EI_thresh]
                        + self.cand_x.idxslist()
                        + [v for (k, v) in zip(self.kernels,
                            self.cand_x.valslist())
                            if not self.is_refinable[k]],
                    [criterion, tensor.grad(criterion,
                            self.s_big_param_vec)],
                    allow_input_downcast=True,
                    mode=self.mode)
            print('Compiled EI_fn_g with %i thunks' %
                    len(EI_fn_g.maker.env.toposort()))

        thresh = self.GP_EI_thresh()

        start_pt = numpy.asarray(
                numpy.concatenate(
                    [xk for k, xk in zip(self.kernels, x.valslist())
                        if self.is_refinable[k]]),
                dtype='float64')

        args = ((self._GP_n_train,
            len(x_idxset),
            self._GP_y_all,
            self._GP_y_var)
            + tuple(self._GP_x_all.flatten())
            + (thresh,)
            + tuple(x.idxslist())
            + tuple([v
                for (k, v) in zip(self.kernels, x.valslist())
                if not self.is_refinable[k]]))

        bounds = []
        for (k, xk) in zip(self.kernels, x.valslist()):
            if self.is_refinable[k]:
                bounds.extend([self.bounds[k]] * len(xk))

        if self.trace_on:
            def fff(*vvv):
                for i, v in enumerate(vvv):
                    self.trace(('vvv', i), numpy.asarray(v))
                f, df = EI_fn_g(*vvv)
                self.trace('f', f)
                self.trace('df', df)
                return f, df
            self.trace('start_pt', start_pt)
            for i, v in enumerate(args):
                self.trace(('args', i), numpy.asarray(v))
            self.trace('bounds', numpy.asarray(bounds))
            self.trace('maxiter', numpy.asarray(maxiter))
        else:
            fff = EI_fn_g

        best_pt, best_value, best_d = fmin_l_bfgs_b(fff,
                start_pt,
                None,
                args=args,
                maxfun=maxiter,
                bounds=bounds,
                iprint=-1)
        self.trace('best_pt', best_pt)

        # print 'BEST_PT', best_pt
        rval = x.copy()
        initial = 0
        for (_ind, iv) in enumerate(x):
            if self.is_refinable[self.kernels[_ind]]:
                diff = len(iv.vals)
                # XXX: assumes vector-valued vals (scalar elements)
                rval[_ind].vals = best_pt[initial:initial + diff]
                initial += diff
        # -- assert that all elements of best_pt have been used
        assert initial == len(best_pt)

        # -- apply any quantization required by the distributions
        self.post_refinement(rval)
        return rval

    def suggest_from_gp(self, trials, results, N):
        logger.info('suggest_from_gp')
        if N != 1:
            raise NotImplementedError('only N==1 is supported')
        ivls = self.idxs_vals_by_status(trials, results)

        prepared_data = self.prepare_GP_training_data(ivls)
        self.fit_GP(*prepared_data)

        # -- add the best previous trials as candidates
        n_trials_to_opt = self.n_candidates_to_draw // 2
        best_idxs = numpy.asarray(ivls['losses']['ok'].idxs)[
                numpy.argsort(ivls['losses']['ok'].vals)[:n_trials_to_opt]]
        best_IVLs = ivls['x_IVLs']['ok'].numeric_take(best_idxs)
        best_idxset = best_IVLs.idxset()

        # -- draw the remainder as random candidates
        candidates = self.gm_algo.suggest_from_model(ivls,
                self.n_candidates_to_draw - len(best_idxset))

        # -- re-index the best_IVLs to ensure no collision during stack
        cand_idxset = candidates.idxset()
        assert (len(cand_idxset) + len(best_idxset)
                == self.n_candidates_to_draw)
        idmap = {}
        for i in best_idxset:
            if i in cand_idxset:
                idmap[i] = (max(cand_idxset) + max(best_idxset) +
                        len(idmap) + 1)
            else:
                idmap[i] = i
            assert idmap[i] not in cand_idxset
        assert (len(cand_idxset.union(idmap.values()))
                == self.n_candidates_to_draw)
        best_IVLs.reindex(idmap)
        candidates = candidates.as_list()
        candidates.stack(best_IVLs)
        assert len(candidates.idxset()) == self.n_candidates_to_draw
        # XXX: rather than reindex here, take advantage of fact that random
        #      candidates were already contiguously indexed and stack
        #      appropriately reindexed trials on top of them.
        candidates.reindex()
        candidates = candidates.as_numpy()

        candidates_opt = self.GP_EI_optimize(candidates)

        EI_opt = self.GP_EI(candidates_opt)
        best_idx = numpy.argmax(EI_opt)
        if 1:
            # for DEBUGGING
            EI = self.GP_EI(candidates)
            if EI.max() - 1e-4 > EI_opt.max():
                logger.warn(
                    'Optimization actually *decreased* EI!? %.3f -> %.3f' % (
                        EI.max(), EI_opt.max()))
        rval = candidates_opt.numeric_take([best_idx])
        return rval

    def suggest_from_gm(self, trials, results, N):
        logger.info('suggest_from_gm')
        ivls = self.idxs_vals_by_status(trials, results)
        rval = self.gm_algo.suggest_from_model(ivls, N)
        return rval

    def suggest_from_prior(self, trials, results, N):
        logger.info('suggest_from_prior')
        if not hasattr(self, '_prior_sampler'):
            self._prior_sampler = theano.function(
                    [self.s_N],
                    self.s_prior.flatten(),
                    mode=self.mode)
        rvals = self._prior_sampler(N)
        return IdxsValsList.fromflattened(rvals)

    def suggest(self, trials, results, N):
        ivls = self.idxs_vals_by_status(trials, results)
        t0 = time.time()
        n_ok = len(ivls['losses']['ok'].idxs)

        # -- choose the suggestion strategy (heuristic)
        if n_ok < self.n_startup_jobs:
            fn = self.suggest_from_prior
        else:
            # -- figure out how long (in iterations) it has been since picking a
            #    winner: `winner_age`
            assert (list(ivls['losses']['ok'].idxs)
                    == list(sorted(ivls['losses']['ok'].idxs)))
            t_winner = numpy.asarray(ivls['losses']['ok'].vals).argmin()
            winner_age = n_ok - t_winner
            if winner_age < self.local_improvement_patience:
                fn = self.suggest_from_gp
            else:
                if self.numpy_rng.rand() < self.p_GP_during_exploration:
                    fn = self.suggest_from_gp
                else:
                    fn = self.suggest_from_gm
        try:
            rval = self.suggest_ivl(fn(trials, results, N))
        finally:
            logger.info('suggest %i took %.2f seconds' % (
                    len(ivls['losses']['ok'].idxs),
                    time.time() - t0))
        return rval
コード例 #4
0
class GP_BanditAlgo(TheanoBanditAlgo):
    """
    Gaussian proces - based BanditAlgo
    """
    params_l2_penalty = 0
    # fitting penalty on the lengthscales of kernels
    # might make sense to make this negative to blur out the ML solution.

    mode = None  # None to use theano's default compilation mode

    n_startup_jobs = 30  # enough to estimate mean and variance in Y | prior(X)
    # should be bandit-agnostic

    y_minvar = 1e-6  # minimum variance to permit for observations

    EI_ambition = 0.75

    n_candidates_to_draw = 50
    # number of candidates returned by GM, and refined with gradient EI

    n_candidates_to_draw_in_GM = 200
    # number of candidates drawn within GM

    trace_on = False

    local_improvement_patience = 20
    # For this many iterations after the suggestion of a new best point, this
    # algorithm will use the GP (and not the GM).
    # N.B. than in parallel search, this number must be overestimated because
    # several time-steps will have elapsed by the time the best point switches
    # to status 'ok'.

    p_GP_during_exploration = .5
    # probability of using the GP when more than `local_improvement_patience`
    # iterations have elapsed since the last winning point was found.

    liar_percentile = .2

    # Attribute to jobs in progress the mean and variance of this quantile of
    # finished jobs.  0 would be most optimistic, 1 would be least.

    def trace(self, msg, obj):
        """Keep a trace of actions and results, useful for debugging"""
        if self.trace_on:
            try:
                _trace = self._trace
            except AttributeError:
                _trace = self._trace = []
            _trace.append((msg, copy.deepcopy(obj)))

    def theano_trace_mode(self):
        print >> sys.stderr, "WARNING: theano_trace_mode breaks pickling"

        class PrintEverythingMode(theano.Mode):
            def __init__(sss):
                def print_eval(i, node, fn):
                    for j, ij in enumerate(fn.inputs):
                        self.trace(('linker in', j, node.op), ij[0])
                    fn()
                    for j, ij in enumerate(fn.outputs):
                        self.trace(('linker out', j, node.op), ij[0])

                wrap_linker = theano.gof.WrapLinkerMany(
                    [theano.gof.OpWiseCLinker()], [print_eval])
                super(PrintEverythingMode, sss).__init__(wrap_linker,
                                                         optimizer='fast_run')

        return PrintEverythingMode()

    def qln_cleanup(self, prior_vals, kern, candidate_vals):
        """
        Undo the smooth relaxation applied to quantized log-normal variables
        """
        round = tensor.get_constant_value(
            mt_dist.quantized_lognormal_get_round(prior_vals))
        intlike = numpy.ceil(candidate_vals / float(round))
        assert intlike.ndim >= 1
        # in test problems, it seems possible to get stuck in a mode
        # where the EI optimum always gets rounded up to 3
        # and so 2 is never tried, even though it is actually the best point.
        intlike = numpy.maximum(
            1, intlike - self.numpy_rng.randint(2, size=len(intlike)))
        assert intlike.ndim >= 1
        rval = intlike * float(round)
        rval = rval.astype(prior_vals.dtype)
        return rval

    def post_refinement(self, candidates):
        # Coercing candidates from the form that was good for optimizing
        # to the form that is required by the configuration grammar
        for i, (iv, k,
                c) in enumerate(zip(self.s_prior, self.kernels, candidates)):
            if k in self.post_refinement_cleanup:
                f = self.post_refinement_cleanup[k]
                cvals = f(iv.vals, k, c.vals)
                assert cvals.shape == c.vals.shape
                assert str(cvals.dtype) == iv.vals.dtype
                assert cvals.ndim == iv.vals.ndim
                c.vals = cvals

    def __init__(self, bandit):
        TheanoBanditAlgo.__init__(self, bandit)
        self.numpy_rng = numpy.random.RandomState(234)
        self.s_prior = IdxsValsList.fromlists(self.s_idxs, self.s_vals)
        self.s_n_train = tensor.lscalar('n_train')
        self.s_n_test = tensor.lscalar('n_test')
        self.y_obs = tensor.vector('y_obs')
        self.y_obs_var = tensor.vector('y_obs_var')
        self.x_obs_IVL = self.s_prior.new_like_self()

        self.cand_x = self.s_prior.new_like_self()
        self.cand_EI_thresh = tensor.scalar()

        self.init_kernels()
        self.init_gram_weights()
        self.params.extend(self.convex_coefficient_params)
        self.param_bounds.extend(self.convex_coefficient_params_bounds)

        self.s_big_param_vec = tensor.vector()
        ### assumes all variables are refinable
        ### assumes all variables are vectors
        n_elements_used = 0
        for k, iv in zip(self.kernels, self.cand_x):
            if self.is_refinable[k]:
                n_elements_in_v = iv.idxs.shape[0]
                start = n_elements_used
                stop = n_elements_used + n_elements_in_v
                iv.vals = self.s_big_param_vec[start:stop]
                n_elements_used += n_elements_in_v

        self.gprmath = GPR_math(self.x_obs_IVL,
                                self.y_obs,
                                self.y_obs_var,
                                picklable_instancemethod(self, 'K_fn'),
                                N=self.s_n_train,
                                min_variance=self.y_minvar)

        self.nll_obs = self.gprmath.s_nll()

        self.cand_EI = tensor.log(
            self.gprmath.s_expectation_lt_thresh(self.cand_x,
                                                 self.cand_EI_thresh))

        # self.gm_algo is used to draw candidates for subsequent refinement
        # It is also entirely responsible for choosing categorical variables.
        self.gm_algo = AdaptiveParzenGM(self.bandit)
        self.gm_algo.n_EI_candidates = self.n_candidates_to_draw_in_GM

    def __getstate__(self):
        rval = dict(self.__dict__)
        todel = [
            k for k, v in rval.items()
            if isinstance(v, theano.compile.Function)
        ]
        for name in todel:
            del rval[name]
        return rval

    def init_kernels(self):
        self.kernels = []
        self.is_refinable = {}
        self.bounds = {}
        self.params = []
        self.param_bounds = []
        self.idxs_mulsets = {}
        self.post_refinement_cleanup = {}

        for iv in self.s_prior:
            dist_name = montetheano.rstreams.rv_dist_name(iv.vals)
            if dist_name == 'normal':
                k = SquaredExponentialKernel()
                self.is_refinable[k] = get_refinability(iv, dist_name)
                self.bounds[k] = (None, None)
            elif dist_name == 'uniform':
                k = SquaredExponentialKernel()
                self.is_refinable[k] = get_refinability(iv, dist_name)
                if self.is_refinable[k]:
                    low = tensor.get_constant_value(
                        mt_dist.uniform_get_low(iv.vals))
                    high = tensor.get_constant_value(
                        mt_dist.uniform_get_high(iv.vals))
                    self.bounds[k] = (low, high)
            elif dist_name == 'lognormal':
                k = LogSquaredExponentialKernel()
                self.is_refinable[k] = get_refinability(iv, dist_name)
                self.bounds[k] = (1e-8, None)
            elif dist_name == 'quantized_lognormal':
                k = LogSquaredExponentialKernel()
                self.is_refinable[k] = get_refinability(iv, dist_name)
                if self.is_refinable:
                    lbound = tensor.get_constant_value(
                        mt_dist.quantized_lognormal_get_round(iv.vals))
                    self.bounds[k] = (lbound, None)
                    ff = picklable_instancemethod(self, 'qln_cleanup')
                    self.post_refinement_cleanup[k] = ff
            elif dist_name == 'categorical':
                # XXX: a better CategoryKernel would have different
                # similarities for different choices
                k = CategoryKernel()
                self.is_refinable[k] = False
                # refinable is false, so not setting bounds
            else:
                raise TypeError("unsupported distribution", dist_name)

            self.kernels.append(k)
            self.params.extend(k.params())
            self.param_bounds.extend(k.param_bounds())
            # XXX : to be more robust, it would be nice to build an Env with
            # the idxs as outputs, and then run the MergeOptimizer on it.
            self.idxs_mulsets.setdefault(iv.idxs, []).append(k)

    def init_gram_weights_helper(self, idxs, parent_weight, cparent):
        if parent_weight.ndim != 0:
            raise TypeError(parent_weight.type)
        kerns = self.idxs_mulsets[idxs]
        cat_kerns = [
            k for k, iv in zip(self.kernels, self.s_prior)
            if (isinstance(k, CategoryKernel) and k in kerns
                and iv.vals in cparent.values())
        ]
        if len(cat_kerns) == 0:
            self.gram_weights[idxs] = parent_weight
        elif len(cat_kerns) == 1:
            # We have a mulset with one categorical variable in it.
            param = theano.shared(numpy.asarray(0.0))
            self.convex_coefficient_params.append(param)
            self.convex_coefficient_params_bounds.append((-5, 5))
            weight = tensor.nnet.sigmoid(param)
            # call recursively for each mulset
            # that corresponds to a slice out of idxs
            cat_vals = self.s_prior[self.kernels.index(cat_kerns[0])].vals
            self.weight_to_children[cat_vals] = parent_weight * weight
            sub_idxs_list = [
                sub_idxs for sub_idxs in self.idxs_mulsets
                if cparent[sub_idxs] == cat_vals
            ]
            assert all(si.owner.inputs[0] == idxs for si in sub_idxs_list)
            for sub_idxs in sub_idxs_list:
                self.init_gram_weights_helper(
                    sub_idxs,
                    parent_weight=self.weight_to_children[cat_vals],
                    cparent=cparent)
            #print 'adding gram_weight', idxs
            #theano.printing.debugprint(parent_weight * (1 - weight))
            self.gram_weights[idxs] = parent_weight * (1 - weight)
        else:
            # We have a mulset with multiple categorical variables in it.
            # in this case the parent_weight must be divided among
            # this mulset itself, and each of the contained mulsets
            # (corresponding to the choices within each categorical variable)
            n_terms = len(cat_kerns) + 1
            params = theano.shared(numpy.zeros(n_terms))
            self.convex_coefficient_params.append(params)
            self.convex_coefficient_params_bounds.extend([(-5, 5)] * n_terms)
            weights = tensor.nnet.softmax(params) * parent_weight
            if weights.ndim == 2:
                # dimshuffle gets rid of the extra dimension inserted by the
                # stupid softmax implementation.  Get rid of this once
                # Theano's softmax vector branch is merged to master.
                weights = weights.dimshuffle(1)
            for i, k in enumerate(cat_kerns):
                # we're looking for sub_idxs that are formed by
                # advanced-indexing into `idxs` at positions determined
                # by the random choices of the variable corresponding to
                # kernel k
                weights_i = weights[i]
                cat_vals = self.s_prior[self.kernels.index(k)].vals
                self.weight_to_children[cat_vals] = weights_i
                sub_idxs_list = [
                    sub_idxs for sub_idxs in self.idxs_mulsets
                    if cparent[sub_idxs] == cat_vals
                ]
                assert all(si.owner.inputs[0] == idxs for si in sub_idxs_list)
                for sub_idxs in sub_idxs_list:
                    self.init_gram_weights_helper(sub_idxs,
                                                  parent_weight=weights_i,
                                                  cparent=cparent)
            self.gram_weights[idxs] = weights[len(cat_kerns)]

    def init_gram_weights(self):
        """ Initialize mixture component weights of the hierarchical kernel.
        """
        try:
            self.gram_weights
            raise Exception('already initialized weights')
        except AttributeError:
            self.convex_coefficient_params = []
            self.convex_coefficient_params_bounds = []
            self.gram_weights = {}
            self.weight_to_children = {}

        # XXX : to be more robust, it would be better to build an Env
        # with the idxs as outputs, and then run the MergeOptimizer on
        # it.

        # Precondition: all idxs are either the root ARange or an
        # AdvancedSubtensor1 of some other idxs variable
        root_idxs = None
        cparent = {}
        for ii in self.idxs_mulsets:
            assert ii.owner
            if isinstance(ii.owner.op, tensor.ARange):
                assert root_idxs in (ii, None)
                root_idxs = ii
                cparent[ii] = None
            else:
                if isinstance(ii.owner.op, tensor.AdvancedSubtensor1):
                    assert ii.owner.inputs[0] in self.idxs_mulsets
                    cparent[ii] = categorical_parent(ii)
                else:
                    raise Exception('WHAT IS', ii)

        self.categorical_parent_of_idxs = cparent
        self.init_gram_weights_helper(root_idxs, as_tensor_variable(1.0),
                                      cparent)

    def K_fn(self, x0, x1):
        """
        :param x0: an IdxsValsList of symbolic variables
        :param x1: an IdxsValsList of symbolic variables

        :returns: symbolic gram matrix
        """

        gram_matrices = {}
        gram_matrices_idxs = {}
        for k, iv_prior, iv0, iv1 in zip(self.kernels, self.s_prior, x0, x1):
            gram = k.K(iv0.vals, iv1.vals)
            gram_matrices.setdefault(iv_prior.idxs, []).append(gram)
            gram_matrices_idxs.setdefault(iv_prior.idxs, [iv0.idxs, iv1.idxs])

        nx1 = self.s_n_train if x1 is x0 else self.s_n_test
        # N.B. the asarray works around mysterious Theano casting rules...
        base = tensor.alloc(numpy.asarray(0.0), self.s_n_train, nx1)
        for idxs, grams in gram_matrices.items():
            prod = self.gram_weights[idxs] * tensor.mul(*grams)
            base = sparse_gram_inc(base, prod, *gram_matrices_idxs[idxs])

        # we need to top up the gram matrix with weighted blocks of 1s
        # every time a categorical variable
        # sliced categoricals
        if 1:
            sliced_vals = set(self.categorical_parent_of_idxs.values())
            sliced_vals.remove(None)
            if 0:
                print sliced_vals
                for v in sliced_vals:
                    print v, [iv for iv in self.s_prior if iv.vals is v]
            # assert there are no dups
            assert len(sliced_vals) == len(set(sliced_vals))

            cparent = self.categorical_parent_of_idxs

            for prior_vals in sliced_vals:
                weight = self.weight_to_children[prior_vals]
                pos_of_child_idxs = [
                    i for i, iv in enumerate(self.s_prior)
                    if cparent[iv.idxs] == prior_vals
                ]
                child_idxs0 = [x0[i].idxs for i in pos_of_child_idxs]
                child_idxs1 = [x1[i].idxs for i in pos_of_child_idxs]
                iii = self.s_prior.valslist().index(prior_vals)
                assert iii >= 0
                base = sparse_gram_inc(
                    base, weight, set_difference(x0[iii].idxs, *child_idxs0),
                    set_difference(x1[iii].idxs, *child_idxs1))
        assert base.dtype == 'float64'
        return base

    def prepare_GP_training_data(self, ivls):
        # The mean and std should be estimated only from
        # the initial jobs that were sampled randomly.
        ok_idxs = ivls['losses']['ok'].idxs
        ok_vals = ivls['losses']['ok'].vals
        if (max(ok_idxs[:self.n_startup_jobs]) <
                min([sys.maxint] + ok_idxs[self.n_startup_jobs:])):
            y_mean = numpy.mean(ok_vals[:self.n_startup_jobs])
            y_std = numpy.std(ok_vals[:self.n_startup_jobs])
        else:
            # TODO: extract the elements of losses['ok'] corresponding to
            # initial random jobs, and use them to estimate y_mean, y_std
            raise NotImplementedError()
        y_std = numpy.maximum(y_std, numpy.sqrt(self.y_minvar))
        del ok_idxs, ok_vals

        x_all = ivls['x_IVLs']['ok'].as_list()
        y_all_iv = ivls['losses']['ok'].as_list()
        y_var_iv = ivls['losses_variance']['ok'].as_list()

        # -- HEURISTIC: assign running jobs the same performance as the
        #    some percentile of the observed losses.
        liar_y_pos = numpy.argsort(ivls['losses']['ok'].vals)[int(
            self.liar_percentile * len(ivls['losses']['ok'].vals))]
        liar_y_mean = ivls['losses']['ok'].vals[liar_y_pos]
        liar_y_var = ivls['losses_variance']['ok'].vals[liar_y_pos]

        for pseudo_bad_status in 'new', 'running':
            logger.info('GM_BanditAlgo assigning bad scores to %i new jobs' %
                        len(ivls['losses'][pseudo_bad_status].idxs))
            x_all.stack(ivls['x_IVLs'][pseudo_bad_status])
            y_all_iv.stack(
                IdxsVals(ivls['losses'][pseudo_bad_status].idxs,
                         [liar_y_mean] *
                         len(ivls['losses'][pseudo_bad_status].idxs)))
            y_var_iv.stack(
                IdxsVals(ivls['losses_variance'][pseudo_bad_status].idxs,
                         [liar_y_var] *
                         len(ivls['losses'][pseudo_bad_status].idxs)))

        # renumber the configurations in x_all to be 0 .. (n_train - 1)
        idmap = y_all_iv.reindex()
        idmap = y_var_iv.reindex(idmap)
        idmap = x_all.reindex(idmap)

        assert y_all_iv.idxset() == y_var_iv.idxset() == x_all.idxset()

        assert numpy.all(y_all_iv.idxs == numpy.arange(len(y_all_iv.idxs)))
        assert numpy.all(y_var_iv.idxs == numpy.arange(len(y_all_iv.idxs)))

        y_all = y_all_iv.as_numpy(vdtype=theano.config.floatX).vals
        y_var = y_var_iv.as_numpy(vdtype=theano.config.floatX).vals
        x_all = x_all.as_numpy_floatX()

        y_all = (y_all - y_mean) / (1e-8 + y_std)
        y_var /= (1e-8 + y_std)**2

        assert y_all.shape == y_var.shape
        if y_var.min() < -1e-6:
            raise ValueError('negative variance encountered in results')
        y_var = numpy.maximum(y_var, self.y_minvar)
        return x_all, y_all, y_mean, y_var, y_std

    def fit_GP(self, x_all, y_all, y_mean, y_var, y_std, maxiter=1000):
        """
        Fit GPR kernel parameters by minimizing magininal nll.

        Returns: None

        Side effect: chooses optimal kernel parameters.
        """
        if y_std <= 0:
            raise ValueError('y_std must be postiive', y_std)

        if list(sorted(x_all.idxset())) != range(len(x_all.idxset())):
            raise NotImplementedError('need contiguous 0-based indexes on x')
        n_train = len(y_all)

        #TODO: optimize this function by making theano include the get_pt and
        #      set_pt, and theano function returns gradient and function value
        #      at once.

        self._GP_n_train = n_train
        self._GP_x_all = x_all
        self._GP_y_all = y_all
        self._GP_y_var = y_var
        self._GP_y_mean = y_mean
        self._GP_y_std = y_std

        if hasattr(self, 'nll_fn'):
            nll_fn = self.nll_fn
            dnll_dparams = self.dnll_dparams
        else:
            cost = (self.nll_obs +
                    self.params_l2_penalty * sum([(p**2).sum()
                                                  for p in self.params]))
            nll_fn = self.nll_fn = theano.function(
                [self.s_n_train, self.s_n_test, self.y_obs, self.y_obs_var] +
                self.x_obs_IVL.flatten(),
                cost,
                allow_input_downcast=True,
                mode=self.mode,
            )
            dnll_dparams = self.dnll_dparams = theano.function(
                [self.s_n_train, self.s_n_test, self.y_obs, self.y_obs_var] +
                self.x_obs_IVL.flatten(),
                tensor.grad(cost, self.params),
                allow_input_downcast=True,
                mode=self.mode)
            print('Compiled nll_fn with %i thunks' %
                  len(nll_fn.maker.env.toposort()))
            print('Compiled dnll_fn with %i thunks' %
                  len(dnll_dparams.maker.env.toposort()))

        lbounds = []
        ubounds = []
        for lb, ub in self.param_bounds:
            lbounds.extend(numpy.asarray(value(lb)).flatten())
            ubounds.extend(numpy.asarray(value(ub)).flatten())
        bounds = numpy.asarray([lbounds, ubounds]).T

        # re-initialize params to eliminate warm-start bias
        for k in self.kernels:
            k.random_reset(self.numpy_rng)

        # re-initialize coefficients to even weights
        for p in self.convex_coefficient_params:
            p.set_value(0 * p.get_value())

        def get_pt():
            rval = []
            for p in self.params:
                v = p.get_value().flatten()
                rval.extend(v)
            return numpy.asarray(rval)

        def set_pt(pt):
            i = 0
            self.trace('fit_GP set_pt', pt)
            for p in self.params:
                assert p.dtype == 'float64'
                shape = p.get_value(borrow=True).shape
                size = int(numpy.prod(shape))
                p.set_value(pt[i:i + size].reshape(shape))
                i += size
            assert i == len(pt)

        n_calls = [0]

        def f(pt):
            n_calls[0] += 1
            set_pt(pt)
            rval = nll_fn(self._GP_n_train, self._GP_n_train, self._GP_y_all,
                          self._GP_y_var, *self._GP_x_all.flatten())
            self.trace('fit_GP f', rval)
            return rval

        def df(pt):
            n_calls[0] += 1
            set_pt(pt)
            dparams = dnll_dparams(self._GP_n_train, self._GP_n_train,
                                   self._GP_y_all, self._GP_y_var,
                                   *self._GP_x_all.flatten())
            rval = []
            for dp in dparams:
                rval.extend(dp.flatten())

            rval = numpy.asarray(rval)
            self.trace('fit_GP df', rval)
            return rval

        self.trace('fit_GP start_pt', get_pt())

        best_pt, best_value, best_d = fmin_l_bfgs_b(f,
                                                    get_pt(),
                                                    df,
                                                    maxfun=maxiter,
                                                    bounds=bounds,
                                                    iprint=-1)
        logger.info('fit_GP best value: %f' % best_value)
        set_pt(best_pt)
        self.trace('fit_GP best_pt', best_pt)
        return best_value

    def GP_mean(self, x):
        """
        Compute mean at points in x
        """
        return self.GP_mean_variance(x)[0]

    def GP_variance(self, x):
        """
        Compute variance at points in x
        """
        return self.GP_mean_variance(x)[1]

    def GP_mean_variance(self, x, ret_K=False):
        """
        Compute mean and variance at points in x
        """
        try:
            self._mean_variance
        except AttributeError:
            s_x = self.s_prior.new_like_self()
            self._mean_variance = theano.function(
                [self.s_n_train, self.s_n_test, self.y_obs, self.y_obs_var] +
                self.x_obs_IVL.flatten() + s_x.flatten(),
                [
                    self.gprmath.s_mean(s_x),
                    self.gprmath.s_variance(s_x),
                    #self.K_fn(self.x_obs_IVL, self.x_obs_IVL),
                    self.K_fn(self.x_obs_IVL, s_x),
                ],
                allow_input_downcast=True)
            #theano.printing.debugprint(self._mean_variance)
        if len(x) != len(self._GP_x_all):
            raise ValueError('x has wrong len', (len(x), len(self._GP_x_all)))
        x_idxset = x.idxset()
        if list(sorted(x_idxset)) != range(len(x_idxset)):
            raise ValueError('x needs re-indexing')
        rval_mean, rval_var, rval_K = self._mean_variance(
            self._GP_n_train, len(x_idxset), self._GP_y_all, self._GP_y_var,
            *(self._GP_x_all.flatten() + x.flatten()))

        if ret_K:
            return rval_K

        rval_var_min = rval_var.min()
        assert rval_var_min > -1e-4, rval_var_min
        rval_var = numpy.maximum(rval_var, 0)
        return (rval_mean * self._GP_y_std + self._GP_y_mean,
                rval_var * self._GP_y_std**2)

    def GP_train_K(self):
        return self.GP_mean_variance(self._GP_x_all, ret_K=True)

    def GP_EI_thresh(self):
        thresh = (self._GP_y_all -
                  self.EI_ambition * numpy.sqrt(self._GP_y_var)).min()
        return thresh

    def GP_EI(self, x):
        x_idxset = x.idxset()
        if list(sorted(x_idxset)) != range(len(x_idxset)):
            raise ValueError('x needs re-indexing')

        try:
            self._EI_fn
        except AttributeError:
            self._EI_fn = theano.function(
                [self.s_n_train, self.s_n_test, self.y_obs, self.y_obs_var] +
                self.x_obs_IVL.flatten() + [self.cand_EI_thresh] +
                self.cand_x.flatten(),
                self.cand_EI,
                allow_input_downcast=True)

        thresh = self.GP_EI_thresh()

        rval = self._EI_fn(
            self._GP_n_train, len(x_idxset), self._GP_y_all, self._GP_y_var,
            *(self._GP_x_all.flatten() + [thresh] + x.flatten()))
        assert rval.shape == (len(x_idxset), )
        return rval

    def GP_EI_optimize(self, x, maxiter=1000):
        x_idxset = x.idxset()
        if list(sorted(x_idxset)) != range(len(x_idxset)):
            raise ValueError('x needs re-indexing')

        if len(x) != len(self.kernels):
            raise ValueError('len(x) == %i but len(self.kernels)==%i' %
                             (len(x), len(self.kernels)))

        n_refinable = len([k for k in self.kernels if self.is_refinable[k]])
        if n_refinable == 0:
            return x

        try:
            EI_fn_g = self._EI_fn_g
        except AttributeError:
            criterion = -self.cand_EI.sum()
            EI_fn_g = self._EI_fn_g = theano.function(
                [self.s_big_param_vec] +
                [self.s_n_train, self.s_n_test, self.y_obs, self.y_obs_var] +
                self.x_obs_IVL.flatten() + [self.cand_EI_thresh] +
                self.cand_x.idxslist() + [
                    v for (k, v) in zip(self.kernels, self.cand_x.valslist())
                    if not self.is_refinable[k]
                ], [criterion,
                    tensor.grad(criterion, self.s_big_param_vec)],
                allow_input_downcast=True,
                mode=self.mode)
            print('Compiled EI_fn_g with %i thunks' %
                  len(EI_fn_g.maker.env.toposort()))

        thresh = self.GP_EI_thresh()

        start_pt = numpy.asarray(numpy.concatenate([
            xk for k, xk in zip(self.kernels, x.valslist())
            if self.is_refinable[k]
        ]),
                                 dtype='float64')

        args = (
            (self._GP_n_train, len(x_idxset), self._GP_y_all, self._GP_y_var) +
            tuple(self._GP_x_all.flatten()) + (thresh, ) +
            tuple(x.idxslist()) + tuple([
                v for (k, v) in zip(self.kernels, x.valslist())
                if not self.is_refinable[k]
            ]))

        bounds = []
        for (k, xk) in zip(self.kernels, x.valslist()):
            if self.is_refinable[k]:
                bounds.extend([self.bounds[k]] * len(xk))

        if self.trace_on:

            def fff(*vvv):
                for i, v in enumerate(vvv):
                    self.trace(('vvv', i), numpy.asarray(v))
                f, df = EI_fn_g(*vvv)
                self.trace('f', f)
                self.trace('df', df)
                return f, df

            self.trace('start_pt', start_pt)
            for i, v in enumerate(args):
                self.trace(('args', i), numpy.asarray(v))
            self.trace('bounds', numpy.asarray(bounds))
            self.trace('maxiter', numpy.asarray(maxiter))
        else:
            fff = EI_fn_g

        best_pt, best_value, best_d = fmin_l_bfgs_b(fff,
                                                    start_pt,
                                                    None,
                                                    args=args,
                                                    maxfun=maxiter,
                                                    bounds=bounds,
                                                    iprint=-1)
        self.trace('best_pt', best_pt)

        # print 'BEST_PT', best_pt
        rval = x.copy()
        initial = 0
        for (_ind, iv) in enumerate(x):
            if self.is_refinable[self.kernels[_ind]]:
                diff = len(iv.vals)
                # XXX: assumes vector-valued vals (scalar elements)
                rval[_ind].vals = best_pt[initial:initial + diff]
                initial += diff
        # -- assert that all elements of best_pt have been used
        assert initial == len(best_pt)

        # -- apply any quantization required by the distributions
        self.post_refinement(rval)
        return rval

    def suggest_from_gp(self, trials, results, N):
        logger.info('suggest_from_gp')
        if N != 1:
            raise NotImplementedError('only N==1 is supported')
        ivls = self.idxs_vals_by_status(trials, results)

        prepared_data = self.prepare_GP_training_data(ivls)
        self.fit_GP(*prepared_data)

        # -- add the best previous trials as candidates
        n_trials_to_opt = self.n_candidates_to_draw // 2
        best_idxs = numpy.asarray(ivls['losses']['ok'].idxs)[numpy.argsort(
            ivls['losses']['ok'].vals)[:n_trials_to_opt]]
        best_IVLs = ivls['x_IVLs']['ok'].numeric_take(best_idxs)
        best_idxset = best_IVLs.idxset()

        # -- draw the remainder as random candidates
        candidates = self.gm_algo.suggest_from_model(
            ivls, self.n_candidates_to_draw - len(best_idxset))

        # -- re-index the best_IVLs to ensure no collision during stack
        cand_idxset = candidates.idxset()
        assert (len(cand_idxset) +
                len(best_idxset) == self.n_candidates_to_draw)
        idmap = {}
        for i in best_idxset:
            if i in cand_idxset:
                idmap[i] = (max(cand_idxset) + max(best_idxset) + len(idmap) +
                            1)
            else:
                idmap[i] = i
            assert idmap[i] not in cand_idxset
        assert (len(cand_idxset.union(
            idmap.values())) == self.n_candidates_to_draw)
        best_IVLs.reindex(idmap)
        candidates = candidates.as_list()
        candidates.stack(best_IVLs)
        assert len(candidates.idxset()) == self.n_candidates_to_draw
        # XXX: rather than reindex here, take advantage of fact that random
        #      candidates were already contiguously indexed and stack
        #      appropriately reindexed trials on top of them.
        candidates.reindex()
        candidates = candidates.as_numpy()

        candidates_opt = self.GP_EI_optimize(candidates)

        EI_opt = self.GP_EI(candidates_opt)
        best_idx = numpy.argmax(EI_opt)
        if 1:
            # for DEBUGGING
            EI = self.GP_EI(candidates)
            if EI.max() - 1e-4 > EI_opt.max():
                logger.warn(
                    'Optimization actually *decreased* EI!? %.3f -> %.3f' %
                    (EI.max(), EI_opt.max()))
        rval = candidates_opt.numeric_take([best_idx])
        return rval

    def suggest_from_gm(self, trials, results, N):
        logger.info('suggest_from_gm')
        ivls = self.idxs_vals_by_status(trials, results)
        rval = self.gm_algo.suggest_from_model(ivls, N)
        return rval

    def suggest_from_prior(self, trials, results, N):
        logger.info('suggest_from_prior')
        if not hasattr(self, '_prior_sampler'):
            self._prior_sampler = theano.function([self.s_N],
                                                  self.s_prior.flatten(),
                                                  mode=self.mode)
        rvals = self._prior_sampler(N)
        return IdxsValsList.fromflattened(rvals)

    def suggest(self, trials, results, N):
        ivls = self.idxs_vals_by_status(trials, results)
        t0 = time.time()
        n_ok = len(ivls['losses']['ok'].idxs)

        # -- choose the suggestion strategy (heuristic)
        if n_ok < self.n_startup_jobs:
            fn = self.suggest_from_prior
        else:
            # -- figure out how long (in iterations) it has been since picking a
            #    winner: `winner_age`
            assert (list(ivls['losses']['ok'].idxs) == list(
                sorted(ivls['losses']['ok'].idxs)))
            t_winner = numpy.asarray(ivls['losses']['ok'].vals).argmin()
            winner_age = n_ok - t_winner
            if winner_age < self.local_improvement_patience:
                fn = self.suggest_from_gp
            else:
                if self.numpy_rng.rand() < self.p_GP_during_exploration:
                    fn = self.suggest_from_gp
                else:
                    fn = self.suggest_from_gm
        try:
            rval = self.suggest_ivl(fn(trials, results, N))
        finally:
            logger.info('suggest %i took %.2f seconds' %
                        (len(ivls['losses']['ok'].idxs), time.time() - t0))
        return rval