Ejemplo n.º 1
0
    def test_kernels(self):
        from GPy.kern import RBF,Linear,MLP,Bias,White
        Q = self.Z.shape[1]
        kernels = [RBF(Q,ARD=True), Linear(Q,ARD=True),MLP(Q,ARD=True), RBF(Q,ARD=True)+Linear(Q,ARD=True)+Bias(Q)+White(Q)
                  ,RBF(Q,ARD=True)+Bias(Q)+White(Q),  Linear(Q,ARD=True)+Bias(Q)+White(Q)]

        for k in kernels:
            k.randomize()
            self._test_kernel_param(k)
            self._test_Z(k)
            self._test_qX(k)
            self._test_kernel_param(k, psi2n=True)
            self._test_Z(k, psi2n=True)
            self._test_qX(k, psi2n=True)
Ejemplo n.º 2
0
 def _create_kernel(self, V):
     self._kerns = [RBF(1, ARD=True, active_dims=[i])
                    for i in range(self.n_dims)]
     self._kernf = Fixed(self.n_dims, tdot(V))
     self._kernb = Bias(self.n_dims)
     self.kernel = np.sum(self._kerns) + self._kernf + self._kernb
Ejemplo n.º 3
0
class GP(BaseStrategy):
    short_name = 'gp'

    def __init__(self, acquisition=None, seed=None, seeds=1, max_feval=5E4, max_iter=1E5):
        self.seed = seed
        self.seeds = seeds
        self.max_feval = max_feval
        self.max_iter = max_iter
        self.model = None
        self.n_dims = None
        self.kernel = None
        self._kerns = None
        self._kernf = None
        self._kernb = None
        if acquisition is None:
            acquisition = {'name': 'osprey', 'params': {}}
        self.acquisition_function = acquisition
        self._acquisition_function = None
        self._set_acquisition()

    def _create_kernel(self, V):
        self._kerns = [RBF(1, ARD=True, active_dims=[i])
                       for i in range(self.n_dims)]
        self._kernf = Fixed(self.n_dims, tdot(V))
        self._kernb = Bias(self.n_dims)
        self.kernel = np.sum(self._kerns) + self._kernf + self._kernb

    def _fit_model(self, X, Y):
        model = GPRegression(X, Y, self.kernel)
        model.optimize(messages=False, max_f_eval=self.max_feval)
        self.model = model

    def _get_random_point(self):
        return np.array([np.random.uniform(low=0., high=1.)
                         for i in range(self.n_dims)])

    def _is_var_positive(self, var):

        if np.any(var < 0):
            # RuntimeError may be overkill
            raise RuntimeError('Negative variance predicted from regression model.')
        else:
            return True

    def _ei(self, x, y_mean, y_var):
        y_std = np.sqrt(y_var)
        y_best = self.model.Y.max(axis=0)
        z = (y_mean - y_best)/y_std
        result = y_std*(z*norm.cdf(z) + norm.pdf(z))
        return result

    def _ucb(self, x, y_mean, y_var, kappa=1.0):
        result = y_mean + kappa*np.sqrt(y_var)
        return result

    def _osprey(self, x, y_mean, y_var):
        return (y_mean+y_var).flatten()

    def _optimize(self, init=None):
        # TODO start minimization from a range of points and take minimum
        if not init:
            init = self._get_random_point()

        def z(x):
            # TODO make spread of points around x and take mean value.
            x = x.copy().reshape(-1, self.n_dims)
            y_mean, y_var = self.model.predict(x, kern=(np.sum(self._kerns).copy() +
                                                        self._kernb.copy()))
            # This code is for debug/testing phase only.
            # Ideally we should test for negative variance regardless of the AF.
            # However, we want to recover the original functionality of Osprey, hence the conditional block.
            # TODO remove this.
            if self.acquisition_function['name'] == 'osprey':
                af = self._acquisition_function(x, y_mean=y_mean, y_var=y_var)
            elif self.acquisition_function['name'] in ['ei', 'ucb']:
                # y_var = np.abs(y_var)
                if self._is_var_positive(y_var):
                    af = self._acquisition_function(x, y_mean=y_mean, y_var=y_var)
            return (-1)*af

        res = minimize(z, init, bounds=self.n_dims*[(0., 1.)],
                        options={'maxiter': self.max_iter, 'disp': 0})
        return res.x

    def _set_acquisition(self):
        if isinstance(self.acquisition_function, list):
            raise RuntimeError('Must specify only one acquisition function')
        if sorted(self.acquisition_function.keys()) != ['name', 'params']:
            raise RuntimeError('strategy/params/acquisition must contain keys '
                               '"name" and "params"')
        if self.acquisition_function['name'] not in ['ei', 'ucb', 'osprey']:
            raise RuntimeError('strategy/params/acquisition name must be one of '
                               '"ei", "ucb", "osprey"')

        f = eval('self._'+self.acquisition_function['name'])

        def g(x, y_mean, y_var):
            return f(x, y_mean, y_var, **self.acquisition_function['params'])

        self._acquisition_function = g

    def _get_data(self, history, searchspace):
        X = []
        Y = []
        V = []
        ignore = []
        for param_dict, scores, status in history:
            # transform points into the GP domain. This invloves bringing
            # int and enum variables to floating point, etc.
            if status == 'FAILED':
                # not sure how to deal with these yet
                continue

            point = searchspace.point_to_gp(param_dict)
            if status == 'SUCCEEDED':
                X.append(point)
                Y.append(np.mean(scores))
                V.append(np.var(scores))

            elif status == 'PENDING':
                ignore.append(point)
            else:
                raise RuntimeError('unrecognized status: %s' % status)

        return (np.array(X).reshape(-1, self.n_dims),
                np.array(Y).reshape(-1, 1),
                np.array(V).reshape(-1, 1),
                np.array(ignore).reshape(-1, self.n_dims))

    def _from_gp(self, result, searchspace):

        # Note that GP only deals with float-valued variables, so we have
        # a transform step on either side, where int and enum valued variables
        # are transformed before calling gp, and then the result suggested by
        # GP needs to be reverse-transformed.
        out = {}
        for gpvalue, var in zip(result, searchspace):
            out[var.name] = var.point_from_gp(float(gpvalue))

        return out

    def _is_within(self, point, X, tol=1E-2):
        if True in (np.sqrt(((point - X)**2).sum(axis=0)) <= tol):
            return True
        return False

    def suggest(self, history, searchspace, max_tries=5):
        if not GPRegression:
            raise ImportError('No module named GPy')
        if not minimize:
            raise ImportError('No module named SciPy')

        if len(history) < self.seeds:
            return RandomSearch().suggest(history, searchspace)

        self.n_dims = searchspace.n_dims

        X, Y, V, ignore = self._get_data(history, searchspace)

        # TODO make _create_kernel accept optional args.
        self._create_kernel(V)
        self._fit_model(X, Y)
        suggestion = self._optimize()

        if suggestion in ignore or self._is_within(suggestion, X):
            return RandomSearch().suggest(history, searchspace)

        return self._from_gp(suggestion, searchspace)
Ejemplo n.º 4
0
class GP(BaseStrategy):
    short_name = 'gp'

    def __init__(self, seed=None, seeds=1, max_feval=5E4, max_iter=1E5):
        self.seed = seed
        self.seeds = seeds
        self.max_feval = max_feval
        self.max_iter = max_iter
        self.model = None
        self.n_dims = None
        self.kernel = None
        self._kerns = None
        self._kernf = None
        self._kernb = None

    def _create_kernel(self, V):
        self._kerns = [
            RBF(1, ARD=True, active_dims=[i]) for i in range(self.n_dims)
        ]
        self._kernf = Fixed(self.n_dims, tdot(V))
        self._kernb = Bias(self.n_dims)
        self.kernel = np.sum(self._kerns) + self._kernf + self._kernb

    def _fit_model(self, X, Y):
        model = GPRegression(X, Y, self.kernel)
        model.optimize(messages=False, max_f_eval=self.max_feval)
        self.model = model

    def _get_random_point(self):
        return np.array(
            [np.random.uniform(low=0., high=1.) for i in range(self.n_dims)])

    def _optimize(self, init=None):
        if not init:
            init = self._get_random_point()

        def z(x):
            y = x.copy().reshape(-1, self.n_dims)
            s, v = self.model.predict(y,
                                      kern=(np.sum(self._kerns).copy() +
                                            self._kernb.copy()))
            return -(s + v).flatten()

        return minimize(z,
                        init,
                        bounds=self.n_dims * [(0., 1.)],
                        options={
                            'maxiter': self.max_iter,
                            'disp': 0
                        }).x

    def _get_data(self, history, searchspace):
        X = []
        Y = []
        V = []
        ignore = []
        for param_dict, scores, status in history:
            # transform points into the GP domain. This invloves bringing
            # int and enum variables to floating point, etc.
            if status == 'FAILED':
                # not sure how to deal with these yet
                continue

            point = searchspace.point_to_gp(param_dict)
            if status == 'SUCCEEDED':
                X.append(point)
                Y.append(np.mean(scores))
                V.append(np.var(scores))

            elif status == 'PENDING':
                ignore.append(point)
            else:
                raise RuntimeError('unrecognized status: %s' % status)

        return (np.array(X).reshape(-1, self.n_dims),
                np.array(Y).reshape(-1, 1), np.array(V).reshape(-1, 1),
                np.array(ignore).reshape(-1, self.n_dims))

    def _from_gp(self, result, searchspace):

        # Note that GP only deals with float-valued variables, so we have
        # a transform step on either side, where int and enum valued variables
        # are transformed before calling gp, and then the result suggested by
        # GP needs to be reverse-transformed.
        out = {}
        for gpvalue, var in zip(result, searchspace):
            out[var.name] = var.point_from_gp(float(gpvalue))

        return out

    def _is_within(self, point, X, tol=1E-2):
        if True in (np.sqrt(((point - X)**2).sum(axis=0)) <= tol):
            return True
        return False

    def suggest(self, history, searchspace, max_tries=5):
        if not GPRegression:
            raise ImportError('No module named GPy')
        if not minimize:
            raise ImportError('No module named SciPy')

        if len(history) < self.seeds:
            return RandomSearch().suggest(history, searchspace)

        self.n_dims = searchspace.n_dims

        X, Y, V, ignore = self._get_data(history, searchspace)
        self._create_kernel(V)
        self._fit_model(X, Y)

        suggestion = self._optimize()

        if suggestion in ignore or self._is_within(suggestion, X):
            return RandomSearch().suggest(history, searchspace)

        return self._from_gp(suggestion, searchspace)
Ejemplo n.º 5
0
class GP(BaseStrategy):
    short_name = 'gp'

    def __init__(self,
                 acquisition=None,
                 seed=None,
                 seeds=1,
                 max_feval=5E4,
                 max_iter=1E5):
        self.seed = seed
        self.seeds = seeds
        self.max_feval = max_feval
        self.max_iter = max_iter
        self.model = None
        self.n_dims = None
        self.kernel = None
        self._kerns = None
        self._kernf = None
        self._kernb = None
        if acquisition is None:
            acquisition = {'name': 'osprey', 'params': {}}
        self.acquisition_function = acquisition
        self._acquisition_function = None
        self._set_acquisition()

    def _create_kernel(self, V):
        self._kerns = [
            RBF(1, ARD=True, active_dims=[i]) for i in range(self.n_dims)
        ]
        self._kernf = Fixed(self.n_dims, tdot(V))
        self._kernb = Bias(self.n_dims)
        self.kernel = np.sum(self._kerns) + self._kernf + self._kernb

    def _fit_model(self, X, Y):
        model = GPRegression(X, Y, self.kernel)
        model.optimize(messages=False, max_f_eval=self.max_feval)
        self.model = model

    def _get_random_point(self):
        return np.array(
            [np.random.uniform(low=0., high=1.) for i in range(self.n_dims)])

    def _is_var_positive(self, var):

        if np.any(var < 0):
            # RuntimeError may be overkill
            raise RuntimeError(
                'Negative variance predicted from regression model.')
        else:
            return True

    def _ei(self, x, y_mean, y_var):
        y_std = np.sqrt(y_var)
        y_best = self.model.Y.max(axis=0)
        z = (y_mean - y_best) / y_std
        result = y_std * (z * norm.cdf(z) + norm.pdf(z))
        return result

    def _ucb(self, x, y_mean, y_var, kappa=1.0):
        result = y_mean + kappa * np.sqrt(y_var)
        return result

    def _osprey(self, x, y_mean, y_var):
        return (y_mean + y_var).flatten()

    def _optimize(self, init=None):
        # TODO start minimization from a range of points and take minimum
        if not init:
            init = self._get_random_point()

        def z(x):
            # TODO make spread of points around x and take mean value.
            x = x.copy().reshape(-1, self.n_dims)
            y_mean, y_var = self.model.predict(
                x, kern=(np.sum(self._kerns).copy() + self._kernb.copy()))
            # This code is for debug/testing phase only.
            # Ideally we should test for negative variance regardless of the AF.
            # However, we want to recover the original functionality of Osprey, hence the conditional block.
            # TODO remove this.
            if self.acquisition_function['name'] == 'osprey':
                af = self._acquisition_function(x, y_mean=y_mean, y_var=y_var)
            elif self.acquisition_function['name'] in ['ei', 'ucb']:
                # y_var = np.abs(y_var)
                if self._is_var_positive(y_var):
                    af = self._acquisition_function(x,
                                                    y_mean=y_mean,
                                                    y_var=y_var)
            return (-1) * af

        res = minimize(z,
                       init,
                       bounds=self.n_dims * [(0., 1.)],
                       options={
                           'maxiter': self.max_iter,
                           'disp': 0
                       })
        return res.x

    def _set_acquisition(self):
        if isinstance(self.acquisition_function, list):
            raise RuntimeError('Must specify only one acquisition function')
        if sorted(self.acquisition_function.keys()) != ['name', 'params']:
            raise RuntimeError('strategy/params/acquisition must contain keys '
                               '"name" and "params"')
        if self.acquisition_function['name'] not in ['ei', 'ucb', 'osprey']:
            raise RuntimeError(
                'strategy/params/acquisition name must be one of '
                '"ei", "ucb", "osprey"')

        f = eval('self._' + self.acquisition_function['name'])

        def g(x, y_mean, y_var):
            return f(x, y_mean, y_var, **self.acquisition_function['params'])

        self._acquisition_function = g

    def _get_data(self, history, searchspace):
        X = []
        Y = []
        V = []
        ignore = []
        for param_dict, scores, status in history:
            # transform points into the GP domain. This invloves bringing
            # int and enum variables to floating point, etc.
            if status == 'FAILED':
                # not sure how to deal with these yet
                continue

            point = searchspace.point_to_gp(param_dict)
            if status == 'SUCCEEDED':
                X.append(point)
                Y.append(np.mean(scores))
                V.append(np.var(scores))

            elif status == 'PENDING':
                ignore.append(point)
            else:
                raise RuntimeError('unrecognized status: %s' % status)

        return (np.array(X).reshape(-1, self.n_dims),
                np.array(Y).reshape(-1, 1), np.array(V).reshape(-1, 1),
                np.array(ignore).reshape(-1, self.n_dims))

    def _from_gp(self, result, searchspace):

        # Note that GP only deals with float-valued variables, so we have
        # a transform step on either side, where int and enum valued variables
        # are transformed before calling gp, and then the result suggested by
        # GP needs to be reverse-transformed.
        out = {}
        for gpvalue, var in zip(result, searchspace):
            out[var.name] = var.point_from_gp(float(gpvalue))

        return out

    def _is_within(self, point, X, tol=1E-2):
        if True in (np.sqrt(((point - X)**2).sum(axis=0)) <= tol):
            return True
        return False

    def suggest(self, history, searchspace, max_tries=5):
        if not GPRegression:
            raise ImportError('No module named GPy')
        if not minimize:
            raise ImportError('No module named SciPy')

        if len(history) < self.seeds:
            return RandomSearch().suggest(history, searchspace)

        self.n_dims = searchspace.n_dims

        X, Y, V, ignore = self._get_data(history, searchspace)

        # TODO make _create_kernel accept optional args.
        self._create_kernel(V)
        self._fit_model(X, Y)
        suggestion = self._optimize()

        if suggestion in ignore or self._is_within(suggestion, X):
            return RandomSearch().suggest(history, searchspace)

        return self._from_gp(suggestion, searchspace)
Ejemplo n.º 6
0
class GP(BaseStrategy):
    short_name = 'gp'

    def __init__(self, seeds=1, max_feval=5E4, max_iter=1E5):
        self.seeds = seeds
        self.max_feval = max_feval
        self.max_iter = max_iter
        self.model = None
        self.n_dims = None
        self.kernel = None
        self._kerns = None
        self._kernf = None
        self._kernb = None

    def _create_kernel(self, V):
        self._kerns = [RBF(1, ARD=True, active_dims=[i])
                       for i in range(self.n_dims)]
        self._kernf = Fixed(self.n_dims, tdot(V))
        self._kernb = Bias(self.n_dims)
        self.kernel = np.sum(self._kerns) + self._kernf + self._kernb

    def _fit_model(self, X, Y):
        model = GPRegression(X, Y, self.kernel)
        model.optimize(messages=False, max_f_eval=self.max_feval)
        self.model = model

    def _get_random_point(self):
        return np.array([np.random.uniform(low=0., high=1.)
                         for i in range(self.n_dims)])

    def _optimize(self, init=None):
        if not init:
            init = self._get_random_point()

        def z(x):
            y = x.copy().reshape(-1, self.n_dims)
            s, v = self.model.predict(y, kern=(np.sum(self._kerns).copy() +
                                               self._kernb.copy()))
            return -(s+v).flatten()

        return minimize(z, init, bounds=self.n_dims*[(0., 1.)],
                        options={'maxiter': self.max_iter, 'disp': 0}).x

    def _get_data(self, history, searchspace):
        X = []
        Y = []
        V = []
        ignore = []
        for param_dict, scores, status in history:
            # transform points into the GP domain. This invloves bringing
            # int and enum variables to floating point, etc.
            if status == 'FAILED':
                # not sure how to deal with these yet
                continue

            point = searchspace.point_to_gp(param_dict)
            if status == 'SUCCEEDED':
                X.append(point)
                Y.append(np.mean(scores))
                V.append(np.var(scores))

            elif status == 'PENDING':
                ignore.append(point)
            else:
                raise RuntimeError('unrecognized status: %s' % status)

        return (np.array(X).reshape(-1, self.n_dims),
                np.array(Y).reshape(-1, 1),
                np.array(V).reshape(-1, 1),
                np.array(ignore).reshape(-1, self.n_dims))

    def _from_gp(self, result, searchspace):

        # Note that GP only deals with float-valued variables, so we have
        # a transform step on either side, where int and enum valued variables
        # are transformed before calling gp, and then the result suggested by
        # GP needs to be reverse-transformed.
        out = {}
        for gpvalue, var in zip(result, searchspace):
            out[var.name] = var.point_from_gp(float(gpvalue))

        return out

    def _is_within(self, point, X, tol=1E-2):
        if True in (np.sqrt(((point - X)**2).sum(axis=0)) <= tol):
            return True
        return False

    def suggest(self, history, searchspace, max_tries=5):
        if not GPRegression:
            raise ImportError('No module named GPy')
        if not minimize:
            raise ImportError('No module named SciPy')

        if len(history) < self.seeds:
            return RandomSearch().suggest(history, searchspace)

        self.n_dims = searchspace.n_dims

        X, Y, V, ignore = self._get_data(history, searchspace)
        self._create_kernel(V)
        self._fit_model(X, Y)

        suggestion = self._optimize()

        if suggestion in ignore or self._is_within(suggestion, X):
            return RandomSearch().suggest(history, searchspace)

        return self._from_gp(suggestion, searchspace)
Ejemplo n.º 7
0
def gp_on_fold(feature_sets, train, test, y, y_all, learn_options):

    sequences = np.array([str(x) for x in y_all.index.get_level_values(0).tolist()])

    kern = WeightedDegree(
        1, sequences, d=learn_options["kernel degree"], active_dims=[0]
    )
    X = np.arange(len(train))[:, None]

    current_dim = 1

    if "gc_count" in feature_sets:
        kern += RBF(1, active_dims=[current_dim], name="GC_rbf")
        X = np.concatenate((X, feature_sets["gc_count"].values), axis=1)
        current_dim += 1
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number of columns")

    if "drug" in feature_sets:
        Q = feature_sets["drug"].values.shape[1]
        kern += Linear(
            Q, active_dims=range(current_dim, current_dim + Q), name="drug_lin"
        )
        X = np.concatenate((X, feature_sets["drug"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "gene effect" in feature_sets:
        Q = feature_sets["gene effect"].values.shape[1]
        kern += Linear(
            Q, active_dims=range(current_dim, current_dim + Q), name="gene_lin"
        )
        X = np.concatenate((X, feature_sets["gene effect"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "Percent Peptide" in feature_sets:
        Q = feature_sets["Percent Peptide"].values.shape[1]
        kern += RBF(
            Q, active_dims=range(current_dim, current_dim + Q), name="percent_pept"
        )
        X = np.concatenate((X, feature_sets["Percent Peptide"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "Nucleotide cut position" in feature_sets:
        Q = feature_sets["Nucleotide cut position"].values.shape[1]
        kern += RBF(
            Q, active_dims=range(current_dim, current_dim + Q), name="nucleo_cut"
        )
        X = np.concatenate((X, feature_sets["Nucleotide cut position"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "Strand effect" in feature_sets:
        Q = feature_sets["Strand effect"].values.shape[1]
        kern += Linear(
            Q, active_dims=range(current_dim, current_dim + Q), name="strand"
        )
        X = np.concatenate((X, feature_sets["Strand effect"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "NGGX" in feature_sets:
        Q = feature_sets["NGGX"].values.shape[1]
        kern += Linear(Q, active_dims=range(current_dim, current_dim + Q), name="NGGX")
        X = np.concatenate((X, feature_sets["NGGX"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "TM" in feature_sets:
        Q = feature_sets["TM"].values.shape[1]
        kern += RBF(
            Q, ARD=True, active_dims=range(current_dim, current_dim + Q), name="TM"
        )
        X = np.concatenate((X, feature_sets["TM"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "gene features" in feature_sets:
        Q = feature_sets["gene features"].values.shape[1]
        kern += Linear(
            Q,
            ARD=True,
            active_dims=range(current_dim, current_dim + Q),
            name="genefeat",
        )
        X = np.concatenate((X, feature_sets["gene features"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    kern += Bias(X.shape[1])

    if learn_options["warpedGP"]:
        m = WarpedGP(X[train], y[train], kernel=kern)
    else:
        m = GPRegression(X[train], y[train], kernel=kern)

    m.optimize_restarts(3)
    y_pred, _ = m.predict(X[test])

    # TODO add offset such that low scores are around 0 (not -4 or so)

    return y_pred, m[:]