Beispiel #1
0
    def _density_est(self, xpca):
        """
        input: 
            xpca cluster projected onto its first principal component
        output:
            True/False flag, if x has local minima True, False o/w            
        """
        if xpca.shape[0] < 3:
            return False

        kde = KernelDensity()
        h = np.std(xpca) * (4 / 3 / len(xpca)) ** (1 / 5)
        kde.set_params(bandwidth=h).fit(xpca)

        mmin, mmax = np.percentile(xpca, [20, 80])
        xdensity = np.linspace(mmin, mmax, 1000)[:, np.newaxis]  # take .1, .9 quantile
        ydensity = np.exp(kde.score_samples(xdensity))

        local_minimas_idx = argrelmin(ydensity)[0]

        if local_minimas_idx.size == 0:
            flag = False
        else:
            flag = True

        return flag
Beispiel #2
0
    def _density_est(self, x):
        """
        input: 
            x n_objects x n_features ndarray
            
        output:
            xpca cluster projected onto its first principal component
            xmin x value of minimal minima of produced kernel density
            ymin y value of minimal minima of produced kernel density
        """
        if x.shape[0] < 3: return np.nan, np.nan, np.nan

        pca = PCA(n_components=1, random_state=self.random_state)
        kde = KernelDensity()

        xpca = pca.fit_transform(x)
        h = np.std(xpca) * (4 / 3 / len(xpca)) ** (1 / 5)
        kde.set_params(bandwidth=h).fit(xpca)

        mmin, mmax = np.percentile(xpca, [5, 95])
        xdensity = np.linspace(mmin, mmax, 1000)[:, np.newaxis]
        ydensity = np.exp(kde.score_samples(xdensity))  # think about this 1000
        # number 1000 seems pretty arbitrary for me
        local_minimas_idx = argrelmin(ydensity)[0]
        if local_minimas_idx.size == 0:
            return xpca, np.nan, np.nan
        else:
            idx = ydensity[local_minimas_idx].argmin()
            xmin = xdensity[local_minimas_idx[idx]]
            ymin = ydensity[local_minimas_idx[idx]]

        return xpca, xmin, ymin
Beispiel #3
0
class TwoClassKDE(object):
    """Class for Kernel Density Estimator on two labels. Likelihood ratio at a point is ratio of class-1 likelihood estimate to class-0 likelihood estimate, times the class odds, where this is calculated as the posterior mean estimate under Beta(1, 1) prior, given the observations. If no points are observed for one of the classes, a default (improper) uniform prior is assumed for that class. """
    def __init__(self, **kwargs):
        """Takes same parameters as KernelDensity estimator."""
        self.kde0 = KernelDensity(**kwargs)
        self.kde1 = KernelDensity(**kwargs)
    def fit(self, X, y):
        """Fits KDE models on the data. X is array of data points, y is array of 0-1 labels."""
        y = np.asarray(y, dtype = int)
        self.n0, self.n1 = (y == 0).sum(), (y == 1).sum()
        assert (self.n0 + self.n1 == len(y)), "y must be vector of 1's and 0's."
        X0, X1 = X[y == 0], X[y == 1]
        if (self.n0 > 0):
            self.kde0.fit(X0)
        if (self.n1 > 0):
            self.kde1.fit(X1)
    def fit_with_optimal_bandwidth(self, X, y, gridsize = 101, dynamic_range = 100, cv = 10, verbose = 0, n_jobs = 1):
        """Determines optimal bandwidth using the following strategy: For each subset (0 or 1) of the dataset, 1) set b = 1.06 * sigma * n^(-1/5), the Silverman's rule of thumb estimate for the optimal bandwidth. sigma is the sample standard deviation of the samples after zero-centering the columns (note: ideally each column will have comparable variance), 2) set up a grid (of size gridsize) of bandwidth values to try, ranging from b / alpha to b * alpha in geometric progression, where alpha = sqrt(dynamic_range), 3) compute average likelihood of the estimator on the data using cv-fold cross-validation, 4) select the bandwidth with the highest likelihood."""
        y = np.asarray(y, dtype = int)
        self.n0, self.n1 = (y == 0).sum(), (y == 1).sum()
        assert (self.n0 + self.n1 == len(y)), "y must be vector of 1's and 0's."
        X0, X1 = X[y == 0], X[y == 1]
        if (self.n0 > 0):
            log_b0 = np.log(1.06) + np.log((X0 - X0.mean(axis = 0)).std()) - 0.2 * np.log(self.n0)
            grid0 = GridSearchCV(self.kde0, {'bandwidth' : np.exp(np.linspace(log_b0 - 0.5 * np.log(dynamic_range), log_b0 + 0.5 * np.log(dynamic_range), gridsize))}, cv = cv, verbose = verbose, n_jobs = n_jobs)
            grid0.fit(X0)
            self.kde0 = grid0.best_estimator_
        if (self.n1 > 0):
            log_b1 = np.log(1.06) + np.log((X1 - X1.mean(axis = 0)).std()) - 0.2 * np.log(self.n1)
            grid1 = GridSearchCV(self.kde1, {'bandwidth' : np.exp(np.linspace(log_b1 - 0.5 * np.log(dynamic_range), log_b1 + 0.5 * np.log(dynamic_range), gridsize))}, cv = cv, verbose = verbose, n_jobs = n_jobs)
            grid1.fit(X1)
            self.kde1 = grid1.best_estimator_    
    def get_params(self, **kwargs):
        return self.kde0.get_params(**kwargs)
    def set_params(self, **params):
        self.kde0.set_params(**params)
        self.kde1.set_params(**params)
        return self
    def score_samples(self, X):
        """Evaluate the density model on the data. Returns vector of log-likelihood ratios of class 1 over class 0."""
        p1_est = (self.n1 + 1) / (self.n0 + self.n1 + 2)
        class_log_odds = np.log(p1_est) - np.log(1 - p1_est)
        scores0 = self.kde0.score_samples(X) if (self.n0 > 0) else np.zeros(len(X), dtype = float)
        scores1 = self.kde1.score_samples(X) if (self.n1 > 0) else np.zeros(len(X), dtype = float)
        return scores1 - scores0 + class_log_odds
    def score(self, X, y = None):
        """Compute the overall log-likelihood ratio under the model."""
        return self.score_samples(X).sum()
    def predict_proba(self, X):
        """Probability estimates."""
        scores = self.score_samples(X)
        p0s = 1 / (1 + np.exp(scores))
        return np.array([p0s, 1 - p0s]).transpose()
    def predict_log_proba(self, X):
        """Log of probability estimates."""
        return np.log(self.predict_proba(X))
Beispiel #4
0
    def _density_est(self, x):
        """
        input: 
            x n_objects x n_features ndarray

        output:
            xpca cluster projected onto its first principal component
            xmin x value of minimal minima of produced kernel density
            ymin y value of minimal minima of produced kernel density
        """
        if x.shape[0] < 3: return np.nan, np.nan, np.nan, np.nan

        xidx = 0
        xfeature = []
        xmin = np.inf
        ymin = np.inf

        kde = KernelDensity()
        for i, feature in enumerate(x.T):
            h = np.std(feature) * (4 / 3 / len(feature)) ** (1 / 5)
            kde.set_params(bandwidth=h).fit(feature[:, np.newaxis])

            mmin, mmax = np.percentile(feature, [5, 95])
            xdensity = np.linspace(mmin, mmax, 1000)[:, np.newaxis]
            ydensity = np.exp(kde.score_samples(xdensity))  # think about this 1000
            # number 1000 seems pretty arbitrary for me
            local_minimas_idx = argrelmin(ydensity)[0]

            if local_minimas_idx.size != 0:
                _idx = ydensity[local_minimas_idx].argmin()
                _xmin = xdensity[local_minimas_idx[_idx]]
                _ymin = ydensity[local_minimas_idx[_idx]]

                if _ymin < ymin:
                    xfeature = feature
                    xmin = _xmin
                    ymin = _ymin
                    xidx = i

        if xmin is np.inf:
            return np.nan, np.nan, np.nan, np.nan

        return xfeature[:, np.newaxis], xidx, xmin, ymin
Beispiel #5
0
    def _fit(self, fit_data: np.array):
        """Fit the scikit-learn KDE.

        :param fit_data: Data to fit the KDE to.
        :return: fitted KDE object
        """
        bw = 1 if self.bw is None else self.bw  # bandwidth
        fit_kws = {
            "bandwidth": bw,
            "algorithm": "auto",  # kdtree or ball_tree
            "kernel": self.kernel_type,
            "metric": "euclidean",  # default
            "atol": 1e-4,  # tolerance for convergence
            "rtol": 0,  #
            "breadth_first": True,  #
            "leaf_size": 40,
            "metric_params": None,
        }  # define the kernel density estimator parameters
        kde = KernelDensity(**fit_kws)  # initiate the estimator
        if self.grid_search and not self.bw:
            # GridSearchCV maximizes the total log probability density under the model.
            # The data X will be be divided into train-test splits based on folds defined in cv param
            # For each combination of parameters that you specified in param_grid, the model
            # will be trained on the train part from the step above and then scoring will be used on test part.
            # The scores for each parameter combination will be combined for all the folds and averaged.
            # Highest performing parameter will be selected.

            grid = GridSearchCV(kde, {"bandwidth": self.bandwidth_space
                                      })  # Grid search on bandwidth
            grid.fit(fit_data)  # Fit the grid search
            self.bw = grid.best_params_[
                "bandwidth"]  # Set the bandwidth to the best bandwidth
            fit_kws[
                "bandwidth"] = self.bw  # Update the bandwidth in the fit_kws
            kde.set_params(**{
                "bandwidth": self.bw
            })  # Update the bandwidth in the scikit-learn model

        kde.fit(fit_data)  # Fit the KDE

        return kde