Example #1
def positive_definite_kernel(kernel_config, data=None):
    return kernel, kernel_batch

    cf. sklearn.metrics - Pairwise metrics, Affinities and Kernels
    cf. scipy.spatial.distance
    kernel_type = kernel_config[0]

    if kernel_type == 'Linear':
        return inner_product, None
    elif kernel_type == 'Cos':
        return cosine, None
    elif kernel_type == 'ReluCos':
        return relu_cosine, None
    elif kernel_type == 'Gaussian':
        bw_param = kernel_config[1]
        if bw_param == 'scott':
            if len(kernel_config) > 2:
                scale = float(kernel_config[2])
                sigma_vec = bandwidths.bw_scott(data) * scale
                sigma_vec = bandwidths.bw_scott(data)
            print('bandwidth: {}'.format(np.average(sigma_vec)))
            return gaussian(sigma_vec), gaussian_pairwise(sigma_vec)

        elif bw_param == 'silverman':
            sigma_vec = bandwidths.bw_silverman(data)
            print('bandwidth: {}'.format(np.average(sigma_vec)))
            return gaussian(sigma_vec), gaussian_pairwise(sigma_vec)

            sigma = float(kernel_config[1])
            gamma = 1.0 / (2.0 * sigma ** 2)
            return gaussian(sigma), functools.partial(
                sklearn.metrics.pairwise.rbf_kernel, gamma=gamma)
    elif kernel_type == 'Laplacian':
        gamma = float(kernel_config[1])
        return laplacian(gamma), functools.partial(
            sklearn.metrics.pairwise.laplacian_kernel, gamma=gamma)
Example #2
def _univariate_kdeplot(data, scale=None, shade=False, kernel="gaussian",
        bw="scott", gridsize=100, cut=3, clip=None, legend=True,
        ax=None, orientation = "vertical", **kwargs):
    if ax is None:
        ax = plt.gca()
    if clip is None:
        clip = (-np.inf, np.inf)

    scaled_data = scale(data)
    # mask out the data that's not in the scale domain
    scaled_data = scaled_data[~np.isnan(scaled_data)]  
    if kernel not in ['gaussian','tophat','epanechnikov','exponential','linear','cosine']:
        raise util.CytoflowOpError(None,
                                   "kernel must be one of ['gaussian'|'tophat'|'epanechnikov'|'exponential'|'linear'|'cosine']")
    if bw == 'scott':
        bw = bw_scott(scaled_data)
    elif bw == 'silverman':
        bw = bw_silverman(scaled_data)
    elif not isinstance(bw, float):
        raise util.CytoflowViewError(None,
                                     "Bandwith must be 'scott', 'silverman' or a float")
    support = _kde_support(scaled_data, bw, gridsize, cut, clip)[:, np.newaxis]

    kde = KernelDensity(kernel = kernel, bandwidth = bw).fit(scaled_data[:, np.newaxis])
    log_density = kde.score_samples(support)

    x = scale.inverse(support[:, 0])
    y = np.exp(log_density)

    # Check if a label was specified in the call
    label = kwargs.pop("label", None)
    color = kwargs.pop("color", None)
    alpha = kwargs.pop("alpha", 0.25)

    # Draw the KDE plot and, optionally, shade
    if orientation == "vertical":
        ax.plot(x, y, color=color, label=label, **kwargs)
        if shade:
            ax.fill_between(x, 1e-12, y, facecolor=color, alpha=alpha)
        ax.plot(y, x, color=color, label=label, **kwargs)
        if shade:
            ax.fill_between(y, 1e-12, x, facecolor=color, alpha=alpha)

    return ax
Example #3
from scipy import stats
import numpy as np
from statsmodels.sandbox.distributions.mixture_rvs import mixture_rvs
from statsmodels.nonparametric.kde import (kdensity, kdensityfft)
import matplotlib.pyplot as plt

obs_dist = mixture_rvs([.25, .75],
                       dist=[stats.norm, stats.norm],
                       kwargs=(dict(loc=-1, scale=.5), dict(loc=1, scale=.5)))
#.. obs_dist = mixture_rvs([.25,.75], size=10000, dist=[stats.norm, stats.beta],
#..            kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=1,args=(1,.5))))

f_hat, grid, bw = kdensityfft(obs_dist, kernel="gauss", bw="scott")

# Check the plot

plt.hist(obs_dist, bins=50, normed=True, color='red')
plt.plot(grid, f_hat, lw=2, color='black')

# do some timings
# get bw first because they're not streamlined
from statsmodels.nonparametric import bandwidths
bw = bandwidths.bw_scott(obs_dist)

#.. timeit kdensity(obs_dist, kernel="gauss", bw=bw, gridsize=2**10)
#.. timeit kdensityfft(obs_dist, kernel="gauss", bw=bw, gridsize=2**10)
Example #4
def hsic_test_gamma(X, Y, bw_method='mdbs'):
    """get the HSIC statistic.

    X, Y : array-like, shape (n_samples, n_features)
        Training data, where ``n_samples`` is the number of samples
        and ``n_features`` is the number of features.

    bw_method : str, optional (default=``mdbs``)
        The method used to calculate the bandwidth of the HSIC.

        * ``mdbs`` : Median distance between samples.
        * ``scott`` : Scott's Rule of Thumb.
        * ``silverman`` : Silverman's Rule of Thumb.

    test_stat : float
        the HSIC statistic.

    p : float
        the HSIC p-value.
    X = X.reshape(-1, 1) if X.ndim == 1 else X
    Y = Y.reshape(-1, 1) if Y.ndim == 1 else Y

    if bw_method == 'scott':
        width_x = bandwidths.bw_scott(X)
        width_y = bandwidths.bw_scott(Y)
    elif bw_method == 'silverman':
        width_x = bandwidths.bw_silverman(X)
        width_y = bandwidths.bw_silverman(Y)
    # Get kernel width to median distance between points
        width_x = get_kernel_width(X)
        width_y = get_kernel_width(Y)

    # these are slightly biased estimates of centered gram matrices
    K, Kc = get_gram_matrix(X, width_x)
    L, Lc = get_gram_matrix(Y, width_y)

    # test statistic m*HSICb under H1
    n = X.shape[0]
    bone = np.ones((n, 1))
    test_stat = hsic_teststat(Kc, Lc, n)

    var = (1 / 6 * Kc * Lc)**2
    # second subtracted term is bias correction
    var = 1 / n / (n - 1) * (np.sum(np.sum(var)) - np.sum(np.diag(var)))
    # variance under H0
    var = 72 * (n - 4) * (n - 5) / n / (n - 1) / (n - 2) / (n - 3) * var

    K = K - np.diag(np.diag(K))
    L = L - np.diag(np.diag(L))
    mu_X = 1 / n / (n - 1) * np.dot(bone.T, np.dot(K, bone))
    mu_Y = 1 / n / (n - 1) * np.dot(bone.T, np.dot(L, bone))
    # mean under H0
    mean = 1 / n * (1 + mu_X * mu_Y - mu_X - mu_Y)

    alpha = mean**2 / var
    # threshold for hsicArr*m
    beta = np.dot(var, n) / mean
    p = 1 - gamma.cdf(test_stat, alpha, scale=beta)[0][0]

    return test_stat, p
Example #5
def _bivariate_kdeplot(x, y, xscale=None, yscale=None, shade=False,
                       bw="scott", gridsize=50, cut=3, clip=None, legend=True, 
                       legend_data = None, **kwargs):
    ax = plt.gca()
    # Determine the clipping
    clip = [(-np.inf, np.inf), (-np.inf, np.inf)]
    x = xscale(x)
    y = yscale(y)

    x_nan = np.isnan(x)
    y_nan = np.isnan(y)
    x = x[~(x_nan | y_nan)]
    y = y[~(x_nan | y_nan)]
    if bw == 'scott':
        bw_x = bw_scott(x)
        bw_y = bw_scott(y)
        bw = (bw_x + bw_y) / 2
    elif bw == 'silverman':
        bw_x = bw_silverman(x)
        bw_y = bw_silverman(y)
        bw = (bw_x + bw_y) / 2
    elif isinstance(bw, float):
        bw_x = bw_y = bw
        raise util.CytoflowViewError(None,
                                     "Bandwith must be 'scott', 'silverman' or a float")

    kde = KernelDensity(bandwidth = bw, kernel = 'gaussian').fit(np.column_stack((x, y)))
    x_support = _kde_support(x, bw_x, gridsize, cut, clip[0])
    y_support = _kde_support(y, bw_y, gridsize, cut, clip[1])
    xx, yy = np.meshgrid(x_support, y_support)
    z = kde.score_samples(np.column_stack((xx.ravel(), yy.ravel())))
    z = z.reshape(xx.shape)
    z = np.exp(z)

    n_levels = kwargs.pop("n_levels", 10)
    color = kwargs.pop("color")
    kwargs['colors'] = (color, )
    x_support = xscale.inverse(x_support)
    y_support = yscale.inverse(y_support)
    xx, yy = np.meshgrid(x_support, y_support)    
    contour_func = ax.contourf if shade else ax.contour
        cset = contour_func(xx, yy, z, n_levels, **kwargs)
    except ValueError as e:
        raise util.CytoflowViewError(None,
                                     "Something went wrong in {}, bandwidth = {}.  "
                                     .format(contour_func.__name__, bw)) from e
    num_collections = len(cset.collections)
    min_alpha = kwargs.pop("min_alpha", 0.2)
    if shade:
        min_alpha = 0
    max_alpha = kwargs.pop("max_alpha", 0.9)
    alpha = np.linspace(min_alpha, max_alpha, num = num_collections)
    for el in range(num_collections):

    # Label the axes
    if hasattr(x, "name") and legend:
    if hasattr(y, "name") and legend:
    # Add legend data
    if 'label' in kwargs:
        legend_data[kwargs['label']] = plt.Rectangle((0, 0), 1, 1, fc = color)

    return ax        
Example #6
from scipy import stats
import numpy as np
from statsmodels.distributions.mixture_rvs import mixture_rvs
from statsmodels.nonparametric.kde import (kdensity, kdensityfft)
import matplotlib.pyplot as plt

obs_dist = mixture_rvs([.25,.75], size=10000, dist=[stats.norm, stats.norm],
                kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.5)))
#.. obs_dist = mixture_rvs([.25,.75], size=10000, dist=[stats.norm, stats.beta],
#..            kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=1,args=(1,.5))))

f_hat, grid, bw = kdensityfft(obs_dist, kernel="gauss", bw="scott")

# Check the plot

plt.hist(obs_dist, bins=50, normed=True, color='red')
plt.plot(grid, f_hat, lw=2, color='black')

# do some timings
# get bw first because they're not streamlined
from statsmodels.nonparametric import bandwidths
bw = bandwidths.bw_scott(obs_dist)

#.. timeit kdensity(obs_dist, kernel="gauss", bw=bw, gridsize=2**10)
#.. timeit kdensityfft(obs_dist, kernel="gauss", bw=bw, gridsize=2**10)
Example #7
    def model(self):

        #Time the modelling
        start_time = time.clock()

        #Extract dependent and independent variables
        y = self.df['impl_volatility'].values
        x = self.df[['strike_price', 'stock', 'T', 'riskfree']].values

        #Activate efficient bandwidth selection
        if self.bandwidth == None:
            self.efficient = True
            self.bandwidth = 'cv_ls'
                'No predetermined bandwidth selected. Looking for optimizng the bandwidth'

        #Bandwidth defined by Scott D.W.
        elif self.bandwidth == 'bw_scott':
            self.bandwidth = bw_scott(x)
            #self.bandwidth = self.bandwidth*()
            print('Selected bandwidth: ', self.bandwidth)

        #SBandwidth defined by Silverman B.W.
        elif self.bandwidth == 'bw_silverman':
            self.bandwidth = bw_silverman(x)
            print('Selected bandwidth: ', self.bandwidth)

        #Or else select own bandsidth for the array

        #Optimize the bandwidth selection if no other bandwidth selection method is defined.
        #See more here on their github page
        defaults = EstimatorSettings(efficient=self.efficient,

        #Preprocess the data for faster computation
        x = preprocessing.normalize(x)

        #Split the data into traning anf testing data for in and out of sample testing
        xtrain, xtest, ytrain, ytest = train_test_split(x, y)

        #Define the regressor, with conrinues variables and the bandwith selection
        reg = KernelReg(endog=ytrain,

        #Fit the data onto the test data to get a out of sample prediction
        pred = reg.fit(xtest)[0]

        #Get the results from the test i form om RMSE and in and out of sample R^2
        print('RMSE: ', np.sqrt(mean_squared_error(ytest, pred)))
        print('Out of Sample  R^2 :', r2_score(ytest, pred))
        #print ('In sample ' , reg.r_squared())

        #Print the computing time
        print('Estimation time: ', time.clock() - start_time, "seconds")

        return reg
Example #8
def getGlobalBandwidth(method, dataFrame, maxjobs=None):
    Get Rule of thumb, Cross validation or Plug-in Bandwidth
    Returns estimated bandwidth as covariance matrix.
    We have no plug-in methods since statsmodels has droped plug-in 
    bandwidth selection methods because of their lack of robustness in a 
    multivariate setting.
    method (str): 
        - cv_ml: cross validation maximum likelihood (statsmodels)
        - cv_ls: cross validation least squares (statsmodels)
        - normal_reference: Scott's normal reference rule of thumb (statsmodels)
        - silverman: Silverman's rule of thumb (scipy)
        - scott: Scott's rule of thumb (scipy)
        - over: oversmoothed upper bound [1]_
        - rule-of-thumb: multivariate rule-of-thumb [2]_
    (h, H_diag, H) (ndarray, ndarray, ndarray):
        - h: is the bandwidth
        - H_diag: is the diagonal covariance matrix ie. h^2*I
        - H: is the full covariance matrix
    dataFrame = pd.DataFrame(np.random.normal(size=(300,2)))
    for method in ['cv_ml','cv_ls','silverman','scott']:
        print(method, getGlobalBandwidth(method, dataFrame))
    .. [1] Hansen, B.E., 2009. Lecture notes on nonparametrics. Lecture notes.
    .. [2] Terrell, G.R., 1990. The maximal smoothing principle in density estimation. Journal of the American Statistical Association, 85(410), pp.470-477. http://www.jstor.org/stable/pdf/2289786.pdf?_=1465902314892
    n, d = dataFrame.shape
    if method == 'cv_ls':
        h = getCrossValidationLeastSquares(dataFrame,
    elif method == 'cv_ls_ndim':
        h = dataFrame.std().values * C_2_gaussian(d) * n**(-1 /
                                                           (2.0 * 2.0 + d))
        H_diag = h**2
        H0 = outer(h, h) * dataFrame.corr()
        H = getCrossValidationLeastSquares(dataFrame,
    elif method in ['cv_ml', 'normal_reference']:
        var_type = 'c' * d
        dens_u = KDEMultivariate(data=dataFrame, var_type=var_type, bw=method)
        h = dens_u.bw
    elif method == 'silverman':
        h = bw_silverman(dataFrame).values
    elif method == 'scott':
        h = bw_scott(dataFrame).values
    elif method == 'over':
        h = dataFrame.std().values * (((d + 8.)**(
            (d + 6.) / 2.) * pi**(d / 2.) * R_k_gaussian) / (16 * n * gamma(
                (d + 8.) / 2.) * (d + 2.)))**(1. / (d + 4.))
    elif method == 'rule-of-thumb':
        h = dataFrame.std().values * C_2_gaussian(d) * n**(-1 /
                                                           (2.0 * 2.0 + d))
        raise NotImplementedError(method)
    if method != 'cv_ls_ndim':
        H_diag = h**2
        H = outer(h, h) * dataFrame.corr().values
    return h, H_diag, H
Example #9
def estimate_sigma(
    X: np.ndarray,
    subsample: Optional[int] = None,
    method: str = "median",
    percent: Optional[float] = 0.15,
    scale: float = 1.0,
    random_state: Optional[int] = None,
    per_dimension: bool = False,
) -> float:
    """A function to provide a reasonable estimate of the sigma values
    for the RBF kernel using different methods. 

    X : array, (n_samples, d_dimensions)
        The data matrix to be estimated.
    method : str, default: 'median'
        different methods used to estimate the sigma for the rbf kernel
        * Mean
        * Median
        * Silverman
        * Scott - very common for density estimation
        * normal_reference
    percent : float, default=0.15
        The kth percentage of distance chosen
    random_state : int, (default: None)
        controls the seed for the subsamples drawn to represent
        the data distribution
    sigma : float
        The estimated sigma value
    - Original MATLAB function: https://goo.gl/xYoJce
    Author : J. Emmanuel Johnson
    Email  : [email protected]
           : [email protected]
    Date   : 6 - July - 2018
    X = check_array(X, ensure_2d=True)

    rng = check_random_state(random_state)

    n_samples, n_features = X.shape

    # subsample data
    if subsample is not None:
        X = rng.permutation(X)[:subsample, :]

    if method == "silverman":

        if per_dimension is not True:
            X = X.flatten()

        sigma = bandwidths.bw_silverman(X)

    # SCOTT
    elif method == "scott":

        if per_dimension is not True:
            X = X.flatten()

        sigma = bandwidths.bw_scott(X)

    # MEAN
    elif method == "mean":

        if per_dimension is True:

            if percent is None:
                sigma = [np.mean(pdist(ifeature[:, None])) for ifeature in X.T]
                kth_sample = int(percent * n_samples)
                sigma = [
                    for ifeature in X.T
            if percent is None:
                sigma = np.mean(pdist(X))
                kth_sample = int(percent * n_samples)
                sigma = np.mean(np.sort(squareform(pdist(X)))[:, kth_sample])

        # MEDIAN
    elif method == "median":
        if per_dimension is True:
            if percent is None:
                sigma = [
                    np.median(pdist(ifeature[:, None])) for ifeature in X.T
                kth_sample = int(percent * n_samples)
                sigma = [
                    for ifeature in X.T
            if percent is None:
                sigma = np.median(pdist(X))
                kth_sample = int(percent * n_samples)
                sigma = np.median(np.sort(squareform(pdist(X)))[:, kth_sample])
        raise ValueError('Unrecognized mode "{}".'.format(method))

    if per_dimension is True:
        msg = f"the number of features doesn't match the number of sigmas: {len(sigma)} =/= {n_features}"
        assert len(sigma) == n_features, msg

    return sigma