def positive_definite_kernel(kernel_config, data=None): """ return kernel, kernel_batch kernel_batch: cf. sklearn.metrics - Pairwise metrics, Affinities and Kernels <http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise> <http://scikit-learn.org/stable/modules/metrics.html> cf. scipy.spatial.distance <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html> """ kernel_type = kernel_config[0] if kernel_type == 'Linear': return inner_product, None elif kernel_type == 'Cos': return cosine, None elif kernel_type == 'ReluCos': return relu_cosine, None elif kernel_type == 'Gaussian': bw_param = kernel_config[1] if bw_param == 'scott': if len(kernel_config) > 2: scale = float(kernel_config[2]) sigma_vec = bandwidths.bw_scott(data) * scale else: sigma_vec = bandwidths.bw_scott(data) print('bandwidth: {}'.format(np.average(sigma_vec))) return gaussian(sigma_vec), gaussian_pairwise(sigma_vec) elif bw_param == 'silverman': sigma_vec = bandwidths.bw_silverman(data) print('bandwidth: {}'.format(np.average(sigma_vec))) return gaussian(sigma_vec), gaussian_pairwise(sigma_vec) else: sigma = float(kernel_config[1]) gamma = 1.0 / (2.0 * sigma ** 2) return gaussian(sigma), functools.partial( sklearn.metrics.pairwise.rbf_kernel, gamma=gamma) elif kernel_type == 'Laplacian': gamma = float(kernel_config[1]) return laplacian(gamma), functools.partial( sklearn.metrics.pairwise.laplacian_kernel, gamma=gamma)
def _univariate_kdeplot(data, scale=None, shade=False, kernel="gaussian", bw="scott", gridsize=100, cut=3, clip=None, legend=True, ax=None, orientation = "vertical", **kwargs): if ax is None: ax = plt.gca() if clip is None: clip = (-np.inf, np.inf) scaled_data = scale(data) # mask out the data that's not in the scale domain scaled_data = scaled_data[~np.isnan(scaled_data)] if kernel not in ['gaussian','tophat','epanechnikov','exponential','linear','cosine']: raise util.CytoflowOpError(None, "kernel must be one of ['gaussian'|'tophat'|'epanechnikov'|'exponential'|'linear'|'cosine']") if bw == 'scott': bw = bw_scott(scaled_data) elif bw == 'silverman': bw = bw_silverman(scaled_data) elif not isinstance(bw, float): raise util.CytoflowViewError(None, "Bandwith must be 'scott', 'silverman' or a float") support = _kde_support(scaled_data, bw, gridsize, cut, clip)[:, np.newaxis] kde = KernelDensity(kernel = kernel, bandwidth = bw).fit(scaled_data[:, np.newaxis]) log_density = kde.score_samples(support) x = scale.inverse(support[:, 0]) y = np.exp(log_density) # Check if a label was specified in the call label = kwargs.pop("label", None) color = kwargs.pop("color", None) alpha = kwargs.pop("alpha", 0.25) # Draw the KDE plot and, optionally, shade if orientation == "vertical": ax.plot(x, y, color=color, label=label, **kwargs) if shade: ax.fill_between(x, 1e-12, y, facecolor=color, alpha=alpha) else: ax.plot(y, x, color=color, label=label, **kwargs) if shade: ax.fill_between(y, 1e-12, x, facecolor=color, alpha=alpha) return ax
from scipy import stats import numpy as np from statsmodels.sandbox.distributions.mixture_rvs import mixture_rvs from statsmodels.nonparametric.kde import (kdensity, kdensityfft) import matplotlib.pyplot as plt np.random.seed(12345) obs_dist = mixture_rvs([.25, .75], size=10000, dist=[stats.norm, stats.norm], kwargs=(dict(loc=-1, scale=.5), dict(loc=1, scale=.5))) #.. obs_dist = mixture_rvs([.25,.75], size=10000, dist=[stats.norm, stats.beta], #.. kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=1,args=(1,.5)))) f_hat, grid, bw = kdensityfft(obs_dist, kernel="gauss", bw="scott") # Check the plot plt.figure() plt.hist(obs_dist, bins=50, normed=True, color='red') plt.plot(grid, f_hat, lw=2, color='black') plt.show() # do some timings # get bw first because they're not streamlined from statsmodels.nonparametric import bandwidths bw = bandwidths.bw_scott(obs_dist) #.. timeit kdensity(obs_dist, kernel="gauss", bw=bw, gridsize=2**10) #.. timeit kdensityfft(obs_dist, kernel="gauss", bw=bw, gridsize=2**10)
def hsic_test_gamma(X, Y, bw_method='mdbs'): """get the HSIC statistic. Parameters ---------- X, Y : array-like, shape (n_samples, n_features) Training data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. bw_method : str, optional (default=``mdbs``) The method used to calculate the bandwidth of the HSIC. * ``mdbs`` : Median distance between samples. * ``scott`` : Scott's Rule of Thumb. * ``silverman`` : Silverman's Rule of Thumb. Returns ------- test_stat : float the HSIC statistic. p : float the HSIC p-value. """ X = X.reshape(-1, 1) if X.ndim == 1 else X Y = Y.reshape(-1, 1) if Y.ndim == 1 else Y if bw_method == 'scott': width_x = bandwidths.bw_scott(X) width_y = bandwidths.bw_scott(Y) elif bw_method == 'silverman': width_x = bandwidths.bw_silverman(X) width_y = bandwidths.bw_silverman(Y) # Get kernel width to median distance between points else: width_x = get_kernel_width(X) width_y = get_kernel_width(Y) # these are slightly biased estimates of centered gram matrices K, Kc = get_gram_matrix(X, width_x) L, Lc = get_gram_matrix(Y, width_y) # test statistic m*HSICb under H1 n = X.shape[0] bone = np.ones((n, 1)) test_stat = hsic_teststat(Kc, Lc, n) var = (1 / 6 * Kc * Lc)**2 # second subtracted term is bias correction var = 1 / n / (n - 1) * (np.sum(np.sum(var)) - np.sum(np.diag(var))) # variance under H0 var = 72 * (n - 4) * (n - 5) / n / (n - 1) / (n - 2) / (n - 3) * var K = K - np.diag(np.diag(K)) L = L - np.diag(np.diag(L)) mu_X = 1 / n / (n - 1) * np.dot(bone.T, np.dot(K, bone)) mu_Y = 1 / n / (n - 1) * np.dot(bone.T, np.dot(L, bone)) # mean under H0 mean = 1 / n * (1 + mu_X * mu_Y - mu_X - mu_Y) alpha = mean**2 / var # threshold for hsicArr*m beta = np.dot(var, n) / mean p = 1 - gamma.cdf(test_stat, alpha, scale=beta)[0][0] return test_stat, p
def _bivariate_kdeplot(x, y, xscale=None, yscale=None, shade=False, bw="scott", gridsize=50, cut=3, clip=None, legend=True, legend_data = None, **kwargs): ax = plt.gca() # Determine the clipping clip = [(-np.inf, np.inf), (-np.inf, np.inf)] x = xscale(x) y = yscale(y) x_nan = np.isnan(x) y_nan = np.isnan(y) x = x[~(x_nan | y_nan)] y = y[~(x_nan | y_nan)] if bw == 'scott': bw_x = bw_scott(x) bw_y = bw_scott(y) bw = (bw_x + bw_y) / 2 elif bw == 'silverman': bw_x = bw_silverman(x) bw_y = bw_silverman(y) bw = (bw_x + bw_y) / 2 elif isinstance(bw, float): bw_x = bw_y = bw else: raise util.CytoflowViewError(None, "Bandwith must be 'scott', 'silverman' or a float") kde = KernelDensity(bandwidth = bw, kernel = 'gaussian').fit(np.column_stack((x, y))) x_support = _kde_support(x, bw_x, gridsize, cut, clip[0]) y_support = _kde_support(y, bw_y, gridsize, cut, clip[1]) xx, yy = np.meshgrid(x_support, y_support) z = kde.score_samples(np.column_stack((xx.ravel(), yy.ravel()))) z = z.reshape(xx.shape) z = np.exp(z) n_levels = kwargs.pop("n_levels", 10) color = kwargs.pop("color") kwargs['colors'] = (color, ) x_support = xscale.inverse(x_support) y_support = yscale.inverse(y_support) xx, yy = np.meshgrid(x_support, y_support) contour_func = ax.contourf if shade else ax.contour try: cset = contour_func(xx, yy, z, n_levels, **kwargs) except ValueError as e: raise util.CytoflowViewError(None, "Something went wrong in {}, bandwidth = {}. " .format(contour_func.__name__, bw)) from e num_collections = len(cset.collections) min_alpha = kwargs.pop("min_alpha", 0.2) if shade: min_alpha = 0 max_alpha = kwargs.pop("max_alpha", 0.9) alpha = np.linspace(min_alpha, max_alpha, num = num_collections) for el in range(num_collections): cset.collections[el].set_alpha(alpha[el]) # Label the axes if hasattr(x, "name") and legend: ax.set_xlabel(x.name) if hasattr(y, "name") and legend: ax.set_ylabel(y.name) # Add legend data if 'label' in kwargs: legend_data[kwargs['label']] = plt.Rectangle((0, 0), 1, 1, fc = color) return ax
from scipy import stats import numpy as np from statsmodels.distributions.mixture_rvs import mixture_rvs from statsmodels.nonparametric.kde import (kdensity, kdensityfft) import matplotlib.pyplot as plt np.random.seed(12345) obs_dist = mixture_rvs([.25,.75], size=10000, dist=[stats.norm, stats.norm], kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.5))) #.. obs_dist = mixture_rvs([.25,.75], size=10000, dist=[stats.norm, stats.beta], #.. kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=1,args=(1,.5)))) f_hat, grid, bw = kdensityfft(obs_dist, kernel="gauss", bw="scott") # Check the plot plt.figure() plt.hist(obs_dist, bins=50, normed=True, color='red') plt.plot(grid, f_hat, lw=2, color='black') plt.show() # do some timings # get bw first because they're not streamlined from statsmodels.nonparametric import bandwidths bw = bandwidths.bw_scott(obs_dist) #.. timeit kdensity(obs_dist, kernel="gauss", bw=bw, gridsize=2**10) #.. timeit kdensityfft(obs_dist, kernel="gauss", bw=bw, gridsize=2**10)
def model(self): #Time the modelling start_time = time.clock() #Extract dependent and independent variables y = self.df['impl_volatility'].values x = self.df[['strike_price', 'stock', 'T', 'riskfree']].values #Activate efficient bandwidth selection if self.bandwidth == None: self.efficient = True self.bandwidth = 'cv_ls' print( 'No predetermined bandwidth selected. Looking for optimizng the bandwidth' ) #Bandwidth defined by Scott D.W. elif self.bandwidth == 'bw_scott': self.bandwidth = bw_scott(x) #self.bandwidth = self.bandwidth*() print('Selected bandwidth: ', self.bandwidth) #SBandwidth defined by Silverman B.W. elif self.bandwidth == 'bw_silverman': self.bandwidth = bw_silverman(x) print('Selected bandwidth: ', self.bandwidth) #Or else select own bandsidth for the array else: pass #Optimize the bandwidth selection if no other bandwidth selection method is defined. #See more here on their github page #https://github.com/statsmodels/statsmodels/blob/master/statsmodels/nonparametric/_kernel_base.py defaults = EstimatorSettings(efficient=self.efficient, randomize=False, n_sub=50, n_res=50, n_jobs=0, return_only_bw=True) #Preprocess the data for faster computation x = preprocessing.normalize(x) #Split the data into traning anf testing data for in and out of sample testing xtrain, xtest, ytrain, ytest = train_test_split(x, y) #Define the regressor, with conrinues variables and the bandwith selection reg = KernelReg(endog=ytrain, exog=xtrain, var_type='cccc', bw=self.bandwidth, defaults=defaults) #Fit the data onto the test data to get a out of sample prediction pred = reg.fit(xtest)[0] #Get the results from the test i form om RMSE and in and out of sample R^2 print('RMSE: ', np.sqrt(mean_squared_error(ytest, pred))) print('Out of Sample R^2 :', r2_score(ytest, pred)) #print ('In sample ' , reg.r_squared()) #Print the computing time print('Estimation time: ', time.clock() - start_time, "seconds") return reg
def getGlobalBandwidth(method, dataFrame, maxjobs=None): r""" Get Rule of thumb, Cross validation or Plug-in Bandwidth Returns estimated bandwidth as covariance matrix. We have no plug-in methods since statsmodels has droped plug-in bandwidth selection methods because of their lack of robustness in a multivariate setting. Parameters ---------- method (str): - cv_ml: cross validation maximum likelihood (statsmodels) - cv_ls: cross validation least squares (statsmodels) - normal_reference: Scott's normal reference rule of thumb (statsmodels) - silverman: Silverman's rule of thumb (scipy) - scott: Scott's rule of thumb (scipy) - over: oversmoothed upper bound [1]_ - rule-of-thumb: multivariate rule-of-thumb [2]_ Returns ------- (h, H_diag, H) (ndarray, ndarray, ndarray): - h: is the bandwidth - H_diag: is the diagonal covariance matrix ie. h^2*I - H: is the full covariance matrix Examples -------- dataFrame = pd.DataFrame(np.random.normal(size=(300,2))) for method in ['cv_ml','cv_ls','silverman','scott']: print(method, getGlobalBandwidth(method, dataFrame)) References ---------- .. [1] Hansen, B.E., 2009. Lecture notes on nonparametrics. Lecture notes. .. [2] Terrell, G.R., 1990. The maximal smoothing principle in density estimation. Journal of the American Statistical Association, 85(410), pp.470-477. http://www.jstor.org/stable/pdf/2289786.pdf?_=1465902314892 """ n, d = dataFrame.shape if method == 'cv_ls': h = getCrossValidationLeastSquares(dataFrame, 1.0, bw_silverman(dataFrame).values, maxjobs=maxjobs)**0.5 elif method == 'cv_ls_ndim': #rule-of-thumb h = dataFrame.std().values * C_2_gaussian(d) * n**(-1 / (2.0 * 2.0 + d)) H_diag = h**2 H0 = outer(h, h) * dataFrame.corr() H = getCrossValidationLeastSquares(dataFrame, 1.0, H0.values, maxjobs=maxjobs)**0.5 elif method in ['cv_ml', 'normal_reference']: var_type = 'c' * d dens_u = KDEMultivariate(data=dataFrame, var_type=var_type, bw=method) h = dens_u.bw elif method == 'silverman': h = bw_silverman(dataFrame).values elif method == 'scott': h = bw_scott(dataFrame).values elif method == 'over': h = dataFrame.std().values * (((d + 8.)**( (d + 6.) / 2.) * pi**(d / 2.) * R_k_gaussian) / (16 * n * gamma( (d + 8.) / 2.) * (d + 2.)))**(1. / (d + 4.)) elif method == 'rule-of-thumb': h = dataFrame.std().values * C_2_gaussian(d) * n**(-1 / (2.0 * 2.0 + d)) else: raise NotImplementedError(method) if method != 'cv_ls_ndim': H_diag = h**2 H = outer(h, h) * dataFrame.corr().values return h, H_diag, H
def estimate_sigma( X: np.ndarray, subsample: Optional[int] = None, method: str = "median", percent: Optional[float] = 0.15, scale: float = 1.0, random_state: Optional[int] = None, per_dimension: bool = False, ) -> float: """A function to provide a reasonable estimate of the sigma values for the RBF kernel using different methods. Parameters ---------- X : array, (n_samples, d_dimensions) The data matrix to be estimated. method : str, default: 'median' different methods used to estimate the sigma for the rbf kernel matrix. * Mean * Median * Silverman * Scott - very common for density estimation * normal_reference percent : float, default=0.15 The kth percentage of distance chosen random_state : int, (default: None) controls the seed for the subsamples drawn to represent the data distribution Returns ------- sigma : float The estimated sigma value Resources --------- - Original MATLAB function: https://goo.gl/xYoJce Information ----------- Author : J. Emmanuel Johnson Email : [email protected] : [email protected] Date : 6 - July - 2018 """ X = check_array(X, ensure_2d=True) rng = check_random_state(random_state) n_samples, n_features = X.shape # subsample data if subsample is not None: X = rng.permutation(X)[:subsample, :] # SILVERMAN if method == "silverman": if per_dimension is not True: X = X.flatten() sigma = bandwidths.bw_silverman(X) # SCOTT elif method == "scott": if per_dimension is not True: X = X.flatten() sigma = bandwidths.bw_scott(X) # MEAN elif method == "mean": if per_dimension is True: if percent is None: sigma = [np.mean(pdist(ifeature[:, None])) for ifeature in X.T] else: kth_sample = int(percent * n_samples) sigma = [ np.mean( np.sort(squareform(pdist(ifeature[:, None])))[:, kth_sample]) for ifeature in X.T ] else: if percent is None: sigma = np.mean(pdist(X)) else: kth_sample = int(percent * n_samples) sigma = np.mean(np.sort(squareform(pdist(X)))[:, kth_sample]) # MEDIAN elif method == "median": if per_dimension is True: if percent is None: sigma = [ np.median(pdist(ifeature[:, None])) for ifeature in X.T ] else: kth_sample = int(percent * n_samples) sigma = [ np.median( np.sort(squareform(pdist(ifeature[:, None])))[:, kth_sample]) for ifeature in X.T ] else: if percent is None: sigma = np.median(pdist(X)) else: kth_sample = int(percent * n_samples) sigma = np.median(np.sort(squareform(pdist(X)))[:, kth_sample]) else: raise ValueError('Unrecognized mode "{}".'.format(method)) if per_dimension is True: msg = f"the number of features doesn't match the number of sigmas: {len(sigma)} =/= {n_features}" assert len(sigma) == n_features, msg return sigma