Beispiel #1
0
def get_univariate_dist(
    data, kernel="gau", fft=True, bw="scott", gridsize=100, cut=3, clip=None
):
    kde = smnp.KDEUnivariate(data)
    kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip)
    grid, y = kde.support, kde.density
    return grid, y
Beispiel #2
0
def _statsmodels_univariate_kde(data,
                                kernel,
                                bw,
                                gridsize,
                                cut,
                                clip,
                                cumulative=False):
    """Compute a univariate kernel density estimate using statsmodels."""
    # statsmodels 0.8 fails on int type data
    data = data.astype(np.float64)

    fft = kernel == "gau"
    kde = smnp.KDEUnivariate(data)

    try:
        kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip)
    except RuntimeError as err:  # GH#1990
        if stats.iqr(data) > 0:
            raise err
        msg = "Default bandwidth for data is 0; skipping density estimation."
        warnings.warn(msg, UserWarning)
        return np.array([]), np.array([])

    if cumulative:
        grid, y = kde.support, kde.cdf
    else:
        grid, y = kde.support, kde.density
    return grid, y
def kernel_density_estimation(univariate_dataset, k="gau", bw=1):
    kernel = k
    bandwidth = bw
    fft = kernel == "gau"
    kde = smnp.KDEUnivariate(univariate_dataset)
    kde.fit(kernel, bandwidth, fft)
    x, y = kde.support, kde.density
    return x, y
def mode(data):
    """Compute a kernel density estimate and return the mode"""
    if len(np.unique(data)) == 1:
        return data[0]
    else:
        kde = smnp.KDEUnivariate(data.astype('double'))
        kde.fit(cut=0)
        grid, y = kde.support, kde.density
        return grid[y == y.max()][0]
 def statsmodels_univariate_kde(data, kernel, bw, gridsize, cut, clip):
     """Compute a univariate kernel density estimate using statsmodels."""
     if clip is None:
         clip = (-np.inf, np.inf)
     fft = kernel == "gau"
     kde = smnp.KDEUnivariate(data)
     kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip)
     grid, y = kde.support, kde.density
     return grid, y
    def get_ymax(self, data):
        if np.isnan(data).all():
            return 0

        kde = smnp.KDEUnivariate(data)
        kde.fit()

        maxval = np.nanmax(kde.density)
        if math.isnan(maxval):
            maxval = 0
        return maxval
def _statsmodels_univariate_kde(data, kernel, bw, gridsize, cut, clip,
                                cumulative=False):
    """Compute a univariate kernel density estimate using statsmodels."""
    fft = kernel == "gau"
    kde = smnp.KDEUnivariate(data)
    kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip)
    if cumulative:
        grid, y = kde.support, kde.cdf
    else:
        grid, y = kde.support, kde.density
    return grid, y
Beispiel #8
0
def Kde(data,
        bw=args.bw,
        kernel="gau",
        gridsize=100.,
        cut=args.cut,
        clip=(-np.inf, np.inf),
        cumulative=False):
    """Compute a univariate kernel density estimate using statsmodels."""
    fft = kernel == "gau"
    kde = smnp.KDEUnivariate(np.array([float(el) for el in data]))
    kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip)
    return kde
Beispiel #9
0
    def get_ymax(self, data):
        if np.isnan(data).all():
            return 0

        # if there's just one value in the data, fit breaks
        uniq_values = data.unique()
        if len(uniq_values) == 1:
            return uniq_values[0]

        kde = smnp.KDEUnivariate(data)
        kde.fit()

        maxval = np.nanmax(kde.density)
        if math.isnan(maxval):
            maxval = 0
        return maxval
Beispiel #10
0
def _univariate_kdeplot(data,
                        scale=None,
                        shade=False,
                        kernel="gau",
                        bw="scott",
                        gridsize=100,
                        cut=3,
                        clip=None,
                        legend=True,
                        cumulative=False,
                        shade_lowest=True,
                        ax=None,
                        **kwargs):

    if ax is None:
        ax = plt.gca()

    if clip is None:
        clip = (-np.inf, np.inf)

    scaled_data = scale(data)

    # mask out the data that's not in the scale domain
    scaled_data = scaled_data[~np.isnan(scaled_data)]

    # Calculate the KDE
    fft = (kernel == "gau")
    kde = smnp.KDEUnivariate(scaled_data)
    kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip)

    x, y = scale.inverse(kde.support), kde.density

    # Make sure the density is nonnegative
    y = np.amax(np.c_[np.zeros_like(y), y], axis=1)

    # Check if a label was specified in the call
    label = kwargs.pop("label", None)

    color = kwargs.pop("color", None)

    # Draw the KDE plot and, optionally, shade
    ax.plot(x, y, color=color, label=label, **kwargs)
    alpha = kwargs.get("alpha", 0.25)
    if shade:
        ax.fill_between(x, 1e-12, y, facecolor=color, alpha=alpha)

    return ax
Beispiel #11
0
def get_kde_threshold(array):
    dens = smnp.KDEUnivariate(array)
    dens.fit(gridsize=np.max(array).astype(int), bw=2000)
    x, y = dens.support, dens.density
    peaks = find_peaks(y)
    peaks = peaks[0]
    highest_peaks = peaks[y[peaks].argsort(
    )[-2:][::-1]]  # we get the indices of the two highest peaks
    try:
        thresh = (x[highest_peaks[0]] -
                  x[highest_peaks[1]]) / 4 + x[highest_peaks[1]]
    except:
        thresh = np.min(array)
    # we get the threshold. code works on  assumption that there is a small peak followed by a large peak
    # in the distribution of the rotated rectangle area

    return thresh
Beispiel #12
0
def kde_sm(data,
           kernel='gau',
           bw='scott',
           gridsize=None,
           cut=3,
           clip=(-np.inf, np.inf),
           cumulative=False):
    import statsmodels.nonparametric.api as smnp

    fft = kernel == 'gau'
    kde = smnp.KDEUnivariate(data)
    # noinspection PyTypeChecker
    kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip)
    if cumulative:
        grid, y = kde.support, kde.cdf
    else:
        grid, y = kde.support, kde.density

    return pd.Series(y, index=grid)
Beispiel #13
0
def plot_kde(array, path: str, animal_name):
    dens = smnp.KDEUnivariate(array)
    dens.fit(gridsize=2000, bw=2000)
    x, y = dens.support, dens.density

    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    thresh = utils.get_kde_threshold(array)
    sns.set_style('whitegrid')

    fig = plt.figure(figsize=(10, 10))
    plt.plot(x, y)
    plt.axvline(thresh,
                linestyle='-.',
                color='red',
                label=f'threshold={thresh // 1}')
    plt.xlabel('Rotated Rectangle Area')
    plt.ylabel('Kernel Density')
    plt.title('Rotated Rectangle Area KDE')
    plt.legend()
    fig.savefig(path + '/' + animal_name + '_eyelid_density.pdf')
Beispiel #14
0
    def kde(
            self,
            data,
            gridsize=10,
            fft=True,
            kernel="gau",
            bw="scott",
            cut=3,
            clip=(-np.inf, np.inf),
    ):
        if bw == "scott":
            bw = stats.gaussian_kde(data).scotts_factor() * data.std(ddof=1)

        kde = smnp.KDEUnivariate(data)

        # create the grid to fit the estimation.
        support_min = min(max(data.min() - bw * cut, clip[0]), 0)
        support_max = min(data.max() + bw * cut, clip[1])
        x = np.linspace(support_min, support_max, gridsize)

        kde.fit("gau", bw, fft, gridsize=gridsize, cut=cut, clip=clip)
        y = kde.density

        return x, y
def DataTransformation(Numerical_data, Time_name, Outlier_ratio = 50, bw = 1, kernel = "gau", threshold_of_the_number_of_categories = 5):
    """
    Inputs:
    Numerical_data: Numerical data stored in a DataFrame.
    Time_name: Name of the time variable in the numerical data.
    Outlier_ratio: Scale factor of the outlier threshold.
    bw: Bandwidth.
    kernel: Kernel function.
    threshold_of_the_number_of_categories: Threshold of the number of categories.
    
    Outputs:
    Categorical_data: Categorical data.
    """
    
    Features = list(Numerical_data.keys())
    Features.remove(Time_name)
    Categorical_data = copy.deepcopy(Numerical_data)
    for feature in Features:
#####################Kernel density estimation#####################
        fft = "True"
        kde = smnp.KDEUnivariate([float(x) for x in Numerical_data[feature]])
        kde.fit(kernel, bw, fft)
        x, y = kde.support, kde.density

#####################Initial data classification#####################
        outlier_threshold = max(y)/Outlier_ratio #Threshold of the probability density of outliers
        
        #Obtain valley values
        valley_values = []
        for i in range(len(y)):
            if i == 0:
                if y[i] >= outlier_threshold and y[i] < y[i+1]:
                    valley_values.append(i)
            elif i == len(y)-1:
                if y[i] >= outlier_threshold and y[i] < y[i-1]:
                    valley_values.append(i)
            else:
                if y[i] >= outlier_threshold and y[i] < y[i-1] and y[i] < y[i+1]:
                    valley_values.append(i)
                if y[i] >= outlier_threshold and y[i] < y[i-1] and y[i+1] < outlier_threshold:
                    valley_values.append(i)
                if y[i] >= outlier_threshold and y[i] < y[i+1] and y[i-1] < outlier_threshold:
                    valley_values.append(i)
        
        #Obtain valley values
        peak_values = []
        for i in range(len(y)):
            if i == 0:
                if y[i] >= outlier_threshold and y[i] > y[i+1]:
                    peak_values.append(i)
            elif i == len(y)-1:
                if y[i] >= outlier_threshold and y[i] > y[i-1]:
                    peak_values.append(i)
            else:
                if y[i] >= outlier_threshold and y[i] > y[i-1] and y[i] > y[i+1]:
                    peak_values.append(i)
        
        #Obtain intervals of categories
        Intervals_of_categories = []
        for i in peak_values:
            if i == 0:
                valley = [x for x in valley_values if x > i]
                Intervals_of_categories.append([i, i, valley[0]])
            elif i == len(y)-1:
                valley = [x for x in valley_values if x < i]
                Intervals_of_categories.append([valley[-1], i, i])
            else:
                left_valley = [x for x in valley_values if x < i]
                right_valley = [x for x in valley_values if x > i]
                Intervals_of_categories.append([left_valley[-1], i, right_valley[0]])

#####################Merge categories if it is necessary#####################
        while(len(Intervals_of_categories) > threshold_of_the_number_of_categories):
            number_of_categories_old = len(Intervals_of_categories)
            minimum_interval_size = np.inf
            for i in range(len(Intervals_of_categories)):
                if x[Intervals_of_categories[i][2]]-x[Intervals_of_categories[i][0]] < minimum_interval_size:
                    if i == 0 and Intervals_of_categories[i][2] == Intervals_of_categories[i+1][0]:
                        minimum_interval_size = x[Intervals_of_categories[i][2]]-x[Intervals_of_categories[i][0]]
                        category_to_be_merged = i
                    if i == len(Intervals_of_categories)-1 and Intervals_of_categories[i-1][2] == Intervals_of_categories[i][0]:
                        minimum_interval_size = x[Intervals_of_categories[i][2]]-x[Intervals_of_categories[i][0]]
                        category_to_be_merged = i
                    if i != 0 and i != len(Intervals_of_categories)-1:
                        if Intervals_of_categories[i-1][2] == Intervals_of_categories[i][0] or Intervals_of_categories[i][2] == Intervals_of_categories[i+1][0]:
                            minimum_interval_size = x[Intervals_of_categories[i][2]]-x[Intervals_of_categories[i][0]]
                            category_to_be_merged = i
            
            if category_to_be_merged == 0:
                if y[Intervals_of_categories[category_to_be_merged][1]] > y[Intervals_of_categories[category_to_be_merged+1][1]]:
                    Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged][0], Intervals_of_categories[category_to_be_merged][1], Intervals_of_categories[category_to_be_merged+1][2]]
                else:
                    Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged][0], Intervals_of_categories[category_to_be_merged+1][1], Intervals_of_categories[category_to_be_merged+1][2]]
                del Intervals_of_categories[category_to_be_merged+1]
            elif category_to_be_merged == len(Intervals_of_categories)-1:
                if y[Intervals_of_categories[category_to_be_merged][1]] > y[Intervals_of_categories[category_to_be_merged-1][1]]:
                    Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged-1][0], Intervals_of_categories[category_to_be_merged][1], Intervals_of_categories[category_to_be_merged][2]]
                else:
                    Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged-1][0], Intervals_of_categories[category_to_be_merged-1][1], Intervals_of_categories[category_to_be_merged][2]]
                del Intervals_of_categories[category_to_be_merged-1]     
            else:
                left_consistency_index = y[Intervals_of_categories[category_to_be_merged][1]] - y[Intervals_of_categories[category_to_be_merged][0]]
                right_consistency_index = y[Intervals_of_categories[category_to_be_merged][2]] - y[Intervals_of_categories[category_to_be_merged][1]]
                if left_consistency_index < right_consistency_index:
                    if y[Intervals_of_categories[category_to_be_merged][1]] > y[Intervals_of_categories[category_to_be_merged-1][1]]:
                        Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged-1][0], Intervals_of_categories[category_to_be_merged][1], Intervals_of_categories[category_to_be_merged][2]]
                    else:
                        Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged-1][0], Intervals_of_categories[category_to_be_merged-1][1], Intervals_of_categories[category_to_be_merged][2]]
                    del Intervals_of_categories[category_to_be_merged-1]  
                if left_consistency_index > right_consistency_index:
                    if y[Intervals_of_categories[category_to_be_merged][1]] > y[Intervals_of_categories[category_to_be_merged+1][1]]:
                        Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged][0], Intervals_of_categories[category_to_be_merged][1], Intervals_of_categories[category_to_be_merged+1][2]]
                    else:
                        Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged][0], Intervals_of_categories[category_to_be_merged+1][1], Intervals_of_categories[category_to_be_merged+1][2]]
                    del Intervals_of_categories[category_to_be_merged+1]         
            
            if len(Intervals_of_categories) == number_of_categories_old:
                print("Error: This variable cannot be merged")
        
#####################Data transformation according to categories#####################
        Variable_ = []
        for i in range(len(Numerical_data[feature])):
            flag = 0
            for j in range(len(Intervals_of_categories)):
                if x[Intervals_of_categories[j][0]] <= Numerical_data[feature][i] and x[Intervals_of_categories[j][2]] >= Numerical_data[feature][i]:
                    Variable_.append(feature + ": " + str(round(x[Intervals_of_categories[j][0]],2))+ "-" + str(round(x[Intervals_of_categories[j][2]],2)))
                    flag = 1
                    break
            if flag == 0:
                Variable_.append(np.nan)
        Categorical_data[feature] = Variable_
    
    return Categorical_data
Beispiel #16
0
def scaled_1d_kde_plot(data, shade, bandwidth='scott',
                       vertical=False, legend=False, ax=None,
                       density_scale=None, **kwargs):
    """Plot a univariate kernel density estimate on one of the axes.

    Adapted from _univariate_kdeplot from seaborn but allow user to
    scale densityu estimates using  density_scale.
    """
    if ax is None:
        ax = plt.gca()

    # Calculate the KDE
    kde = smnp.KDEUnivariate(data.astype('double'))
    kde.fit(bw=bandwidth)
    x, y = kde.support, kde.density

    if density_scale:
        y = density_scale * y / np.max(y)

    # Make sure the density is nonnegative
    y = np.amax(np.c_[np.zeros_like(y), y], axis=1)

    # Flip the data if the plot should be on the y axis
    if vertical:
        x, y = y, x

    # Check if a label was specified in the call
    label = kwargs.pop("label", None)

    # Otherwise check if the data object has a name
    if label is None and hasattr(data, "name"):
        label = data.name

    # Decide if we're going to add a legend
    legend = label is not None and legend
    label = "_nolegend_" if label is None else label

    # Use the active color cycle to find the plot color
    facecolor = kwargs.pop("facecolor", None)
    line, = ax.plot(x, y, **kwargs)
    color = line.get_color()
    line.remove()
    kwargs.pop("color", None)
    facecolor = color if facecolor is None else facecolor

    # Draw the KDE plot and, optionally, shade
    ax.plot(x, y, color=color, label=label, **kwargs)
    shade_kws = dict(
        facecolor=facecolor,
        alpha=kwargs.get("alpha", 0.25),
        clip_on=kwargs.get("clip_on", True),
        zorder=kwargs.get("zorder", 1),
    )
    if shade:
        if vertical:
            ax.fill_betweenx(y, 0, x, **shade_kws)
        else:
            ax.fill_between(x, 0, y, **shade_kws)

    # Set the density axis minimum to 0
    ax.set_ylim(0, auto=None)

    # Draw the legend here
    handles, labels = ax.get_legend_handles_labels()

    return ax, x, y
from scipy import stats
import matplotlib.pyplot as plt
import statsmodels.nonparametric.api as npar
from statsmodels.sandbox.nonparametric import kernels
from statsmodels.distributions.mixture_rvs import mixture_rvs

# example from test_kde.py mixture of two normal distributions
np.random.seed(12345)
x = mixture_rvs([.25, .75],
                size=200,
                dist=[stats.norm, stats.norm],
                kwargs=(dict(loc=-1, scale=.5), dict(loc=1, scale=.5)))

x.sort()  # not needed

kde = npar.KDEUnivariate(x)
kde.fit('gau')
ci = kde.kernel.density_confint(kde.density, len(x))

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

ax.hist(x, bins=15, density=True, alpha=0.25)

ax.plot(kde.support, kde.density, lw=2, color='red')
ax.fill_between(kde.support, ci[:, 0], ci[:, 1], color='grey', alpha='0.7')
ax.set_title('Kernel Density Gaussian (bw = %4.2f)' % kde.bw)

# use all kernels directly

x_grid = np.linspace(np.min(x), np.max(x), 51)
Beispiel #18
0
def plotDistribution(dist):
    r"""Plots the fitted PDF, KDE and CDF as well as the PDF differences between
  fits, binning and KDE.
  The figure contains additional informations like:
    * Kolmogorov-Smirnov test statistics and P-values
    * The KDE difference defined by
      $$
        \Delta PDF(x)
        = 2*[PDF_{KDE}(x) - PDF_{FIT}(x)]/[PDF_{KDE}(x) + PDF_{FIT}(x)]
      $$
      and the integrated KDE difference is given by
      $$ \sqrt{ \int dx [\Delta PDF(x)]^2 } $$

  Parameters
  ----------
    dist : array or list, one dimensional

  Returns
  -------
    fig : 'matplotlib.figure'

  Note
  ----
    Abbreviations:
    * KDE : Kernel Density Estimate
    * PDF : Probability Density Function
    * CDF : Cumulative Density Function

    This routine uses seaborn to estimate the bins and KDE, scipy for the
    Kolmogorov-Smirnov test
    (https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test)
    and 'statsmodels' for estimating the KDE
    '''python
    >>> import statsmodels.nonparametric.api as smnp
    >>> kde = smnp.KDEUnivariate(data)
    >>> kde.fit(kernel="gau", bw="scott", fft=True, gridsize=100, cut=3)
    '''
    'seaborn' itself uses 'numpy' for binning where the number of bins is
    determined by the Freedman Diaconis Estimator
    (https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html).
  """
    # Create the figure
    fig, axs = plt.subplots(dpi=400,
                            figsize=(3, 3),
                            nrows=3,
                            sharex=True,
                            gridspec_kw={'height_ratios': [1, 3, 1]})

    # Set up plot styles
    baseLineStyle = {"color": "gray", "lw": 0.5, "ls": "--", "zorder": -1}
    fitLineStyle = {"lw": 0.9, "color": "red", "label": "Fit"}
    kdeLineStyle = {
        "lw": 0.0,
        "marker": ".",
        "ms": 3,
        "color": "green",
        "label": "KDE",
    }
    histLineStyle = {
        "rwidth": 0.9,
        "label": "Bins",
    }
    styles = {
        "Base": baseLineStyle,
        "Fit": fitLineStyle,
        "KDE": kdeLineStyle,
        "Bins": histLineStyle,
    }

    # Compute distribution fits
    mean, sdev = np.mean(dist), np.std(dist, ddof=1)

    # Kolmogorov-Smirnov test
    ksRes = stats.kstest(dist, 'norm', args=(mean, sdev))

    # Estimate KDE and compare to normal
    kde = smnp.KDEUnivariate(dist)
    kde.fit(kernel="gau", bw="scott", fft=True, gridsize=100, cut=3)
    ## Get infinitesimal step size
    deltaX = kde.support[1] - kde.support[0]
    ## Compute fitted PDF
    normal = stats.norm.pdf(kde.support, loc=mean, scale=sdev)
    ## Compute difference
    kdeDiff = (2 * (kde.density - normal) / (kde.density + normal))
    normKDEDiff = np.sqrt(np.sum(kdeDiff**2) * deltaX)

    # Set title
    axs[0].set_title(
        "KS Test result: Statistic = {stat:1.3f}, P-Value = {pvalue:1.3f}".
        format(stat=ksRes.statistic, pvalue=ksRes.pvalue) +
        ",\nintegrated KDE difference = {normKDEDiff:1.3f}".format(
            normKDEDiff=normKDEDiff))

    # Compute fits
    yb, xb = np.histogram(dist, bins="fd")

    #Plot PDF
    ax = axs[0]
    sns.distplot(dist,
                 hist_kws=styles["Bins"],
                 kde_kws=styles["KDE"],
                 fit_kws=styles["Fit"],
                 ax=ax,
                 norm_hist=True,
                 fit=stats.norm)
    ## Axis styling
    ax.axvline(mean, label=r"$\mu$", **baseLineStyle)
    ax.set_ylabel("PDF")
    ax.set_yticks([])
    ax.legend([])

    # CDFs
    ax = axs[1]
    styles["KDE"].update({"cumulative": True})
    styles["Bins"].update({"cumulative": True})
    ## Plot CDFs
    ecdf = sns.distplot(dist,
                        hist_kws=styles["Bins"],
                        kde_kws=styles["KDE"],
                        ax=ax,
                        norm_hist=True)
    ## Get the x-range
    lines = ecdf.get_lines()[0]
    xl = lines.get_xdata()
    ## Compute the fitted CDF
    cdf = stats.norm.cdf(xl, loc=mean, scale=sdev)
    ax.plot(xl, cdf, **fitLineStyle)
    ## Styling
    ax.set_ylabel("CDF")
    ax.axvline(mean, label=r"$\mu$", **baseLineStyle)
    ax.axhline(0.5, **baseLineStyle)
    ax.set_yticks(np.linspace(0.25, 1, 4))
    ax.legend(loc="upper left", frameon=True)

    # Difference plot
    ax = axs[2]
    for key in ["KDE", "Bins"]:
        styles[key].pop("cumulative")
        styles[key].pop("label")

    # Plot KDE difference
    ax.plot(kde.support, kdeDiff, **styles["KDE"])

    # Plot bin difference
    rwidth = styles["Bins"].pop("rwidth")
    styles["Bins"].pop("normed")
    midBin = (xb[1:] + xb[:-1]) / 2
    yb = yb / np.sum(yb * (xb[1:] - xb[:-1]))
    pdf = stats.norm.pdf(midBin, loc=mean, scale=sdev)
    diff = 2 * (yb - pdf) / (yb + pdf)
    ax.bar(xb[:-1] + deltaX / 2,
           diff,
           width=(xb[1:] - xb[:-1]) * rwidth,
           align='edge',
           **styles["Bins"])
    ax.set_ylabel(r"$\Delta$PDF")

    ax.set_ylim(min(-0.1, diff.min()) * 1.5, max(diff.max(), 0.1) * 1.5)

    ## Styling
    ax.axvline(mean, **baseLineStyle)
    baseLineStyle["color"] = "black"
    baseLineStyle["ls"] = "-"
    ax.axhline(0, **baseLineStyle)

    # General styling
    for nax, ax in enumerate(axs):
        # Labels right
        ax.yaxis.set_label_position("right")
        # Ticks styling
        ax.tick_params(axis="both",
                       direction='inout',
                       width=0.5,
                       length=2.5,
                       top=(nax != 0))
        # set line width
        for val in ax.spines.values():
            val.set_linewidth(0.5)

    # Remove line width for PDF plot
    for pos in ["left", "top", "right"]:
        axs[0].spines[pos].set_linewidth(0)

    ax.set_xlim(dist.min(), dist.max())

    # Adjust internal plot spacings
    plt.subplots_adjust(hspace=0.0)

    return fig
Beispiel #19
0
Created on Sun Jul 26 11:23:09 2020

@author: Chaobo Zhang
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.nonparametric.api as smnp

#Draw a probability density plot of a variable

Data_for_density_estimation = pd.read_csv(open('Test data for probability density plot.csv')) #用来估计核密度曲线
fft = "True"
Feature = "Variable 1"
kde = smnp.KDEUnivariate([float(x) for x in Data_for_density_estimation[Feature]])
outlier_ratio = 50 #Scale factor of the outlier threshold
bw = 1 #Bandwidth 
threshold_of_the_number_of_categories = 5 #Threshold of the number of categories
kde.fit("gau", bw, fft)
x, y = kde.support, kde.density
outlier_threshold = max(y)/outlier_ratio #Threshold of the probability density of outliers

plt.figure(figsize=(13, 6))
plt.xticks(fontproperties='Times New Roman',fontsize=24)
plt.yticks(fontproperties='Times New Roman',fontsize=24)
plt.plot(x, y, 'k', linewidth=1.5)
plt.ylim(-0.01, max(y)*1.1)
plt.xlim(min(x), max(x))

outlier_interval = [[-2.0, -1.0], [11.7, 53.0]]
def density(data):
    x = np.array(data, dtype=np.float64)
    kde = smnp.KDEUnivariate(x)
    kde.fit("gau", bw=.5, fft=True)
    x, y = kde.support, kde.density
    return x, y
Beispiel #21
0
def getStatisticsFrame(samples, nXStart=0, nXStep=1, obsTitles=None):
    r"""
  Computes a statistic frame for a given correlator bootstrap ensemble.

  This routine takes statistical data 'samples' (see parameters) as input.
  For each individual distribution within the sample data,
  this routine fits a Gaussian Probability Density Function (PDF) and computes
  Kernel Density Estimate (KDE).
  The output of this routine is a data frame, which contains the
  following information for each individual distribution of data within
  the samples array:
    * 'mean': the mean value of the distribution
    * 'sDev': the standard deviation of the individual distribution
    * 'kdeDiff': the relative vector norm of the KDE and the fitted PDF
        $$
          \sqrt{
            \int dx [ 2*(PDF_{KDE} - PDF_{FIT})/(PDF_{KDE} + PDF_{FIT}) ]^2
          }
        $$
    * 'Dn' and 'pValue': the statistic and the significance of the Hypothesis
          (normal distribution with given parameters)
          by the Kolmogorov-Smirnov test. of the Kolmogorov-Smirnov test
          (https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test).
  The data is classified by 'nX' and the observable name
  ('obsTitles' if present).

  Parameters
  ----------
    samples : array, shape = (nObservables, nXrange, nSamples)
      The statsitical HMC data.

    nXStart : int
      Index to nX dimension of samples array for plotting frames. Plots
      will start at this index.

    nXStep : int
      Stepindex to nX dimension of samples array for plotting frames. Only
      each 'nXStep' will be shown.

    obsTitles : None or list, length = nObservables
      Row titles for figure.

  Returns
  -------
    df : 'pandas.DataFrame'

  Note
  ----
    For the Kolmogorov-Smirnov test see 'scipy.stats.kstest' and for the KDE see
    '''python
    >>> import statsmodels.nonparametric.api as smnp
    >>> kde = smnp.KDEUnivariate(dist)
    >>> kde.fit(kernel="gau", bw="scott", fft=True, gridsize=100, cut=3)
    '''
  """
    # Allocate temp variables
    nObs, nXSize, _ = samples.shape
    if obsTitles is None:
        obsTitles = [r"O{0}".format(no) for no in range(nObs)]

    nXRange = np.arange(nXStart, nXSize, nXStep)

    data = []
    # Iterate correlators
    for nO, corrSample in enumerate(samples):
        # Iterate time steps
        for nX, dist in zip(nXRange, corrSample[nXStart::nXStep]):
            # Execute KS test
            mean, sDev = np.mean(dist), np.std(dist, ddof=1)
            ksRes = stats.kstest(dist, "norm", args=(mean, sDev))

            # Estimate KDE
            kde = smnp.KDEUnivariate(dist)
            kde.fit(kernel="gau", bw="scott", fft=True, gridsize=100, cut=3)

            # Compute integral difference between normal dist and KDE
            deltaX = kde.support[1] - kde.support[0]
            normal = stats.norm.pdf(kde.support, loc=mean, scale=sDev)
            kdeDiff = 2 * (kde.density - normal) / (kde.density + normal)
            kdeDiffnorm = np.sqrt(np.sum(kdeDiff**2) * deltaX)

            # Store data
            data += [{
                "observable": obsTitles[nO],
                "nX": nX,
                "mean": mean,
                "sDev": sDev,
                "Dn": ksRes.statistic,
                "pValue": ksRes.pvalue,
                "kdeDiff": kdeDiffnorm,
            }]

    # Return frame
    return pd.DataFrame(data,
                        columns=[
                            "observable", "nX", "mean", "sDev", "Dn", "pValue",
                            "kdeDiff"
                        ])