Ejemplo n.º 1
0
def _kde_plot(
    values: ndarray, grid: ndarray, axes: Axes, bw: Union[float, str] = "scott"
) -> None:
    """Calculate KDE for observed spacings.

    Parameters
    ----------
    values: ndarray
        the values used to compute (fit) the kernel density estimate

    grid: ndarray
        the grid of values over which to evaluate the computed KDE curve

    axes: pyplot.Axes
        the current axes object to be modified

    bw: bandwidh
        The `bw` argument for statsmodels KDEUnivariate .fit

    Notes
    -----
    we are doing this manually because we want to ensure consistency of the KDE
    calculation and remove Seaborn control over the process, while also avoiding
    inconsistent behaviours like https://github.com/mwaskom/seaborn/issues/938
    and https://github.com/mwaskom/seaborn/issues/796
    """
    values = values[values > 0]  # prevent floating-point bad behaviour
    kde = KDE(values)
    # kde.fit(kernel="gau", bw="scott", cut=0)
    kde.fit(kernel="gau", bw=bw, cut=0)
    evaluated = np.empty_like(grid)
    for i, _ in enumerate(evaluated):
        evaluated[i] = kde.evaluate(grid[i])
    kde_curve = axes.plot(grid, evaluated, label="Kernel Density Estimate")
    plt.setp(kde_curve, color="black")
Ejemplo n.º 2
0
    def kde_statsmodels_u(self, x_grid, bandwidth=0.2, **kwargs):
        """Univariate Kernel Density Estimation with Statsmodels"""
        from statsmodels.nonparametric.kde import KDEUnivariate

        kde = KDEUnivariate(self.data)
        kde.fit(bw=bandwidth, **kwargs)
        return kde.evaluate(x_grid)
Ejemplo n.º 3
0
def compute_entropy(U):

    HGauss0 = 0.5 + 0.5 * np.log(2 * np.pi)

    nSingVals = U.shape[1]
    H = np.empty(nSingVals, dtype='float64')

    for iBasisVector in range(nSingVals):

        kde = KDE(np.abs(U[:, iBasisVector]))
        kde.fit(gridsize=1000)

        pdf = kde.density
        x = kde.support

        dx = x[1] - x[0]

        # Calculate the Gaussian entropy
        pdfMean = nansum(x * pdf) * dx
        with np.errstate(invalid='ignore'):
            sigma = np.sqrt(nansum(((x - pdfMean)**2) * pdf) * dx)
        HGauss = HGauss0 + np.log(sigma)

        # Calculate vMatrix entropy
        pdf_pos = (pdf > 0)
        HVMatrix = -np.sum(xlogy(pdf[pdf_pos], pdf[pdf_pos])) * dx

        # Returned entropy is difference between V-Matrix entropy and Gaussian entropy of similar width (sigma)
        H[iBasisVector] = HVMatrix - HGauss

    return H
Ejemplo n.º 4
0
def draw_hist_and_kde(sample, grid, true_pdf):
    # гистограмма
    plt.hist(sample,
             20,
             range=(grid.min(), grid.max()),
             normed=True,
             label='histogram')

    # ядерная оценка плотности
    kernel_density = KDEUnivariate(sample)
    kernel_density.fit()
    plt.plot(grid,
             kernel_density.evaluate(grid),
             color='green',
             linewidth=2,
             label='kde')

    # истинная плотность
    plt.plot(grid,
             true_pdf(grid),
             color='red',
             linewidth=2,
             alpha=0.3,
             label='true pdf')

    plt.legend()
    plt.show()
Ejemplo n.º 5
0
class EstimatedKDE(object):
    """docstring for EstimatedKDE"""
    eps = 0.05
    points = 10000

    def __init__(self):
        super(EstimatedKDE, self).__init__()
        self.dist = None

    def fit(self, data):
        self.min = np.min(data)
        self.max = np.max(data)
        self.mean = np.mean(data)
        self.std = np.std(data)
        self.dist = KDEUnivariate(data)
        self.dist.fit()
        return self

    def mode(self):
        x = np.linspace(self.min, self.max, self.points)
        y = self.dist.evaluate(x)
        return x[np.argmax(y)]

    def median(self):
        return self.dist.icdf[50]

    def pdf(self, x):
        return self.dist.evaluate(x)
Ejemplo n.º 6
0
def gen_kde_pdf(distribution, bounds=None, kde_width=None):
    ## boundary correction for KDE

    if bounds == None:
        print("\t setting bounds to max value")
        var_min, var_max = min(distribution), max(distribution)

    else:
        distribution = distribution[np.where((distribution > bounds[0])
                                             & (distribution < bounds[1]))]
        var_min, var_max = bounds[0], bounds[1]

    lower = var_min - abs(distribution - var_min)
    upper = var_max + abs(distribution - var_max)
    merge = np.concatenate([lower, upper, distribution])

    if kde_width == None:
        print("... setting kde_width")
        kde_width = S_MAD(distribution) / 2.

    KDE_MERGE = KDEUnivariate(merge)
    KDE_MERGE.fit(bw=kde_width)

    SCALE = np.divide(1.,
                      integrate.quad(KDE_MERGE.evaluate, var_min, var_max)[0])

    return lambda X: SCALE * KDE_MERGE.evaluate(X)
Ejemplo n.º 7
0
Archivo: core.py Proyecto: wtak23/pytak
def kde_1d(signal, x_grid=None):
    """ Return 1d kde of a vector signal (Created 01/24/2015)

    Todo: how are the kde's normalized?  (i want the kde to sum to 1....)

    https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/
    http://glowingpython.blogspot.com/2012/08/kernel-density-estimation-with-scipy.html

    Usage
    -----
    >>> x = np.linspace(0,1,401)
    >>> kde = tw.kde_1d(signal, x)
    >>> plt.plot(x, kde)
    >>> plt.grid('on')
    """
    #    from scipy.stats.kde import gaussian_kde
    #    if x is None:
    #        x = np.linspace(0,1,401)
    #
    #    return gaussian_kde(signal)(x)
    from statsmodels.nonparametric.kde import KDEUnivariate
    kde = KDEUnivariate(signal)
    kde.fit()
    if x_grid is None:
        x_grid = np.linspace(0, 1, 401)
    #bin_space = x_grid[1]-x_grid[0]

    # kde estimate
    kde_est = kde.evaluate(x_grid)

    # normalize to pdf (need to come back on this....multiply by bin-spacing??)
    kde_est /= kde_est.sum()

    return kde_est, x_grid
Ejemplo n.º 8
0
def gaussian_density_estimation(samples, weights, grid, bw=0.1):
    """
    Kernel density estimation with Gaussian kernel.


    Parameters
    ----------
    samples : np.ndarray
        Array of sample values.
    weights : np.ndarray
        Array of sample weights. If None, unweighted KDE will be performed.
    grid : np.ndarray
        Grid points at which the KDE function should be evaluated.
    bw : float
        Bandwidth parameter for kernel density estimation. Associated with
        sigma in the case of a Gaussian kernel.

    Returns
    ----------
    np.ndarray
        The probability density values at the supplied grid points.
    """
    # KDE for fine-grained optimization
    kde = KDEUnivariate(samples)
    kde.fit(weights=weights, bw=bw, fft=False)

    # evaluate pdf on a grid to for use in SGOOP
    # TODO: area under curve between points instead of pdf at point
    return kde.evaluate(grid)
Ejemplo n.º 9
0
def reweight(rc, metad_traj, cv_columns, v_minus_c_col, rc_bins=20, kt=2.5):
    """
    Reweighting biased MD trajectory to unbiased probabilty along
    a given reaction coordinate. Using rbias column from COLVAR to
    perform reweighting per Tiwary and Parinello

    """
    # read in parameters from sgoop object
    colvar = metad_traj[cv_columns].values
    v_minus_c = metad_traj[v_minus_c_col].values

    # calculate rc observable for each frame
    colvar_rc = np.sum(colvar * rc, axis=1)

    # calculate frame weights, per Tiwary and Parinello, JCPB 2015 (c(t) method)
    weights = np.exp(v_minus_c / kt)
    norm_weights = weights / weights.sum()

    # fit weighted KDE with statsmodels method
    kde = KDEUnivariate(colvar_rc)
    kde.fit(weights=norm_weights, bw=0.05, fft=False)

    # evaluate pdf on a grid to for use in SGOOP
    grid = np.linspace(colvar_rc.min(), colvar.max(), num=rc_bins)
    pdf = kde.evaluate(grid)
    pdf = pdf / pdf.sum()

    return pdf, grid
Ejemplo n.º 10
0
 def setupClass(cls):
     cls.x = x = KDEWResults['x']
     weights = KDEWResults['weights']
     res1 = KDE(x)
     res1.fit(kernel=cls.kernel_name, weights=weights, fft=False)
     cls.res1 = res1
     cls.res_density = KDEWResults[cls.res_kernel_name]
def fit_kde(x, grid):
    resol = len(grid)
    d = np.zeros(resol)
    kde = KDEUnivariate(x)
    kde.fit()
    d = kde.evaluate(grid)
    return d
    def pdf(self, token, years, bw=5, *args, **kwargs):

        """
        Estimate a density function from a token's ratio series.

        Args:
            token (str)
            years (iter)
            bw (int)

        Returns: OrderedDict {year: density}
        """

        series = self.clean_series(token, *args, **kwargs)

        # Use the ratio values as weights.
        weights = np.array(list(series.values()))

        # Fit the density estimate.
        density = KDEUnivariate(list(series.keys()))
        density.fit(fft=False, weights=weights, bw=bw)

        samples = OrderedDict()

        for year in years:
            samples[year] = density.evaluate(year)[0]

        return samples
Ejemplo n.º 13
0
 def setup_class(cls):
     cls.decimal_density = 2  # low accuracy because binning is different
     res1 = KDE(Xi)
     res1.fit(kernel="gau", fft=True, bw="silverman")
     cls.res1 = res1
     rfname2 = os.path.join(curdir, 'results', 'results_kde_fft.csv')
     cls.res_density = np.genfromtxt(open(rfname2, 'rb'))
Ejemplo n.º 14
0
def _mode(data):
    modes = np.zeros([data.shape[0]])
    for i in range(data.shape[0]):
        kde = KDE(data[i, :])
        kde.fit(gridsize=2000)
        modes[i] = kde.support[np.argmax(kde.density)]
    return modes
Ejemplo n.º 15
0
def _reduce_mode(x):
    if len(x) == 0:
        return np.NaN
    x = np.asarray(x, dtype=np.float64)
    kde = KDE(x)
    kde.fit(gridsize=2000)
    return kde.support[np.argmax(kde.density)]
Ejemplo n.º 16
0
def reduce_mode(x):
    kde = KDE(x)
    kde.fit(gridsize=2000)

    pdf = kde.density
    x = kde.support
    return x[np.argmax(pdf)]
Ejemplo n.º 17
0
def compute_kde(data, test_x):
    data = data.flatten()
    test_x = test_x.flatten()
    kde = KDEUnivariate(data)
    kde.fit(kernel="gau", bw="silverman")
    dens = kde.evaluate(test_x)
    return dens, None
Ejemplo n.º 18
0
 def setup_class(cls):
     cls.decimal_density = 2 # low accuracy because binning is different
     res1 = KDE(Xi)
     res1.fit(kernel="gau", fft=True, bw="silverman")
     cls.res1 = res1
     rfname2 = os.path.join(curdir,'results','results_kde_fft.csv')
     cls.res_density = np.genfromtxt(open(rfname2, 'rb'))
Ejemplo n.º 19
0
 def setup_class(cls):
     cls.x = x = KDEWResults['x']
     weights = KDEWResults['weights']
     res1 = KDE(x)
     # default kernel was scott when reference values computed
     res1.fit(kernel=cls.kernel_name, weights=weights, fft=False, bw="scott")
     cls.res1 = res1
     cls.res_density = KDEWResults[cls.res_kernel_name]
Ejemplo n.º 20
0
def weighted_kernel_density_1d(values, weights, bw='silverman', plot=False):
    from statsmodels.nonparametric.kde import KDEUnivariate
    kden= KDEUnivariate(values)
    kden.fit(weights=weights, bw=bw, fft=False)
    if plot:
        import matplotlib.pyplot as plt
        plt.plot(kden.support, [kden.evaluate(xi) for xi in kden.support], 'o-')
    return kden
Ejemplo n.º 21
0
 def setup_class(cls):
     cls.x = x = KDEWResults['x']
     weights = KDEWResults['weights']
     res1 = KDE(x)
     # default kernel was scott when reference values computed
     res1.fit(kernel=cls.kernel_name, weights=weights, fft=False, bw="scott")
     cls.res1 = res1
     cls.res_density = KDEWResults[cls.res_kernel_name]
Ejemplo n.º 22
0
 def setup_class(cls):
     res1 = KDE(Xi)
     weights = np.linspace(1,100,200)
     res1.fit(kernel="gau", gridsize=50, weights=weights, fft=False,
                 bw="silverman")
     cls.res1 = res1
     rfname = os.path.join(curdir,'results','results_kde_weights.csv')
     cls.res_density = np.genfromtxt(open(rfname, 'rb'), skip_header=1)
Ejemplo n.º 23
0
 def setup_class(cls):
     res1 = KDE(Xi)
     weights = np.linspace(1,100,200)
     res1.fit(kernel="gau", gridsize=50, weights=weights, fft=False,
                 bw="silverman")
     cls.res1 = res1
     rfname = os.path.join(curdir,'results','results_kde_weights.csv')
     cls.res_density = np.genfromtxt(open(rfname, 'rb'), skip_header=1)
Ejemplo n.º 24
0
def calcKDE(kd_bw=0.1):
    """ """

    #> KDE using StatsModels
    kde = KDEUnivariate(nao_rn)
    kde.fit(bw=kd_bw)

    return kde.evaluate(x_kde)
Ejemplo n.º 25
0
 def fit(self, data):
     self.min = np.min(data)
     self.max = np.max(data)
     self.mean = np.mean(data)
     self.std = np.std(data)
     self.dist = KDEUnivariate(data)
     self.dist.fit()
     return self
def test_kde_bw_positive():
    # GH 6679
    x = np.array([
        4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985,
        4.59511985, 4.59511985, 4.59511985, 4.59511985, 5.67332327, 6.19847872,
        7.43189192
    ])
    kde = KDE(x)
    kde.fit()
    assert kde.bw > 0
Ejemplo n.º 27
0
def find_outiers_kde(x):
    x_scaled = scale(list(map(float, x)))
    kde = KDEUnivariate(x_scaled)
    kde.fit(bw="scott", fft=True)
    pred = kde.evaluate(x_scaled)

    n = sum(pred < 0.5)
    outlierindices = np.asarray(pred).argsort()[:n]
    outliervalue = np.asarray(x)[outlierindices]
    return outlierindices, outliervalue
Ejemplo n.º 28
0
def find_outiers_kde(x):
    x_scaled = scale(list(map(float,x)))
    kde = KDEUnivariate(x_scaled)
    kde.fit(bw="scott",fft=True)
    pred = kde.evaluate(x_scaled)
    
    n = sum(pred < 0.5)
    outlierindices=np.asarray(pred).argsort()[:n]
    outliervalue=np.asarray(x)[outlierindices]
    return outlierindices,outliervalue
Ejemplo n.º 29
0
def mag_dist(dval):
    """
	Function to plot magnitude distribution for targets

	.. codeauthor:: Mikkel N. Lund <*****@*****.**>
	.. codeauthor:: Rasmus Handberg <*****@*****.**>
	"""

    logger = logging.getLogger('dataval')
    logger.info('Plotting Magnitude distribution...')

    fig, ax = plt.subplots(figsize=plt.figaspect(0.5))
    fig.subplots_adjust(left=0.14,
                        wspace=0.3,
                        top=0.94,
                        bottom=0.155,
                        right=0.96)

    colors = ['r', 'b', 'g']  # TODO: What if there are more than three?
    for k, cadence in enumerate(dval.cadences):

        star_vals = dval.search_database(select='todolist.tmag',
                                         search=f'cadence={cadence:d}')

        if star_vals:
            tmags = np.array([star['tmag'] for star in star_vals])

            kde = KDE(tmags)
            kde.fit(gridsize=1000)

            ax.fill_between(kde.support,
                            0,
                            kde.density / np.max(kde.density),
                            color=colors[k],
                            alpha=0.3,
                            label=f'{cadence:d}s cadence')

#		kde_all = KDE(tmags)
#		kde_all.fit(gridsize=1000)
#		ax.plot(kde_all.support, kde_all.density/np.max(kde_all.density), 'k-', lw=1.5, label='All')

    ax.set_ylim(bottom=0)
    ax.set_xlabel('TESS magnitude')
    ax.set_ylabel('Normalised Density')
    ax.xaxis.set_major_locator(MultipleLocator(2))
    ax.xaxis.set_minor_locator(MultipleLocator(1))
    ax.legend(frameon=False,
              loc='upper left',
              borderaxespad=0,
              handlelength=2.5,
              handletextpad=0.4)

    fig.savefig(os.path.join(dval.outfolder, 'mag_dist'))
    if not dval.show:
        plt.close(fig)
Ejemplo n.º 30
0
def normalize_data(img, contrast='T1'):
    '''
    Normalizes 3D images via KDE and clamping
    Params:
        - img: 3D image
    Returns:
        - normalized image
    '''

    if contrast == 'T1':
        CONTRAST = 1
    else:
        CONTRAST = 0

    if (len(np.nonzero(img)[0])) == 0:
        normalized_img = img
    else:
        tmp = np.asarray(np.nonzero(img.flatten()))
        q = np.percentile(tmp, 99.)
        tmp = tmp[tmp <= q]
        tmp = np.asarray(tmp, dtype=float).reshape(-1, 1)

        GRID_SIZE = 80
        bw = float(q) / GRID_SIZE

        kde = KDEUnivariate(tmp)
        kde.fit(kernel='gau', bw=bw, gridsize=GRID_SIZE, fft=True)
        X = 100. * kde.density
        Y = kde.support

        idx = argrelextrema(X, np.greater)
        idx = np.asarray(idx, dtype=int)
        H = X[idx]
        H = H[0]
        p = Y[idx]
        p = p[0]
        x = 0.

        if CONTRAST == 1:
            T1_CLAMP_VALUE = 1.25
            x = p[-1]
            normalized_img = img / x
            normalized_img[normalized_img > T1_CLAMP_VALUE] = T1_CLAMP_VALUE
        else:
            T2_CLAMP_VALUE = 3.5
            x = np.amax(H)
            j = np.where(H == x)
            x = p[j]
            if len(x) > 1:
                x = x[0]
            normalized_img = img / x
            normalized_img[normalized_img > T2_CLAMP_VALUE] = T2_CLAMP_VALUE

    normalized_img /= normalized_img.max()
    return normalized_img
Ejemplo n.º 31
0
def bootstrap_stats(
        args: Dict[str, Any],
        out_q: Optional[mp.Queue] = None) -> Union[None, Dict[str, Any]]:
    r'''
    Computes statistics and KDEs of data via sampling with replacement

    Arguments:
        args: dictionary of arguments. Possible keys are:
            data - data to resample
            name - name prepended to returned keys in result dict
            weights - array of weights matching length of data to use for weighted resampling
            n - number of times to resample data
            x - points at which to compute the kde values of resample data
            kde - whether to compute the kde values at x-points for resampled data
            mean - whether to compute the means of the resampled data
            std - whether to compute standard deviation of resampled data
            c68 - whether to compute the width of the absolute central 68.2 percentile of the resampled data

        out_q: if using multiporcessing can place result dictionary in provided queue

    Returns:
        Result dictionary if `out_q` is `None` else `None`.
    '''

    out_dict, mean, std, c68, boot = {}, [], [], [], []
    name = '' if 'name' not in args else args['name']
    weights = None if 'weights' not in args else args['weights']
    if 'n' not in args: args['n'] = 100
    if 'kde' not in args: args['kde'] = False
    if 'mean' not in args: args['mean'] = False
    if 'std' not in args: args['std'] = False
    if 'c68' not in args: args['c68'] = False
    if args['kde'] and args['data'].dtype != 'float64':
        data = np.array(args['data'], dtype='float64')
    else:
        data = args['data']
    len_d = len(data)

    np.random.seed()
    for i in range(args['n']):
        points = np.random.choice(data, len_d, replace=True, p=weights)
        if args['kde']:
            kde = KDEUnivariate(points)
            kde.fit()
            boot.append([kde.evaluate(x) for x in args['x']])
        if args['mean']: mean.append(np.mean(points))
        if args['std']: std.append(np.std(points, ddof=1))
        if args['c68']: c68.append(np.percentile(np.abs(points), 68.2))

    if args['kde']: out_dict[f'{name}_kde'] = boot
    if args['mean']: out_dict[f'{name}_mean'] = mean
    if args['std']: out_dict[f'{name}_std'] = std
    if args['c68']: out_dict[f'{name}_c68'] = c68
    if out_q is not None: out_q.put(out_dict)
    else: return out_dict
Ejemplo n.º 32
0
def kde_param(distribution, x0):
    ### kde_param tries to ensure correct handling of multimodal distributions

    ### compute kernal density estimation
    KDE = KDEUnivariate(distribution)

    KDE.fit(bw=np.std(distribution)/3.0)

    result = scipy.optimize.minimize(lambda x: -1*KDE.evaluate(x),
    x0 = x0, method='Powell')  ## Powell has been working pretty well.

    return {'result' : float(result['x']), 'kde' : KDE}
Ejemplo n.º 33
0
def kde_param(distribution, x0):


    ### compute kernal density estimation
    KDE = KDEUnivariate(distribution)

    KDE.fit(bw=np.std(distribution)/3.0)

    result = scipy.optimize.minimize(lambda x: -1*KDE.evaluate(x),
    x0 = x0, method='Powell')
    #print(result)

    return {'result' : float(result['x']), 'kde' : KDE}
Ejemplo n.º 34
0
def calc_bayes_factor(prior_samples, posterior_samples, x=0):
    '''Returns the Bayes Factor (BF01) such that values >1 indicate there is 
    more support for `x` under the posterior, relative to the prior.
    '''
    kde = KDEUnivariate(prior_samples)
    kde.fit()
    prior_density_at_zero = kde.evaluate([x])

    kde = KDEUnivariate(posterior_samples)
    kde.fit()
    posterior_density_at_zero = kde.evaluate([x])
    
    BF_prior_post = prior_density_at_zero/posterior_density_at_zero
    return BF_prior_post[0]
Ejemplo n.º 35
0
def sample_pdf(catalog, parameter, pdf_fun, params, bounds):

    ## Catalog: pd.DataFrame() input catalog with arbitrary distribution function
    ## input_fun: desired distribution of sample
    ## scale:   scale of sample

    param_span = np.linspace(min(catalog[parameter]), max(catalog[parameter]),
                             100)

    print("... determine master KDE")

    KDE = KDEUnivariate(catalog[parameter])
    KDE.fit(bw=np.std(catalog[parameter]) / 3)

    KDE_FUN = interp1d(param_span, KDE.evaluate(param_span))

    ## need to rescale within the bounds.

    NORM = np.divide(
        1.,
        integrate.quad(KDE.evaluate,
                       bounds[0],
                       bounds[1],
                       points=param_span[np.where((param_span > bounds[0])
                                                  & (param_span < bounds[1]))],
                       limit=200)[0])

    ##########################################

    N = len(catalog[catalog[parameter].between(*bounds)])

    ############################################

    ### we need the scale from the other function

    result, kde_fun = determine_scale(catalog,
                                      parameter,
                                      pdf_fun,
                                      params,
                                      bounds=bounds)

    sample = np.random.uniform(0.0, 1.0,
                               len(catalog)) * len(catalog) * NORM * KDE_FUN(
                                   catalog[parameter])

    boo_array = sample < result['x'] * pdf_fun(catalog[parameter], *params)

    return catalog[boo_array & (catalog[parameter].between(
        bounds[0], bounds[1], inclusive=True))].copy()
Ejemplo n.º 36
0
def reweight(rc,
             metad_traj,
             cv_columns,
             v_minus_c_col,
             rc_bins=20,
             kt=2.5,
             kde=False):
    """
    Reweighting biased MD trajectory to unbiased probabilty along
    a given reaction coordinate. Using rbias column from COLVAR to
    perform reweighting per Tiwary and Parinello

    """
    # read in parameters from sgoop object
    colvar = metad_traj[cv_columns].values
    v_minus_c = metad_traj[v_minus_c_col].values

    # calculate rc observable for each frame
    colvar_rc = np.sum(colvar * rc, axis=1)

    # calculate frame weights, per Tiwary and Parinello, JCPB 2015 (c(t) method)
    weights = np.exp(v_minus_c / kt)
    norm_weights = weights / weights.sum()

    if kde:
        # KDE for fine-grained optimization
        kde = KDEUnivariate(colvar_rc)
        kde.fit(weights=norm_weights, bw=0.1, fft=False)

        # evaluate pdf on a grid to for use in SGOOP
        # TODO: area under curve between points instead of pdf at point
        grid = np.linspace(colvar_rc.min(), colvar_rc.max(), num=rc_bins)
        pdf = kde.evaluate(grid)

        return pdf, grid

    # histogram density for coarse optimization (
    hist, bin_edges = np.histogram(colvar_rc,
                                   weights=norm_weights,
                                   bins=rc_bins,
                                   density=True,
                                   range=(colvar_rc.min(), colvar_rc.max()))
    # set grid points to center of bins
    bin_width = bin_edges[1] - bin_edges[0]
    grid = bin_edges[:-1] + bin_width
    pdf = hist

    return pdf, grid
Ejemplo n.º 37
0
def empiricalPDF(data):
    """
    Evaluate a probability density function using kernel density
    estimation for input data.

    :param data: :class:`numpy.ndarray` of data values.

    :returns: PDF values at the data points.
    """
    LOG.debug("Calculating empirical PDF")
    sortedmax = np.sort(data)
    kde = KDEUnivariate(sortedmax)
    kde.fit()
    try:
        res = kde.evaluate(sortedmax)
    except MemoryError:
        res = np.zeros(len(sortedmax))
    return res
Ejemplo n.º 38
0
def kde_statsmodels_u(data, grid, **kwargs):
    """
    Univariate Kernel Density Estimation with Statsmodels

    Parameters
    ----------
    data : numpy.array
        Data points used to compute a density estimator. It
        has `n x 1` dimensions, representing n points and p
        variables.
    grid : numpy.array
        Data points at which the desity will be estimated. It
        has `m x 1` dimensions, representing m points and p
        variables.

    Returns
    -------
    out : numpy.array
        Density estimate. Has `m x 1` dimensions
    """
    kde = KDEUnivariate(data)
    kde.fit(**kwargs)
    return kde.evaluate(grid)
Ejemplo n.º 39
0
 def kde_statsmodels_u(self, x_grid, bandwidth=0.2, **kwargs):
     """Univariate Kernel Density Estimation with Statsmodels"""
     from statsmodels.nonparametric.kde import KDEUnivariate
     kde = KDEUnivariate(self.data)
     kde.fit(bw=bandwidth, **kwargs)
     return kde.evaluate(x_grid)
Ejemplo n.º 40
0
ln_par, ln_lo, ln_up = bootstrap_fit(
    stats.lognorm, resid, n_iter=n_bs, quant=q
)
hc_par, hc_lo, hc_up = bootstrap_fit(
    stats.halfcauchy, resid, n_iter=n_bs, quant=q
)
gam_par, gam_lo, gam_up = bootstrap_fit(
    stats.gamma, resid, n_iter=n_bs, quant=q
)

##################################################################

hc = stats.halfcauchy(*stats.halfcauchy.fit(resid))
lg = stats.lognorm(*stats.lognorm.fit(resid))
dens = KDEUnivariate(resid)
dens.fit()
ecdf = ECDF(resid)

##################################################################
# prepare X axes for plotting

ex = ecdf.x
x = np.linspace(min(resid), max(resid), 2000)

##################################################################
# Fit a Landau distribution with ROOT

if HAS_ROOT:
    root_hist = rootpy.plotting.Hist(100, 0, np.pi)
    root_hist.fill_array(resid)
Ejemplo n.º 41
0
plt.title('Logit Residuals');


# Hey I've got an idea, let's just make more plots...

fig = plt.figure(figsize=(18,9), dpi=1600)
a = .2

fig.add_subplot(221, axisbg="#DBDBDB")

"""
this is the "kernel density estimator", just like was used above,
to create a nice smoothed density plot of the predictions
the y-values look incorrect, but I'm guessing the shape is right
"""
kde_res = KDEUnivariate(res.predict())
kde_res.fit()

# I think the "support" is simply the domain in which the
# density is greater than 0.
plt.plot(kde_res.support,kde_res.density)
plt.fill_between(kde_res.support,kde_res.density, alpha=a)
plt.title("Distribution of our Predictions")

# show that predicted survival probabilities are much lower
# for males than females
fig.add_subplot(222, axisbg="#DBDBDB")
plt.scatter(res.predict(),x['C(Sex)[T.male]'] , alpha=a)
plt.grid(b=True, which='major', axis='x')
plt.xlabel("Predicted chance of survival")
plt.ylabel("Gender Bool")
Ejemplo n.º 42
0
 def setup_class(cls):
     res1 = KDE(Xi)
     res1.fit(kernel="gau", fft=False, bw="silverman")
     cls.res1 = res1
     cls.res_density = KDEResults["gau_d"]
def kde_statsmodels_u(x, x_grid, bandwidth=0.2, **kwargs):
    """Univariate Kernel Density Estimation with Statsmodels"""
    kde = KDEUnivariate(x)
    kde.fit(bw=bandwidth, **kwargs)
    return kde.evaluate(x_grid)
Ejemplo n.º 44
0
def draw_logit_regression(df, kind):
    w = open("logit_result.txt", "w")
    formula = 'Survived ~ C(Pclass) + C(Sex) + Age + SibSp  + C(Embarked)' # here the ~ sign is an = sign, and the features of our dataset
    results = {} # create a results dictionary to hold our regression results for easy analysis later
    y, x = dmatrices(formula, data=df, return_type='dataframe')
    model = sm.Logit(y, x)
    res = model.fit()
    results['Logit'] = [res, formula]
    print >> w, res.summary()

    if kind is 1:
        return results

    # Plot Predictions Vs Actual
    plt.figure(figsize=(18,4));
    plt.subplot(121, axisbg="#DBDBDB")
    # generate predictions from our fitted model
    ypred = res.predict(x)
    plt.plot(x.index, ypred, 'bo', x.index, y, 'mo', alpha=.25);
    plt.grid(color='white', linestyle='dashed')
    plt.title('Logit predictions, Blue: \nFitted/predicted values: Red');
    plt.savefig("1.eps")

    # Residuals
    plt.subplot(122, axisbg="#DBDBDB")
    plt.plot(res.resid, 'r-')
    plt.grid(color='white', linestyle='dashed')
    plt.title('Logit Residuals');
    plt.savefig("2.eps")



    fig = plt.figure(figsize=(18,9), dpi=1600)
    a = .2

    # Below are examples of more advanced plotting. 
    # It it looks strange check out the tutorial above.
    fig.add_subplot(221, axisbg="#DBDBDB")
    kde_res = KDEUnivariate(res.predict())
    kde_res.fit()
    plt.plot(kde_res.support,kde_res.density)
    plt.fill_between(kde_res.support,kde_res.density, alpha=a)
    title("Distribution of our Predictions")

    fig.add_subplot(222, axisbg="#DBDBDB")
    plt.scatter(res.predict(),x['C(Sex)[T.male]'] , alpha=a)
    plt.grid(b=True, which='major', axis='x')
    plt.xlabel("Predicted chance of survival")
    plt.ylabel("Gender Bool")
    title("The Change of Survival Probability by Gender (1 = Male)")

    fig.add_subplot(223, axisbg="#DBDBDB")
    plt.scatter(res.predict(),x['C(Pclass)[T.3]'] , alpha=a)
    plt.xlabel("Predicted chance of survival")
    plt.ylabel("Class Bool")
    plt.grid(b=True, which='major', axis='x')
    title("The Change of Survival Probability by Lower Class (1 = 3rd Class)")

    fig.add_subplot(224, axisbg="#DBDBDB")
    plt.scatter(res.predict(),x.Age , alpha=a)
    plt.grid(True, linewidth=0.15)
    title("The Change of Survival Probability by Age")
    plt.xlabel("Predicted chance of survival")
    plt.ylabel("Age")
    plt.savefig("prediction.eps")
Ejemplo n.º 45
0
def lfdr(p_values, pi0, trunc = True, monotone = True, transf = "probit", adj = 1.5, eps = np.power(10.0,-8)):
    """ Estimate local FDR / posterior error probability from p-values according to bioconductor/qvalue """
    p = np.array(p_values)

    # Compare to bioconductor/qvalue reference implementation
    # import rpy2
    # import rpy2.robjects as robjects
    # from rpy2.robjects import pandas2ri
    # pandas2ri.activate()

    # density=robjects.r('density')
    # smoothspline=robjects.r('smooth.spline')
    # predict=robjects.r('predict')

    # Check inputs
    lfdr_out = p
    rm_na = np.isfinite(p)
    p = p[rm_na]

    if (min(p) < 0 or max(p) > 1):
        raise click.ClickException("p-values not in valid range [0,1].")
    elif (pi0 < 0 or pi0 > 1):
        raise click.ClickException("pi0 not in valid range [0,1].")

    # Local FDR method for both probit and logit transformations
    if (transf == "probit"):
        p = np.maximum(p, eps)
        p = np.minimum(p, 1-eps)
        x = scipy.stats.norm.ppf(p, loc=0, scale=1)

        # R-like implementation
        bw = bw_nrd0(x)
        myd = KDEUnivariate(x)
        myd.fit(bw=adj*bw, gridsize = 512)
        splinefit = sp.interpolate.splrep(myd.support, myd.density)
        y = sp.interpolate.splev(x, splinefit)
        # myd = density(x, adjust = 1.5) # R reference function
        # mys = smoothspline(x = myd.rx2('x'), y = myd.rx2('y')) # R reference function
        # y = predict(mys, x).rx2('y') # R reference function

        lfdr = pi0 * scipy.stats.norm.pdf(x) / y
    elif (transf == "logit"):
        x = np.log((p + eps) / (1 - p + eps))

        # R-like implementation
        bw = bw_nrd0(x)
        myd = KDEUnivariate(x)
        myd.fit(bw=adj*bw, gridsize = 512)

        splinefit = sp.interpolate.splrep(myd.support, myd.density)
        y = sp.interpolate.splev(x, splinefit)
        # myd = density(x, adjust = 1.5) # R reference function
        # mys = smoothspline(x = myd.rx2('x'), y = myd.rx2('y')) # R reference function
        # y = predict(mys, x).rx2('y') # R reference function

        dx = np.exp(x) / np.power((1 + np.exp(x)),2)
        lfdr = (pi0 * dx) / y
    else:
        raise click.ClickException("Invalid local FDR method.")

    if (trunc):
        lfdr[lfdr > 1] = 1
    if (monotone):
        lfdr = lfdr[p.ravel().argsort()]
        for i in range(1,len(x)):
            if (lfdr[i] < lfdr[i - 1]):
                lfdr[i] = lfdr[i - 1]
        lfdr = lfdr[scipy.stats.rankdata(p,"min")-1]

    lfdr_out[rm_na] = lfdr
    return lfdr_out