def pdf(self, token, years, bw=5, *args, **kwargs):

        """
        Estimate a density function from a token's ratio series.

        Args:
            token (str)
            years (iter)
            bw (int)

        Returns: OrderedDict {year: density}
        """

        series = self.clean_series(token, *args, **kwargs)

        # Use the ratio values as weights.
        weights = np.array(list(series.values()))

        # Fit the density estimate.
        density = KDEUnivariate(list(series.keys()))
        density.fit(fft=False, weights=weights, bw=bw)

        samples = OrderedDict()

        for year in years:
            samples[year] = density.evaluate(year)[0]

        return samples
Esempio n. 2
0
 def setup_class(cls):
     cls.decimal_density = 2 # low accuracy because binning is different
     res1 = KDE(Xi)
     res1.fit(kernel="gau", fft=True, bw="silverman")
     cls.res1 = res1
     rfname2 = os.path.join(curdir,'results','results_kde_fft.csv')
     cls.res_density = np.genfromtxt(open(rfname2, 'rb'))
Esempio n. 3
0
 def setupClass(cls):
     cls.x = x = KDEWResults['x']
     weights = KDEWResults['weights']
     res1 = KDE(x)
     res1.fit(kernel=cls.kernel_name, weights=weights, fft=False)
     cls.res1 = res1
     cls.res_density = KDEWResults[cls.res_kernel_name]
Esempio n. 4
0
 def setup_class(cls):
     res1 = KDE(Xi)
     weights = np.linspace(1,100,200)
     res1.fit(kernel="gau", gridsize=50, weights=weights, fft=False,
                 bw="silverman")
     cls.res1 = res1
     rfname = os.path.join(curdir,'results','results_kde_weights.csv')
     cls.res_density = np.genfromtxt(open(rfname, 'rb'), skip_header=1)
Esempio n. 5
0
 def setup_class(cls):
     cls.x = x = KDEWResults['x']
     weights = KDEWResults['weights']
     res1 = KDE(x)
     # default kernel was scott when reference values computed
     res1.fit(kernel=cls.kernel_name, weights=weights, fft=False, bw="scott")
     cls.res1 = res1
     cls.res_density = KDEWResults[cls.res_kernel_name]
Esempio n. 6
0
def find_outiers_kde(x):
    x_scaled = scale(list(map(float,x)))
    kde = KDEUnivariate(x_scaled)
    kde.fit(bw="scott",fft=True)
    pred = kde.evaluate(x_scaled)
    
    n = sum(pred < 0.5)
    outlierindices=np.asarray(pred).argsort()[:n]
    outliervalue=np.asarray(x)[outlierindices]
    return outlierindices,outliervalue
Esempio n. 7
0
def empiricalPDF(data):
    """
    Evaluate a probability density function using kernel density
    estimation for input data.

    :param data: :class:`numpy.ndarray` of data values.

    :returns: PDF values at the data points.
    """
    LOG.debug("Calculating empirical PDF")
    sortedmax = np.sort(data)
    kde = KDEUnivariate(sortedmax)
    kde.fit()
    try:
        res = kde.evaluate(sortedmax)
    except MemoryError:
        res = np.zeros(len(sortedmax))
    return res
Esempio n. 8
0
def kde_statsmodels_u(data, grid, **kwargs):
    """
    Univariate Kernel Density Estimation with Statsmodels

    Parameters
    ----------
    data : numpy.array
        Data points used to compute a density estimator. It
        has `n x 1` dimensions, representing n points and p
        variables.
    grid : numpy.array
        Data points at which the desity will be estimated. It
        has `m x 1` dimensions, representing m points and p
        variables.

    Returns
    -------
    out : numpy.array
        Density estimate. Has `m x 1` dimensions
    """
    kde = KDEUnivariate(data)
    kde.fit(**kwargs)
    return kde.evaluate(grid)
Esempio n. 9
0
def k2p2FixFromSum(SumImage,
                   thresh=1,
                   output_folder=None,
                   plot_folder=None,
                   show_plot=True,
                   min_no_pixels_in_mask=8,
                   min_for_cluster=4,
                   cluster_radius=np.sqrt(2),
                   segmentation=True,
                   ws_alg='flux',
                   ws_blur=0.5,
                   ws_thres=0.05,
                   ws_footprint=3,
                   extend_overflow=True,
                   catalog=None):
    """
	Create pixel masks from Sum-image.

	Parameters:
		SumImage (ndarray): Sum-image.
		thres (float, optional): Threshold for significant flux. The threshold is calculated as MODE+thres*MAD. Default=1.
		output_folder (string, optional): Path to directory where output should be saved. Default=None.
		plot_folder (string, optional): Path to directory where plots should be saved. Default=None.
		show_plot (boolean, optional): Should plots be shown to the user? Default=True.
		min_no_pixels_in_mask (integer, optional): Minimim number of pixels to constitute a mask.
		min_for_cluster (integer, optional): Minimum number of pixels to be considered a cluster in DBSCAN clustering.
		cluster_radius (float, optional): Radius around points to consider cluster in DBSCAN clustering.
		segmentation (boolean, optional): Perform segmentation of clusters using Watershed segmentation.
		ws_alg (string, optional): Watershed method to use. Default='flux'.
		ws_thres (float, optional): Threshold for watershed segmentation.
		ws_footprint (integer, optional): Footprint to use in watershed segmentation.
		extend_overflow (boolean, optional): Enable extension of overflow columns for bright stars.
		catalog (ndarray, optional): Catalog of stars as an array with three columns (column, row and magnitude). If this is provided
			the results will only allow masks to be returned for stars in the catalog and the information is
			also used in the extension of overflow columns.

	Returns:
		tuple: Tuple with two elements: A 3D boolean ndarray of masks and a float indicating the bandwidth used for the estimation background-levels.

	.. codeauthor:: Rasmus Handberg <*****@*****.**>
	.. codeauthor:: Mikkel Lund <*****@*****.**>
	"""

    # Get logger for printing messages:
    logger = logging.getLogger(__name__)
    logger.info("Creating masks from sum-image...")

    NY, NX = np.shape(SumImage)
    ori_mask = ~np.isnan(SumImage)
    X, Y = np.meshgrid(np.arange(NX), np.arange(NY))

    # Cut out pixels from sum image which were collected and contains flux
    # and flatten the 2D image to 1D array:
    Flux = SumImage[ori_mask].flatten()
    Flux = Flux[Flux > 0]

    # Check if there was actually any flux measured:
    if len(Flux) == 0:
        raise K2P2NoFlux("No measured flux in sum-image")

    # Cut away the top 15% of the fluxes:
    flux_cut = stats.trim1(np.sort(Flux), 0.15)
    # Also do a cut on the absolute values of pixel - This helps in cases where
    # the image is dominated by saturated pixels. The exact value is of course
    # in principle dependent on the CCD, but we have found this value to be
    # reasonable in TESS simulated data:
    flux_cut = flux_cut[flux_cut < 70000]

    # Estimate the bandwidth we are going to use for the background:
    background_bandwidth = select_bandwidth(flux_cut, bw='scott', kernel='gau')
    logger.debug("  Sum-image KDE bandwidth: %f", background_bandwidth)

    # Make the Kernel Density Estimation of the fluxes:
    kernel = KDE(flux_cut)
    kernel.fit(kernel='gau', bw=background_bandwidth, fft=True, gridsize=100)

    # MODE
    def kernel_opt(x):
        return -1 * kernel.evaluate(x)

    max_guess = kernel.support[np.argmax(kernel.density)]
    MODE = minimize(kernel_opt, max_guess, method='Powell').x

    # MAD (around mode)
    MAD1 = mad_to_sigma * nanmedian(np.abs(Flux[(Flux < MODE)] - MODE))

    # Define the cutoff above which pixels are regarded significant:
    CUT = MODE + thresh * MAD1

    logger.debug("  Threshold used: %f", thresh)
    logger.debug("  Flux cut is: %f", CUT)
    if logger.isEnabledFor(logging.DEBUG) and plot_folder is not None:
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.fill_between(kernel.support, kernel.density, alpha=0.3)
        ax.axvline(MODE, color='k')
        ax.axvline(CUT, color='r')
        ax.set_xlabel('Flux')
        ax.set_ylabel('Distribution')
        save_figure(os.path.join(plot_folder, 'flux_distribution'))
        plt.close(fig)

    #==========================================================================
    # Find and seperate clusters of pixels
    #==========================================================================

    # Cut out pixels of sum image with flux above the cut-off:
    # The following two lines are identical to "idx = (SumImage > CUT)",
    # but in this way we avoid an RuntimeWarning when SumImage contains NaNs.
    idx = np.zeros_like(SumImage, dtype='bool')
    np.greater(SumImage, CUT, out=idx, where=~np.isnan(SumImage))
    X2 = X[idx]
    Y2 = Y[idx]

    if np.all(~idx):
        raise K2P2NoStars("No flux above threshold")

    logger.debug("  Min for cluster is: %f", min_for_cluster)
    logger.debug("  Cluster radius is: %f", cluster_radius)

    # Run clustering algorithm
    XX, labels_ini, core_samples_mask = run_DBSCAN(X2, Y2, cluster_radius,
                                                   min_for_cluster)

    # Run watershed segmentation algorithm:
    # Demand that there was any non-noise clusters found.
    if segmentation and any(labels_ini != -1):
        # Create a set of dummy-masks that are made up of the clusters
        # that were found by DBSCAN, meaning that there could be masks
        # with several stars in them:
        DUMMY_MASKS = np.zeros((0, NY, NX), dtype='bool')
        DUMMY_MASKS_LABELS = []
        m = np.zeros_like(SumImage, dtype='bool')
        for lab in set(labels_ini):
            if lab == -1: continue
            # Create "image" of this mask:
            m[:, :] = False
            for x, y in XX[labels_ini == lab]:
                m[y, x] = True
            # Append them to lists:
            DUMMY_MASKS = np.append(DUMMY_MASKS, [m], axis=0)
            DUMMY_MASKS_LABELS.append(lab)

        # Run the dummy masks through the detection of saturated columns:
        logger.debug("Detecting saturated columns in non-segmentated masks...")
        smask, _ = k2p2_saturated(SumImage, DUMMY_MASKS, idx)

        # Create dictionary that will map a label to the mask of saturated pixels:
        if np.any(smask):
            saturated_masks = {}
            for u, sm in enumerate(smask):
                saturated_masks[DUMMY_MASKS_LABELS[u]] = sm
        else:
            saturated_masks = None

        # Run the mask segmentaion algorithm on the found clusters:
        labels, unique_labels, NoCluster = k2p2WS(
            X,
            Y,
            X2,
            Y2,
            SumImage,
            XX,
            labels_ini,
            core_samples_mask,
            saturated_masks=saturated_masks,
            ws_thres=ws_thres,
            ws_footprint=ws_footprint,
            ws_blur=ws_blur,
            ws_alg=ws_alg,
            output_folder=plot_folder,
            catalog=catalog)
    else:
        labels = labels_ini
        unique_labels = set(labels)
        #NoCluster = len(unique_labels) - (1 if -1 in labels else 0)

    # Make sure it is a tuple and not a set - much easier to work with:
    unique_labels = tuple(unique_labels)

    # Create list of clusters and their number of pixels:
    No_pix_sort = np.zeros([len(unique_labels), 2])
    for u, lab in enumerate(unique_labels):
        No_pix_sort[u, 0] = np.sum(labels == lab)
        No_pix_sort[u, 1] = lab

    # Only select the clusters that have enough pixels and are not noise:
    cluster_select = (No_pix_sort[:, 0] >=
                      min_no_pixels_in_mask) & (No_pix_sort[:, 1] != -1)
    no_masks = sum(cluster_select)
    No_pix_sort = No_pix_sort[cluster_select, :]

    # No masks were found, so return None:
    if no_masks == 0:
        MASKS = None

    else:
        # Sort the clusters by the number of pixels:
        cluster_sort = np.argsort(No_pix_sort[:, 0])
        No_pix_sort = No_pix_sort[cluster_sort[::-1], :]

        # Create 3D array that will hold masks for each target:
        MASKS = np.zeros((no_masks, NY, NX))
        for u in range(no_masks):
            lab = No_pix_sort[u, 1]
            class_member_mask = (labels == lab)
            xy = XX[class_member_mask, :]
            MASKS[u, xy[:, 1], xy[:, 0]] = 1

        #==========================================================================
        # Fill holes in masks
        #==========================================================================
        pattern = np.array([[[0, 0.25, 0], [0.25, 0, 0.25],
                             [0, 0.25, 0]]])  # 3D array - shape=(1, 3, 3)
        mask_holes_indx = ndimage.convolve(MASKS,
                                           pattern,
                                           mode='constant',
                                           cval=0.0)
        mask_holes_indx = (mask_holes_indx > 0.95) & (
            MASKS == 0
        )  # Should be exactly 1.0, but let's assume some round-off errors
        if np.any(mask_holes_indx):
            logger.info("Filling %d holes in the masks",
                        np.sum(mask_holes_indx))
            MASKS[mask_holes_indx] = 1

            if plot_folder is not None:
                # Create image showing all masks at different levels:
                img = np.zeros((NY, NX))
                for r in np.transpose(np.where(MASKS > 0)):
                    img[r[1], r[2]] = r[0] + 1

                # Plot everything together:
                fig = plt.figure()
                ax = fig.add_subplot(111)
                plot_image(img,
                           ax=ax,
                           scale='linear',
                           percentile=100,
                           cmap='nipy_spectral',
                           title='Holes in mask filled')

                # Create outline of filled holes:
                for hole in np.transpose(np.where(mask_holes_indx)):
                    cen = (hole[2] - 0.5, hole[1] - 0.5)
                    ax.add_patch(
                        mpl.patches.Rectangle(cen,
                                              1,
                                              1,
                                              color='k',
                                              lw=2,
                                              fill=False,
                                              hatch='//'))

                #fig.savefig(os.path.join(plot_folder, 'mask_filled_holes.png'), format='png', bbox_inches='tight')
                save_figure(os.path.join(plot_folder, 'mask_filled_holes'))
                plt.close(fig)

        #==========================================================================
        # Entend overflow lanes
        #==========================================================================
        if extend_overflow:
            logger.debug("Detecting saturated columns in masks...")

            # Find pixels that are saturated in each mask and find out if they should
            # be added to the mask:
            saturated_mask, pixels_added = k2p2_saturated(SumImage, MASKS, idx)
            logger.info("Overflow will add %d pixels in total to the masks.",
                        pixels_added)

            # If we have a catalog of stars, we will only allow stars above the saturation
            # limit to get their masks extended:
            if catalog is not None:
                # Filter that catalog, only keeping stars actully inside current image:
                c = np.asarray(np.round(catalog[:, 0]), dtype='int32')
                r = np.asarray(np.round(catalog[:, 1]), dtype='int32')
                tmag = catalog[:, 2]
                indx = (c >= 0) & (c < SumImage.shape[1]) & (r >= 0) & (
                    r < SumImage.shape[0])
                c = c[indx]
                r = r[indx]
                tmag = tmag[indx]
                # Loop through the masks:
                for u in range(no_masks):
                    if np.any(saturated_mask[u, :, :]):
                        # Find out which stars fall inside this mask:
                        which_stars = np.asarray(MASKS[u, :, :][r, c],
                                                 dtype='bool')
                        if np.any(which_stars):
                            # Only allow extension of columns if the combined light of
                            # the targts in the mask exceeds the saturation limit:
                            mags_in_mask = tmag[which_stars]
                            mags_total = -2.5 * np.log10(
                                np.nansum(10**(-0.4 * mags_in_mask)))
                            if mags_total > saturation_limit:
                                # The combined magnitude of the targets is now
                                # above saturation
                                saturated_mask[u, :, :] = False
                        else:
                            # Do not add saturation columns if no stars were found:
                            saturated_mask[u, :, :] = False

            # If we are going to plot later on, make a note
            # of how the outline of the masks looked before
            # changing anything:
            if plot_folder is not None and logger.isEnabledFor(logging.DEBUG):
                outline_before = []
                for u in range(no_masks):
                    outline_before.append(k2p2maks(MASKS[u, :, :], 1, 0.5))

            # Add the saturated pixels to the masks:
            MASKS[saturated_mask] = 1

            # If we are running as DEBUG, output some plots as well:
            if plot_folder is not None and logger.isEnabledFor(logging.DEBUG):
                logger.debug("Plotting overflow figures...")
                Ypixel = np.arange(NY)
                for u in range(no_masks):
                    mask = np.asarray(MASKS[u, :, :], dtype='bool')
                    mask_rows, mask_columns = np.where(mask)
                    mask_max = np.nanmax(SumImage[mask])

                    # The outline of the mask after saturated columns have been
                    # corrected for:
                    outline = k2p2maks(mask, 1, 0.5)

                    with PdfPages(
                            os.path.join(plot_folder, 'overflow_mask' +
                                         str(u) + '.pdf')) as pdf:
                        for c in sorted(set(mask_columns)):

                            column_rows = mask_rows[mask_columns == c]

                            title = "Mask %d - Column %d" % (u, c)
                            if np.any(saturated_mask[u, :, c]):
                                title += " - Saturated"

                            fig = plt.figure(figsize=(14, 6))
                            ax1 = fig.add_subplot(121)
                            ax1.axvspan(np.min(column_rows) - 0.5,
                                        np.max(column_rows) + 0.5,
                                        color='0.7')
                            ax1.plot(Ypixel,
                                     SumImage[:, c],
                                     'ro-',
                                     drawstyle='steps-mid')
                            ax1.set_title(title)
                            ax1.set_xlabel('Y pixels')
                            ax1.set_ylabel('Sum-image counts')
                            ax1.set_ylim(0, mask_max)
                            ax1.set_xlim(-0.5, NY - 0.5)

                            ax2 = fig.add_subplot(122)
                            plot_image(SumImage, ax=ax2, scale='log')
                            ax2.plot(outline_before[u][:, 0],
                                     outline_before[u][:, 1], 'r:')
                            ax2.plot(outline[:, 0], outline[:, 1], 'r-')
                            ax2.axvline(c, color='r', ls='--')

                            pdf.savefig(fig)
                            plt.close(fig)

    #==============================================================================
    # Create plots
    #==============================================================================
    if plot_folder is not None:
        # Colors to use for each cluster label:
        colors = plt.cm.gist_rainbow(np.linspace(0, 1, len(unique_labels)))

        # Colormap to use for clusters:
        # https://stackoverflow.com/questions/9707676/defining-a-discrete-colormap-for-imshow-in-matplotlib/9708079#9708079
        #cmap = mpl.colors.ListedColormap(np.append([[1, 1, 1, 1]], colors, axis=0))
        #cmap_norm = mpl.colors.BoundaryNorm(np.arange(-1, len(unique_labels)-1)+0.5, cmap.N)

        # Set up figure to hold subplots:
        if NY / NX > 5:
            aspect = 0.5
        else:
            aspect = 0.2

        fig0 = plt.figure(figsize=(2 * plt.figaspect(aspect)))
        fig0.subplots_adjust(wspace=0.12)

        # ---------------
        # PLOT 1
        ax0 = fig0.add_subplot(151)
        plot_image(SumImage, ax=ax0, scale='log', title='Sum-image')

        # ---------------
        # PLOT 2
        idx = np.zeros_like(SumImage, dtype='bool')
        np.greater(SumImage, CUT, out=idx, where=~np.isnan(SumImage))
        Flux_mat2 = np.zeros_like(SumImage)
        Flux_mat2[~idx] = 1
        Flux_mat2[idx] = 2
        Flux_mat2[ori_mask == 0] = 0

        ax2 = fig0.add_subplot(152)
        plot_image(Flux_mat2,
                   ax=ax2,
                   scale='linear',
                   percentile=100,
                   cmap='nipy_spectral',
                   title='Significant flux')

        # ---------------
        # PLOT 3
        ax2 = fig0.add_subplot(153)

        Flux_mat4 = np.zeros_like(SumImage)
        for u, lab in enumerate(unique_labels):
            class_member_mask = (labels == lab)
            xy = XX[class_member_mask, :]
            if lab == -1:
                # Black used for noise.
                ax2.plot(xy[:, 0],
                         xy[:, 1],
                         '+',
                         markerfacecolor='k',
                         markeredgecolor='k',
                         markersize=5)

            else:
                Flux_mat4[xy[:, 1], xy[:, 0]] = u + 1
                ax2.plot(xy[:, 0],
                         xy[:, 1],
                         'o',
                         markerfacecolor=tuple(colors[u]),
                         markeredgecolor='k',
                         markersize=5)

        ax2.set_title("Clustering + Watershed")
        ax2.set_xlim([-0.5, SumImage.shape[1] - 0.5])
        ax2.set_ylim([-0.5, SumImage.shape[0] - 0.5])
        ax2.set_aspect('equal')

        # ---------------
        # PLOT 4
        ax4 = fig0.add_subplot(154)
        plot_image(Flux_mat4,
                   ax=ax4,
                   scale='linear',
                   percentile=100,
                   cmap='nipy_spectral',
                   title='Extracted clusters')

        # ---------------
        # PLOT 5
        ax5 = fig0.add_subplot(155)
        plot_image(SumImage, ax=ax5, scale='log', title='Final masks')

        # Plot outlines of selected masks:
        for u in range(no_masks):
            # Get the color associated with this label:
            col = colors[int(np.where(unique_labels == No_pix_sort[u, 1])[0])]
            # Make mask outline:
            outline = k2p2maks(MASKS[u, :, :], 1, threshold=0.5)
            # Plot outlines:
            ax5.plot(outline[:, 0],
                     outline[:, 1],
                     color=col,
                     zorder=10,
                     lw=2.5)
            ax4.plot(outline[:, 0],
                     outline[:, 1],
                     color='k',
                     zorder=10,
                     lw=1.5)

        # Save the figure and close it:
        save_figure(os.path.join(plot_folder, 'masks_' + ws_alg))
        if show_plot:
            plt.show()
        else:
            plt.close('all')

    return MASKS, background_bandwidth
Esempio n. 10
0
ln_par, ln_lo, ln_up = bootstrap_fit(
    stats.lognorm, resid, n_iter=n_bs, quant=q
)
hc_par, hc_lo, hc_up = bootstrap_fit(
    stats.halfcauchy, resid, n_iter=n_bs, quant=q
)
gam_par, gam_lo, gam_up = bootstrap_fit(
    stats.gamma, resid, n_iter=n_bs, quant=q
)

##################################################################

hc = stats.halfcauchy(*stats.halfcauchy.fit(resid))
lg = stats.lognorm(*stats.lognorm.fit(resid))
dens = KDEUnivariate(resid)
dens.fit()
ecdf = ECDF(resid)

##################################################################
# prepare X axes for plotting

ex = ecdf.x
x = np.linspace(min(resid), max(resid), 2000)

##################################################################
# Fit a Landau distribution with ROOT

if HAS_ROOT:
    root_hist = rootpy.plotting.Hist(100, 0, np.pi)
    root_hist.fill_array(resid)
    root_hist /= root_hist.Integral()
Esempio n. 11
0
def lfdr(p_values, pi0, trunc = True, monotone = True, transf = "probit", adj = 1.5, eps = np.power(10.0,-8)):
    """ Estimate local FDR / posterior error probability from p-values according to bioconductor/qvalue """
    p = np.array(p_values)

    # Compare to bioconductor/qvalue reference implementation
    # import rpy2
    # import rpy2.robjects as robjects
    # from rpy2.robjects import pandas2ri
    # pandas2ri.activate()

    # density=robjects.r('density')
    # smoothspline=robjects.r('smooth.spline')
    # predict=robjects.r('predict')

    # Check inputs
    lfdr_out = p
    rm_na = np.isfinite(p)
    p = p[rm_na]

    if (min(p) < 0 or max(p) > 1):
        raise click.ClickException("p-values not in valid range [0,1].")
    elif (pi0 < 0 or pi0 > 1):
        raise click.ClickException("pi0 not in valid range [0,1].")

    # Local FDR method for both probit and logit transformations
    if (transf == "probit"):
        p = np.maximum(p, eps)
        p = np.minimum(p, 1-eps)
        x = scipy.stats.norm.ppf(p, loc=0, scale=1)

        # R-like implementation
        bw = bw_nrd0(x)
        myd = KDEUnivariate(x)
        myd.fit(bw=adj*bw, gridsize = 512)
        splinefit = sp.interpolate.splrep(myd.support, myd.density)
        y = sp.interpolate.splev(x, splinefit)
        # myd = density(x, adjust = 1.5) # R reference function
        # mys = smoothspline(x = myd.rx2('x'), y = myd.rx2('y')) # R reference function
        # y = predict(mys, x).rx2('y') # R reference function

        lfdr = pi0 * scipy.stats.norm.pdf(x) / y
    elif (transf == "logit"):
        x = np.log((p + eps) / (1 - p + eps))

        # R-like implementation
        bw = bw_nrd0(x)
        myd = KDEUnivariate(x)
        myd.fit(bw=adj*bw, gridsize = 512)

        splinefit = sp.interpolate.splrep(myd.support, myd.density)
        y = sp.interpolate.splev(x, splinefit)
        # myd = density(x, adjust = 1.5) # R reference function
        # mys = smoothspline(x = myd.rx2('x'), y = myd.rx2('y')) # R reference function
        # y = predict(mys, x).rx2('y') # R reference function

        dx = np.exp(x) / np.power((1 + np.exp(x)),2)
        lfdr = (pi0 * dx) / y
    else:
        raise click.ClickException("Invalid local FDR method.")

    if (trunc):
        lfdr[lfdr > 1] = 1
    if (monotone):
        lfdr = lfdr[p.ravel().argsort()]
        for i in range(1,len(x)):
            if (lfdr[i] < lfdr[i - 1]):
                lfdr[i] = lfdr[i - 1]
        lfdr = lfdr[scipy.stats.rankdata(p,"min")-1]

    lfdr_out[rm_na] = lfdr
    return lfdr_out
Esempio n. 12
0
 def kde_statsmodels_u(self, x_grid, bandwidth=0.2, **kwargs):
     """Univariate Kernel Density Estimation with Statsmodels"""
     from statsmodels.nonparametric.kde import KDEUnivariate
     kde = KDEUnivariate(self.data)
     kde.fit(bw=bandwidth, **kwargs)
     return kde.evaluate(x_grid)
Esempio n. 13
0

# Hey I've got an idea, let's just make more plots...

fig = plt.figure(figsize=(18,9), dpi=1600)
a = .2

fig.add_subplot(221, axisbg="#DBDBDB")

"""
this is the "kernel density estimator", just like was used above,
to create a nice smoothed density plot of the predictions
the y-values look incorrect, but I'm guessing the shape is right
"""
kde_res = KDEUnivariate(res.predict())
kde_res.fit()

# I think the "support" is simply the domain in which the
# density is greater than 0.
plt.plot(kde_res.support,kde_res.density)
plt.fill_between(kde_res.support,kde_res.density, alpha=a)
plt.title("Distribution of our Predictions")

# show that predicted survival probabilities are much lower
# for males than females
fig.add_subplot(222, axisbg="#DBDBDB")
plt.scatter(res.predict(),x['C(Sex)[T.male]'] , alpha=a)
plt.grid(b=True, which='major', axis='x')
plt.xlabel("Predicted chance of survival")
plt.ylabel("Gender Bool")
plt.title("The Change of Survival Probability by Gender (1 = Male)")
def kde_statsmodels_u(x, x_grid, bandwidth=0.2, **kwargs):
    """Univariate Kernel Density Estimation with Statsmodels"""
    kde = KDEUnivariate(x)
    kde.fit(bw=bandwidth, **kwargs)
    return kde.evaluate(x_grid)
Esempio n. 15
0
def draw_logit_regression(df, kind):
    w = open("logit_result.txt", "w")
    formula = 'Survived ~ C(Pclass) + C(Sex) + Age + SibSp  + C(Embarked)' # here the ~ sign is an = sign, and the features of our dataset
    results = {} # create a results dictionary to hold our regression results for easy analysis later
    y, x = dmatrices(formula, data=df, return_type='dataframe')
    model = sm.Logit(y, x)
    res = model.fit()
    results['Logit'] = [res, formula]
    print >> w, res.summary()

    if kind is 1:
        return results

    # Plot Predictions Vs Actual
    plt.figure(figsize=(18,4));
    plt.subplot(121, axisbg="#DBDBDB")
    # generate predictions from our fitted model
    ypred = res.predict(x)
    plt.plot(x.index, ypred, 'bo', x.index, y, 'mo', alpha=.25);
    plt.grid(color='white', linestyle='dashed')
    plt.title('Logit predictions, Blue: \nFitted/predicted values: Red');
    plt.savefig("1.eps")

    # Residuals
    plt.subplot(122, axisbg="#DBDBDB")
    plt.plot(res.resid, 'r-')
    plt.grid(color='white', linestyle='dashed')
    plt.title('Logit Residuals');
    plt.savefig("2.eps")



    fig = plt.figure(figsize=(18,9), dpi=1600)
    a = .2

    # Below are examples of more advanced plotting. 
    # It it looks strange check out the tutorial above.
    fig.add_subplot(221, axisbg="#DBDBDB")
    kde_res = KDEUnivariate(res.predict())
    kde_res.fit()
    plt.plot(kde_res.support,kde_res.density)
    plt.fill_between(kde_res.support,kde_res.density, alpha=a)
    title("Distribution of our Predictions")

    fig.add_subplot(222, axisbg="#DBDBDB")
    plt.scatter(res.predict(),x['C(Sex)[T.male]'] , alpha=a)
    plt.grid(b=True, which='major', axis='x')
    plt.xlabel("Predicted chance of survival")
    plt.ylabel("Gender Bool")
    title("The Change of Survival Probability by Gender (1 = Male)")

    fig.add_subplot(223, axisbg="#DBDBDB")
    plt.scatter(res.predict(),x['C(Pclass)[T.3]'] , alpha=a)
    plt.xlabel("Predicted chance of survival")
    plt.ylabel("Class Bool")
    plt.grid(b=True, which='major', axis='x')
    title("The Change of Survival Probability by Lower Class (1 = 3rd Class)")

    fig.add_subplot(224, axisbg="#DBDBDB")
    plt.scatter(res.predict(),x.Age , alpha=a)
    plt.grid(True, linewidth=0.15)
    title("The Change of Survival Probability by Age")
    plt.xlabel("Predicted chance of survival")
    plt.ylabel("Age")
    plt.savefig("prediction.eps")
my_sample = my_data.age206Pb_238U

# Plot the Density Histogram
fig, ax = plt.subplots(figsize=(8, 5))
bins = np.arange(0, 1500, 20)
ax.hist(my_sample,
        bins,
        color='#c7ddf4',
        edgecolor='k',
        density=True,
        label='Density Histogram - bins = 20 My')

# Compute and plot the KDE
age_eval = np.arange(0, 1500, 10)
kde = KDEUnivariate(my_sample)
kde.fit(bw=20)
pdf = kde.evaluate(age_eval)
ax.plot(age_eval,
        pdf,
        label='Gaussian KDE - bw = 20 Ma',
        linewidth=2,
        alpha=0.7,
        color='#ff464a')

# Adjust diagram parameters
ax.set_ylim(0, 0.0018)
ax.set_xlabel('Age (My)')
ax.set_ylabel('Probability Densisty')
ax.legend()
ax.grid(axis='y')
Esempio n. 17
0
def test_fit_self(reset_randomstate):
    x = np.random.standard_normal(100)
    kde = KDE(x)
    assert isinstance(kde, KDE)
    assert isinstance(kde.fit(), KDE)
Esempio n. 18
0
 def setup_class(cls):
     res1 = KDE(Xi)
     res1.fit(kernel="gau", fft=False, bw="silverman")
     cls.res1 = res1
     cls.res_density = KDEResults["gau_d"]
Esempio n. 19
0
ax2.set_xlim(-1, len(res.resid_dev))
plt.title('Logit Residuals')

# Hey I've got an idea, let's just make more plots...

fig = plt.figure(figsize=(18, 9), dpi=1600)
a = .2

fig.add_subplot(221, axisbg="#DBDBDB")
"""
this is the "kernel density estimator", just like was used above,
to create a nice smoothed density plot of the predictions
the y-values look incorrect, but I'm guessing the shape is right
"""
kde_res = KDEUnivariate(res.predict())
kde_res.fit()

# I think the "support" is simply the domain in which the
# density is greater than 0.
plt.plot(kde_res.support, kde_res.density)
plt.fill_between(kde_res.support, kde_res.density, alpha=a)
plt.title("Distribution of our Predictions")

# show that predicted survival probabilities are much lower
# for males than females
fig.add_subplot(222, axisbg="#DBDBDB")
plt.scatter(res.predict(), x['C(Sex)[T.male]'], alpha=a)
plt.grid(b=True, which='major', axis='x')
plt.xlabel("Predicted chance of survival")
plt.ylabel("Gender Bool")
plt.title("The Change of Survival Probability by Gender (1 = Male)")
def kde_statsmodels_u(x, x_grid, bandwidth=0.2, **kwargs):
    """Univariate Kernel Density Estimation with Statsmodels"""
    kde = KDEUnivariate(x)
    kde.fit(bw=bandwidth, **kwargs)
    return kde.evaluate(x_grid)
def kde_statsmodels_u(x, x_grid, bandwidth, **kwargs):
    kde = KDEUnivariate(x)
    kde.fit(bw=bandwidth, **kwargs)
    return kde.evaluate(x_grid)
Esempio n. 22
0
pd.set_option('precision', 0)
# Alternative:
# df.round({'Father': 0, 'Mother': 0, 'Height': 0})

###############################################################################
# Q2: Any Na in the dataframe?

null_data = df[df.isnull().any(axis=1)]
print("There are " + str(df.isnull().sum().sum()) + ' total missing values')

###############################################################################
# Q3: Plot height density for Mothers and Fathers

plt.figure()
kde_father = KDEUnivariate(df['Father'])
kde_father.fit(bw=2, kernel='gau')
x_grid = np.linspace(140, 210)
pdf_est_father = kde_father.evaluate(x_grid)

kde_mother = KDEUnivariate(df['Mother'])
kde_mother.fit(bw=2, kernel='gau')
x_grid = np.linspace(140, 210)
pdf_est_mother = kde_mother.evaluate(x_grid)

plt.plot(x_grid, pdf_est_father, color=my_blue, label='Father')
plt.fill_between(x_grid, 0, pdf_est_father, facecolor=my_blue, alpha=0.5)

plt.plot(x_grid, pdf_est_mother, color=my_purple, label='Mother')
plt.fill_between(x_grid, 0, pdf_est_mother, facecolor=my_purple, alpha=0.5)

plt.ylabel('Density', fontsize=18)
Esempio n. 23
0
 def setup_class(cls):
     res1 = KDE(Xi)
     res1.fit(kernel="biw", fft=False, bw="silverman")
     cls.res1 = res1
     cls.res_density = KDEResults["biw_d"]
Esempio n. 24
0
 def kde_statsmodels_u(self, x_grid, bandwidth=0.2, **kwargs):
     """Univariate Kernel Density Estimation with Statsmodels"""
     from statsmodels.nonparametric.kde import KDEUnivariate
     kde = KDEUnivariate(self.data)
     kde.fit(bw=bandwidth, **kwargs)
     return kde.evaluate(x_grid)