Beispiel #1
0
def kde_sklearn(data, grid, **kwargs):
    """
    Kernel Density Estimation with Scikit-learn

    Parameters
    ----------
    data : numpy.array
        Data points used to compute a density estimator. It
        has `n x p` dimensions, representing n points and p
        variables.
    grid : numpy.array
        Data points at which the desity will be estimated. It
        has `m x p` dimensions, representing m points and p
        variables.

    Returns
    -------
    out : numpy.array
        Density estimate. Has `m x 1` dimensions
    """
    kde_skl = KernelDensity(**kwargs)
    kde_skl.fit(data)
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(grid)
    return np.exp(log_pdf)
    def pdf(self, token, years, bandwidth=5):

        """
        Estimate a density function from a token's rank series.

        Args:
            token (str)
            years (range)

        Returns: OrderedDict {year: density}
        """

        series = self.series(token)

        data = []
        for year, wpm in series.items():
            data += [year] * round(wpm)

        data = np.array(data)[:, np.newaxis]

        pdf = KernelDensity(bandwidth=bandwidth).fit(data)

        samples = OrderedDict()

        for year in years:
            samples[year] = np.exp(pdf.score(year))

        return samples
Beispiel #3
0
def KDE_plt(categories,inter_arrivals):
    KDEs = []
    for i in range(0,len(categories)):

        X = np.asarray(extract_cat_samples(inter_arrivals,categories,i))#for single inter-arrivals in a category
        #X = np_matrix(categories[i][0])#for avg(inter-arrival)/person in a category
        kde = KernelDensity(kernel='gaussian', bandwidth=4).fit(X)
        KDEs.append(kde) #to use for prob_return()
        max_sample = max_interarrival_mean(categories,inter_arrivals,i)
        X_plot = np.linspace(0,1.5*max_sample,2000)[:, np.newaxis]
        log_dens = kde.score_samples(X_plot)

        plt.figure(i)
        plt.plot(X_plot[:, 0], np.exp(log_dens), '-',label="kernel = '{0}'".format('gaussian'))
            #plt.draw()
            #plt.pause(0.001)
        #plt.title("Non-Parametric Density Estimation for category=%s Visitors"%(i))
        plt.hist(combine_inner_lists(extract_cat_samples(inter_arrivals,categories,i)),bins=40,normed=1,color="cyan",alpha=.3,label="histogram") #alpha, from 0 (transparent) to 1 (opaque)
       # plt.hist(np.asarray(categories[i][0]),bins=40,normed=1,color="cyan",alpha=.3,label="histogram") #alpha, from 0 (transparent) to 1 (opaque)
        plt.xlabel("inter-arrival time (days)")
        plt.ylabel("PDF")
        plt.legend()
        save_as='./app/static/img/cat_result/kde/kdeplt_cat'+str(i)+'.png' # dump result into kde folder
        plt.savefig(save_as)
        plt.show(block=False)
        plt.close(plt.figure(i))
    return KDEs
 def EstimateDensity(self,name,df,histogram,f,s,ax):
     # if the desired output is in Histogram format
     if(histogram):
         finRes = []
         lab = []
         for i in xrange(5):
             res = np.array(df[ df[f] == i][s])
             if(res.shape[0]>0):
                 finRes.append(res)
                 lab.append(name[0]+ ' = ' + str(i))
         pl.hist(finRes, bins=2, normed=True, histtype='bar',label = lab)
         
     # if the desired output is simple plot
     else:
         for i in xrange(5):
             res = np.array(df[ df[f] == i][s])
             if(res.shape[0]>0):
                 res = res.reshape(res.shape[0],1)
                 X_plot = np.array(np.linspace(-1, 5,20)).reshape(20,1)
                 kde= KernelDensity(kernel='exponential', bandwidth=0.05)
                 kde.fit(res)
                 log_dens = kde.score_samples(X_plot)
                 ax.plot(X_plot,np.exp(log_dens),label=name[0]+ ' = ' + str(i))        
     ax.legend()
     ax.set_title(name[1] + "  distrubution for changing  " + name[0])
Beispiel #5
0
    def kde(self, term, bandwidth=2000, samples=1000, kernel='gaussian'):

        """
        Estimate the kernel density of the instances of term in the text.

        Args:
            term (str): A stemmed term.
            bandwidth (int): The kernel bandwidth.
            samples (int): The number of evenly-spaced sample points.
            kernel (str): The kernel function.

        Returns:
            np.array: The density estimate.
        """

        # Get the offsets of the term instances.
        terms = np.array(self.terms[term])[:, np.newaxis]

        # Fit the density estimator on the terms.
        kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(terms)

        # Score an evely-spaced array of samples.
        x_axis = np.linspace(0, len(self.tokens), samples)[:, np.newaxis]
        scores = kde.score_samples(x_axis)

        # Scale the scores to integrate to 1.
        return np.exp(scores) * (len(self.tokens) / samples)
def plot_kde_histogram2(X1, X2, f_name, bins=25):
    """
    Plot KDE-smoothed histogram of the data in X1/X2. Assume data is 1D.
    """
    import matplotlib.pyplot as plt
    # make a figure and configure an axis
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.hold(True)
    for (X, style) in [(X1, '-'), (X2, '--')]:
        X_samp = X.ravel()[:,np.newaxis]
        X_min = np.min(X_samp)
        X_max = np.max(X_samp)
        X_range = X_max - X_min
        sigma = X_range / float(bins)
        plot_min = X_min - (X_range/3.0)
        plot_max = X_max + (X_range/3.0)
        plot_X = np.linspace(plot_min, plot_max, 1000)[:,np.newaxis]
        # make a kernel density estimator for the data in X
        kde = KernelDensity(kernel='gaussian', bandwidth=sigma).fit(X_samp)
        ax.plot(plot_X, np.exp(kde.score_samples(plot_X)), linestyle=style)
    fig.savefig(f_name, dpi=None, facecolor='w', edgecolor='w', \
        orientation='portrait', papertype=None, format=None, \
        transparent=False, bbox_inches=None, pad_inches=0.1, \
        frameon=None)
    plt.close(fig)
    return
def plot_sklearn_kde(df, support, column='AirTime', bins=50):
    """
    Plots a KDE and a histogram using sklearn.KernelDensity.
    Uses Gaussian kernels.
    The optimal bandwidth is calculated according to Silverman's rule of thumb.

    Parameters
    ----------
    df: A pandas.DataFrame
    support: A 1-d numpy array.
             Input data points for the probabilit density function.

    Returns
    -------
    A matplotlib.axes.Axes instance.
    """

    bw = get_silverman_bandwidth(df, column)

    kde = KernelDensity(kernel='gaussian', bandwidth=bw)

    x = df[column]

    kde.fit(x[:, np.newaxis])
    y = kde.score_samples(support[:, np.newaxis])

    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(np.ravel(x), bins=bins, alpha=0.5, color=sns.xkcd_rgb["denim blue"], normed=True)
    ax.plot(support, np.exp(y))
    ax.set_xlabel(column, fontsize=14)
    ax.set_ylabel('Density', fontsize=14)
    ax.set_title('Kernel Density Plot', fontsize=14)
    sns.despine(ax=ax, offset=5, trim=True)

    return ax
Beispiel #8
0
def kdescatter(xs, ys, log_color=False, atol=1e-4, rtol=1e-4,
               n_jobs=1, n_samp_scaling=100, n_samp_tuning=1000, ax=None,
               **kwargs):
    if ax is None:
        import matplotlib.pyplot as plt
        ax = plt

    kwargs.setdefault('linewidths', 0)
    kwargs.setdefault('s', 20)
    kwargs.setdefault('cmap', 'winter')

    X = np.asarray([xs, ys]).T
    n = X.shape[0]
    samp_X = X[np.random.choice(n, min(n_samp_scaling, n), replace=False)]
    median_sqdist = np.median(euclidean_distances(samp_X, squared=True))
    bws = np.logspace(-2, 2, num=10) * np.sqrt(median_sqdist)
    est = GridSearchCV(KernelDensity(), {'bandwidth': bws}, n_jobs=n_jobs)
    est.fit(X[np.random.choice(n, min(n_samp_tuning, n), replace=False)])
    bw = est.best_params_['bandwidth']

    kde = KernelDensity(bandwidth=bw)
    kde.fit(X)
    densities = kde.score_samples(X)
    if not log_color:
        np.exp(densities, out=densities)
    ax.scatter(xs, ys, c=densities, **kwargs)
def draw_posterior_kld_hist(X_kld, X_vae, f_name, bins=25):
    """
    Plot KDE-smoothed histograms.
    """
    import matplotlib.pyplot as plt
    # make a figure and configure an axis
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.set_xlabel('Posterior KLd Density')
    ax.set_title('Posterior KLds: Over-regularized vs. Standard')
    ax.hold(True)
    for (X, style, label) in [(X_kld, '-', 'ORK'), (X_vae, '--', 'VAR')]:
        X_samp = X.ravel()[:,np.newaxis]
        X_min = np.min(X_samp)
        X_max = np.max(X_samp)
        X_range = X_max - X_min
        sigma = X_range / float(bins)
        plot_min = X_min - (X_range/4.0)
        plot_max = X_max + (X_range/4.0)
        plot_X = np.linspace(plot_min, plot_max, 1000)[:,np.newaxis]
        # make a kernel density estimator for the data in X
        kde = KernelDensity(kernel='gaussian', bandwidth=sigma).fit(X_samp)
        ax.plot(plot_X, np.exp(kde.score_samples(plot_X)), linestyle=style, label=label)
    ax.legend()
    fig.savefig(f_name, dpi=None, facecolor='w', edgecolor='w', \
        orientation='portrait', papertype=None, format='pdf', \
        transparent=False, bbox_inches=None, pad_inches=0.1, \
        frameon=None)
    plt.close(fig)
    return
Beispiel #10
0
def test2():
    arr = np.concatenate((np.linspace(0, 10, 10), np.linspace(2, 4, 10), np.linspace(7, 10, 10)))[:, np.newaxis]
    kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(arr)
    X = np.linspace(0,10,1000)[:, np.newaxis]
    log_dens = kde.score_samples(X)
    plt.plot(X, log_dens)
    plt.show()
def kde_opt4(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        df_new = pd.DataFrame()
        df_new["hour"] = df["hour"]
        df_new["weekday"] = df["weekday"] + df["hour"] / 24.
        df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x))
        df_new["x"] = df["x"]
        df_new["y"] = df["y"]
        return df_new
    logging.info("train kde_opt4 model")
    df_cell_train_feats_kde = prepare_feats(df_cell_train_feats)
    df_cell_test_feats_kde = prepare_feats(df_cell_test_feats)
    n_class = len(np.unique(y_train))
    y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d")
    for i in range(n_class):
        X = df_cell_train_feats_kde[y_train == i]
        y_test_pred_i = np.ones(len(df_cell_test_feats_kde), "d")
        for feat in df_cell_train_feats_kde.columns.values:
            X_feat = X[feat].values
            BGK10_output = kdeBGK10(X_feat)
            if BGK10_output is None:
                kde = gaussian_kde(X_feat, "scott")
                kde = gaussian_kde(X_feat, kde.factor * 0.741379)
                y_test_pred_i *= kde.evaluate(df_cell_test_feats_kde[feat].values)
            else:
                bandwidth, mesh, density = BGK10_output
                kde = KernelDensity(kernel='gaussian', metric='manhattan', bandwidth=bandwidth)
                kde.fit(X_feat[:, np.newaxis])
                y_test_pred_i *= np.exp(kde.score_samples(df_cell_test_feats_kde[feat].values[:, np.newaxis]))
        y_test_pred[:, i] += y_test_pred_i
    return y_test_pred
Beispiel #12
0
def surface_density(c, bandwidth=0.2, grid_step=0.02):
    """
    Given particle positions as a coordinate object, compute the
    surface density using a kernel density estimate.
    """

    if not HAS_SKLEARN:
        raise ImportError("scikit-learn is required to use this function.")

    xgrid = np.arange(2., 9.+0.1, grid_step) # deg
    ygrid = np.arange(26.5, 33.5+0.1, grid_step) # deg
    shp = (xgrid.size, ygrid.size)
    meshies = np.meshgrid(xgrid, ygrid)
    grid = np.vstack(map(np.ravel, meshies)).T

    x = c.l.degree
    y = c.b.degree
    skypos = np.vstack((x,y)).T

    kde = KernelDensity(bandwidth=bandwidth, kernel='epanechnikov')
    kde.fit(skypos)

    dens = np.exp(kde.score_samples(grid)).reshape(meshies[0].shape)
    log_dens = np.log10(dens)

    return grid, log_dens
def plot_kde_histogram(X, f_name, bins=25):
    """
    Plot KDE-smoothed histogram of the data in X. Assume data is univariate.
    """
    import matplotlib.pyplot as plt
    X = X.ravel()
    np.random.shuffle(X)
    X = X[0:min(X.shape[0], 1000000)]
    X_samp = X[:,np.newaxis]
    X_min = np.min(X_samp)
    X_max = np.max(X_samp)
    X_range = X_max - X_min
    sigma = X_range / float(bins)
    plot_min = X_min - (X_range/3.0)
    plot_max = X_max + (X_range/3.0)
    plot_X = np.linspace(plot_min, plot_max, 1000)[:,np.newaxis]
    # make a kernel density estimator for the data in X
    kde = KernelDensity(kernel='gaussian', bandwidth=sigma).fit(X_samp)
    # make a figure
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(plot_X, np.exp(kde.score_samples(plot_X)))
    fig.savefig(f_name, dpi=None, facecolor='w', edgecolor='w', \
        orientation='portrait', papertype=None, format=None, \
        transparent=False, bbox_inches=None, pad_inches=0.1, \
        frameon=None)
    plt.close(fig)
    return
Beispiel #14
0
def max_prob(df):
    df_tmp = df.copy()

    arr = []
    for ind in df_tmp.index:
        row = df_tmp.loc[ind]
        d = row.dropna().values
        # d = d.dropna()
        if len(d)==0:
            centre = np.NaN
            arr.append(centre)
            continue

        # arr = vals.sort(axis=0)
        # df_ordered = pd.DataFrame(vals, index=df.index, columns=df.columns)

        x_grid = np.linspace(d.min(), d.max(), 50)
        x_grid = x_grid.reshape(-1,1)
        d = d.reshape(-1,1)

        kde = KernelDensity().fit(d)
        log_dens = kde.score_samples(x_grid)
        vals = np.exp(log_dens).round(4)
        centre = x_grid[vals.argmax()][0]
        centre2 = round(centre, 4)
        # TODO first element adds unnecessary decimal places (use decimal places class to fix)
        arr.append(centre2)
    return arr
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs):
    """Kernel Density Estimation with Scikit-learn"""
    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(x[:, np.newaxis])
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
    return np.exp(log_pdf)
Beispiel #16
0
def test_kernel_density_sampling(n_samples=100, n_features=3):
    rng = np.random.RandomState(0)
    X = rng.randn(n_samples, n_features)

    bandwidth = 0.2

    for kernel in ['gaussian', 'tophat']:
        # draw a tophat sample
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        samp = kde.sample(100)
        assert_equal(X.shape, samp.shape)

        # check that samples are in the right range
        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
        dist, ind = nbrs.kneighbors(X, return_distance=True)

        if kernel == 'tophat':
            assert np.all(dist < bandwidth)
        elif kernel == 'gaussian':
            # 5 standard deviations is safe for 100 samples, but there's a
            # very small chance this test could fail.
            assert np.all(dist < 5 * bandwidth)

    # check unsupported kernels
    for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        assert_raises(NotImplementedError, kde.sample, 100)

    # non-regression test: used to return a scalar
    X = rng.randn(4, 1)
    kde = KernelDensity(kernel="gaussian").fit(X)
    assert_equal(kde.sample().shape, (1, 1))
def get_density_based_best_sample(X, known_votes, possibilities):
  total_votes = sum(map(lambda x: len(x), known_votes))
  print total_votes
  X = X.toarray()
  current_vectors = numpy.copy(X)
  #print 'X', X
  #print 'known_votes ', known_votes
  original_docs = len(X)
  possibilities = set([x[0] for x in possibilities])
  #print possibilities

  for i, sample in enumerate(known_votes):
    for k in range(len(sample)):
      current_vectors = numpy.append(current_vectors, [X[i]], axis=0)
  #print 'current_vectors ', current_vectors, len(current_vectors)
  #assert current_vectors != X
  model = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(current_vectors)
  scores = model.score_samples(X)
  
  if (total_votes % 3):
    #Explore low density regions
    sorted_scores = sorted(enumerate(scores), key = lambda x: x[1], reverse=True)
  else:
    #Exploit high density regions 1 times out of 3
    sorted_scores = sorted(enumerate(scores), key = lambda x: x[1])
  #print sorted_scores
  for i in range(original_docs):
    if sorted_scores[i][0] in possibilities:
      #print sorted_scores[i][0]
      return sorted_scores[i][0]
  return None
Beispiel #18
0
def sklearn_kde(data, points):
    from sklearn.neighbors import KernelDensity

    # Silverman bandwidth estimator
    n, d = data.shape
    bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4))

    # standardize data so that we can use uniform bandwidth
    mu, sigma = mean(data, axis=0), std(data, axis=0)
    data, points = (data - mu)/sigma, (points - mu)/sigma

    #print("starting grid search for bandwidth over %d points"%n)
    #from sklearn.grid_search import GridSearchCV
    #from numpy import logspace
    #params = {'bandwidth': logspace(-1, 1, 20)}
    #fitter = GridSearchCV(KernelDensity(), params)
    #fitter.fit(data)
    #kde = fitter.best_estimator_
    #print("best bandwidth: {0}".format(kde.bandwidth))
    #import time; T0 = time.time()
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth,
                        rtol=1e-6, atol=1e-6)
    #print("T:%6.3f   fitting"%(time.time()-T0))
    kde.fit(data)
    #print("T:%6.3f   estimating"%(time.time()-T0))
    log_pdf = kde.score_samples(points)
    #print("T:%6.3f   done"%(time.time()-T0))
    return exp(log_pdf)
def find_kernel(data, numgrid = 1000, bw = 0.002):
	Xtrain = data[:,0:2]
	ytrain = data[2]
	# Set up the data grid for the contour plot
	xgrid = np.linspace(-74.1, -73.65, numgrid=1000)
	ygrid = np.linspace(40.5, 40.8, numgrid=1000)
	X, Y = np.meshgrid(xgrid, ygrid)

	xy = np.vstack([Y.ravel(), X.ravel()]).T

	# Plot map of with distributions of each species
	fig = plt.figure()
    # construct a kernel density estimate of the distribution
	kde = KernelDensity(bandwidth=bw,
                    kernel='gaussian')
	kde.fit(Xtrain, y = ytrain)

 # evaluate only on the land: -9999 indicates ocean
	Z = np.exp(kde.score_samples(xy))
	Z = Z.reshape(X.shape)

    # plot contours of the density
	levels = np.linspace(0, Z.max(), 25)
	plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds)
	plt.title('BK CRIME')
	plt.show()
	return Z
Beispiel #20
0
def sklearn_density(sample_points, evaluation_points):
    """
    Estimate the probability density function from which a set of sample
    points was drawn and return the estimated density at the evaluation points.
    """
    from sklearn.neighbors import KernelDensity

    # Silverman bandwidth estimator
    n, d = sample_points.shape
    bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4))

    # Standardize data so that we can use uniform bandwidth.
    # Note that we will need to scale the resulting density by sigma to
    # correct the area.
    mu, sigma = mean(sample_points, axis=0), std(sample_points, axis=0)
    data, points = (sample_points - mu)/sigma, (evaluation_points - mu)/sigma

    #print("starting grid search for bandwidth over %d points"%n)
    #from sklearn.grid_search import GridSearchCV
    #from numpy import logspace
    #params = {'bandwidth': logspace(-1, 1, 20)}
    #fitter = GridSearchCV(KernelDensity(), params)
    #fitter.fit(data)
    #kde = fitter.best_estimator_
    #print("best bandwidth: {0}".format(kde.bandwidth))
    #import time; T0 = time.time()
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth,
                        rtol=1e-6, atol=1e-6)
    #print("T:%6.3f   fitting"%(time.time()-T0))
    kde.fit(data)
    #print("T:%6.3f   estimating"%(time.time()-T0))
    log_pdf = kde.score_samples(points)
    #print("T:%6.3f   done"%(time.time()-T0))
    return exp(log_pdf)/np.prod(sigma)  # undo the x scaling on the data points
Beispiel #21
0
def kde_fit_quantiles(rtquants, nsamples=1000, bw=.1):
    """ takes quantile estimates and fits cumulative density function
    returns samples to pass to sns.kdeplot()
    """
    kdefit = KernelDensity(kernel='gaussian', bandwidth=bw).fit(rtquants)
    samples = kdefit.sample(n_samples=nsamples).flatten()
    return samples
Beispiel #22
0
def test_KernelDensity_sampling(n_samples=100, n_features=3):
    np.random.seed(0)
    X = np.random.random((n_samples, n_features))

    bandwidth = 0.2

    for kernel in ["gaussian", "tophat"]:
        # draw a tophat sample
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        samp = kde.sample(100)
        assert_equal(X.shape, samp.shape)

        # check that samples are in the right range
        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
        dist, ind = nbrs.kneighbors(X, return_distance=True)

        if kernel == "tophat":
            assert np.all(dist < bandwidth)
        elif kernel == "gaussian":
            # 5 standard deviations is safe for 100 samples, but there's a
            # very small chance this test could fail.
            assert np.all(dist < 5 * bandwidth)

    # check unsupported kernels
    for kernel in ["epanechnikov", "exponential", "linear", "cosine"]:
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        assert_raises(NotImplementedError, kde.sample, 100)
Beispiel #23
0
def xy_kde(xy,bandwidth,N_grid=100,levels=[0.8,0.6,0.4,0.2]):  
    
    x_edges = np.linspace(np.min(xy[:,0]),np.max(xy[:,0]),N_grid+1)
    y_edges = np.linspace(np.min(xy[:,1]),np.max(xy[:,1]),N_grid+1)
    x_centres = np.array([x_edges[b] + (x_edges[b+1]-x_edges[b])/2 
                          for b in range(N_grid)])
    y_centres = np.array([y_edges[b] + (y_edges[b+1]-y_edges[b])/2 
                          for b in range(N_grid)])
    x_grid, y_grid = np.meshgrid(x_centres,y_centres)
    xy_grid = np.array([np.ravel(x_grid),np.ravel(y_grid)]).T
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(xy)
    H = np.exp(kde.score_samples(xy_grid).reshape(N_grid,N_grid))
    # this bit is taken from the corner_plot.py method.
    ######################################
    Hflat = H.flatten()
    inds = np.argsort(Hflat)[::-1]
    Hflat = Hflat[inds]
    sm = np.cumsum(Hflat)
    sm /= sm[-1]
    V = np.empty(len(levels))
    for i, v0 in enumerate(levels):
        try:
            V[i] = Hflat[sm <= v0][-1]
        except:
            V[i] = Hflat[0]
    #####################################
    V = np.sort(V)
    
    return H, V, x_grid, y_grid, bandwidth
Beispiel #24
0
def art_qi2(img, airmask, min_voxels=int(1e3), max_voxels=int(3e5), save_plot=True):
    r"""
    Calculates :math:`\text{QI}_2`, based on the goodness-of-fit of a centered
    :math:`\chi^2` distribution onto the intensity distribution of
    non-artifactual background (within the "hat" mask):


    .. math ::

        \chi^2_n = \frac{2}{(\sigma \sqrt{2})^{2n} \, (n - 1)!}x^{2n - 1}\, e^{-\frac{x}{2}}

    where :math:`n` is the number of coil elements.

    :param numpy.ndarray img: input data
    :param numpy.ndarray airmask: input air mask without artifacts

    """

    from sklearn.neighbors import KernelDensity
    from scipy.stats import chi2
    from mriqc.viz.misc import plot_qi2

    # S. Ogawa was born
    np.random.seed(1191935)

    data = img[airmask > 0]
    data = data[data > 0]

    # Write out figure of the fitting
    out_file = op.abspath('error.svg')
    with open(out_file, 'w') as ofh:
        ofh.write('<p>Background noise fitting could not be plotted.</p>')

    if len(data) < min_voxels:
        return 0.0, out_file

    modelx = data if len(data) < max_voxels else np.random.choice(
        data, size=max_voxels)

    x_grid = np.linspace(0.0, np.percentile(data, 99), 1000)

    # Estimate data pdf with KDE on a random subsample
    kde_skl = KernelDensity(bandwidth=0.05 * np.percentile(data, 98),
                            kernel='gaussian').fit(modelx[:, np.newaxis])
    kde = np.exp(kde_skl.score_samples(x_grid[:, np.newaxis]))

    # Find cutoff
    kdethi = np.argmax(kde[::-1] > kde.max() * 0.5)

    # Fit X^2
    param = chi2.fit(modelx[modelx < np.percentile(data, 95)], 32)
    chi_pdf = chi2.pdf(x_grid, *param[:-2], loc=param[-2], scale=param[-1])

    # Compute goodness-of-fit (gof)
    gof = float(np.abs(kde[-kdethi:] - chi_pdf[-kdethi:]).mean())
    if save_plot:
        out_file = plot_qi2(x_grid, kde, chi_pdf, modelx, kdethi)

    return gof, out_file
Beispiel #25
0
def get_log_density(x, bins):

    x_kde = bins[:, np.newaxis]
    bandwidth = 1.06 * np.std(x) * np.power(len(x), -0.2)
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(x[:, np.newaxis])
    log_density = kde.score_samples(x_kde)

    return log_density
Beispiel #26
0
def estimate_density(city):
    """Return a Gaussian KDE of venues in `city`."""
    kde = KernelDensity(bandwidth=175, rtol=1e-4)
    surround = xp.build_surrounding(DB.venue, city, likes=-1, checkins=1)
    kde.fit(surround.venues[:, :2])
    max_density = approximate_maximum_density(kde, surround.venues[:, :2])
    # pylint: disable=E1101
    return lambda xy: np.exp(kde.score_samples(xy))/max_density
def fit_kde(costs, frac_std):
    """
    Fit a KDE to the costs, use a gaussian kernel and a bandwidth that is the
    specified fraction of the std.
    """
    bw = frac_std * np.std(costs)
    kde = KernelDensity(bandwidth=bw)
    return kde.fit(costs)
Beispiel #28
0
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth,
                        atol=atol, rtol=rtol)
    log_dens = kde.fit(X).score_samples(Y)
    assert_allclose(np.exp(log_dens), dens_true,
                    atol=atol, rtol=max(1E-7, rtol))
    assert_allclose(np.exp(kde.score(Y)),
                    np.prod(dens_true),
                    atol=atol, rtol=max(1E-7, rtol))
def CrossValidationScore(Xs,h, kernel='gaussian'):
    kde = KernelDensity(h, kernel=kernel)
    ret = 0.
    for i in range(len(Xs)):
        x = np.concatenate([Xs[0:i],Xs[i+1:-1]])
        kde.fit(x)
        ret +=kde.score_samples(Xs[i].reshape(1,-1)) 
    ret/=(1.*len(Xs))
    return ret
Beispiel #30
0
def build_classifiers(training_data, bandwidth):
  classifiers = {}
  for category in training_data:
    print 'Classifier category: ' + category
    print 'Number of samples: ' + str(len(training_data[category]))
    kde = KernelDensity(bandwidth=bandwidth)
    classifiers[category] = kde.fit(training_data[category])

  return classifiers
                        tumor_x.append(c.loc[0])
                        tumor_y.append(c.loc[1])
                    else:
                        stroma_cnt += 1
                        stroma_features.append(c.features)
                        stroma_x.append(c.loc[0])
                        stroma_y.append(c.loc[1])

                # if stroma_cnt > 0 and tumor_cnt > 0:

                if stroma_cnt > 10 and tumor_cnt > 10:
                    eligible_patch += 1
                    # print("eligible patch")
                    xy = np.vstack([tumor_x, tumor_y])
                    s_xy = np.vstack([stroma_x, stroma_y]).T
                    kde_skl_1 = KernelDensity(bandwidth=16)
                    kde_skl_1.fit(xy.T)
                    sc_1 = kde_skl_1.score_samples(s_xy)

                    kde_skl_2 = KernelDensity(bandwidth=20)
                    kde_skl_2.fit(xy.T)
                    sc_2 = kde_skl_2.score_samples(s_xy)

                    kde_skl_3 = KernelDensity(bandwidth=24)
                    kde_skl_3.fit(xy.T)
                    sc_3 = kde_skl_3.score_samples(s_xy)

                    kde_skl_4 = KernelDensity(bandwidth=30)
                    kde_skl_4.fit(xy.T)
                    sc_4 = kde_skl_4.score_samples(s_xy)
Beispiel #32
0
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity

# load the data
digits = load_digits()
data = digits.data

# project the 64-dimensional data to a lower dimension
pca = PCA(n_components=15, whiten=False)
data = pca.fit_transform(digits.data)

# use grid search cross-validation to optimize the bandwidth
params = {'bandwidth': np.logspace(-1, 1, 20)}
grid = GridSearchCV(KernelDensity(), params)
grid.fit(data)

print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

# use the best estimator to compute the kernel density estimate
kde = grid.best_estimator_

# sample 44 new points from the data
new_data = kde.sample(44, random_state=0)
new_data = pca.inverse_transform(new_data)

# turn data into a 4x11 grid
new_data = new_data.reshape((4, 11, -1))
real_data = digits.data[:44].reshape((4, 11, -1))
Beispiel #33
0
def test_kde_sample_weights():
    n_samples = 400
    size_test = 20
    weights_neutral = np.full(n_samples, 3.)
    for d in [1, 2, 10]:
        rng = np.random.RandomState(0)
        X = rng.rand(n_samples, d)
        weights = 1 + (10 * X.sum(axis=1)).astype(np.int8)
        X_repetitions = np.repeat(X, weights, axis=0)
        n_samples_test = size_test // d
        test_points = rng.rand(n_samples_test, d)
        for algorithm in ['auto', 'ball_tree', 'kd_tree']:
            for metric in ['euclidean', 'minkowski', 'manhattan', 'chebyshev']:
                if algorithm != 'kd_tree' or metric in KDTree.valid_metrics:
                    kde = KernelDensity(algorithm=algorithm, metric=metric)

                    # Test that adding a constant sample weight has no effect
                    kde.fit(X, sample_weight=weights_neutral)
                    scores_const_weight = kde.score_samples(test_points)
                    sample_const_weight = kde.sample(random_state=1234)
                    kde.fit(X)
                    scores_no_weight = kde.score_samples(test_points)
                    sample_no_weight = kde.sample(random_state=1234)
                    assert_allclose(scores_const_weight, scores_no_weight)
                    assert_allclose(sample_const_weight, sample_no_weight)

                    # Test equivalence between sampling and (integer) weights
                    kde.fit(X, sample_weight=weights)
                    scores_weight = kde.score_samples(test_points)
                    sample_weight = kde.sample(random_state=1234)
                    kde.fit(X_repetitions)
                    scores_ref_sampling = kde.score_samples(test_points)
                    sample_ref_sampling = kde.sample(random_state=1234)
                    assert_allclose(scores_weight, scores_ref_sampling)
                    assert_allclose(sample_weight, sample_ref_sampling)

                    # Test that sample weights has a non-trivial effect
                    diff = np.max(np.abs(scores_no_weight - scores_weight))
                    assert diff > 0.001

                    # Test invariance with respect to arbitrary scaling
                    scale_factor = rng.rand()
                    kde.fit(X, sample_weight=(scale_factor * weights))
                    scores_scaled_weight = kde.score_samples(test_points)
                    assert_allclose(scores_scaled_weight, scores_weight)
def cumi(x_orig,
         y_orig,
         z_orig,
         normalization=False,
         k=5,
         density_estimation_method="kde",
         k_density=5,
         bw=.01):
    """Calculates the uniformed conditional mutual information where the distribution for :math:`x` and :math:`z` is replaced by a uniform distribution.

    `cumi` takes two random variable :math:`x` and :math:`y` and estimated their mutual information conditioned on the
    third random variable :math:`z` using the KSG estimator while :math:`x`, :math:`y` is replaced by a uniform distribution.

    Arguments
    ---------
        x_orig: `List`
            One random variable from the time-series data.
        y_orig: `List`
            Another random variable from the time-series data.
        z_orig: `List`
            Another random variable from the time-series data.
        normalization: `bool` (Default: False)
            Whether to normalize the expression of :math:`x, y, z` by their standard deviation.
        k: `int` (Default: 5)
            Number for nearest neighbors used in entropy calculation
        density_estimation_method: `str` (Default: `kde`)
            Which 2D density estimator you would like to use. `kde` is kde estimator while `knn` is knn based estimator.
        k_density: `bool` (default: False)
            The number of k nearest neighbors you would like to use when calculating the density (only applicable when
            density_estimation_method is to be `knn` or using knn based density estimation).
        bw: `float` (default: 0.01)
            Bindwidth used for the kernel density estimator.

    Returns
    -------
    A estimated conditional mutual information value between two variables (x, y), conditioning on a third variable z where
    the distribution for the x, z is replaced by a uniform distribution.
    """
    x = deepcopy(x_orig)
    y = deepcopy(y_orig)
    z = deepcopy(z_orig)

    assert len(x) == len(y), "Lists should have same length"
    assert len(x) == len(z), "Lists should have same length"

    N = len(x)

    dx = len(x[0])
    dy = len(y[0])
    dz = len(z[0])

    if normalization:
        x /= np.std(x)
        y /= np.std(y)
        z /= np.std(z)

    data_xyz = np.concatenate((x, y, z), axis=1)
    data_xz = np.concatenate((x, z), axis=1)
    data_yz = np.concatenate((y, z), axis=1)

    tree_xyz = ss.cKDTree(data_xyz)
    tree_xz = ss.cKDTree(data_xz)
    tree_yz = ss.cKDTree(data_yz)
    tree_z = ss.cKDTree(z)

    if density_estimation_method.lower() == "kde":
        kernel = KernelDensity(bandwidth=bw)
        kernel.fit(data_xz)
        kde = np.exp(kernel.score_samples(data_xz))
        weight = (1 / kde) / np.mean(1 / kde)
    elif density_estimation_method.lower() == "knn":
        knn_dis = [
            tree_xz.query(point, k_density + 1, p=np.inf)[0][k_density]
            for point in data_xz
        ]
        density_estimate = np.array([
            float(k_density) / N / knn_dis[i]**(dx + dz)
            for i in range(len(knn_dis))
        ])
        weight = (1 / density_estimate) / np.mean(1 / density_estimate)
    else:
        raise ValueError("The density estimation method is not recognized")

    knn_dis = [
        tree_xyz.query(point, k + 1, p=np.inf)[0][k] for point in data_xyz
    ]
    information_samples = [0 for i in range(N)]
    for i in range(N):
        information_samples[i] += weight[i] * digamma(
            len(tree_xyz.query_ball_point(data_xyz[i], knn_dis[i], p=np.inf)) -
            1)
        information_samples[i] += weight[i] * -digamma(
            len(tree_xz.query_ball_point(data_xz[i], knn_dis[i], p=np.inf)) -
            1)
        information_samples[i] += weight[i] * -digamma(
            np.sum(weight[j] for j in tree_yz.query_ball_point(
                data_yz[i], knn_dis[i], p=np.inf)) - weight[i])
        information_samples[i] += weight[i] * digamma(
            np.sum(
                weight[j]
                for j in tree_z.query_ball_point(z[i], knn_dis[i], p=np.inf)) -
            weight[i])
    return np.mean(information_samples)
#print dick
estimations = []
for relation, points in dick.iteritems():
    print relation
    X = np.array(points[1])[:, np.newaxis]
    Y = np.array(points[0])[:, np.newaxis]

    #plt.scatter(Y,[0] * len(Y))#np.random.normal(0,0.03,len(Y)))
    #plt.scatter(X,[1] * len(X))#np.random.normal(1,0.03,len(X)))
    #plt.title(relation)
    #plt.ylabel("Correctness")
    #plt.xlabel("Confidence")

    #kde
    X_plot = np.linspace(0, 1, 1000)[:, np.newaxis]
    kde_incorrect = KernelDensity(kernel='gaussian', bandwidth=0.15).fit(X)
    kde_allpt = KernelDensity(kernel='gaussian', bandwidth=0.15).fit(Y)

    log_dens_incorrect = kde_incorrect.score_samples(X_plot)
    log_dens_allpt = kde_allpt.score_samples(X_plot)

    #ax = plt.gca()
    estimation = np.exp(
        np.subtract(log_dens_incorrect + np.log(len(X)),
                    log_dens_allpt + np.log(len(Y))))
    #ax.plot(X_plot, estimation)

    for i in range(0, len(estimation) - 1):
        if estimation[i] > estimation[i + 1]:
            estimation[i + 1] = estimation[i]
Beispiel #36
0
    def segment(self, Z, verbose=True):
        """Fit the model using Z as data to be segmented.

        Args:
            Z (np array, shape (n_samples, 2)): Data to be segemented.
            verbose (bool): Verbosity.

        Returns:
            labels (np array, (n_samples,): Segment label for each sample.
        """
        if verbose:
            print('Segmenting regions using watershed...')
            print('- num samples: {}'.format(len(Z)))

        # outliers
        if self.prune_outliers:
            if verbose:
                print('- pruning outliers')
            self.lof_ = LocalOutlierFactor(n_neighbors=self.outlier_neighbors,
                                           contamination=0.1)
            lof_pred = self.lof_.fit_predict(Z)
            lof_scores = self.lof_.negative_outlier_factor_
            lof_scores = minmax_scale(lof_scores)
            self.Z_crop_ = Z[lof_scores > self.outlier_threshold]
            self.Z_left_ = np.where(lof_scores > self.outlier_threshold)
            num_outliers = Z.shape[0] - self.Z_crop_.shape[0]
            print('-> outliers pruned: {}'.format(num_outliers))
        else:
            self.Z_crop_ = Z

        # normalize Z and inset
        self.Z_norm_ = minmax_scale(
            self.Z_crop_,
            feature_range=(0 + self.ngrid_pad, 1 - self.ngrid_pad),
            axis=0,
        )

        # estimate probability density using Gaussian kernal
        if verbose:
            print('- performing KDE')
        self.kde_ = KernelDensity(kernel='gaussian',
                                  bandwidth=self.bandwidth).fit(self.Z_norm_)

        # convert density estimate to an image of probs and normalize
        if verbose:
            print('- scoring KDE')
        x, y = np.meshgrid(np.linspace(0, 1, self.ngrid),
                           np.linspace(0, 1, self.ngrid))
        log_dens = self.kde_.score_samples(
            np.array((x.flatten(), y.flatten())).T)
        self.P_ = np.reshape(log_dens, (self.ngrid, self.ngrid))
        self.P_ = np.exp(self.P_) / np.max(np.exp(self.P_))

        # find peaks
        if verbose:
            print('- finding peaks')
        self.peaks_ = peak_local_max(
            self.P_,
            min_distance=self.peak_min_distance,
            threshold_rel=self.peak_threshold_rel,
            exclude_border=False,
        )

        # convert peaks to image and dialate
        self.P_peaks_ = np.ones_like(self.P_)
        for peak in self.peaks_:
            for i in range(-self.peak_dialation, self.peak_dialation + 1):
                for j in range(-self.peak_dialation, self.peak_dialation + 1):
                    self.P_peaks_[(peak[0] + i, peak[1] + j)] = 0

        # euclidean distance transform
        if verbose:
            print('- computing edt')
        self.P_edt_ = ndi.distance_transform_edt(self.P_peaks_)

        # perform watershed on edt
        if verbose:
            print('- performing watershed on edt')
        markers = ndi.label(1 - self.P_peaks_)[0]  # use peaks as seed markers
        self.P_labels_ = watershed(self.P_edt_,
                                   markers,
                                   compactness=self.compactness)

        # find boundaries
        if verbose:
            print('- finding boundaries')
        self.P_bounds_ = find_boundaries(self.P_labels_)

        # find labels for Zs
        indices = np.round(self.Z_norm_ * self.ngrid).astype(int)
        self.Z_labels_ = self.P_labels_[indices[:, 1], indices[:,
                                                               0]]  # swap axes

        if verbose:
            print('-> num regions found: {}'.format(len(self.peaks_)))

        return self.Z_labels_
Beispiel #37
0
def doqueries(givenfield, command):
    con = lite.connect(
        '/home/hartsuiker/Documents/dbdm/DDM2017/FinalProject/DDM17final.db')

    with con:
        cur = con.cursor()

        commandR1 = """SELECT ImageID,COUNT(DISTINCT StarID)
						FROM imagetable_H
						WHERE Flux1/dFlux1 > 5
						and ImageID in(
							SELECT ID
							FROM mastertable
							WHERE MJD between 56800 and 57300)
						and class = -1
						GROUP BY ImageID

						UNION

						SELECT ImageID,COUNT(DISTINCT StarID)
						FROM imagetable_Ks
						WHERE Flux1/dFlux1 > 5
						and ImageID in(
							SELECT ID
							FROM mastertable
							WHERE MJD between 56800 and 57300)
						and class = -1
						GROUP BY ImageID

						UNION

						SELECT ImageID,COUNT(DISTINCT StarID)
						FROM imagetable_Z
						WHERE Flux1/dFlux1 > 5
						and ImageID in(
							SELECT ID
							FROM mastertable
							WHERE MJD between 56800 and 57300)
						and class = -1
						GROUP BY ImageID

						UNION

						SELECT ImageID,COUNT(DISTINCT StarID)
						FROM imagetable_J
						WHERE Flux1/dFlux1 > 5
						and ImageID in(
							SELECT ID
							FROM mastertable
							WHERE MJD between 56800 and 57300)
						and class = -1
						GROUP BY ImageID

						UNION

						SELECT ImageID,COUNT(DISTINCT StarID)
						FROM imagetable_Y
						WHERE Flux1/dFlux1 > 5
						and ImageID in(
							SELECT ID
							FROM mastertable
							WHERE MJD between 56800 and 57300)
						and class = -1
						GROUP BY ImageID
						ORDER BY imageID asc
						"""

        commandR2 = '''SELECT h.StarID,j.mag1-h.mag1
						FROM imagetable_H as h
						join imagetable_J as j on h.StarID = j.StarID
						WHERE J.mag1-h.mag1 > 1.5
						ORDER BY h.StarID asc
					'''

        commandR3 = '''SELECT ks.StarID,ks.imageID,ABS(ks.Flux1-(
														SELECT AVG(ks2.Flux1)
														FROM imagetable_Ks as ks2
														WHERE ks.imageID = ks2.imageID)
														)/ks.dFlux1
						FROM imagetable_Ks as ks
						WHERE ABS(ks.Flux1 -(
							SElECT AVG(ks2.Flux1)
							FROM imagetable_Ks as ks2
							WHERE ks.imageID = ks2.imageID)) > 20 *ks.dFlux1
						ORDER BY ks.StarID asc,ks.imageID asc
					'''

        commandR4 = '''SELECT ID
						FROM mastertable
						WHERE FieldID = %s
						ORDER BY ID asc
					''' % (givenfield)

        commandR5 = '''SELECT y.StarID,y.Mag1,z.Mag1,j.Mag1,h.Mag1,ks.Mag1
						FROM imagetable_Y as y
						join imagetable_Z as z on z.StarID = y.StarID
						join imagetable_J as j on j.StarID = y.StarID
						join imagetable_H as h on h.StarID = y.StarID
						join imagetable_Ks as ks on ks.StarID = y.StarID
						join mastertable as m on m.ID = y.ImageID
						WHERE y.Flux1/y.dFlux1 > 30
						and z.Flux1/z.dFlux1 > 30
						and j.Flux1/j.dFlux1 > 30
						and h.Flux1/h.dFlux1 > 30
						and ks.Flux1/ks.dFlux1 > 30
						and y.class = -1
						and z.class = -1
						and j.class = -1
						and h.class = -1
						and ks.class = -1
						and m.FieldID = %s
						and ks.ImageID=(
							SELECT m2.ID
							FROM mastertable as m2
							WHERE m2.Filename = 'Field-%s-Ks-E001.fits')
						ORDER BY y.StarID asc
					''' % (givenfield, givenfield)

        commandR6 = '''SELECT y.Mag1-j.Mag1,j.Mag1-h.Mag1
						FROM imagetable_Y as y
						join imagetable_J as j on j.StarID = y.StarID
						join imagetable_H as h on h.StarID = y.StarID
						WHERE y.Mag1-j.Mag1 not NULL
						and j.Mag1-h.Mag1 not NULL
						and y.class = -1
						and j.class = -1
						and h.class = -1
						limit 100
					'''
        if command == 1:
            command = commandR1
        elif command == 2:
            command = commandR2
        elif command == 3:
            command = commandR3
        elif command == 4:
            command = commandR4
        elif command == 5:
            command = commandR5
        elif command == 6:
            command = commandR6
        rows = cur.execute(command)

        # for row in rows:
        # 	print row

        if command == commandR2:
            Q = 0
            for row in rows:
                if Q == 0:
                    a = np.array(row)
                    Q = 1
                else:
                    a = np.vstack((a, row))
            print a
            plt.hist(a[:, 1], bins=200)
            plt.ylabel('amount of objects', fontsize=50)
            plt.xlabel('J-H color', fontsize=50)
            plt.xticks(fontsize=40)
            plt.yticks(fontsize=40)
            plt.xlim(1.49, 1.75)
            plt.title('J-H color of all objects with J-H > 1.5', fontsize=60)
            plt.show()
            plt.close()

        if command == commandR3:
            Q = 0
            for row in rows:
                if Q == 0:
                    a = np.array(row)
                    Q = 1
                else:
                    a = np.vstack((a, row))
            print a
            plt.hist(a[:, 2], bins=280)
            plt.ylabel('amount of objects', fontsize=50)
            plt.xlabel('deviation from the mean flux [flux uncertainties]',
                       fontsize=50)
            plt.xticks(fontsize=40)
            plt.yticks(fontsize=40)
            plt.xlim(0, 145)
            plt.title(
                'deviation from the mean flux for all deviations > 20 times the flux uncertainty',
                fontsize=32)
            plt.show()
            plt.close()

        if command == commandR5:
            Q = 0
            for row in rows:
                if Q == 0:
                    a = np.array(row)
                    Q = 1
                else:
                    a = np.vstack((a, row))

            sns.kdeplot(a[:, 1], label='Y', shade=True, linewidth=3.5)
            sns.kdeplot(a[:, 2], label='Z', shade=True, linewidth=3.5)
            sns.kdeplot(a[:, 3], label='J', shade=True, linewidth=3.5)
            sns.kdeplot(a[:, 4], label='H', shade=True, linewidth=3.5)
            sns.kdeplot(a[:, 5], label='Ks', shade=True, linewidth=3.5)
            plt.title(
                'Kernel density plot in all filters of all objects in field ' +
                str(givenfield),
                fontsize=50)
            leg = plt.legend(fontsize=60, loc='upper left')
            for line in leg.get_lines():
                line.set_linewidth(6.0)
            plt.xlabel('Magnitude in given filter', fontsize=50)
            plt.ylabel('Normalized counts', fontsize=50)
            plt.xticks(fontsize=30)
            plt.yticks(fontsize=30)
            plt.show()
            plt.close()

        if command == commandR6:
            Q = 0
            for row in rows:
                if Q == 0:
                    a = np.array(row)
                    Q = 1
                else:
                    a = np.vstack((a, row))

            kf = KFold(n_splits=10)
            kf.get_n_splits(a)
            print 'shape', np.shape(a)
            Max = -1e99
            # for i in range(1000):
            # 	print 0.001+i/1000.
            # 	array=[]
            # 	for train_index,test_index in kf.split(a):
            # 		a_train,a_test = a[train_index],a[test_index]
            # 		kde = KernelDensity(kernel='gaussian', bandwidth=0.001+i/1000.).fit(a_train)
            # 		log_dens = kde.score_samples(a_train)
            # 		loglikelihood = kde.score(a_test)
            # 		array = np.append(array,loglikelihood)
            # 	Loglikelihood = np.nanmean(array)
            # 	if Loglikelihood > Max:
            # 		Max=Loglikelihood
            # 		Bandwidth = 0.001+i/1000.
            # 		print 'new best value for the bandwidth: ',Bandwidth
            Bandwidth = 0.061  #calculated with the above for loop for the fist 2000 entries of the query
            kde = KernelDensity(kernel='gaussian', bandwidth=Bandwidth).fit(a)
            samples = kde.sample(100000)
            # plt.scatter(samples[:,0],samples[:,1])
            # plt.xlabel('Y-J',fontsize=50)
            # plt.ylabel('J-H',fontsize=50)
            # plt.xticks(fontsize=40)
            # plt.yticks(fontsize=40)
            # plt.title('sample of J-H color vs the Y-J color for 100,000 stars',fontsize=50)
            # plt.show()
            # plt.close()
            data = samples
            df = pd.DataFrame(data, columns=["Y-J", "J-H"])
            sns.jointplot(x="Y-J",
                          y="J-H",
                          data=df,
                          stat_func=None,
                          kind="kde")
            # plt.xlabel('Y-J',fontsize=40)
            # plt.ylabel('J-H',fontsize=40)
            plt.xticks(fontsize=20)
            plt.yticks(fontsize=20)
            # plt.title('sample of J-H color vs the Y-J color for 100,000 stars as 2D distribution',fontsize=40)
            plt.show()
            plt.close()
        con.commit()
Beispiel #38
0
hit_prob = float(hits) / float(total)
out_prob = float(outs) / float(total)

out_rows = hit_vector.loc[hit_vector['events'].isin(out_list)]
single_rows = hit_vector.loc[hit_vector['events'] == 'Single']
double_rows = hit_vector.loc[hit_vector['events'] == 'Double']
triple_rows = hit_vector.loc[hit_vector['events'] == 'Triple']

hit_vector = hit_vector.drop(hit_vector.columns[[0, 1, 2]], axis=1)
out_rows = out_rows.drop(out_rows.columns[[0, 1, 2]], axis=1)
single_rows = single_rows.drop(single_rows.columns[[0, 1, 2]], axis=1)
double_rows = double_rows.drop(double_rows.columns[[0, 1, 2]], axis=1)
triple_rows = triple_rows.drop(triple_rows.columns[[0, 1, 2]], axis=1)
hit_rows = pd.concat([single_rows, double_rows, triple_rows])

kde = KernelDensity(bandwidth=4.53793103448)
kde2 = KernelDensity(bandwidth=5.5620689655172413)
heat_list = []

## grid = GridSearchCV(KernelDensity(),
#    {'bandwidth': np.linspace(0.1, 10.0, 30)},
#      cv=20) # 20-fold cross-validation
## grid.fit(hit_rows)
## print grid.best_params_

# print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

kde.fit(hit_vector)
kde2.fit(hit_rows)

for angle in xrange(-20, 5):
def KDE(x,y):
    kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(x)
    score = sum(kde.score_samples(y[:,]))
    return score
Beispiel #40
0
#class_road_pos = {}


# 功能:将一个二重列表写入到csv文件中
# 输入:文件名称,数据列表
def createListCSV(fileName="", dataList=[]):
    with open(fileName, "wb") as csvFile:
        csvWriter = csv.writer(csvFile)
        for data in dataList:
            csvWriter.writerow(data)
        csvFile.close


# 读取groundtruth图片,计算每一类的中心点
with open(label_list) as file_object:
    lines = file_object.readlines()
    # line example: /home/yangshuhui/code/data/GT5label/label3/06753.png
    for line in lines:
        class_road_pos = kdeEstimates.split_img_center(line.rstrip())
        for cls, pos in class_road_pos.items():
            pos_list[cls].append(pos)

# kde函数生成
for cls, pos in pos_list.items():
    #createListCSV(cls, pos)
    if pos:
        X = np.array(pos)
        print(X)
        kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(X)
Beispiel #41
0
        print("pval_silverman(data) = {}".format(pval_silverman(data)))
        t2 = time.time()
        print("Critical bandwidth computation time: {}".format(t1-t0))
        print("Silverman test computation time: {}".format(t2-t1))

        fig, ax = plt.subplots()
        ax.hist(data, bins=50, normed=True)
        x_grid = np.linspace(np.min(data)-2, np.max(data)+2, 100)
        ax.plot(x_grid, KDE(data, h_crit).evaluate(x_grid), linewidth=2, color='black')
        plt.show()

    if 0:
        data = np.random.randn(1000)
        h = .5
        print("np.std(data) = {}".format(np.std(data)))
        resamp = KernelDensity(kernel='gaussian', bandwidth=h).fit(data).sample(1000)/np.sqrt(1+h**2/np.var(data))
        print("np.std(resamp) = {}".format(np.std(resamp)))

    if 0:
        N = 1000
        data = np.hstack([np.random.randn(N/2), np.random.randn(N/4)+4])
        h = 0.1
        print("is_unimodal_kde(h, data) = {}".format(is_unimodal_kde(h, data)))
        #plt.show()
        h_crit = critical_bandwidth_m_modes(data, 2)
        x = np.linspace(-3, 8, 200)
        y = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1)).score_samples(x.reshape(-1, 1))
        plt.plot(x, np.exp(y))
        plt.show()

    if 0:
Beispiel #42
0
fcols = [c for c in original_data.columns if c != 'label']
classes = np.sort(np.unique(sampled_data["label"].values))
kdes = {}
for c in classes:
    print("\n\n=========================", "class", c,
          "============================\n\n")
    cdata = original_data.loc[original_data.label == c, fcols].values
    sampled_cdata = sampled_data.loc[sampled_data.label == c, fcols].values
    NUM_CDATA = cdata.shape[0]
    NUM_SDATA = sampled_cdata.shape[0]
    print("Fitting & Sampling...")
    if bandwidth is not None:
        for bw in [float(b) for b in bandwidth.split(",")]:
            print(" -- bw --", bw)
            kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(cdata)
            if not onlyNew:
                samples = kde.sample(NUM_SDATA, random_state=0)

                # random sampling.
                max_val = np.max(cdata)
                min_val = np.min(cdata)
                rnd_data = np.random.rand(
                    NUM_SDATA, cdata.shape[1]) * (max_val - min_val) + min_val

                print("Evaluating ...")
                kdes[c] = kde
                odata = cdata[0:NUM_SDATA, :]
                print("class", c, "original", kde.score(odata) / NUM_SDATA)
                print("class", c, "sklearn KDE sampled",
                      kde.score(samples) / NUM_SDATA)
Beispiel #43
0
import numpy as np
from scipy import stats
import matplotlib.pyplot as pltV
from sklearn.neighbors import KernelDensity

fig, plt = pltV.subplots(1, 1)

r = stats.norm.rvs(size=20)
z = stats.norm.rvs(size=80)
v = stats.norm.rvs(size=150)

print(r)
print(z)
print(v)
print(stats.norm.fit(r))
print(stats.norm.fit(z))
print(stats.norm.fit(v))

x241 = np.linspace(-5, 5).reshape(-1, 1)
norm241 = stats.norm.pdf(x241)
plt.plot(norm241, 'r-')

kde = KernelDensity(kernel='gaussian').fit(x241)
norm2412 = np.exp(kde.score_samples(x241))

plt.plot(norm2412)
pltV.show()
def showSurveyStatistics(simulatedSurvey,
                         pdfFile=None,
                         pngFile=None,
                         usekde=False):
    """
    Produce a plot with the survey statistics.

    Parameters
    ----------

    simulatedSurvey : Object containing the simulated survey.

    Keywords
    --------

    pdfFile : string
        Name of optional PDF file in which to save the plot.
    pngFile : string
        Name of optional PNG file in which to save the plot.
    usekde  : boolean
        If true use kernel density estimates to show the distribution of survey quantities instead of
        histograms.
    """
    try:
        _ = simulatedSurvey.observedParallaxes.shape
    except AttributeError:
        stderr.write("You have not generated the observations yet!\n")
        return

    parLimitPlot = 50.0
    plxSnrLim = 5.0

    positiveParallaxes = (simulatedSurvey.observedParallaxes > 0.0)
    goodParallaxes = (simulatedSurvey.observedParallaxes /
                      simulatedSurvey.parallaxErrors >= plxSnrLim)
    estimatedAbsMags = (
        simulatedSurvey.observedMagnitudes[positiveParallaxes] +
        5.0 * np.log10(simulatedSurvey.observedParallaxes[positiveParallaxes])
        - 10.0)
    relParErr = (simulatedSurvey.parallaxErrors[positiveParallaxes] /
                 simulatedSurvey.observedParallaxes[positiveParallaxes])
    deltaAbsMag = estimatedAbsMags - simulatedSurvey.absoluteMagnitudes[
        positiveParallaxes]

    useagab(usetex=False, fontfam='sans')
    fig = plt.figure(figsize=(27, 12))

    axA = fig.add_subplot(2, 3, 1)
    apply_tufte(axA, withgrid=False)
    axA.set_prop_cycle(cycler('color', get_distinct(3)))

    minPMinThird = np.power(simulatedSurvey.minParallax, -3.0)
    maxPMinThird = np.power(parLimitPlot, -3.0)
    x = np.linspace(simulatedSurvey.minParallax,
                    np.min([parLimitPlot, simulatedSurvey.maxParallax]), 1001)
    axA.plot(x,
             3.0 * np.power(x, -4.0) / (minPMinThird - maxPMinThird),
             '--',
             label='model',
             lw=3)

    if usekde:
        scatter = rse(simulatedSurvey.trueParallaxes)
        bw = 1.06 * scatter * simulatedSurvey.numberOfStarsInSurvey**(-0.2)
        kde = KernelDensity(bandwidth=bw)
        kde.fit(simulatedSurvey.trueParallaxes[:, None])
        samples = np.linspace(simulatedSurvey.trueParallaxes.min(),
                              simulatedSurvey.trueParallaxes.max(), 200)[:,
                                                                         None]
        logdens = kde.score_samples(samples)
        axA.plot(samples, np.exp(logdens), '-', lw=3, label='true')
    else:
        axA.hist(simulatedSurvey.trueParallaxes,
                 bins='auto',
                 density=True,
                 histtype='step',
                 lw=3,
                 label='true')

    if usekde:
        scatter = rse(simulatedSurvey.observedParallaxes)
        bw = 1.06 * scatter * simulatedSurvey.numberOfStarsInSurvey**(-0.2)
        kde = KernelDensity(bandwidth=bw)
        kde.fit(simulatedSurvey.observedParallaxes[:, None])
        samples = np.linspace(simulatedSurvey.observedParallaxes.min(),
                              simulatedSurvey.observedParallaxes.max(),
                              200)[:, None]
        logdens = kde.score_samples(samples)
        axA.plot(samples, np.exp(logdens), '-', lw=3, label='observed')
    else:
        axA.hist(simulatedSurvey.observedParallaxes,
                 bins='auto',
                 density=True,
                 histtype='step',
                 lw=3,
                 label='observed')

    axA.set_xlabel(r'$\varpi$,  $\varpi_\mathrm{true}$ [mas]')
    axA.set_ylabel(r'$p(\varpi)$, $p(\varpi_\mathrm{true})$')
    leg = axA.legend(loc='best', handlelength=1.0)
    for t in leg.get_texts():
        t.set_fontsize(14)
    axA.text(0.025,
             0.9,
             'a',
             horizontalalignment='center',
             verticalalignment='center',
             transform=axA.transAxes,
             weight='bold',
             fontsize=30)

    axB = fig.add_subplot(2, 3, 2)
    apply_tufte(axB, withgrid=False)
    axB.set_prop_cycle(cycler('color', get_distinct(3)))

    m = np.linspace(simulatedSurvey.observedMagnitudes.min(),
                    simulatedSurvey.observedMagnitudes.max(), 1000)
    axB.plot(m,
             np.exp(simulatedSurvey.apparentMagnitude_lpdf(m)),
             '--',
             lw=3,
             label='model')

    if usekde:
        scatter = rse(simulatedSurvey.apparentMagnitudes)
        bw = 1.06 * scatter * simulatedSurvey.numberOfStarsInSurvey**(-0.2)
        kde = KernelDensity(bandwidth=bw)
        kde.fit(simulatedSurvey.apparentMagnitudes[:, None])
        samples = np.linspace(simulatedSurvey.apparentMagnitudes.min(),
                              simulatedSurvey.apparentMagnitudes.max(),
                              200)[:, None]
        logdens = kde.score_samples(samples)
        axB.plot(samples, np.exp(logdens), '-', label='true', lw=3)
    else:
        axB.hist(simulatedSurvey.apparentMagnitudes,
                 bins='auto',
                 density=True,
                 histtype='step',
                 lw=3,
                 label='true')

    if usekde:
        scatter = rse(simulatedSurvey.observedMagnitudes)
        bw = 1.06 * scatter * simulatedSurvey.numberOfStarsInSurvey**(-0.2)
        kde = KernelDensity(bandwidth=bw)
        kde.fit(simulatedSurvey.observedMagnitudes[:, None])
        samples = np.linspace(simulatedSurvey.observedMagnitudes.min(),
                              simulatedSurvey.observedMagnitudes.max(),
                              200)[:, None]
        logdens = kde.score_samples(samples)
        axB.plot(samples, np.exp(logdens), '-', label='observed', lw=3)
    else:
        axB.hist(simulatedSurvey.observedMagnitudes,
                 bins='auto',
                 density=True,
                 histtype='step',
                 lw=3,
                 label='observed')

    axB.set_xlabel("$m$, $m_\mathrm{true}$")
    axB.set_ylabel("$p(m)$, $p(m_\mathrm{true})$")
    leg = axB.legend(loc=(0.03, 0.55), handlelength=1.0)
    for t in leg.get_texts():
        t.set_fontsize(14)
    axB.text(0.025,
             0.9,
             'b',
             horizontalalignment='center',
             verticalalignment='center',
             transform=axB.transAxes,
             weight='bold',
             fontsize=30)

    axC = fig.add_subplot(2, 3, 3)
    apply_tufte(axC, withgrid=False)
    axC.set_prop_cycle(cycler('color', get_distinct(3)))

    x = np.linspace(simulatedSurvey.absoluteMagnitudes.min(),
                    simulatedSurvey.absoluteMagnitudes.max(), 300)
    axC.plot(x,
             norm.pdf(x,
                      loc=simulatedSurvey.meanAbsoluteMagnitude,
                      scale=simulatedSurvey.stddevAbsoluteMagnitude),
             '--',
             lw=3,
             label='model')

    if usekde:
        scatter = rse(simulatedSurvey.absoluteMagnitudes)
        bw = 1.06 * scatter * simulatedSurvey.numberOfStarsInSurvey**(-0.2)
        kde = KernelDensity(bandwidth=bw)
        kde.fit(simulatedSurvey.absoluteMagnitudes[:, None])
        samples = np.linspace(simulatedSurvey.absoluteMagnitudes.min(),
                              simulatedSurvey.absoluteMagnitudes.max(),
                              200)[:, None]
        logdens = kde.score_samples(samples)
        axC.plot(samples, np.exp(logdens), '-', label='true', lw=3)
    else:
        axC.hist(simulatedSurvey.absoluteMagnitudes,
                 bins='auto',
                 density=True,
                 histtype='step',
                 lw=3,
                 label='true')

    if (simulatedSurvey.absoluteMagnitudes[goodParallaxes].size >= 3):
        if usekde:
            scatter = rse(simulatedSurvey.absoluteMagnitudes[goodParallaxes])
            bw = 1.06 * scatter * simulatedSurvey.absoluteMagnitudes[
                goodParallaxes].size**(-0.2)
            kde = KernelDensity(bandwidth=bw)
            kde.fit(simulatedSurvey.absoluteMagnitudes[goodParallaxes][:,
                                                                       None])
            samples = np.linspace(
                simulatedSurvey.absoluteMagnitudes[goodParallaxes].min(),
                simulatedSurvey.absoluteMagnitudes[goodParallaxes].max(),
                200)[:, None]
            logdens = kde.score_samples(samples)
            axC.plot(
                samples,
                np.exp(logdens),
                '-',
                label=r'$\varpi/\sigma_\varpi\geq{0:.1f}$'.format(plxSnrLim),
                lw=3)
        else:
            axC.hist(
                simulatedSurvey.absoluteMagnitudes[goodParallaxes],
                bins='auto',
                density=True,
                histtype='step',
                lw=3,
                label=r'$\varpi/\sigma_\varpi\geq{0:.1f}$'.format(plxSnrLim))

    axC.set_xlabel("$M$")
    axC.set_ylabel("$p(M)$")
    leg = axC.legend(loc=(0.03, 0.55), handlelength=1.0)
    for t in leg.get_texts():
        t.set_fontsize(14)
    axC.text(0.025,
             0.9,
             'c',
             horizontalalignment='center',
             verticalalignment='center',
             transform=axC.transAxes,
             weight='bold',
             fontsize=30)

    axD = fig.add_subplot(2, 3, 4)
    apply_tufte(axD, withgrid=False)
    axD.set_prop_cycle(cycler('color', get_distinct(3)))
    axD.plot(simulatedSurvey.trueParallaxesNoLim,
             simulatedSurvey.observedParallaxesNoLim -
             simulatedSurvey.trueParallaxesNoLim,
             'k,',
             label=r'$m_\mathrm{lim}=\infty$')
    axD.plot(simulatedSurvey.trueParallaxes,
             simulatedSurvey.observedParallaxes -
             simulatedSurvey.trueParallaxes,
             '.',
             label=r'$m_\mathrm{{lim}}={0}$'.format(
                 simulatedSurvey.apparentMagnitudeLimit))
    axD.plot(simulatedSurvey.trueParallaxes[positiveParallaxes],
             simulatedSurvey.observedParallaxes[positiveParallaxes] -
             simulatedSurvey.trueParallaxes[positiveParallaxes],
             '.',
             label=r'$\varpi>0$')
    axD.plot(simulatedSurvey.trueParallaxes[goodParallaxes],
             simulatedSurvey.observedParallaxes[goodParallaxes] -
             simulatedSurvey.trueParallaxes[goodParallaxes],
             'o',
             label=r'$\varpi/\sigma_\varpi\geq{0:.1f}$'.format(plxSnrLim))
    axD.set_xlabel(r"$\varpi_\mathrm{true}$ [mas]")
    axD.set_ylabel("$\\varpi-\\varpi_\\mathrm{true}$ [mas]")
    leg = axD.legend(loc='best', handlelength=0.5, ncol=2)
    for t in leg.get_texts():
        t.set_fontsize(14)
    axD.text(0.025,
             0.9,
             'd',
             horizontalalignment='center',
             verticalalignment='center',
             transform=axD.transAxes,
             weight='bold',
             fontsize=30)

    axE = fig.add_subplot(2, 3, 5)
    apply_tufte(axE, withgrid=False)
    axE.set_prop_cycle(cycler('color', get_distinct(3)))
    axE.plot(simulatedSurvey.trueParallaxesNoLim,
             simulatedSurvey.absoluteMagnitudesNoLim,
             'k,',
             label=r'$m_\mathrm{lim}=\infty$')
    axE.plot(simulatedSurvey.trueParallaxes,
             simulatedSurvey.absoluteMagnitudes,
             '.',
             label=r'$m_\mathrm{{lim}}={0}$'.format(
                 simulatedSurvey.apparentMagnitudeLimit))
    axE.plot(simulatedSurvey.trueParallaxes[positiveParallaxes],
             simulatedSurvey.absoluteMagnitudes[positiveParallaxes],
             '.',
             label=r'$\varpi>0$')
    axE.plot(simulatedSurvey.trueParallaxes[goodParallaxes],
             simulatedSurvey.absoluteMagnitudes[goodParallaxes],
             'o',
             label=r'$\varpi/\sigma_\varpi\geq{0:.1f}$'.format(plxSnrLim))
    axE.set_xlabel(r"$\varpi_\mathrm{true}$ [mas]")
    axE.set_ylabel("$M_\\mathrm{true}$")
    axE.axhline(y=simulatedSurvey.meanAbsoluteMagnitude)
    leg = axE.legend(loc='best', handlelength=0.5, ncol=2)
    for t in leg.get_texts():
        t.set_fontsize(14)
    axE.text(0.025,
             0.9,
             'e',
             horizontalalignment='center',
             verticalalignment='center',
             transform=axE.transAxes,
             weight='bold',
             fontsize=30)

    plt.suptitle(
        "Simulated survey statistics: $N_\\mathrm{{stars}}={0}$, ".format(
            simulatedSurvey.numberOfStars) +
        "$m_\\mathrm{{lim}}={0}$, ".format(
            simulatedSurvey.apparentMagnitudeLimit) +
        "$N_\\mathrm{{survey}}={0}$, ".format(
            simulatedSurvey.numberOfStarsInSurvey) +
        "${0}\\leq\\varpi\\leq{1}$, ".format(simulatedSurvey.minParallax,
                                             simulatedSurvey.maxParallax) +
        "$\\mu_M={0}$, ".format(simulatedSurvey.meanAbsoluteMagnitude) +
        "$\\sigma_M={0:.2f}$".format(simulatedSurvey.stddevAbsoluteMagnitude))

    if pdfFile is not None:
        plt.savefig(pdfFile)
    if pngFile is not None:
        plt.savefig(pngFile)
    if (pdfFile is None and pngFile is None):
        plt.show()
Beispiel #45
0
def KDE(X,X_plot):
    kde = KernelDensity(kernel='gaussian',bandwidth=0.75 ).fit(X.reshape(-1,1))
    log_dens = kde.score_samples(X_plot)
    return log_dens
Beispiel #46
0
def estimate_jensen_shannon_divergence_from_numerical_distribution(
        particles,
        x_N,
        y_N,
        h=0.2,
        xlimit=[-4, 4],
        ylimit=[-4, 4],
        grid_N=100,
        plot=True):
    """
    :param particles:
    :param x_N:
    :param y_N:
    :param h:
    :param xlimit:
    :param ylimit:
    :param grid_N:
    :return:
    """

    # Fit the particles
    kde1 = KernelDensity(kernel='gaussian', bandwidth=h).fit(particles)

    # Create mesh grid
    x_grid_N = grid_N
    x_grid = np.linspace(xlimit[0], xlimit[1], x_grid_N)
    y_grid_N = grid_N
    y_grid = np.linspace(ylimit[0], ylimit[1], y_grid_N)

    x_GH, y_GH = np.meshgrid(x_grid, y_grid)
    xy_grid = np.vstack([x_GH.flatten(), y_GH.flatten()]).T
    log_pdf_kde = kde1.score_samples(xy_grid)
    pdf_kde = np.exp(log_pdf_kde)

    sigma_prior = 1
    sigma_y = 1

    # Compute log prior, this is straight forward
    log_pdf_prior_M = scipy.stats.multivariate_normal.logpdf(
        xy_grid, np.zeros(2), sigma_prior * np.eye(2))
    log_pdf_prior_GH = log_pdf_prior_M.reshape((grid_N, grid_N))

    # Compute log likelihood
    log_pdf_lik_M = np.zeros(xy_grid.shape[0])

    for mm in range(xy_grid.shape[0]):
        a = xy_grid[mm, 0]
        b = xy_grid[mm, 1]
        log_ll = scipy.stats.norm.logpdf(y_N, a * b * x_N, sigma_y)
        log_pdf_lik_M[mm] = np.sum(log_ll)

    log_pdf_lik_GH = log_pdf_lik_M.reshape((grid_N, grid_N))

    # Compute unnormalized log posterior
    log_pdf_post_GH = log_pdf_lik_GH + log_pdf_prior_GH
    log_pdf_post_vector = log_pdf_post_GH.flatten()

    # Compute Posterior
    pdf_post_vector = np.exp(log_pdf_post_vector)
    pdf_post_vector = pdf_post_vector / np.sum(pdf_post_vector)

    # Compute jensen-shannon divergence
    # JSD(q|p) = 0.5 * KL(q|m) + 0.5 * KL(p|m) where m = 0.5*(p+q)
    q = 0.5 * (pdf_post_vector + pdf_kde)
    jsd = 0.5 * entropy(pdf_post_vector, q) + 0.5 * entropy(pdf_kde, q)

    if plot:
        # Plot data
        plt.plot(x_N, y_N, 'k.')
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])
        plt.title("Data distribution")
        plt.show()
        # Plot contour plots for prior
        make_log_pdf_contour_plot(x_grid, y_grid, log_pdf_prior_GH,
                                  "Prior distribution", -100)
        # Plot contour for likelihood
        make_log_pdf_contour_plot(x_grid, y_grid, log_pdf_lik_GH, "likelihood",
                                  -100)
        # Plot contour plots for posterior
        make_log_pdf_contour_plot(x_grid, y_grid, log_pdf_post_GH, "posterior",
                                  -100)
        # Plot contour plots for KDE
        x_particle = particles[:, 0]
        y_particle = particles[:, 1]
        plt.scatter(x_particle, y_particle)
        plt.title("Particles")
        plt.show()
        make_log_pdf_contour_plot(x_grid, y_grid,
                                  np.reshape(log_pdf_kde, (grid_N, grid_N)),
                                  "kde(particles), jsd = {}".format(jsd), -100)

    return jsd
Beispiel #47
0
 def __init__(self, data, bandwidth, kernel):
     self.__kde = KernelDensity(bandwidth=bandwidth,
                                kernel=kernel).fit(data)
Beispiel #48
0

def find_optimum_bandwidth(spike_times, bandwidths=10**np.linspace(-1, 1,
                                                                   100)):

    grid = GridSearchCV(KernelDensity(kernel='gaussian'),
                        {'bandwidth': bandwidths},
                        cv=LeaveOneOut())
    grid.fit(spike_times[:, None])
    bandwidth = grid.best_params_
    return bandwidth['bandwidth']


# bandwidth = find_optimum_bandwidth(spike_times)
# print(bandwidth)

bandwidth = 0.126
spike_times = np.sort(spike_times)

# instantiate and fit the KDE model
kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian')
kde.fit(spike_times[:, None])

# score_samples returns the log of the probability density
logprob = kde.score_samples(spike_times[:, None])

# ax.fill_between(spike_times, np.exp(logprob), alpha=0.5)
ax.plot(spike_times, np.exp(logprob), alpha=1, lw=2, color="k")
pl.savefig("fig.png")
pl.close()
Beispiel #49
0
 def set_grid(self, reset=False):
     if not Parallelograms.bandwidth_grid or reset:
         Parallelograms.bandwidth_grid = GridSearchCV(KernelDensity(kernel='tophat'),
             {'bandwidth': np.linspace(0.1, 1.0, 100)},
             cv=8) # 8-fold cross-validation
     return Parallelograms.bandwidth_grid
Beispiel #50
0
class KDE(BaseOutlierDetector):
    """Outlier detector using Kernel Density Estimation (KDE).

    Parameters
    ----------
    algorithm : str, default 'auto'
        Tree algorithm to use. Valid algorithms are
        ['kd_tree'|'ball_tree'|'auto'].

    atol : float, default 0.0
        Desired absolute tolerance of the result.

    bandwidth : float, default 1.0
        Bandwidth of the kernel.

    breadth_first : bool, default True
        If true, use a breadth-first approach to the problem. Otherwise use a
        depth-first approach.

    contamination : float, default 0.1
        Proportion of outliers in the data set. Used to define the threshold.

    kernel : str, default 'gaussian'
        Kernel to use. Valid kernels are
        ['gaussian'|'tophat'|'epanechnikov'|'exponential'|'linear'|'cosine'].

    leaf_size : int, default 40
        Leaf size of the underlying tree.

    metric : str, default 'euclidean'
        Distance metric to use.

    rtol : float, default 0.0
        Desired relative tolerance of the result.

    metric_params : dict, default None
        Additional parameters to be passed to the requested metric.

    Attributes
    ----------
    anomaly_score_ : array-like of shape (n_samples,)
        Anomaly score for each training data.

    contamination_ : float
        Actual proportion of outliers in the data set.

    threshold_ : float
        Threshold.

    References
    ----------
    .. [#parzen62] Parzen, E.,
        "On estimation of a probability density function and mode,"
        Ann. Math. Statist., 33(3), pp. 1065-1076, 1962.

    Examples
    --------
    >>> import numpy as np
    >>> from kenchi.outlier_detection import KDE
    >>> X = np.array([
    ...     [0., 0.], [1., 1.], [2., 0.], [3., -1.], [4., 0.],
    ...     [5., 1.], [6., 0.], [7., -1.], [8., 0.], [1000., 1.]
    ... ])
    >>> det = KDE()
    >>> det.fit_predict(X)
    array([ 1,  1,  1,  1,  1,  1,  1,  1,  1, -1])
    """

    @property
    def X_(self):
        """array-like of shape (n_samples, n_features): Training data.
        """

        return self.estimator_.tree_.data

    def __init__(
        self, algorithm='auto', atol=0., bandwidth=1.,
        breadth_first=True, contamination=0.1, kernel='gaussian', leaf_size=40,
        metric='euclidean', rtol=0., metric_params=None
    ):
        self.algorithm     = algorithm
        self.atol          = atol
        self.bandwidth     = bandwidth
        self.breadth_first = breadth_first
        self.contamination = contamination
        self.kernel        = kernel
        self.leaf_size     = leaf_size
        self.metric        = metric
        self.rtol          = rtol
        self.metric_params = metric_params

    def _check_is_fitted(self):
        super()._check_is_fitted()

        check_is_fitted(self, 'X_')

    def _fit(self, X):
        self.estimator_   = KernelDensity(
            algorithm     = self.algorithm,
            atol          = self.atol,
            bandwidth     = self.bandwidth,
            breadth_first = self.breadth_first,
            kernel        = self.kernel,
            leaf_size     = self.leaf_size,
            metric        = self.metric,
            rtol          = self.rtol,
            metric_params = self.metric_params
        ).fit(X)

        return self

    def _anomaly_score(self, X):
        return -self.estimator_.score_samples(X)
Beispiel #51
0
kernel = 'gaussian'
bins = np.linspace(-1, 1, 200)
kde_path = './kde_%i_%i_%i_%i.jbl' % (window, overlap, decimation_rate,
                                      spectrum_bins_left)
if os.path.exists(kde_path):
    kdes = joblib.load(kde_path)
else:
    kdes = dict()
    for cls in raw_data_seg:
        kdes[cls] = dict()
        for run_name, run_seg in raw_data_seg[cls].items():
            kdes[cls][run_name] = np.zeros((run_seg.shape[0], bins.shape[0]))
            for i in range(0, run_seg.shape[0]):
                segment = run_seg[i]
                kde = KernelDensity(kernel=kernel,
                                    bandwidth=0.5).fit(segment[:, np.newaxis])
                kdes[cls][run_name][i] = np.exp(
                    kde.score_samples(bins[:, np.newaxis]))
    joblib.dump(kdes, kde_path)

from Functions.StatFunctions import KLDiv


def kl_dv_fn(kdes, k):
    kl_foward = {}
    kl_reverse = {}
    for cls in kdes:
        kl_foward[cls] = dict()
        kl_reverse[cls] = dict()
        for run in kdes[cls]:
            run_pdf = kdes[cls][run]
 def __init__(self, **kwargs):
     self.kde = KernelDensity(**kwargs)
     self.pre_whiten = PCA(whiten=True)
Beispiel #53
0
    while n < N_tr:
        print('Stock: ', count, '/24', ' - Training: ', n + 1, '/', N_tr)

        "%%%%%%%%%%%%%%%%%%%%% TRANING %%%%%%%%%%%%%%%%%%%%%"

        "Input vector"
        u_train = X_train[n, :][np.newaxis, :]

        size = np.zeros(len(Set_Dict_tr))
        kullback = np.ones(len(Set_Dict_tr))
        for i in Set_Dict_tr:
            prev_dict_ytr = Set_Dict_tr[str(i)].reshape(-1, 1)
            curr_dict_ytr = np.append(prev_dict_ytr,
                                      u_train[0, -1])[:, np.newaxis]

            kde_prev = KernelDensity(kernel='gaussian',
                                     bandwidth=0.05).fit(prev_dict_ytr)
            kde_curr = KernelDensity(kernel='gaussian',
                                     bandwidth=0.05).fit(curr_dict_ytr)

            den_prev = np.exp(kde_prev.score_samples(y_grid[:, None]))
            den_curr = np.exp(kde_curr.score_samples(y_grid[:, None]))

            kullback[int(i) - 1] = entropy(pk=den_prev, qk=den_curr)

            size[int(i) - 1] = np.shape(Set_Dictionaries[str(i)])[0]

            del prev_dict_ytr, curr_dict_ytr, kde_prev, kde_curr, den_prev, den_curr

        entr_near_cluster = np.min(kullback)
        near_cluster = np.argmin(kullback) + 1
Beispiel #54
0
def main(classifier,model, X_train, y_train, Y_train,X_test, y_test,Y_test, X_test_adv,Bandwidth):

    batch_size = 256


    X_test,X_test_adv,Y_test = get_testing_data(X_test,X_test_adv,y_test,classifier)



    uncerts_normal = np.zeros((X_test.shape[0],),dtype=float)
    uncerts_adv = np.zeros((X_test.shape[0],),dtype=float)


    print('Getting deep feature representations...')
    X_train_features = get_deep_representations(model, X_train,
                                                batch_size=batch_size)
    X_test_normal_features = get_deep_representations(model, X_test,
                                                      batch_size=batch_size)
    X_test_adv_features = get_deep_representations(model, X_test_adv,
                                                   batch_size=batch_size)
    class_inds = {}
    for i in range(Y_train.shape[1]):
        class_inds[i] = np.where(Y_train.argmax(axis=1) == i)[0]

    # print('class_inds:', class_inds)
    kdes = {}
    warnings.warn("Using pre-set kernel bandwidths that were determined "
                  "optimal for the specific CNN models of the paper. If you've "
                  "changed your model, you'll need to re-optimize the "
                  "bandwidth.")
    for i in range(Y_train.shape[1]):
        kdes[i] = KernelDensity(kernel='gaussian',
                                bandwidth=Bandwidth) \
            .fit(X_train_features[class_inds[i]])

    preds_test_normal = classifier.predict(X_test)
    preds_test_adv = classifier.predict(X_test_adv)
    preds_test_normal = preds_test_normal.argmax(axis=1)
    preds_test_adv = preds_test_adv.argmax(axis=1)

    densities_normal = score_samples(
        kdes,
        X_test_normal_features,
        preds_test_normal
    )
    densities_adv = score_samples(
        kdes,
        X_test_adv_features,
        preds_test_adv
    )
    # print(densities_adv)
    ## Z-score the uncertainty and density values
    uncerts_normal_z, uncerts_adv_z = normalize(
        uncerts_normal,
        uncerts_adv
    )
    densities_normal_z, densities_adv_z = normalize(
        densities_normal,
        densities_adv
    )

    values, labels, lr = train_lr(
        densities_pos=densities_adv_z,
        densities_neg=densities_normal_z,
        uncerts_pos=uncerts_adv_z,
        uncerts_neg=uncerts_normal_z
    )

    ## Evaluate detector
    # Compute logistic regression model predictions
    probs = lr.predict_proba(values)[:, 1]

    # Compute AUC
    n_samples = len(X_test)

    FPR, TPR, auc_score = compute_roc(
        probs_neg=probs[:n_samples],
        probs_pos=probs[n_samples:]
    )
    print('FPR:', FPR)
    print('TPR:', TPR)
    print('auc:', auc_score)
    print('Detector ROC-AUC score: %0.4f' % auc_score)

    print('Total:',n_samples)
    print('Clean:',np.sum((probs[n_samples:])>0.5))
    print('Adv:',np.sum(probs[:n_samples]<0.5))

    print('P:',np.sum((probs[n_samples:])>0.5)/np.sum(probs>0.5))
    print('R:', np.sum(probs[n_samples:] > 0.5) / probs[:n_samples].shape[0])
    print('Detector ROC-AUC score: %0.4f' % auc_score)

    concat = np.vstack((FPR, TPR))

    return concat
def umi(x, y, k=5, density_estimation_method="kde", k_density=5, bw=.01):
    """Calculates the uniformed mutual information where the distribution for :math:`x` is replaced by a uniform distribution.

    `umi` takes two random variable x and y and estimated their mutual using the KSG estimator while x is replaced by a
    uniform distribution.

    Arguments
    ---------
        x: `List`
            One random variable from the time-series data.
        y: `List`
            Another random variable from the time-series data.
        k: `int` (Default: 5)
            Number for nearest neighbors used in entropy calculation
        density_estimation_method: `str` (Default: `kde`)
            Which 2D density estimator you would like to use. `kde` is kde estimator while `knn` is knn based estimator.
        k_density: `bool` (default: False)
            The number of k nearest neighbors you would like to use when calculating the density (only applicable when
            density_estimation_method is to be `knn` or using knn based density estimation).
        bw: `float` (default: 0.1)
            Bindwidth used for the kernel density estimator.

    Returns
    -------
    A estimated uniform mutual information value between two variables (x, y) where the distribution for the x is replaced
    by a uniform distribution.
    """
    assert len(x) == len(y), "Lists should have same length"
    assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
    N = len(x)
    dx = len(x[0])
    dy = len(y[0])
    data = np.concatenate((x, y), axis=1)

    tree_xy = ss.cKDTree(data)
    tree_x = ss.cKDTree(x)
    tree_y = ss.cKDTree(y)

    if density_estimation_method.lower() == "kde":
        kernel = KernelDensity(bandwidth=bw)
        kernel.fit(x)
        kde = np.exp(kernel.score_samples(x))
        weight = (1 / kde) / np.mean(1 / kde)

    elif density_estimation_method.lower() == "knn":
        knn_dis = [
            tree_x.query(point, k_density + 1, p=np.inf)[0][k_density]
            for point in x
        ]
        density_estimate = np.array([
            float(k_density) / N / knn_dis[i]**dx for i in range(len(knn_dis))
        ])
        weight = (1 / density_estimate) / np.mean(1 / density_estimate)

    else:
        raise ValueError("The density estimation method is not recognized")

    knn_dis = [tree_xy.query(point, k + 1, p=2)[0][k] for point in data]
    ans = digamma(k) + 2 * log(N - 1) - digamma(N) + vd(dx) + vd(dy) - vd(dx +
                                                                          dy)

    weight_y = np.zeros(N)
    for i in range(N):
        weight_y[i] = np.sum(weight[j] for j in tree_y.query_ball_point(
            y[i], knn_dis[i], p=2)) - weight[i]
    weight_y *= N / np.sum(weight_y)

    for i in range(N):
        nx = len(tree_x.query_ball_point(x[i], knn_dis[i], p=2)) - 1
        ny = np.sum(weight[j] for j in tree_y.query_ball_point(
            y[i], knn_dis[i], p=2)) - weight[i]
        ans += -weight[i] * log(nx) / N
        # ans += -ny * log(ny) / N / (len(tree_y.query_ball_point(y[i], knn_dis[i], p=2))-1)
        ans += -weight[i] * log(ny) / N
    return ans
Beispiel #56
0
x1_kgroups = X[np.where(zh_kgroups == 0)][:, np.newaxis]
x2_kgroups = X[np.where(zh_kgroups == 1)][:, np.newaxis]
acc_kgroups = metric.accuracy(z, zh_kgroups)
t.add_row(['kernel k-groups', acc_kgroups])

print t

### kernel density estimation for truth
X_plot = np.linspace(low, high, num_points)[:, np.newaxis]
x1_true = X[np.where(z == 0)][:, np.newaxis]
x2_true = X[np.where(z == 1)][:, np.newaxis]

fig = plt.figure()
ax = fig.add_subplot(111)

kde1 = KernelDensity(kernel='gaussian', bandwidth=bw).fit(x1_true)
log_dens1 = kde1.score_samples(X_plot)
kde2 = KernelDensity(kernel='gaussian', bandwidth=bw).fit(x2_true)
log_dens2 = kde2.score_samples(X_plot)
ax.fill_between(X_plot[:, 0], np.exp(log_dens1), alpha=.3, color='k')
ax.plot(X_plot[:, 0], np.exp(log_dens1), color='k', label='truth')
ax.fill_between(X_plot[:, 0], np.exp(log_dens2), alpha=.3, color='k')
ax.plot(X_plot[:, 0], np.exp(log_dens2), color='k')

xs = np.linspace(low, high, num_points)
ax.plot(xs,
        scipy.stats.norm.pdf(xs, x1_mu_kmeans, np.sqrt(x1_var_kmeans)),
        label="%s" % (methods[0]),
        color=colors[0])
ax.plot(xs,
        scipy.stats.norm.pdf(xs, x2_mu_kmeans, np.sqrt(x2_var_kmeans)),
Beispiel #57
0
X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis]
bins = np.linspace(-5, 10, 10)

fig, ax = plt.subplots(2, 2, sharex=True, sharey=True)
fig.subplots_adjust(hspace=0.05, wspace=0.05)

# histogram 1
ax[0, 0].hist(X[:, 0], bins=bins, fc='#AAAAFF', **density_param)
ax[0, 0].text(-3.5, 0.31, "Histogram")

# histogram 2
ax[0, 1].hist(X[:, 0], bins=bins + 0.75, fc='#AAAAFF', **density_param)
ax[0, 1].text(-3.5, 0.31, "Histogram, bins shifted")

# tophat KDE
kde = KernelDensity(kernel='tophat', bandwidth=0.75).fit(X)
log_dens = kde.score_samples(X_plot)
ax[1, 0].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF')
ax[1, 0].text(-3.5, 0.31, "Tophat Kernel Density")

# Gaussian KDE
kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X)
log_dens = kde.score_samples(X_plot)
ax[1, 1].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF')
ax[1, 1].text(-3.5, 0.31, "Gaussian Kernel Density")

for axi in ax.ravel():
    axi.plot(X[:, 0], np.full(X.shape[0], -0.01), '+k')
    axi.set_xlim(-4, 9)
    axi.set_ylim(-0.02, 0.34)

fnames = ['GlobalTemp_1.txt', 'GlobalTemp_2.txt']
data = load_data(fnames)

# Sanity check
print(data['GlobalTemp_1.txt'][6, 0])  # Should be 0.74
print(data['GlobalTemp_2.txt'][6, 0])  # Should be 1.07

# Remove -99.99 from row 8
data_row8_1997 = remove_99(data['GlobalTemp_1.txt'][6, None])
data_row8_2017 = remove_99(data['GlobalTemp_2.txt'][6, None])

x1 = np.linspace(-2, 4, 1000)
x2 = np.linspace(-2, 4, 1000)
kde1 = KernelDensity(kernel='epanechnikov',
                     bandwidth=0.4).fit(data_row8_1997[:, None])
kde2 = KernelDensity(kernel='epanechnikov',
                     bandwidth=0.4).fit(data_row8_2017[:, None])


def f_kde1(x):
    return np.exp((kde1.score_samples([[x]])))


def f_kde2(x):
    return np.exp((kde2.score_samples([[x]])))


# Remember score_samples return log(probability density) !!!!!
p1 = np.exp(kde1.score_samples(x1[:, None]))
p2 = np.exp(kde2.score_samples(x2[:, None]))
def window_analysis(Windows,
                    ref_labels,
                    labels1,
                    Chr=1,
                    ncomp=4,
                    amova=True,
                    supervised=True,
                    include_who=[],
                    range_sample=[130, 600],
                    rand_sample=0,
                    clsize=15,
                    cl_freqs=5,
                    Bandwidth_split=20,
                    quantile=0.1,
                    centre_d=True,
                    PC_sel=0):

    kde_class_labels = labels1
    kde_label_dict = {
        z:
        [x for x in range(len(kde_class_labels)) if kde_class_labels[x] == z]
        for z in list(set(kde_class_labels))
    }

    if include_who:
        include = [
            x for x in range(len(kde_class_labels))
            if kde_class_labels[x] in include_who
        ]
        ref_labels = include_who
        kde_class_labels = [kde_class_labels[x] for x in include]

        kde_label_dict = {
            z: [
                x for x in range(len(kde_class_labels))
                if kde_class_labels[x] == z
            ]
            for z in include_who
        }

    if rand_sample:
        sample = rand_sample
        sample_range = [0, sample]
        Freq_extract = {
            Chr: {
                bl: Windows[Chr][bl]
                for bl in np.random.choice(
                    list(Windows[Chr].keys()), sample, replace=True)
            }
        }

    if range_sample:
        sample_range = range_sample
        Freq_extract = {
            Chr: {
                bl: Windows[Chr][bl]
                for bl in list(sorted(Windows[Chr].keys()))
                [sample_range[0]:sample_range[1]]
            }
        }

    Results = {'header': ['Chr', 'window'], 'info': [], 'coords': []}

    Frequencies = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []}

    Construct = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []}

    PC_var = {'header': ['Chr', 'window'], 'coords': [], 'info': []}

    pc_density = []
    pc_coords = []

    sim_fst = []

    for c in Freq_extract[Chr].keys():
        Sequences = Windows[Chr][c]

        if Sequences.shape[1] <= 3:
            Results[Chr][c] = [0, 0]
            print('hi')
            continue

        Sequences = np.nan_to_num(Sequences)

        pca = PCA(n_components=ncomp, whiten=False,
                  svd_solver='randomized').fit(Sequences)
        data = pca.transform(Sequences)

        from sklearn.preprocessing import scale

        if include_who:
            data = data[include, :]

        ##### PC density
        PC = PC_sel

        pc_places = data[:, PC]

        if centre_d:
            pc_places = scale(pc_places, with_std=False)

        X_plot = np.linspace(-8, 8, 100)

        Focus_labels = list(range(data.shape[0]))

        bandwidth_pc = estimate_bandwidth(pc_places.reshape(-1, 1),
                                          quantile=quantile,
                                          n_samples=len(pc_places))
        if bandwidth_pc <= 1e-3:
            bandwidth_pc = 0.01

        bandwidth = estimate_bandwidth(data,
                                       quantile=quantile,
                                       n_samples=len(Focus_labels))
        if bandwidth <= 1e-3:
            bandwidth = 0.01

        kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth_pc).fit(
            np.array(pc_places).reshape(-1, 1))

        log_dens = kde.score_samples(X_plot.reshape(-1, 1))

        pc_density.append(np.exp(log_dens))
        pc_coords.append(pc_places)

        PC_var['coords'].append([Chr, c])
        PC_var['info'].append([x for x in pca.explained_variance_])
        ###
        params = {
            'bandwidth': np.linspace(np.min(data), np.max(data),
                                     Bandwidth_split)
        }
        grid = GridSearchCV(KernelDensity(algorithm="ball_tree",
                                          breadth_first=False),
                            params,
                            verbose=0)

        ######################################
        ####### TEST global Likelihood #######
        ######################################

        #### Mean Shift approach
        ## from sklearn.cluster import MeanShift, estimate_bandwidth

        ms = MeanShift(bandwidth=bandwidth,
                       cluster_all=False,
                       min_bin_freq=clsize)
        ms.fit(data[Focus_labels, :])
        labels = ms.labels_

        Tree = {
            x: [Focus_labels[y] for y in range(len(labels)) if labels[y] == x]
            for x in [g for g in list(set(labels)) if g != -1]
        }
        Keep = [x for x in Tree.keys() if len(Tree[x]) > clsize]

        Tree = {x: Tree[x] for x in Keep}
        Ngps = len(Tree)
        SpaceX = {x: data[Tree[x], :] for x in Tree.keys()}

        these_freqs = []
        ### Extract MScluster likelihood by sample

        for hill in SpaceX.keys():

            if len(Tree[hill]) >= cl_freqs:
                if supervised == False:
                    print('hi')
                    cl_seqs = Sequences[Tree[hill], :]

                    freq_vector = [
                        float(x) / (cl_seqs.shape[0] * 2)
                        for x in np.sum(cl_seqs, axis=0)
                    ]

                    Frequencies['coords'].append([Chr, c, hill])
                    Frequencies['info'].append(freq_vector)
                    these_freqs.append(freq_vector)

            grid.fit(data[Tree[hill], :])

            # use the best estimator to compute the kernel density estimate
            kde = grid.best_estimator_

            P_dist = kde.score_samples(data[Tree[hill], :])
            Dist = kde.score_samples(data)
            P_dist = np.nan_to_num(P_dist)
            Dist = np.nan_to_num(Dist)

            if np.std(P_dist) == 0:
                Dist = np.array(
                    [int(Dist[x] in P_dist) for x in range(len(Dist))])
            else:
                Dist = scipy.stats.norm(np.mean(P_dist),
                                        np.std(P_dist)).cdf(Dist)
            Dist = np.nan_to_num(Dist)

            Construct['coords'].append([Chr, c, hill])
            Construct['info'].append(Dist)

            #########################################
        ############# AMOVA ################
        #########################################

        if supervised:
            labels = [x for x in kde_class_labels if x in ref_labels]
            Who = [
                z for z in it.chain(*[kde_label_dict[x] for x in ref_labels])
            ]
            Ngps = len(ref_labels)

            #print(ref_labels)
            for hill in ref_labels:

                if len(kde_label_dict[hill]) >= cl_freqs:
                    if include_who:
                        Seq_specific = Sequences[include, :]

                    cl_seqs = Seq_specific[kde_label_dict[hill], :]

                    freq_vector = [
                        float(x) / (cl_seqs.shape[0] * 2)
                        for x in np.sum(cl_seqs, axis=0)
                    ]

                    Frequencies['coords'].append([Chr, c, hill])
                    Frequencies['info'].append(freq_vector)
                    these_freqs.append(freq_vector)

        else:
            Who = [
                x for x in range(len(labels))
                if labels[x] != -1 and labels[x] in Keep
            ]
            labels = [labels[x] for x in Who]
            Who = [Focus_labels[x] for x in Who]

        #
        if len(these_freqs) > 1:
            Pairwise = return_fsts2(np.array(these_freqs))
            sim_fst.extend(Pairwise.fst)

        if len(list(set(labels))) == 1:
            Results['info'].append([Chr, c, 0, 1])
            #Results['info'].append([AMOVA,Ngps])
            continue

        if amova:
            clear_output()
            AMOVA, Cig = AMOVA_FM42(data[Who, :],
                                    labels,
                                    n_boot=0,
                                    metric='euclidean')
            print('counting: {}, Ngps: {}'.format(AMOVA, Ngps))
            Results['info'].append([Chr, c, AMOVA, Ngps])

    Results['info'] = pd.DataFrame(
        np.array(Results['info']),
        columns=['chrom', 'window', 'AMOVA', 'Ngps'])

    if len(sim_fst) > 3:
        X_plot = np.linspace(0, .3, 100)

        freq_kde = KernelDensity(kernel='gaussian', bandwidth=0.02).fit(
            np.array(sim_fst).reshape(-1, 1))

        log_dens = freq_kde.score_samples(X_plot.reshape(-1, 1))

        fig_roost_dens = [
            go.Scatter(x=X_plot,
                       y=np.exp(log_dens),
                       mode='lines',
                       fill='tozeroy',
                       name='',
                       line=dict(color='blue', width=2))
        ]
        ##

        layout = go.Layout(
            title='allele frequency distribution across clusters',
            yaxis=dict(title='density'),
            xaxis=dict(title='fst'))

        fig = go.Figure(data=fig_roost_dens, layout=layout)

    else:
        fig = []

    return Frequencies, sim_fst, Results, Construct, pc_density, pc_coords, fig
Beispiel #60
0
class Watershed:
    """Watershed segemention for two-dimensional data.

    Performs probability density estimation of data, then applies watershed
    to segement into discrete regions.

    Basically a wrapper on sklearn to manage preprocessing and store data.

    Args:
        prune_outliers (bool): To prune or not to prune.
        outlier_neighbors (int): Number of neighbors to use.
        outlier_thresholds (float): Outlier threshold.
        bandwidth (float): KDE bandwidth.
        ngrid (int): KDE grid resolution.
        ngrid_pad (float): Grid inset padding.
        peak_min_distance (int): Minimum number of pixels separating peaks.
        peak_threshold_rel (float): Minimum relative intensity of peaks.
        peak_dialation (float): Peak dialation factor.
        compactness (float): Compactness factor for compact watershed.

    Attributes:
        lof_: LocalOutlierFactor.
        kde_: KernelDensity estimator.
        Z_crop_: Z cropped for outliers.
        Z_norm_: Z normalized and inset.
        Z_labels_ (np array, shape (n_samples,)): Segment labels.
        P_ (np array, shape (ngrid, ngrid)): Probabilty density map.
        P_peaks_ (np array, shape (ngrid, ngrid)): KDE peaks.
        P_edt_ (np array, shape (ngrid, ngrid)): Euclidean distance transform.
        P_labels_ (np array, shape (ngrid, ngrid)): KDE Labels.
        P_bounds_ (np array, shape (ngrid, ngrid)): KDE Bounds .
        peaks_ (np array, shape (num_peaks, 2)): List of KDE peaks.
    """
    def __init__(
        self,
        prune_outliers=False,
        outlier_neighbors=1,
        outlier_threshold=0.7,
        bandwidth=1.0 / 40,
        ngrid=600,
        ngrid_pad=0.07,
        peak_min_distance=10,
        peak_threshold_rel=0.1,
        peak_dialation=2,
        compactness=0.01,
    ):
        self.prune_outliers = prune_outliers
        self.outlier_neighbors = outlier_neighbors
        self.outlier_threshold = outlier_threshold
        self.bandwidth = bandwidth
        self.ngrid = ngrid
        self.ngrid_pad = ngrid_pad
        self.peak_min_distance = peak_min_distance
        self.peak_threshold_rel = peak_threshold_rel
        self.peak_dialation = peak_dialation
        self.compactness = compactness

    def segment(self, Z, verbose=True):
        """Fit the model using Z as data to be segmented.

        Args:
            Z (np array, shape (n_samples, 2)): Data to be segemented.
            verbose (bool): Verbosity.

        Returns:
            labels (np array, (n_samples,): Segment label for each sample.
        """
        if verbose:
            print('Segmenting regions using watershed...')
            print('- num samples: {}'.format(len(Z)))

        # outliers
        if self.prune_outliers:
            if verbose:
                print('- pruning outliers')
            self.lof_ = LocalOutlierFactor(n_neighbors=self.outlier_neighbors,
                                           contamination=0.1)
            lof_pred = self.lof_.fit_predict(Z)
            lof_scores = self.lof_.negative_outlier_factor_
            lof_scores = minmax_scale(lof_scores)
            self.Z_crop_ = Z[lof_scores > self.outlier_threshold]
            self.Z_left_ = np.where(lof_scores > self.outlier_threshold)
            num_outliers = Z.shape[0] - self.Z_crop_.shape[0]
            print('-> outliers pruned: {}'.format(num_outliers))
        else:
            self.Z_crop_ = Z

        # normalize Z and inset
        self.Z_norm_ = minmax_scale(
            self.Z_crop_,
            feature_range=(0 + self.ngrid_pad, 1 - self.ngrid_pad),
            axis=0,
        )

        # estimate probability density using Gaussian kernal
        if verbose:
            print('- performing KDE')
        self.kde_ = KernelDensity(kernel='gaussian',
                                  bandwidth=self.bandwidth).fit(self.Z_norm_)

        # convert density estimate to an image of probs and normalize
        if verbose:
            print('- scoring KDE')
        x, y = np.meshgrid(np.linspace(0, 1, self.ngrid),
                           np.linspace(0, 1, self.ngrid))
        log_dens = self.kde_.score_samples(
            np.array((x.flatten(), y.flatten())).T)
        self.P_ = np.reshape(log_dens, (self.ngrid, self.ngrid))
        self.P_ = np.exp(self.P_) / np.max(np.exp(self.P_))

        # find peaks
        if verbose:
            print('- finding peaks')
        self.peaks_ = peak_local_max(
            self.P_,
            min_distance=self.peak_min_distance,
            threshold_rel=self.peak_threshold_rel,
            exclude_border=False,
        )

        # convert peaks to image and dialate
        self.P_peaks_ = np.ones_like(self.P_)
        for peak in self.peaks_:
            for i in range(-self.peak_dialation, self.peak_dialation + 1):
                for j in range(-self.peak_dialation, self.peak_dialation + 1):
                    self.P_peaks_[(peak[0] + i, peak[1] + j)] = 0

        # euclidean distance transform
        if verbose:
            print('- computing edt')
        self.P_edt_ = ndi.distance_transform_edt(self.P_peaks_)

        # perform watershed on edt
        if verbose:
            print('- performing watershed on edt')
        markers = ndi.label(1 - self.P_peaks_)[0]  # use peaks as seed markers
        self.P_labels_ = watershed(self.P_edt_,
                                   markers,
                                   compactness=self.compactness)

        # find boundaries
        if verbose:
            print('- finding boundaries')
        self.P_bounds_ = find_boundaries(self.P_labels_)

        # find labels for Zs
        indices = np.round(self.Z_norm_ * self.ngrid).astype(int)
        self.Z_labels_ = self.P_labels_[indices[:, 1], indices[:,
                                                               0]]  # swap axes

        if verbose:
            print('-> num regions found: {}'.format(len(self.peaks_)))

        return self.Z_labels_