Beispiel #1
0
Datei: ch6.py Projekt: syting/esl
def figure_6_14():
    """Reproduces figure 6.14 in ESLii displaying a density estimate for sbp
    levels in chd/no-chd groups using a Gaussian kernel density estimate
    """
    sa = eslii.read_sa_heart_data()
    sbp = sa["sbp"]
    sbp_chd = sa[sa["chd"] == 1]["sbp"].copy()
    sbp_chd.sort()
    sbp_no_chd = sa[sa["chd"] == 0]["sbp"].copy()
    sbp_no_chd.sort()

    kde_chd = KernelDensity(kernel='gaussian', bandwidth=7.5).fit(
        sbp_chd.reshape(len(sbp_chd), 1))
    chd_log_dens = kde_chd.score_samples(sbp_chd.reshape((len(sbp_chd), 1)))
    plt.subplot(121)
    plt.plot(sbp_chd, np.exp(chd_log_dens), label="CHD")

    kde_no_chd = KernelDensity(kernel='gaussian', bandwidth=7.5).fit(
        sbp_no_chd.reshape(len(sbp_no_chd), 1))
    no_chd_log_dens = kde_no_chd.score_samples(
        sbp_no_chd.reshape((len(sbp_no_chd), 1)))
    plt.plot(sbp_no_chd, np.exp(no_chd_log_dens), label="no CHD")
    plt.legend(loc='best')

    sbp_range = np.linspace(min(sbp), max(sbp), 100).reshape((100, 1))
    chd_dens = np.exp(kde_chd.score_samples(sbp_range))
    no_chd_dens = np.exp(kde_no_chd.score_samples(sbp_range))
    p_chd = float(len(sbp_chd))/(len(sbp_chd) + len(sbp_no_chd))
    posterior_est = [p_chd * chd_dens[i] /
                     (p_chd * chd_dens[i] + (1 - p_chd) * no_chd_dens[i])
                     for i in range(len(sbp_range))]
    plt.subplot(122)
    plt.plot(sbp_range, posterior_est)
    plt.show()
Beispiel #2
0
def cistrans(args):
    cob = co.COB(args.cob) 
    if args.out == None:
        args.out = '{}_cistrans'.format(cob.name)
    # np.newaxis adds an empty axis in that position of the slice
    # the sklearn module requires the values to be in the rows:
    # http://scikit-learn.org/stable/auto_examples/neighbors/plot_kde_1d.html
    cis = cob.coex \
            .score[cob.coex.distance <= args.cis_distance]\
            .values[:,np.newaxis]
    trans = cob.coex\
            .score[np.isinf(cob.coex.distance)]\
            .values[:,np.newaxis]
    X_plot = np.linspace(-10,10,1000)[:,np.newaxis]
    print(
            'Found {:,} cis interactions and {:,} trans interactions'.format(
        cis.shape[0],
        trans.shape[0]
    ))
    # Fit the kernel
    kd=KernelDensity(bandwidth=0.2)
    kd.fit(cis)
    cis_kde = np.exp(kd.score_samples(X_plot))
    plt.fill(X_plot,cis_kde,alpha=0.5,label='Cis Interactions')
    # Fit the trans 
    kd.fit(trans[0:50000])
    trans_kde = np.exp(kd.score_samples(X_plot))
    plt.fill(X_plot,trans_kde,alpha=0.5,label='Trans Interactions')
    plt.legend()
    plt.title('Cis vs Trans Density: {}'.format(cob.name))
    # Calculate the mann whitney U test
    u,pval = sp.stats.mannwhitneyu(cis[:,0],trans[:,0]) 
    print('P-val: {}'.format(pval))
    plt.savefig(args.out+'.png')
Beispiel #3
0
    def plot_samples(self, folder = '', title = ''):
        print("Saving plots...")
        numbins = int(8 * math.log(len(self.samples[self.store_list[0]])))
        
        for p in self.store_list:
            if p == 'D':
                continue
            samples = np.array([self.samples[p]]).T
            a = np.min(samples)
            b = np.max(samples)
            band = 0.1 * (b-a + 0.001)
            kde = KD(kernel='gaussian', bandwidth=band).fit(samples)
            n, bins, patches = plt.hist(self.samples[p], numbins, normed=1)
            log_dens = kde.score_samples(np.array([bins]).T)
            plt.plot(bins, np.exp(log_dens), 'r-')
            MAP = self.get_MAP(kde, a, b)
            self.params['MAP'][p] = MAP
            plt.plot([MAP], np.exp(kde.score_samples([MAP])), 'go')
            plt.title(title + " MAP estimate: " + str(MAP))
            plt.ylabel("Posterior(" + p + ")")
            plt.xlabel(p)
            x1,x2,y1,y2 = plt.axis()
            plt.axis((-3,3,y1,y2))
            if p == 'L' or p == 'T':
                plt.axis((0,1,y1,y2))
            plt.savefig(folder + p + "_" + title)
            plt.clf()
        
        self.params['MAP']['D'] = self.params['D']
        if not self.bkt:
            print("Working on difficulty params...")

            p = 'D'

            data = np.array(self.samples[p])
            for j in range(self.data['num_problems']):
                samples = np.array([data[:,j]]).T
                #print samples
                a = np.min(samples)
                b = np.max(samples)
                band = 0.1 * (b-a + 0.001)
                kde = KD(kernel='gaussian', bandwidth=band).fit(samples)
                n, bins, patches = plt.hist(samples, numbins, normed=1)
                log_dens = kde.score_samples(np.array([bins]).T)
                plt.plot(bins, np.exp(log_dens), 'r-')
                MAP = self.get_MAP(kde, a, b)
                self.params['MAP']['D'][j] = MAP
                plt.plot([MAP], np.exp(kde.score_samples([MAP])), 'go')
                plt.title(title + " MAP estimate: " + str(MAP))
                plt.ylabel("Posterior(" + p + ")")
                plt.xlabel("Problem " + str(j))
                x1,x2,y1,y2 = plt.axis()
                plt.axis((-3,3,y1,y2))
                plt.savefig(folder + "Difficulty/problem" + str(j) +  "_" + title)
                plt.clf()

        print("Plots saved!")
Beispiel #4
0
class TwoClassKDE(object):
    """Class for Kernel Density Estimator on two labels. Likelihood ratio at a point is ratio of class-1 likelihood estimate to class-0 likelihood estimate, times the class odds, where this is calculated as the posterior mean estimate under Beta(1, 1) prior, given the observations. If no points are observed for one of the classes, a default (improper) uniform prior is assumed for that class. """
    def __init__(self, **kwargs):
        """Takes same parameters as KernelDensity estimator."""
        self.kde0 = KernelDensity(**kwargs)
        self.kde1 = KernelDensity(**kwargs)
    def fit(self, X, y):
        """Fits KDE models on the data. X is array of data points, y is array of 0-1 labels."""
        y = np.asarray(y, dtype = int)
        self.n0, self.n1 = (y == 0).sum(), (y == 1).sum()
        assert (self.n0 + self.n1 == len(y)), "y must be vector of 1's and 0's."
        X0, X1 = X[y == 0], X[y == 1]
        if (self.n0 > 0):
            self.kde0.fit(X0)
        if (self.n1 > 0):
            self.kde1.fit(X1)
    def fit_with_optimal_bandwidth(self, X, y, gridsize = 101, dynamic_range = 100, cv = 10, verbose = 0, n_jobs = 1):
        """Determines optimal bandwidth using the following strategy: For each subset (0 or 1) of the dataset, 1) set b = 1.06 * sigma * n^(-1/5), the Silverman's rule of thumb estimate for the optimal bandwidth. sigma is the sample standard deviation of the samples after zero-centering the columns (note: ideally each column will have comparable variance), 2) set up a grid (of size gridsize) of bandwidth values to try, ranging from b / alpha to b * alpha in geometric progression, where alpha = sqrt(dynamic_range), 3) compute average likelihood of the estimator on the data using cv-fold cross-validation, 4) select the bandwidth with the highest likelihood."""
        y = np.asarray(y, dtype = int)
        self.n0, self.n1 = (y == 0).sum(), (y == 1).sum()
        assert (self.n0 + self.n1 == len(y)), "y must be vector of 1's and 0's."
        X0, X1 = X[y == 0], X[y == 1]
        if (self.n0 > 0):
            log_b0 = np.log(1.06) + np.log((X0 - X0.mean(axis = 0)).std()) - 0.2 * np.log(self.n0)
            grid0 = GridSearchCV(self.kde0, {'bandwidth' : np.exp(np.linspace(log_b0 - 0.5 * np.log(dynamic_range), log_b0 + 0.5 * np.log(dynamic_range), gridsize))}, cv = cv, verbose = verbose, n_jobs = n_jobs)
            grid0.fit(X0)
            self.kde0 = grid0.best_estimator_
        if (self.n1 > 0):
            log_b1 = np.log(1.06) + np.log((X1 - X1.mean(axis = 0)).std()) - 0.2 * np.log(self.n1)
            grid1 = GridSearchCV(self.kde1, {'bandwidth' : np.exp(np.linspace(log_b1 - 0.5 * np.log(dynamic_range), log_b1 + 0.5 * np.log(dynamic_range), gridsize))}, cv = cv, verbose = verbose, n_jobs = n_jobs)
            grid1.fit(X1)
            self.kde1 = grid1.best_estimator_    
    def get_params(self, **kwargs):
        return self.kde0.get_params(**kwargs)
    def set_params(self, **params):
        self.kde0.set_params(**params)
        self.kde1.set_params(**params)
        return self
    def score_samples(self, X):
        """Evaluate the density model on the data. Returns vector of log-likelihood ratios of class 1 over class 0."""
        p1_est = (self.n1 + 1) / (self.n0 + self.n1 + 2)
        class_log_odds = np.log(p1_est) - np.log(1 - p1_est)
        scores0 = self.kde0.score_samples(X) if (self.n0 > 0) else np.zeros(len(X), dtype = float)
        scores1 = self.kde1.score_samples(X) if (self.n1 > 0) else np.zeros(len(X), dtype = float)
        return scores1 - scores0 + class_log_odds
    def score(self, X, y = None):
        """Compute the overall log-likelihood ratio under the model."""
        return self.score_samples(X).sum()
    def predict_proba(self, X):
        """Probability estimates."""
        scores = self.score_samples(X)
        p0s = 1 / (1 + np.exp(scores))
        return np.array([p0s, 1 - p0s]).transpose()
    def predict_log_proba(self, X):
        """Log of probability estimates."""
        return np.log(self.predict_proba(X))
def pdf_estimate(images, labels, W, method, t):
	"""
	Uses kernel density extimation to the compute the pdf of neural activation data.

	Args:
		images (numpy array): input images
		labels (numpy array): input labels associated with the neuron activations
		W (numpy array): weights of the hidden neurons
		method (str): method to approximate the pdf
		t (float): temperature of the softmax when then network was trained

	returns:
		(list of regressor or kde objects): list of marginal pdfs
		(regressor or kde object): pdf
		(numpy array): labels of the data points used to compute the pdf (useful to compute prior)
	"""

	classes = np.unique(labels)
	n_classes = len(np.unique(labels))
	n_trials = len(labels)

	""" computes the activation of the hidden neurons for the given input images """
	activ = ex.propagate_layerwise(images, W, t=t)

	n_subsample = 1000 #number of data points to use to compute the pdf in the 'subsample' and 'fit' methods
	subsample_idx = np.random.choice(n_trials, size=n_subsample, replace=False)
	activ_subs = activ[subsample_idx, :]

	n_train_fit = 500 #number of data point to use to fit the pdf in the 'fit' method
	train_fit_idx = np.random.choice(n_trials, size=n_train_fit, replace=False)
	activ_fit = activ[train_fit_idx, :]

	if method=='full':
		pdf_labels = np.copy(labels)
		pdf_evidence = KernelDensity(bandwidth=5e-1, kernel='gaussian', rtol=1e-100).fit(activ)
		pdf_marginals = []
		for c in classes:
			pdf_marginals.append(KernelDensity(bandwidth=5e-1, kernel='gaussian', rtol=1e-100).fit(activ[pdf_labels==c]))

	if method=='subsample':
		pdf_labels = labels[subsample_idx]
		pdf_evidence = KernelDensity(bandwidth=5e-1, kernel='gaussian', rtol=1e-100).fit(activ_subs)
		pdf_marginals = []
		for c in classes:
			pdf_marginals.append(KernelDensity(bandwidth=5e-1, kernel='gaussian', rtol=1e-100).fit(activ_subs[pdf_labels==c]))

	if method=='fit':
		pdf_labels = labels[subsample_idx]
		pdf_evidence_full = KernelDensity(bandwidth=5e-1, kernel='gaussian', rtol=1e-100).fit(activ_subs)
		pdf_evidence = KNeighborsRegressor().fit(activ_fit, pdf_evidence_full.score_samples(activ_fit))
		pdf_marginals = []
		for c in classes:
			pdf_marginal_full = KernelDensity(bandwidth=5e-1, kernel='gaussian', rtol=1e-100).fit(activ_subs[pdf_labels==c])
			pdf_marginals.append(KNeighborsRegressor().fit(activ_fit, pdf_marginal_full.score_samples(activ_fit)))

	return pdf_marginals, pdf_evidence, pdf_labels
    def initialize_optimization_plot(self):

        if self.dataset is None:
            print("Set data first before initializing plot options!")
            return


        if self.parameters is None:
            print("Parameter needs to be set!")
            return

        ab_list = [
            AB_INDICES['A-A'],
            AB_INDICES['C-C'],
            AB_INDICES['E-R'],
            AB_INDICES['R-E'],
            AB_INDICES['K-E'],
            AB_INDICES['E-E'],
            AB_INDICES['K-K'],
            AB_INDICES['K-R'],
            AB_INDICES['V-I'],
            AB_INDICES['I-L'],
            AB_INDICES['S-T'],
            AB_INDICES['S-S'],
            AB_INDICES['K-P'],
            AB_INDICES['N-N'],
            AB_INDICES['W-W'],
            AB_INDICES['G-F']
        ]

        couplings_contacts, couplings_noncontacts, avg_lambda_pair = self.dataset.get_decoy_set(size=self.size_evaluationset)
        self.evaluation_set['contact'] = np.array(couplings_contacts).transpose()
        self.evaluation_set['bg'] = np.array(couplings_noncontacts).transpose()

        bandwidth = 0.01
        self.evaluation_set_kde = {}
        self.evaluation_set_kde['x_grid'] = np.linspace(-0.5, 0.5, 500)
        self.evaluation_set_kde['contact'] = {}
        self.evaluation_set_kde['bg'] = {}

        # kernel density estimate for couplings wijab
        for ab in ab_list:
            kde_contact = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(self.evaluation_set['contact'][ab].reshape(-1, 1))
            kde_bg = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(self.evaluation_set['bg'][ab].reshape(-1, 1))

            ### add empirical distribution for example data points
            self.evaluation_set_kde['contact'][ab] = np.exp(kde_contact.score_samples(self.evaluation_set_kde['x_grid'].reshape(-1, 1)))
            self.evaluation_set_kde['bg'][ab] = np.exp(kde_bg.score_samples(self.evaluation_set_kde['x_grid'].reshape(-1, 1)))

        #sample points according to regularizer
        std_dev = np.sqrt(1.0/avg_lambda_pair)
        regularizer = np.random.normal(scale=std_dev, size=10000)
        kde_reg = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(regularizer.reshape(-1, 1))
        self.evaluation_set_kde['regularizer'] = np.exp(kde_reg.score_samples(self.evaluation_set_kde['x_grid'].reshape(-1, 1)))
Beispiel #7
0
def test1():
	X = [[1], [2], [4], [3], [2], [8], [8], [9], [10], [12], [11], [9]]
	kde = KernelDensity(kernel='gaussian', bandwidth=0.4).fit(X)
	scores = kde.score_samples(X)
	for x in xrange(len(scores)):
		scores[x] = math.exp(scores[x])
	print scores
	Y = [[1], [2], [2], [1], [5], [6], [6], [7], [9], [10], [8], [7]]
	density = kde.score_samples(Y)
	for x in xrange(len(density)):
		density[x] = math.exp(density[x])
	print density
Beispiel #8
0
def get_P_binary_v_tot(proj_sep, delta_v_tot, num_sys=100000):
    """ This function calculates the probability of a
    random star having the observed proper motion

    Parameters
    ----------
    proj_sep : float
        Projected separation between two stars
    delta_v_tot : float
        Total velocity difference between two stars

    Returns
    -------
    P(proj_sep, delta_v_tot) : float
        Probability that angular separation, pm+RV difference
        is due to a genuine binary
    """

    # Catalog check
    global binary_set

    if binary_set is None:
        generate_binary_set(num_sys=num_sys)

    # Use a Gaussian KDE
    global binary_v_tot_kde
    # We work in log space for the set of binaries

    if binary_v_tot_kde is None:
        kwargs = {'kernel':'tophat'}
        binary_v_tot_kde = KernelDensity(bandwidth=0.1, **kwargs)
        binary_v_tot_kde.fit( np.array([np.log10(binary_set['proj_sep']), np.log10(binary_set['delta_v_tot'])]).T )

    if isinstance(delta_v_tot, np.ndarray) and isinstance(proj_sep, np.ndarray):
        values = np.array([np.log10(proj_sep), np.log10(delta_v_tot)]).T
        prob_binary = np.exp(binary_v_tot_kde.score_samples(values))

    elif isinstance(delta_v_tot, np.ndarray):
        values = np.array([np.log10(proj_sep)*np.ones(len(delta_v_tot)), np.log10(delta_v_tot)]).T
        prob_binary = np.exp(binary_v_tot_kde.score_samples(values))
    else:
        prob_binary = np.exp(binary_v_tot_kde.score_samples([np.log10(proj_sep), np.log10(delta_v_tot)]))


    # Convert back from log10-space to linear-space
    # the log(10) terms convert from log10 to ln
    prob_binary = prob_binary / (proj_sep*np.log(10.)) / (delta_v_tot*np.log(10.))

    return prob_binary
def plot_agglomerative():
    from sklearn.datasets import make_blobs
    from sklearn.cluster import AgglomerativeClustering
    from sklearn.neighbors import KernelDensity
    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    m = 16
    k = 3
    X, y = make_blobs(n_samples= m, n_features=2, centers=k, cluster_std=1.3, random_state = 2255)
    agg = AgglomerativeClustering(n_clusters=3)

    eps = X.std() / 2.

    x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
    y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps

    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
    gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)]

    ax = plt.gca()
    for i, x in enumerate(X):
        ax.text(x[0] + .1, x[1], "%d" % i, horizontalalignment='left', verticalalignment='center')

    ax.scatter(X[:, 0], X[:, 1], s=20, c='grey')
    ax.set_xticks(())
    ax.set_yticks(())

    for i in range((m-1)):
        agg.n_clusters = X.shape[0] - i
        agg.fit(X)

        bins = np.bincount(agg.labels_)
        for cluster in range(agg.n_clusters):
            if bins[cluster] > 1:
                points = X[agg.labels_ == cluster]
                other_points = X[agg.labels_ != cluster]

                kde = KernelDensity(bandwidth= 0.9).fit(points)
                scores = kde.score_samples(gridpoints)
                score_inside = np.min(kde.score_samples(points))
                score_outside = np.max(kde.score_samples(other_points))
                levels = .80 * score_inside + .20 * score_outside
                ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels],
                           colors='k', linestyles='solid', linewidths=0.8)

    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
def draw_posterior_kld_hist(X_kld, X_vae, f_name, bins=25):
    """
    Plot KDE-smoothed histograms.
    """
    import matplotlib.pyplot as plt
    # make a figure and configure an axis
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.set_xlabel('Posterior KLd Density')
    ax.set_title('Posterior KLds: Over-regularized vs. Standard')
    ax.hold(True)
    for (X, style, label) in [(X_kld, '-', 'ORK'), (X_vae, '--', 'VAR')]:
        X_samp = X.ravel()[:,np.newaxis]
        X_min = np.min(X_samp)
        X_max = np.max(X_samp)
        X_range = X_max - X_min
        sigma = X_range / float(bins)
        plot_min = X_min - (X_range/4.0)
        plot_max = X_max + (X_range/4.0)
        plot_X = np.linspace(plot_min, plot_max, 1000)[:,np.newaxis]
        # make a kernel density estimator for the data in X
        kde = KernelDensity(kernel='gaussian', bandwidth=sigma).fit(X_samp)
        ax.plot(plot_X, np.exp(kde.score_samples(plot_X)), linestyle=style, label=label)
    ax.legend()
    fig.savefig(f_name, dpi=None, facecolor='w', edgecolor='w', \
        orientation='portrait', papertype=None, format='pdf', \
        transparent=False, bbox_inches=None, pad_inches=0.1, \
        frameon=None)
    plt.close(fig)
    return
def kde_opt4(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        df_new = pd.DataFrame()
        df_new["hour"] = df["hour"]
        df_new["weekday"] = df["weekday"] + df["hour"] / 24.
        df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x))
        df_new["x"] = df["x"]
        df_new["y"] = df["y"]
        return df_new
    logging.info("train kde_opt4 model")
    df_cell_train_feats_kde = prepare_feats(df_cell_train_feats)
    df_cell_test_feats_kde = prepare_feats(df_cell_test_feats)
    n_class = len(np.unique(y_train))
    y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d")
    for i in range(n_class):
        X = df_cell_train_feats_kde[y_train == i]
        y_test_pred_i = np.ones(len(df_cell_test_feats_kde), "d")
        for feat in df_cell_train_feats_kde.columns.values:
            X_feat = X[feat].values
            BGK10_output = kdeBGK10(X_feat)
            if BGK10_output is None:
                kde = gaussian_kde(X_feat, "scott")
                kde = gaussian_kde(X_feat, kde.factor * 0.741379)
                y_test_pred_i *= kde.evaluate(df_cell_test_feats_kde[feat].values)
            else:
                bandwidth, mesh, density = BGK10_output
                kde = KernelDensity(kernel='gaussian', metric='manhattan', bandwidth=bandwidth)
                kde.fit(X_feat[:, np.newaxis])
                y_test_pred_i *= np.exp(kde.score_samples(df_cell_test_feats_kde[feat].values[:, np.newaxis]))
        y_test_pred[:, i] += y_test_pred_i
    return y_test_pred
Beispiel #12
0
def kde_sklearn(data, grid, **kwargs):
    """
    Kernel Density Estimation with Scikit-learn

    Parameters
    ----------
    data : numpy.array
        Data points used to compute a density estimator. It
        has `n x p` dimensions, representing n points and p
        variables.
    grid : numpy.array
        Data points at which the desity will be estimated. It
        has `m x p` dimensions, representing m points and p
        variables.

    Returns
    -------
    out : numpy.array
        Density estimate. Has `m x 1` dimensions
    """
    kde_skl = KernelDensity(**kwargs)
    kde_skl.fit(data)
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(grid)
    return np.exp(log_pdf)
def find_kernel(data, numgrid = 1000, bw = 0.002):
	Xtrain = data[:,0:2]
	ytrain = data[2]
	# Set up the data grid for the contour plot
	xgrid = np.linspace(-74.1, -73.65, numgrid=1000)
	ygrid = np.linspace(40.5, 40.8, numgrid=1000)
	X, Y = np.meshgrid(xgrid, ygrid)

	xy = np.vstack([Y.ravel(), X.ravel()]).T

	# Plot map of with distributions of each species
	fig = plt.figure()
    # construct a kernel density estimate of the distribution
	kde = KernelDensity(bandwidth=bw,
                    kernel='gaussian')
	kde.fit(Xtrain, y = ytrain)

 # evaluate only on the land: -9999 indicates ocean
	Z = np.exp(kde.score_samples(xy))
	Z = Z.reshape(X.shape)

    # plot contours of the density
	levels = np.linspace(0, Z.max(), 25)
	plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds)
	plt.title('BK CRIME')
	plt.show()
	return Z
Beispiel #14
0
def sklearn_density(sample_points, evaluation_points):
    """
    Estimate the probability density function from which a set of sample
    points was drawn and return the estimated density at the evaluation points.
    """
    from sklearn.neighbors import KernelDensity

    # Silverman bandwidth estimator
    n, d = sample_points.shape
    bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4))

    # Standardize data so that we can use uniform bandwidth.
    # Note that we will need to scale the resulting density by sigma to
    # correct the area.
    mu, sigma = mean(sample_points, axis=0), std(sample_points, axis=0)
    data, points = (sample_points - mu)/sigma, (evaluation_points - mu)/sigma

    #print("starting grid search for bandwidth over %d points"%n)
    #from sklearn.grid_search import GridSearchCV
    #from numpy import logspace
    #params = {'bandwidth': logspace(-1, 1, 20)}
    #fitter = GridSearchCV(KernelDensity(), params)
    #fitter.fit(data)
    #kde = fitter.best_estimator_
    #print("best bandwidth: {0}".format(kde.bandwidth))
    #import time; T0 = time.time()
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth,
                        rtol=1e-6, atol=1e-6)
    #print("T:%6.3f   fitting"%(time.time()-T0))
    kde.fit(data)
    #print("T:%6.3f   estimating"%(time.time()-T0))
    log_pdf = kde.score_samples(points)
    #print("T:%6.3f   done"%(time.time()-T0))
    return exp(log_pdf)/np.prod(sigma)  # undo the x scaling on the data points
Beispiel #15
0
def sklearn_kde(data, points):
    from sklearn.neighbors import KernelDensity

    # Silverman bandwidth estimator
    n, d = data.shape
    bandwidth = (n * (d + 2) / 4.)**(-1. / (d + 4))

    # standardize data so that we can use uniform bandwidth
    mu, sigma = mean(data, axis=0), std(data, axis=0)
    data, points = (data - mu)/sigma, (points - mu)/sigma

    #print("starting grid search for bandwidth over %d points"%n)
    #from sklearn.grid_search import GridSearchCV
    #from numpy import logspace
    #params = {'bandwidth': logspace(-1, 1, 20)}
    #fitter = GridSearchCV(KernelDensity(), params)
    #fitter.fit(data)
    #kde = fitter.best_estimator_
    #print("best bandwidth: {0}".format(kde.bandwidth))
    #import time; T0 = time.time()
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth,
                        rtol=1e-6, atol=1e-6)
    #print("T:%6.3f   fitting"%(time.time()-T0))
    kde.fit(data)
    #print("T:%6.3f   estimating"%(time.time()-T0))
    log_pdf = kde.score_samples(points)
    #print("T:%6.3f   done"%(time.time()-T0))
    return exp(log_pdf)
def get_density_based_best_sample(X, known_votes, possibilities):
  total_votes = sum(map(lambda x: len(x), known_votes))
  print total_votes
  X = X.toarray()
  current_vectors = numpy.copy(X)
  #print 'X', X
  #print 'known_votes ', known_votes
  original_docs = len(X)
  possibilities = set([x[0] for x in possibilities])
  #print possibilities

  for i, sample in enumerate(known_votes):
    for k in range(len(sample)):
      current_vectors = numpy.append(current_vectors, [X[i]], axis=0)
  #print 'current_vectors ', current_vectors, len(current_vectors)
  #assert current_vectors != X
  model = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(current_vectors)
  scores = model.score_samples(X)
  
  if (total_votes % 3):
    #Explore low density regions
    sorted_scores = sorted(enumerate(scores), key = lambda x: x[1], reverse=True)
  else:
    #Exploit high density regions 1 times out of 3
    sorted_scores = sorted(enumerate(scores), key = lambda x: x[1])
  #print sorted_scores
  for i in range(original_docs):
    if sorted_scores[i][0] in possibilities:
      #print sorted_scores[i][0]
      return sorted_scores[i][0]
  return None
def plot_kde_histogram2(X1, X2, f_name, bins=25):
    """
    Plot KDE-smoothed histogram of the data in X1/X2. Assume data is 1D.
    """
    import matplotlib.pyplot as plt
    # make a figure and configure an axis
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.hold(True)
    for (X, style) in [(X1, '-'), (X2, '--')]:
        X_samp = X.ravel()[:,np.newaxis]
        X_min = np.min(X_samp)
        X_max = np.max(X_samp)
        X_range = X_max - X_min
        sigma = X_range / float(bins)
        plot_min = X_min - (X_range/3.0)
        plot_max = X_max + (X_range/3.0)
        plot_X = np.linspace(plot_min, plot_max, 1000)[:,np.newaxis]
        # make a kernel density estimator for the data in X
        kde = KernelDensity(kernel='gaussian', bandwidth=sigma).fit(X_samp)
        ax.plot(plot_X, np.exp(kde.score_samples(plot_X)), linestyle=style)
    fig.savefig(f_name, dpi=None, facecolor='w', edgecolor='w', \
        orientation='portrait', papertype=None, format=None, \
        transparent=False, bbox_inches=None, pad_inches=0.1, \
        frameon=None)
    plt.close(fig)
    return
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs):
    """Kernel Density Estimation with Scikit-learn"""
    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(x[:, np.newaxis])
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
    return np.exp(log_pdf)
Beispiel #19
0
def xy_kde(xy,bandwidth,N_grid=100,levels=[0.8,0.6,0.4,0.2]):  
    
    x_edges = np.linspace(np.min(xy[:,0]),np.max(xy[:,0]),N_grid+1)
    y_edges = np.linspace(np.min(xy[:,1]),np.max(xy[:,1]),N_grid+1)
    x_centres = np.array([x_edges[b] + (x_edges[b+1]-x_edges[b])/2 
                          for b in range(N_grid)])
    y_centres = np.array([y_edges[b] + (y_edges[b+1]-y_edges[b])/2 
                          for b in range(N_grid)])
    x_grid, y_grid = np.meshgrid(x_centres,y_centres)
    xy_grid = np.array([np.ravel(x_grid),np.ravel(y_grid)]).T
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(xy)
    H = np.exp(kde.score_samples(xy_grid).reshape(N_grid,N_grid))
    # this bit is taken from the corner_plot.py method.
    ######################################
    Hflat = H.flatten()
    inds = np.argsort(Hflat)[::-1]
    Hflat = Hflat[inds]
    sm = np.cumsum(Hflat)
    sm /= sm[-1]
    V = np.empty(len(levels))
    for i, v0 in enumerate(levels):
        try:
            V[i] = Hflat[sm <= v0][-1]
        except:
            V[i] = Hflat[0]
    #####################################
    V = np.sort(V)
    
    return H, V, x_grid, y_grid, bandwidth
def plot_kde_histogram(X, f_name, bins=25):
    """
    Plot KDE-smoothed histogram of the data in X. Assume data is univariate.
    """
    import matplotlib.pyplot as plt
    X = X.ravel()
    np.random.shuffle(X)
    X = X[0:min(X.shape[0], 1000000)]
    X_samp = X[:,np.newaxis]
    X_min = np.min(X_samp)
    X_max = np.max(X_samp)
    X_range = X_max - X_min
    sigma = X_range / float(bins)
    plot_min = X_min - (X_range/3.0)
    plot_max = X_max + (X_range/3.0)
    plot_X = np.linspace(plot_min, plot_max, 1000)[:,np.newaxis]
    # make a kernel density estimator for the data in X
    kde = KernelDensity(kernel='gaussian', bandwidth=sigma).fit(X_samp)
    # make a figure
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(plot_X, np.exp(kde.score_samples(plot_X)))
    fig.savefig(f_name, dpi=None, facecolor='w', edgecolor='w', \
        orientation='portrait', papertype=None, format=None, \
        transparent=False, bbox_inches=None, pad_inches=0.1, \
        frameon=None)
    plt.close(fig)
    return
def plot_sklearn_kde(df, support, column='AirTime', bins=50):
    """
    Plots a KDE and a histogram using sklearn.KernelDensity.
    Uses Gaussian kernels.
    The optimal bandwidth is calculated according to Silverman's rule of thumb.

    Parameters
    ----------
    df: A pandas.DataFrame
    support: A 1-d numpy array.
             Input data points for the probabilit density function.

    Returns
    -------
    A matplotlib.axes.Axes instance.
    """

    bw = get_silverman_bandwidth(df, column)

    kde = KernelDensity(kernel='gaussian', bandwidth=bw)

    x = df[column]

    kde.fit(x[:, np.newaxis])
    y = kde.score_samples(support[:, np.newaxis])

    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(np.ravel(x), bins=bins, alpha=0.5, color=sns.xkcd_rgb["denim blue"], normed=True)
    ax.plot(support, np.exp(y))
    ax.set_xlabel(column, fontsize=14)
    ax.set_ylabel('Density', fontsize=14)
    ax.set_title('Kernel Density Plot', fontsize=14)
    sns.despine(ax=ax, offset=5, trim=True)

    return ax
Beispiel #22
0
def max_prob(df):
    df_tmp = df.copy()

    arr = []
    for ind in df_tmp.index:
        row = df_tmp.loc[ind]
        d = row.dropna().values
        # d = d.dropna()
        if len(d)==0:
            centre = np.NaN
            arr.append(centre)
            continue

        # arr = vals.sort(axis=0)
        # df_ordered = pd.DataFrame(vals, index=df.index, columns=df.columns)

        x_grid = np.linspace(d.min(), d.max(), 50)
        x_grid = x_grid.reshape(-1,1)
        d = d.reshape(-1,1)

        kde = KernelDensity().fit(d)
        log_dens = kde.score_samples(x_grid)
        vals = np.exp(log_dens).round(4)
        centre = x_grid[vals.argmax()][0]
        centre2 = round(centre, 4)
        # TODO first element adds unnecessary decimal places (use decimal places class to fix)
        arr.append(centre2)
    return arr
Beispiel #23
0
def test2():
    arr = np.concatenate((np.linspace(0, 10, 10), np.linspace(2, 4, 10), np.linspace(7, 10, 10)))[:, np.newaxis]
    kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(arr)
    X = np.linspace(0,10,1000)[:, np.newaxis]
    log_dens = kde.score_samples(X)
    plt.plot(X, log_dens)
    plt.show()
Beispiel #24
0
def surface_density(c, bandwidth=0.2, grid_step=0.02):
    """
    Given particle positions as a coordinate object, compute the
    surface density using a kernel density estimate.
    """

    if not HAS_SKLEARN:
        raise ImportError("scikit-learn is required to use this function.")

    xgrid = np.arange(2., 9.+0.1, grid_step) # deg
    ygrid = np.arange(26.5, 33.5+0.1, grid_step) # deg
    shp = (xgrid.size, ygrid.size)
    meshies = np.meshgrid(xgrid, ygrid)
    grid = np.vstack(map(np.ravel, meshies)).T

    x = c.l.degree
    y = c.b.degree
    skypos = np.vstack((x,y)).T

    kde = KernelDensity(bandwidth=bandwidth, kernel='epanechnikov')
    kde.fit(skypos)

    dens = np.exp(kde.score_samples(grid)).reshape(meshies[0].shape)
    log_dens = np.log10(dens)

    return grid, log_dens
Beispiel #25
0
def kdescatter(xs, ys, log_color=False, atol=1e-4, rtol=1e-4,
               n_jobs=1, n_samp_scaling=100, n_samp_tuning=1000, ax=None,
               **kwargs):
    if ax is None:
        import matplotlib.pyplot as plt
        ax = plt

    kwargs.setdefault('linewidths', 0)
    kwargs.setdefault('s', 20)
    kwargs.setdefault('cmap', 'winter')

    X = np.asarray([xs, ys]).T
    n = X.shape[0]
    samp_X = X[np.random.choice(n, min(n_samp_scaling, n), replace=False)]
    median_sqdist = np.median(euclidean_distances(samp_X, squared=True))
    bws = np.logspace(-2, 2, num=10) * np.sqrt(median_sqdist)
    est = GridSearchCV(KernelDensity(), {'bandwidth': bws}, n_jobs=n_jobs)
    est.fit(X[np.random.choice(n, min(n_samp_tuning, n), replace=False)])
    bw = est.best_params_['bandwidth']

    kde = KernelDensity(bandwidth=bw)
    kde.fit(X)
    densities = kde.score_samples(X)
    if not log_color:
        np.exp(densities, out=densities)
    ax.scatter(xs, ys, c=densities, **kwargs)
Beispiel #26
0
def KDE_plt(categories,inter_arrivals):
    KDEs = []
    for i in range(0,len(categories)):

        X = np.asarray(extract_cat_samples(inter_arrivals,categories,i))#for single inter-arrivals in a category
        #X = np_matrix(categories[i][0])#for avg(inter-arrival)/person in a category
        kde = KernelDensity(kernel='gaussian', bandwidth=4).fit(X)
        KDEs.append(kde) #to use for prob_return()
        max_sample = max_interarrival_mean(categories,inter_arrivals,i)
        X_plot = np.linspace(0,1.5*max_sample,2000)[:, np.newaxis]
        log_dens = kde.score_samples(X_plot)

        plt.figure(i)
        plt.plot(X_plot[:, 0], np.exp(log_dens), '-',label="kernel = '{0}'".format('gaussian'))
            #plt.draw()
            #plt.pause(0.001)
        #plt.title("Non-Parametric Density Estimation for category=%s Visitors"%(i))
        plt.hist(combine_inner_lists(extract_cat_samples(inter_arrivals,categories,i)),bins=40,normed=1,color="cyan",alpha=.3,label="histogram") #alpha, from 0 (transparent) to 1 (opaque)
       # plt.hist(np.asarray(categories[i][0]),bins=40,normed=1,color="cyan",alpha=.3,label="histogram") #alpha, from 0 (transparent) to 1 (opaque)
        plt.xlabel("inter-arrival time (days)")
        plt.ylabel("PDF")
        plt.legend()
        save_as='./app/static/img/cat_result/kde/kdeplt_cat'+str(i)+'.png' # dump result into kde folder
        plt.savefig(save_as)
        plt.show(block=False)
        plt.close(plt.figure(i))
    return KDEs
 def EstimateDensity(self,name,df,histogram,f,s,ax):
     # if the desired output is in Histogram format
     if(histogram):
         finRes = []
         lab = []
         for i in xrange(5):
             res = np.array(df[ df[f] == i][s])
             if(res.shape[0]>0):
                 finRes.append(res)
                 lab.append(name[0]+ ' = ' + str(i))
         pl.hist(finRes, bins=2, normed=True, histtype='bar',label = lab)
         
     # if the desired output is simple plot
     else:
         for i in xrange(5):
             res = np.array(df[ df[f] == i][s])
             if(res.shape[0]>0):
                 res = res.reshape(res.shape[0],1)
                 X_plot = np.array(np.linspace(-1, 5,20)).reshape(20,1)
                 kde= KernelDensity(kernel='exponential', bandwidth=0.05)
                 kde.fit(res)
                 log_dens = kde.score_samples(X_plot)
                 ax.plot(X_plot,np.exp(log_dens),label=name[0]+ ' = ' + str(i))        
     ax.legend()
     ax.set_title(name[1] + "  distrubution for changing  " + name[0])
Beispiel #28
0
    def kde(self, term, bandwidth=2000, samples=1000, kernel='gaussian'):

        """
        Estimate the kernel density of the instances of term in the text.

        Args:
            term (str): A stemmed term.
            bandwidth (int): The kernel bandwidth.
            samples (int): The number of evenly-spaced sample points.
            kernel (str): The kernel function.

        Returns:
            np.array: The density estimate.
        """

        # Get the offsets of the term instances.
        terms = np.array(self.terms[term])[:, np.newaxis]

        # Fit the density estimator on the terms.
        kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(terms)

        # Score an evely-spaced array of samples.
        x_axis = np.linspace(0, len(self.tokens), samples)[:, np.newaxis]
        scores = kde.score_samples(x_axis)

        # Scale the scores to integrate to 1.
        return np.exp(scores) * (len(self.tokens) / samples)
Beispiel #29
0
def art_qi2(img, airmask, min_voxels=int(1e3), max_voxels=int(3e5), save_plot=True):
    r"""
    Calculates :math:`\text{QI}_2`, based on the goodness-of-fit of a centered
    :math:`\chi^2` distribution onto the intensity distribution of
    non-artifactual background (within the "hat" mask):


    .. math ::

        \chi^2_n = \frac{2}{(\sigma \sqrt{2})^{2n} \, (n - 1)!}x^{2n - 1}\, e^{-\frac{x}{2}}

    where :math:`n` is the number of coil elements.

    :param numpy.ndarray img: input data
    :param numpy.ndarray airmask: input air mask without artifacts

    """

    from sklearn.neighbors import KernelDensity
    from scipy.stats import chi2
    from mriqc.viz.misc import plot_qi2

    # S. Ogawa was born
    np.random.seed(1191935)

    data = img[airmask > 0]
    data = data[data > 0]

    # Write out figure of the fitting
    out_file = op.abspath('error.svg')
    with open(out_file, 'w') as ofh:
        ofh.write('<p>Background noise fitting could not be plotted.</p>')

    if len(data) < min_voxels:
        return 0.0, out_file

    modelx = data if len(data) < max_voxels else np.random.choice(
        data, size=max_voxels)

    x_grid = np.linspace(0.0, np.percentile(data, 99), 1000)

    # Estimate data pdf with KDE on a random subsample
    kde_skl = KernelDensity(bandwidth=0.05 * np.percentile(data, 98),
                            kernel='gaussian').fit(modelx[:, np.newaxis])
    kde = np.exp(kde_skl.score_samples(x_grid[:, np.newaxis]))

    # Find cutoff
    kdethi = np.argmax(kde[::-1] > kde.max() * 0.5)

    # Fit X^2
    param = chi2.fit(modelx[modelx < np.percentile(data, 95)], 32)
    chi_pdf = chi2.pdf(x_grid, *param[:-2], loc=param[-2], scale=param[-1])

    # Compute goodness-of-fit (gof)
    gof = float(np.abs(kde[-kdethi:] - chi_pdf[-kdethi:]).mean())
    if save_plot:
        out_file = plot_qi2(x_grid, kde, chi_pdf, modelx, kdethi)

    return gof, out_file
Beispiel #30
0
def test_kde_sample_weights():
    n_samples = 400
    size_test = 20
    weights_neutral = np.full(n_samples, 3.)
    for d in [1, 2, 10]:
        rng = np.random.RandomState(0)
        X = rng.rand(n_samples, d)
        weights = 1 + (10 * X.sum(axis=1)).astype(np.int8)
        X_repetitions = np.repeat(X, weights, axis=0)
        n_samples_test = size_test // d
        test_points = rng.rand(n_samples_test, d)
        for algorithm in ['auto', 'ball_tree', 'kd_tree']:
            for metric in ['euclidean', 'minkowski', 'manhattan',
                           'chebyshev']:
                if algorithm != 'kd_tree' or metric in KDTree.valid_metrics:
                    kde = KernelDensity(algorithm=algorithm, metric=metric)

                    # Test that adding a constant sample weight has no effect
                    kde.fit(X, sample_weight=weights_neutral)
                    scores_const_weight = kde.score_samples(test_points)
                    sample_const_weight = kde.sample(random_state=1234)
                    kde.fit(X)
                    scores_no_weight = kde.score_samples(test_points)
                    sample_no_weight = kde.sample(random_state=1234)
                    assert_allclose(scores_const_weight, scores_no_weight)
                    assert_allclose(sample_const_weight, sample_no_weight)

                    # Test equivalence between sampling and (integer) weights
                    kde.fit(X, sample_weight=weights)
                    scores_weight = kde.score_samples(test_points)
                    sample_weight = kde.sample(random_state=1234)
                    kde.fit(X_repetitions)
                    scores_ref_sampling = kde.score_samples(test_points)
                    sample_ref_sampling = kde.sample(random_state=1234)
                    assert_allclose(scores_weight, scores_ref_sampling)
                    assert_allclose(sample_weight, sample_ref_sampling)

                    # Test that sample weights has a non-trivial effect
                    diff = np.max(np.abs(scores_no_weight - scores_weight))
                    assert diff > 0.001

                    # Test invariance with respect to arbitrary scaling
                    scale_factor = rng.rand()
                    kde.fit(X, sample_weight=(scale_factor * weights))
                    scores_scaled_weight = kde.score_samples(test_points)
                    assert_allclose(scores_scaled_weight, scores_weight)
Beispiel #31
0
fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05)

for i in range(2):
    plt.subplot(1, 2, i + 1)

    # construct a kernel density estimate of the distribution
    print(" - computing KDE in spherical coordinates")
    kde = KernelDensity(bandwidth=0.04,
                        metric='haversine',
                        kernel='gaussian',
                        algorithm='ball_tree')
    kde.fit(Xtrain[ytrain == i])

    # evaluate only on the land: -9999 indicates ocean
    Z = -9999 + np.zeros(land_mask.shape[0])
    Z[land_mask] = np.exp(kde.score_samples(xy))
    Z = Z.reshape(X.shape)

    # plot contours of the density
    levels = np.linspace(0, Z.max(), 25)
    plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds)

    if basemap:
        print(" - plot coastlines using basemap")
        m = Basemap(projection='cyl',
                    llcrnrlat=Y.min(),
                    urcrnrlat=Y.max(),
                    llcrnrlon=X.min(),
                    urcrnrlon=X.max(),
                    resolution='c')
        m.drawcoastlines()
#kde = KernelDensity(bandwidth=grid.best_params_['bandwidth'], 
kde = KernelDensity(bandwidth=1, 
                    kernel='gaussian')
kde.fit(Zarr[:, None])


### Plots
# Remove large values for ease of plotting
Zarr = Zarr[(Zarr < 100) & (Zarr > -100)]
x_d = np.linspace(-100,100,1000)
cfit = cauchy.pdf(x_d,loc=loc,scale=sca)
nfit = norm.pdf(x_d,loc=locnorm,scale=scanorm)
tfit = t.pdf(x_d,df=dft,loc=loct,scale=scat)

logprob_kde = kde.score_samples(x_d[:, None])

pdf_cmb_array = []
for x in x_d:
    pdf_cmb_array.append(1/ncomb * pdf_cmb(x))
#    pdf_cmb_array.append(pdf_cmb(x))

pdf_cmb_array = np.array(pdf_cmb_array)

_ = plt.hist(Zarr,bins=100,normed=True,histtype='step')
plt.plot(x_d,cfit,'k-') # Cauchy fit
plt.plot(x_d,nfit,'k--') # Normal fit
#plt.plot(x_d,tfit,'k-.') # Student-t fit

plt.plot(x_d,pdf_cmb_array,'r--') # Mixture
plt.fill_between(x_d, np.exp(logprob_kde), alpha=0.5)
plt.savefig(output_dir + 'ws_mean_vs_mld_change.pdf')

plt.figure()
plt.scatter(mean_highest_max_ws[under_ice], mld_change[under_ice], c='k', s=2)
plt.xlabel(
    'Average of highest 48 hours of wind speeds between float profile pairs (m/s)'
)
plt.ylabel('MLD change (m)')
plt.savefig(output_dir + 'ws_highest_vs_mld_change.pdf')

plt.figure()
mld_axis = arange(-50, 50, 1)
kde_without_storms \
    = KernelDensity(kernel='gaussian',bandwidth=5.0)\
    .fit(mld_change[logical_and(under_ice,num_periods_with_storms == 0)].reshape(-1,1))
log_dens_without_storms = kde_without_storms.score_samples(
    mld_axis.reshape(-1, 1))
kde_with_storms \
    = KernelDensity(kernel='gaussian',bandwidth=5.0)\
    .fit(mld_change[logical_and(under_ice,num_periods_with_storms > 0)].reshape(-1,1))
log_dens_with_storms = kde_with_storms.score_samples(mld_axis.reshape(-1, 1))
plt.fill_between(mld_axis,
                 exp(log_dens_without_storms),
                 color='k',
                 alpha=0.5,
                 label='Zero storms',
                 zorder=2)
plt.fill_between(mld_axis,
                 exp(log_dens_with_storms),
                 color='r',
                 alpha=0.5,
                 label='One or more storms',
Beispiel #34
0
class KDE(KernelDensity, BaseDetector):

    def __init__(self, bandwidth=1.0, algorithm='auto',
                 kernel='gaussian', metric="euclidean", atol=0, rtol=0, contamination=0.1,
                 breadth_first=True, leaf_size=40, metric_params=None, random_state=42):
        """Kernel density estimation (KDE)
        Parameters
        ----------
        bandwidth : float
            The bandwidth of the kernel.

        algorithm : str
            The tree algorithm to use.  Valid options are
            ['kd_tree'|'ball_tree'|'auto'].  Default is 'auto'.

        kernel : str
            The kernel to use.  Valid kernels are
            ['gaussian'|'tophat'|'epanechnikov'|'exponential'|'linear'|'cosine']
            Default is 'gaussian'.

        metric : str
            The distance metric to use.

        atol : float
            The desired absolute tolerance of the result.  A larger tolerance will
            generally lead to faster execution. Default is 0.

        rtol : float
            The desired relative tolerance of the result.

        breadth_first : bool
            If true (default), use a breadth-first approach to the problem.
            Otherwise use a depth-first approach.

        leaf_size : int
            Specify the leaf size of the underlying tree.

        metric_params : dict
            Additional parameters to be passed to the tree for use with the
            metric.
        """
        self.algorithm = algorithm
        self.bandwidth = bandwidth
        self.kernel = kernel
        self.metric = metric
        self.atol = atol
        self.rtol = rtol
        self.breadth_first = breadth_first
        self.leaf_size = leaf_size
        self.metric_params = metric_params
        self.contamination = contamination
        self.random_state = random_state

        # run the choose algorithm code so that exceptions will happen here
        # we're using clone() in the GenerativeBayes classifier,
        # so we can't do this kind of logic in __init__
        self._choose_algorithm(self.algorithm, self.metric)

        if bandwidth <= 0:
            raise ValueError("bandwidth must be positive")
        if kernel not in VALID_KERNELS:
            raise ValueError("invalid kernel: '{0}'".format(kernel))

    def fit(self, X_train, y_train=None):
        """Fit KDE.

        Parameters
        ----------
        X_train : numpy array of shape (n_samples, n_features)
            The input samples.

        y_train : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).

        Returns
        -------
        self : object
            the fitted estimator.
        """
        X_train = _check_X(X_train)
        self.model_ = KernelDensity(bandwidth=self.bandwidth,
                                    algorithm=self.algorithm,
                                    kernel=self.kernel,
                                    metric=self.metric,
                                    atol=self.atol,
                                    rtol=self.rtol,
                                    breadth_first=self.breadth_first,
                                    leaf_size=self.leaf_size,
                                    metric_params=self.metric_params)

        self.model_.fit(X_train)

        return self

    def decision_function(self, X):
        """Predict raw anomaly scores of X using the fitted detector.
        After invert_order(): the higher score, the more probability of x that is predicted as abnormal

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        # check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        return invert_order(self.model_.score_samples(X))

    def predict_proba(self, X):
        raise NotImplementedError
                    {'bandwidth': bandwidths},
                    cv=5,
                    verbose=1)
grid.fit(Pdarr)

print('Best params:', grid.best_params_)

# Instantiate and fit the KDE model
print("Instantiate and fit the KDE model")
kde = KernelDensity(bandwidth=grid.best_params_['bandwidth'],
                    kernel='gaussian')
kde.fit(Pdarr)

# Score_samples returns the log of the probability density
x_d = np.linspace(0, 100, 1000)
logprob = kde.score_samples(x_d[:, None])

### CORRECT DOMONKOS
for cat in ['SC012', 'EK6', 'AR3']:
    Ptorr = data.loc[data['cathode'] == cat, 'totalPressure']
    do = data.loc[data['cathode'] == cat, 'orificeDiameter'] * 0.1
    data.loc[data['cathode'] == cat, 'pressureDiameter'] = Ptorr * do

Pdarr_corr = np.array(data.pressureDiameter)
Pdarr_corr = Pdarr_corr[~np.isnan(Pdarr_corr)]

## KERNEL DENSITY
# Calculate best kernel density bandwidth
bandwidths = 10**np.linspace(-1, 1, 200)
grid = GridSearchCV(KernelDensity(kernel='gaussian'),
                    {'bandwidth': bandwidths},
Beispiel #36
0
def sample_and_visualize(tp):

    pos = pos_dat[:, :, tp]
    drn = dir_dat[:, :, tp]
    vel = vel_dat[:, tp]

    # convert to spherical coords
    sph_coords = appendSpherical_np(pos)
    theta = sph_coords[:, 4]  # latitude in [-pi/2, pi/2]
    phi = sph_coords[:, 5]  # longitude in [-pi, pi]

    # compute KD-Tree for fast neighbor search
    # note: KD-Tree does not support haversine metric
    kd = KDTree(pos)

    theta_idx = np.argsort(theta)
    theta_sorted = theta[theta_idx]
    phi_idx = np.argsort(phi)
    phi_sorted = phi[phi_idx]

    # compute velocity vector components
    # reference: https://stackoverflow.com/questions/707985/
    # calculate-3d-vector-perpendicular-to-plane-described-by-a-point-and-true-north-h
    origin = [0, 0, 0]
    north_vec = [0, 0, sph_rad]

    east_dir = np.cross(north_vec - pos,
                        pos - origin)  # points due east, tangent to the sphere
    north_dir = np.cross(east_dir, pos -
                         origin)  # points due north, tangent to the sphere

    east_dir = norm_vec_list(east_dir)
    north_dir = norm_vec_list(north_dir)

    vec_east = np.multiply(east_dir[:,0], drn[:,0]) + np.multiply(east_dir[:,1], drn[:,1]) + \
                np.multiply(east_dir[:,2], drn[:,2])
    vec_east = np.multiply(vel, vec_east)

    vec_north = np.multiply(north_dir[:,0], drn[:,0]) + np.multiply(north_dir[:,1], drn[:,1]) + \
                np.multiply(north_dir[:,2], drn[:,2])
    vec_north = np.multiply(vel, vec_north)

    # sampling mesh
    sample_theta = np.linspace(-np.pi / 2, np.pi / 2, 180)
    sample_phi = np.linspace(-np.pi, np.pi, 360)
    lats, lons = np.meshgrid(sample_theta, sample_phi)
    lats_flat = lats.ravel()
    lons_flat = lons.ravel()

    sample_X = sph_rad * np.multiply(np.sin(lats_flat + np.pi / 2),
                                     np.cos(lons_flat + np.pi))
    sample_Y = sph_rad * np.multiply(np.sin(lats_flat + np.pi / 2),
                                     np.sin(lons_flat + np.pi))
    sample_Z = sph_rad * np.cos(lats_flat + np.pi / 2)

    # KDE
    kde = KernelDensity(bandwidth=0.02, metric='haversine')
    kde.fit(np.vstack([theta, phi]).T,
            sample_weight=np.divide(1.0, num_agents))

    latlon = np.vstack([lats_flat, lons_flat]).T
    density_est = np.exp(kde.score_samples(latlon))
    density_est = density_est.reshape((360, 180))

    # compute velocity at sample points
    vel_east = np.zeros(np.size(sample_X))
    vel_north = np.zeros(np.size(sample_X))

    nearest_neighbor_dist = 0.005
    ind, dist = kd.query_radius(np.vstack([sample_X, sample_Y, sample_Z]).T,
                                nearest_neighbor_dist,
                                count_only=False,
                                return_distance=True)

    for i in range(np.size(sample_X)):

        num_neighbors = np.size(ind[i])
        vec_east_sum = np.sum(vec_east[ind[i]])
        vec_north_sum = np.sum(vec_north[ind[i]])

        if num_neighbors > 0:
            vel_east[i] = np.divide(vec_east_sum, num_neighbors)
            vel_north[i] = np.divide(vec_north_sum, num_neighbors)

    vel_east = vel_east.reshape((360, 180))
    vel_north = vel_north.reshape((360, 180))

    # shift indices by pi/2 and pi
    vel_east = np.roll(vel_east, (180, 0), axis=(0, 1))
    vel_east = np.fliplr(vel_east)
    vel_north = np.roll(vel_north, (180, 0), axis=(0, 1))
    vel_north = np.fliplr(vel_north)

    ###################################
    ## plot maps
    ###################################

    ytic_vals = np.array(list(np.linspace(0, 180, 6) - 90))
    xtic_vals = np.array(list(np.linspace(-180, 180, 10)))

    plt.figure(figsize=(8, 3 * 3), dpi=300)

    plt.subplot(311)
    plt.imshow(density_est.T)
    cbar = plt.colorbar()
    cbar.ax.tick_params(labelsize=6)
    plt.clim(0, 1)

    plt.yticks(np.linspace(0, 180, 6), ytic_vals.astype(int), fontsize=7)
    plt.xticks(np.linspace(0, 360, 10), xtic_vals.astype(int), fontsize=7)

    plt.xlabel("Longitude", fontsize=8)
    plt.ylabel("Latitude", fontsize=8)
    plt.title("Local Density", fontsize=8)

    plt.subplot(312)
    plt.imshow(vel_north.T)
    cbar = plt.colorbar()
    cbar.ax.tick_params(labelsize=6)
    plt.clim(-v_max, v_max)

    plt.yticks(np.linspace(0, 180, 6), ytic_vals.astype(int), fontsize=7)
    plt.xticks(np.linspace(0, 360, 10), xtic_vals.astype(int), fontsize=7)

    plt.xlabel("Longitude", fontsize=8)
    plt.ylabel("Latitude", fontsize=8)
    plt.title("Velocity (North)", fontsize=8)

    plt.subplot(313)
    plt.imshow(vel_east.T)
    cbar = plt.colorbar()
    cbar.ax.tick_params(labelsize=6)
    plt.clim(-v_max, v_max)

    plt.yticks(np.linspace(0, 180, 6), ytic_vals.astype(int), fontsize=7)
    plt.xticks(np.linspace(0, 360, 10), xtic_vals.astype(int), fontsize=7)

    plt.xlabel("Longitude", fontsize=8)
    plt.ylabel("Latitude", fontsize=8)
    plt.title("Velocity (East)", fontsize=8)

    plt.subplots_adjust(wspace=0.2, hspace=0.4)

    plt.savefig(map_folder + os.sep + repr(tp).zfill(3) + ".png")
    plt.close()

    ###################################
    ## plot agents
    ###################################

    plot_quiver_flag = False

    R = sph_rad
    theta_val = theta_sorted[
        theta_idx.argsort()] + np.pi / 2  # latitude in [0, pi]
    phi_val = phi_sorted[phi_idx.argsort()] + np.pi  # longitude in [0, 2*pi]

    X_coord = R * np.sin(theta_val) * np.cos(phi_val)
    Y_coord = R * np.sin(theta_val) * np.sin(phi_val)
    Z_coord = R * np.cos(theta_val)

    # sphere parameterization
    u, v = np.mgrid[0:2 * np.pi:36j, 0:np.pi:18j]
    xs = R * np.cos(u) * np.sin(v)
    ys = R * np.sin(u) * np.sin(v)
    zs = R * np.cos(v)

    fig = plt.figure(figsize=(3, 3), dpi=300)
    ax = fig.add_subplot(1, 1, 1, projection='3d')

    # plot empty plot, with points (without a line)
    points, = ax.plot([], [], [],
                      'ro',
                      markersize=0.3,
                      alpha=1.0,
                      fillstyle="full",
                      markerfacecolor="red",
                      markeredgecolor='red',
                      zorder=10)

    quivers = ax.quiver([], [], [], [], [], [],
                        color='lightblue',
                        linewidth=0.8,
                        normalize=False,
                        zorder=5)

    # set initial viewing angles
    azimuth, elev = 75, 21
    ax.set_xlim([-R, R])
    ax.set_ylim([-R, R])
    ax.set_zlim([-R, R])
    ax.view_init(elev, azimuth)

    plot_idx = plot_visible(azimuth, elev, points, X_coord, Y_coord, Z_coord,
                            quivers, drn[:, 0], drn[:, 1], drn[:, 2], vel,
                            plot_quiver_flag)

    if plot_quiver_flag:
        rndr = plt.gcf().canvas.get_renderer()
        quivers.draw(rndr)

    fig.canvas.draw_idle()

    ax.plot_surface(xs,
                    ys,
                    zs,
                    linewidth=0.1,
                    zorder=0,
                    edgecolor='gray',
                    color='white',
                    shade=False)

    plt.axis("off")
    plt.savefig(abm_folder + os.sep + repr(tp).zfill(3) + ".png")
    plt.close()

    ###################################
    ## plot overlays on sphere
    ###################################

    lats_vals = lats + np.pi / 2
    lons_vals = lons + np.pi

    x = R * np.sin(lats_vals) * np.cos(lons_vals)
    y = R * np.sin(lats_vals) * np.sin(lons_vals)
    z = R * np.cos(lats_vals)

    fig = plt.figure(figsize=(4 * 3, 3), dpi=300)

    # plot density

    ax1 = fig.add_subplot(1, 3, 1, projection='3d')

    points, = ax1.plot([], [], [],
                       'ro',
                       markersize=1.5,
                       alpha=0.5,
                       fillstyle="full",
                       markerfacecolor="red",
                       markeredgecolor='none',
                       zorder=10)

    azimuth, elev = 75, 21
    ax1.set_xlim([-R, R])
    ax1.set_ylim([-R, R])
    ax1.set_zlim([-R, R])
    ax1.view_init(elev, azimuth)

    ls = LightSource(75, 0)
    rho_colors = ls.shade(density_est,
                          cmap=cm.viridis,
                          blend_mode='soft',
                          vert_exag=1)
    rho_plt = ax1.plot_surface(x,
                               y,
                               z,
                               rstride=1,
                               cstride=1,
                               linewidth=0,
                               edgecolor='white',
                               facecolors=rho_colors,
                               antialiased=False,
                               shade=True)

    cbar1 = fig.colorbar(rho_plt, ax=ax1, shrink=0.75)
    cbar1.ax.tick_params(labelsize=7)
    cbar1.mappable.set_clim(0, 1)

    plt.title("Local Density", fontsize=8)
    plt.axis("off")

    # plot velocity (north)

    ax2 = fig.add_subplot(1, 3, 2, projection='3d')

    points, = ax2.plot([], [], [],
                       'ro',
                       markersize=1.5,
                       alpha=0.5,
                       fillstyle="full",
                       markerfacecolor="red",
                       markeredgecolor='none',
                       zorder=10)

    azimuth, elev = 75, 21
    ax2.set_xlim([-R, R])
    ax2.set_ylim([-R, R])
    ax2.set_zlim([-R, R])
    ax2.view_init(elev, azimuth)

    ls = LightSource(75, 0)
    vel_north_colors = ls.shade(vel_north,
                                cmap=cm.viridis,
                                blend_mode='soft',
                                vert_exag=1)
    vel_north_plt = ax2.plot_surface(x,
                                     y,
                                     z,
                                     rstride=1,
                                     cstride=1,
                                     linewidth=0,
                                     edgecolor='white',
                                     facecolors=vel_north_colors,
                                     antialiased=False,
                                     shade=True)

    cbar2 = fig.colorbar(vel_north_plt, ax=ax2, shrink=0.75)
    cbar2.ax.tick_params(labelsize=7)
    cbar2.mappable.set_clim(-v_max, v_max)

    plt.title("Velocity (North)", fontsize=8)
    plt.axis("off")

    # plot velocity (east)

    ax3 = fig.add_subplot(1, 3, 3, projection='3d')

    points, = ax3.plot([], [], [],
                       'ro',
                       markersize=1.5,
                       alpha=0.5,
                       fillstyle="full",
                       markerfacecolor="red",
                       markeredgecolor='none',
                       zorder=10)

    azimuth, elev = 75, 21
    ax3.set_xlim([-R, R])
    ax3.set_ylim([-R, R])
    ax3.set_zlim([-R, R])
    ax3.view_init(elev, azimuth)

    ls = LightSource(75, 0)
    vel_east_colors = ls.shade(vel_east,
                               cmap=cm.viridis,
                               blend_mode='soft',
                               vert_exag=1)
    vel_east_plt = ax3.plot_surface(x,
                                    y,
                                    z,
                                    rstride=1,
                                    cstride=1,
                                    linewidth=0,
                                    edgecolor='white',
                                    facecolors=vel_east_colors,
                                    antialiased=False,
                                    shade=True)

    cbar3 = fig.colorbar(vel_east_plt, ax=ax3, shrink=0.75)
    cbar3.ax.tick_params(labelsize=7)
    cbar3.mappable.set_clim(-v_max, v_max)

    plt.title("Velocity (East)", fontsize=8)
    plt.axis("off")

    # save figure

    plt.savefig(overlay_folder + os.sep + repr(tp).zfill(3) + ".png")
    plt.close()

    ###################################
    ## save results
    ###################################

    feat_vec = np.vstack([
        lats.ravel(),
        lons.ravel(),
        density_est.ravel(),
        vel_north.ravel(),
        vel_east.ravel()
    ]).T

    output_fname = npy_folder + os.sep + repr(tp).zfill(3) + ".npy"
    savedict = {'feat_vec': feat_vec, 'theta': sample_theta, 'phi': sample_phi}
    np.save(output_fname, savedict)

    output_fname = mat_folder + os.sep + repr(tp).zfill(3) + ".mat"
    sio.savemat(output_fname, savedict)
Beispiel #37
0
X_plot = np.linspace(-5, 20, 1000)[:, np.newaxis]

from collections import Counter
fig, ax = plt.subplots()
n, bins, patches = ax.hist(X, density=1)
c = Counter(x)
y = c.values()
#ax.plot(bins, y, '--')

colors = ['navy', 'cornflowerblue', 'darkorange']
kernels = ['gaussian', 'epanechnikov']
lw = 2

for color, kernel in zip(colors, kernels):
    kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X)
    log_dens = kde.score_samples(X_plot)
    ax.plot(X_plot[:, 0],
            np.exp(log_dens),
            color=color,
            lw=lw,
            linestyle='-',
            label="kernel = '{0}'".format(kernel))

ax.legend(loc='upper left')
ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k')

ax.set_xlim(2, 20)
ax.set_ylim(-0.02, 1)
plt.show()

from sklearn.neighbors import KernelDensity
Beispiel #38
0
def art_qi2(img,
            airmask,
            min_voxels=int(1e3),
            max_voxels=int(3e5),
            save_plot=True):
    r"""
    Calculates :math:`\text{QI}_2`, based on the goodness-of-fit of a centered
    :math:`\chi^2` distribution onto the intensity distribution of
    non-artifactual background (within the "hat" mask):


    .. math ::

        \chi^2_n = \frac{2}{(\sigma \sqrt{2})^{2n} \, (n - 1)!}x^{2n - 1}\, e^{-\frac{x}{2}}

    where :math:`n` is the number of coil elements.

    :param numpy.ndarray img: input data
    :param numpy.ndarray airmask: input air mask without artifacts

    """

    from sklearn.neighbors import KernelDensity
    from scipy.stats import chi2
    from mriqc.viz.misc import plot_qi2

    # S. Ogawa was born
    np.random.seed(1191935)

    data = img[airmask > 0]
    data = data[data > 0]

    # Write out figure of the fitting
    out_file = op.abspath('error.svg')
    with open(out_file, 'w') as ofh:
        ofh.write('<p>Background noise fitting could not be plotted.</p>')

    if len(data) < min_voxels:
        return 0.0, out_file

    modelx = data if len(data) < max_voxels else np.random.choice(
        data, size=max_voxels)

    x_grid = np.linspace(0.0, np.percentile(data, 99), 1000)

    # Estimate data pdf with KDE on a random subsample
    kde_skl = KernelDensity(bandwidth=0.05 * np.percentile(data, 98),
                            kernel='gaussian').fit(modelx[:, np.newaxis])
    kde = np.exp(kde_skl.score_samples(x_grid[:, np.newaxis]))

    # Find cutoff
    kdethi = np.argmax(kde[::-1] > kde.max() * 0.5)

    # Fit X^2
    param = chi2.fit(modelx[modelx < np.percentile(data, 95)], 32)
    chi_pdf = chi2.pdf(x_grid, *param[:-2], loc=param[-2], scale=param[-1])

    # Compute goodness-of-fit (gof)
    gof = float(np.abs(kde[-kdethi:] - chi_pdf[-kdethi:]).mean())
    if save_plot:
        out_file = plot_qi2(x_grid, kde, chi_pdf, modelx, kdethi)

    return gof, out_file
Beispiel #39
0
    def ShowSingleComponentVariation(self, X, listOfComponents=[0, 1]):
        #matplotlib.rcParams['font.size'] = 14

        showAsTraces = (np.shape(self.objectPixels)[0] == 1)
        assert (all([(x in range(self.numBasisFunctions))
                     for x in listOfComponents]))

        X_rep = self.RepresentUsingModel(X)

        percentilesToShow = [1, 10, 30, 70, 90, 99]
        numReadDataSamplePerPercentile = 4
        representationPercentiles = []
        for percentile in percentilesToShow:
            representationPercentiles.append(
                np.percentile(self.dataRepresentation, percentile, axis=0))
        medianRepVec = np.percentile(self.dataRepresentation, 50, axis=0)

        for eigVecInd in listOfComponents:
            plt.figure()
            gs = gridspec.GridSpec(numReadDataSamplePerPercentile + 2,
                                   len(percentilesToShow))

            # calculate the Gaussian smoothed distribution of values along the eignevector direction
            sigmaOfKDE = 0.12
            pdfStart = min(self.dataRepresentation[:,
                                                   eigVecInd]) - 3 * sigmaOfKDE
            pdfStop = max(self.dataRepresentation[:,
                                                  eigVecInd]) + 3 * sigmaOfKDE
            xAxis = np.linspace(pdfStart, pdfStop, 200)
            PDF_Model = KernelDensity(
                kernel='gaussian', bandwidth=sigmaOfKDE).fit(
                    self.dataRepresentation[:, eigVecInd].reshape(-1, 1))
            logPDF = PDF_Model.score_samples(xAxis.reshape(-1, 1))
            percentileValuesToShow = [
                representationPercentiles[x][eigVecInd]
                for x in range(len(representationPercentiles))
            ]
            percentilesToShowLogPDF = PDF_Model.score_samples(
                np.array(percentileValuesToShow).reshape(-1, 1))

            # show distribution of current component and red dots at the list of precentiles to show
            plt.subplot(gs[0, :])
            plt.fill(xAxis, np.exp(logPDF), fc='b', alpha=0.9)
            plt.scatter(percentileValuesToShow,
                        np.exp(percentilesToShowLogPDF),
                        c='r',
                        s=300)
            plt.title(
                '%.3f%s explained' %
                (100 * self.PCAModel.explained_variance_ratio_[eigVecInd],
                 '%'))

            for plotCol, currPrecentile in enumerate(percentilesToShow):
                currPrecentileRepVec = medianRepVec.copy()
                currPrecentileRepVec[eigVecInd] = representationPercentiles[
                    plotCol][eigVecInd]

                currPrecentileImage = np.zeros(np.shape(self.objectPixels))
                currPrecentileImage[
                    self.objectPixels] = self.ReconstructUsingModel(
                        currPrecentileRepVec).ravel()

                # show the median image with current precentile as activation of the curr image
                plt.subplot(gs[1, plotCol])
                if showAsTraces:
                    plt.plot(currPrecentileImage)
                    plt.title('precentile: ' +
                              str(percentilesToShow[plotCol]) + '%')
                elif np.shape(self.objectPixels)[2] == 3:
                    currPrecentileImage[currPrecentileImage > 1] = 1.0
                    currPrecentileImage[currPrecentileImage < 0] = 0.0
                    plt.imshow(currPrecentileImage)
                    plt.title('precentile: ' +
                              str(percentilesToShow[plotCol]) + '%')
                    plt.axis('off')
                else:
                    plt.imshow(currPrecentileImage, cmap='gray')
                    plt.title('precentile: ' +
                              str(percentilesToShow[plotCol]) + '%')
                    plt.axis('off')

                # find the most suitible candidates in X for current precentile
                distFromPercentile = abs(
                    X_rep[:, eigVecInd] -
                    representationPercentiles[plotCol][eigVecInd])
                X_inds = np.argpartition(distFromPercentile,
                                         numReadDataSamplePerPercentile
                                         )[:numReadDataSamplePerPercentile]
                for k, X_ind in enumerate(X_inds):
                    currNearestPrecentileImage = np.zeros(
                        np.shape(self.objectPixels))
                    currNearestPrecentileImage[self.objectPixels] = X[
                        X_ind, :].ravel()

                    plt.subplot(gs[2 + k, plotCol])
                    if showAsTraces:
                        plt.plot(currNearestPrecentileImage)
                        plt.title('Close Neighbor')
                    else:
                        plt.imshow(currNearestPrecentileImage, cmap='gray')
                        plt.title('Close Neighbor')
                        plt.axis('off')
            plt.tight_layout()
Beispiel #40
0
    def ShowModelVariations(self, numVariations=6):
        #matplotlib.rcParams['font.size'] = 14

        showAsTraces = (np.shape(self.objectPixels)[0] == 1)
        numVariations = min(numVariations, self.numBasisFunctions)

        numVarsPerFigure = min(6, numVariations)
        numFigures = int(np.ceil(float(numVariations) / numVarsPerFigure))

        lowRepVec = np.percentile(self.dataRepresentation, 2, axis=0)
        medianRepVec = np.percentile(self.dataRepresentation, 50, axis=0)
        highRepVec = np.percentile(self.dataRepresentation, 98, axis=0)

        for figureInd in range(numFigures):
            plt.figure()
            for plotCol in range(numVarsPerFigure):
                eigVecInd = numVarsPerFigure * figureInd + plotCol
                if eigVecInd >= self.numBasisFunctions:
                    break

                # create the low and high precentile representation activation vectors
                currLowPrecentileRepVec = medianRepVec.copy()
                currLowPrecentileRepVec[eigVecInd] = lowRepVec[eigVecInd]
                currHighPrecentileRepVec = medianRepVec.copy()
                currHighPrecentileRepVec[eigVecInd] = highRepVec[eigVecInd]

                # create blank images
                deltaImage = np.zeros(np.shape(self.objectPixels))
                medianImage = np.zeros(np.shape(self.objectPixels))
                lowPrecentileImage = np.zeros(np.shape(self.objectPixels))
                highPrecentileImage = np.zeros(np.shape(self.objectPixels))

                # fill the object pixels with the relevant data
                deltaImage[self.objectPixels] = self.PCAModel.components_[
                    eigVecInd, :].ravel()
                lowPrecentileImage[
                    self.objectPixels] = self.ReconstructUsingModel(
                        currLowPrecentileRepVec).ravel()
                medianImage[self.objectPixels] = self.ReconstructUsingModel(
                    medianRepVec).ravel()
                highPrecentileImage[
                    self.objectPixels] = self.ReconstructUsingModel(
                        currHighPrecentileRepVec).ravel()

                # calculate the Gaussian smoothed distribution of values along the eignevector direction
                sigmaOfKDE = 0.12
                pdfStart = min(
                    self.dataRepresentation[:, eigVecInd]) - 3 * sigmaOfKDE
                pdfStop = max(
                    self.dataRepresentation[:, eigVecInd]) + 3 * sigmaOfKDE
                xAxis = np.linspace(pdfStart, pdfStop, 200)
                PDF_Model = KernelDensity(
                    kernel='gaussian', bandwidth=sigmaOfKDE).fit(
                        self.dataRepresentation[:, eigVecInd].reshape(-1, 1))
                logPDF = PDF_Model.score_samples(xAxis.reshape(-1, 1))

                # show distribution of current component
                plt.subplot(5, numVarsPerFigure,
                            0 * numVarsPerFigure + plotCol + 1)
                plt.fill(xAxis, np.exp(logPDF), fc='b', alpha=0.9)
                plt.title(
                    '%.3f%s explained' %
                    (100 * self.PCAModel.explained_variance_ratio_[eigVecInd],
                     '%'))

                # show variance direction (eigenvector)
                plt.subplot(5, numVarsPerFigure,
                            1 * numVarsPerFigure + plotCol + 1)
                if showAsTraces:
                    plt.plot(deltaImage)
                    plt.title('eigenvector ' + str(eigVecInd))
                elif np.shape(self.objectPixels)[2] == 3:
                    deltaImage = 0.1 / deltaImage.std() * deltaImage + 0.5
                    deltaImage[deltaImage > 1] = 1.0
                    deltaImage[deltaImage < 0] = 0.0
                    plt.imshow(deltaImage)
                    plt.title('eigenvector ' + str(eigVecInd))
                    plt.axis('off')
                else:
                    plt.imshow(deltaImage)
                    plt.title('eigenvector ' + str(eigVecInd))
                    plt.axis('off')

                # show 2nd precentile image
                plt.subplot(5, numVarsPerFigure,
                            2 * numVarsPerFigure + plotCol + 1)
                if showAsTraces:
                    plt.plot(lowPrecentileImage)
                    plt.title('2nd precentile')
                elif np.shape(self.objectPixels)[2] == 3:
                    lowPrecentileImage[lowPrecentileImage > 1] = 1.0
                    lowPrecentileImage[lowPrecentileImage < 0] = 0.0
                    plt.imshow(lowPrecentileImage)
                    plt.title('2nd precentile')
                    plt.axis('off')
                else:
                    plt.imshow(lowPrecentileImage, cmap='gray')
                    plt.title('2nd precentile')
                    plt.axis('off')

                # show median image
                plt.subplot(5, numVarsPerFigure,
                            3 * numVarsPerFigure + plotCol + 1)
                if showAsTraces:
                    plt.plot(medianImage)
                    plt.title('median')
                else:
                    plt.imshow(medianImage, cmap='gray')
                    plt.title('median')
                    plt.axis('off')

                # show 98th precentile image
                plt.subplot(5, numVarsPerFigure,
                            4 * numVarsPerFigure + plotCol + 1)
                if showAsTraces:
                    plt.plot(highPrecentileImage)
                    plt.title('98th precentile')
                elif np.shape(self.objectPixels)[2] == 3:
                    highPrecentileImage[highPrecentileImage > 1] = 1.0
                    highPrecentileImage[highPrecentileImage < 0] = 0.0
                    plt.imshow(highPrecentileImage)
                    plt.title('98th precentile')
                    plt.axis('off')
                else:
                    plt.imshow(highPrecentileImage, cmap='gray')
                    plt.title('98th precentile')
                    plt.axis('off')
            plt.tight_layout()
Beispiel #41
0
def feature_distribution_plot_mult_modes(feats,
                                         feat_names,
                                         grp_col,
                                         grp_colors,
                                         grp_modes,
                                         ncol,
                                         plot_ranges=None,
                                         dens_num=100):
    """Creats a lattice of histogram plots for the given grouped features,
    where each group can have a different plot type.

    Args:
        feats: A dataframe containing the features (on the columns) and a
            grouping variable.
        feat_names: A list of strings of features to plot, matching the column 
            names in feats.
        grp_col: The name (string) of the column in feats used for grouping.
        grp_colors: A dict containing the colors (rgb strings) for each
            group in the grouping column. (E.g. {'group1':'rgb(228,26,28)'}).
        grp_modes: A dict containing the plotting for each group.  Each mode
            must be one of the following:
                'hist' - A histogram
                'dens' - A fitted density histogram
                'markers' - Scatter plot
        ncol: The number of columns (int) in the lattice plot.

    Returns:
        A plotly figure.

    """

    grps = np.unique(feats[grp_col])

    figs = []
    cnt = 0
    for f in feat_names:

        if cnt == 0:
            show_leg = True
        else:
            show_leg = False

        if plot_ranges != None:
            layout = go.Layout(title=f,
                               titlefont=dict(size=10),
                               autosize=False,
                               xaxis=dict(range=plot_ranges[f]))
        else:
            layout = go.Layout(title=f,
                               titlefont=dict(size=10),
                               autosize=False)

        traces = []
        if plot_ranges != None:
            f_min = plot_ranges[f][0]
            f_max = plot_ranges[f][1]
        else:
            f_min = min(feats.loc[:, f])
            f_max = max(feats.loc[:, f])
        xpts = np.linspace(f_min, f_max, dens_num)
        for g in grps:

            data = feats.loc[feats[grp_col] == g, f]

            if grp_modes[g] == 'dens':
                y = np.array(data)
                y = y[np.isnan(y) == False]
                kde = KernelDensity(kernel='gaussian',
                                    bandwidth=0.2).fit(y[:, np.newaxis])
                log_dens = kde.score_samples(xpts[:, np.newaxis])
                plt = go.Scatter(x=xpts,
                                 y=np.exp(log_dens),
                                 mode='lines',
                                 line=dict(color=grp_colors[g], width=2),
                                 name=g,
                                 showlegend=show_leg)
            elif grp_modes[g] == 'markers':
                plt = go.Scatter(x=list(data),
                                 y=[1.0] * len(data),
                                 mode='markers',
                                 marker=dict(color=grp_colors[g], size=5),
                                 name=g,
                                 showlegend=show_leg)
            else:
                plt = go.Histogram(x=list(data),
                                   marker=Marker(color=grp_colors[g]),
                                   name=g,
                                   showlegend=show_leg)

            traces.append(plt)

        fig = go.Figure(data=traces, layout=layout)
        figs.append(fig)
        cnt = cnt + 1

    nrow = int(np.ceil(float(len(feat_names)) / float(ncol)))

    return subplot_helper_fig(nrow, ncol, figs)
Beispiel #42
0
def feature_distribution_plot(feats,
                              feat_names,
                              grp_col,
                              grp_colors,
                              ncol,
                              plot_ranges=None,
                              dens_est=False,
                              dens_num=100,
                              title_font=12,
                              all_show_leg=True,
                              lattice=True,
                              titles=None,
                              highlight_samps=[],
                              hl_clr='black'):
    """Creats a lattice of histogram plots for the given grouped features

    Args:
        feats: A dataframe containing the features (on the columns) and a
            grouping variable.
        feat_names: A list of strings of features to plot, matching the column 
            names in feats.
        grp_col: The name (string) of the column in feats used for grouping.
        grp_colors: A dict containing the colors (rgb strings) for each
            group in the grouping column. (E.g. {'group1':'rgb(228,26,28)'}).
        ncol: The number of columns (int) in the lattice plot.

    Returns:
        A plotly figure.

    """
    grps = np.unique(feats[grp_col])

    figs = []
    cnt = 0
    for f in feat_names:

        if cnt == 0:
            show_leg = True & all_show_leg
        else:
            show_leg = False & all_show_leg

        if titles == None:
            t = f
        else:
            t = titles[f]

        if plot_ranges != None:
            layout = go.Layout(title=t,
                               titlefont=dict(size=10),
                               autosize=False,
                               xaxis=dict(range=plot_ranges[f]))
        else:
            layout = go.Layout(title=t,
                               titlefont=dict(size=10),
                               autosize=False)

        traces = []
        if plot_ranges != None:
            f_min = plot_ranges[f][0]
            f_max = plot_ranges[f][1]
        else:
            f_min = 0.7 * min(feats.loc[:, f])
            f_max = 1.3 * max(feats.loc[:, f])
        xpts = np.linspace(f_min, f_max, dens_num)

        for g in grps:

            data = feats.loc[feats[grp_col] == g, f]
            y = np.array(data)
            y = y[np.isnan(y) == False]
            #if (dens_est) & (len(y) > 0) &\
            #(len(np.unique(y)) > 3*len(y) / 4):
            if (dens_est) & (len(y) > 0):
                kde = KernelDensity(kernel='gaussian',
                                    bandwidth=0.2).fit(y[:, np.newaxis])
                log_dens = kde.score_samples(xpts[:, np.newaxis])
                plt = go.Scatter(x=xpts,
                                 y=np.exp(log_dens),
                                 mode='lines',
                                 line=dict(color=grp_colors[g], width=2),
                                 name=g,
                                 showlegend=show_leg)
                traces.append(plt)

            else:
                if len(y) > 0:
                    plt = go.Histogram(x=list(data),
                                       marker=Marker(color=grp_colors[g]),
                                       name=g,
                                       showlegend=show_leg)

                    traces.append(plt)

        if len(highlight_samps) > 0:
            for hs in highlight_samps:
                plt = go.Scatter(x=[feats.loc[hs, f]] * 2,
                                 y=[0.0, 1.0],
                                 line=dict(color=hl_clr, width=2),
                                 mode='lines')
                traces.append(plt)

        fig = go.Figure(data=traces, layout=layout)
        fig['layout'].update(titlefont=dict(size=title_font))
        figs.append(fig)
        cnt = cnt + 1

    if lattice:
        nrow = int(np.ceil(float(len(feat_names)) / float(ncol)))

        return subplot_helper_fig(nrow, ncol, figs)

    return figs
Beispiel #43
0
def plot_scatter(X, out_prefix, title, kde=True):
    """Draws a 2D scatter plot (png) of the core and accessory distances

    Also draws contours of kernel density estimare

    Args:
        X (numpy.array)
            n x 2 array of core and accessory distances for n samples.
        out_prefix (str)
            Prefix for output plot file (.png will be appended)
        title (str)
            The title to display above the plot
        kde (bool)
            Whether to draw kernel density estimate contours

            (default = True)
    """
    # Plot results - max 1M for speed
    max_plot_samples = 1000000
    if X.shape[0] > max_plot_samples:
        X = utils.shuffle(X, random_state=random.randint(
            1, 10000))[0:max_plot_samples, ]

    # Kernel estimate uses scaled data 0-1 on each axis
    scale = np.amax(X, axis=0)
    X /= scale

    plt.figure(figsize=(11, 8), dpi=160, facecolor='w', edgecolor='k')
    if kde:
        xx, yy, xy = get_grid(0, 1, 100)

        # KDE estimate
        kde = KernelDensity(bandwidth=0.03,
                            metric='euclidean',
                            kernel='epanechnikov',
                            algorithm='ball_tree')
        kde.fit(X)
        z = np.exp(kde.score_samples(xy))
        z = z.reshape(xx.shape).T

        levels = np.linspace(z.min(), z.max(), 10)
        # Rescale contours
        plt.contour(xx * scale[0],
                    yy * scale[1],
                    z,
                    levels=levels[1:],
                    cmap='plasma')
        scatter_alpha = 1
    else:
        scatter_alpha = 0.1

    # Plot on correct scale
    plt.scatter(X[:, 0] * scale[0].flat,
                X[:, 1] * scale[1].flat,
                s=1,
                alpha=scatter_alpha)

    plt.title(title)
    plt.xlabel('Core distance (' + r'$\pi$' + ')')
    plt.ylabel('Accessory distance (' + r'$a$' + ')')
    plt.savefig(out_prefix + ".png")
    plt.close()
Beispiel #44
0
y = np.array([1, 1, 1, 2, 2, 2])
clf = NearestCentroid()
clf.fit(X, y)

print(clf.predict([[-0.8, -1]]))

from sklearn.neighbors import KernelDensity
kde = KernelDensity(bandwidth=0.04,
                    metric='haversine',
                    kernel='gaussian',
                    algorithm='ball_tree',
                    n)

#kde.fit(Xtrain[ytrain == i])
kde.fit(X)
kde.score_samples(X)

from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

X, y = load_iris(return_X_y=True)
km = KMeans(n_clusters=5, random_state=1).fit(X)

dists = euclidean_distances(km.cluster_centers_)

import numpy as np
tri_dists = dists[np.triu_indices(5, 1)]
max_dist, avg_dist, min_dist = tri_dists.max(), tri_dists.mean(
), tri_dists.min()
def get_kde(X_vals, dist, kernel='tophat', bandwidth=0.3):
    eps=1e-15
    scores = np.array(dist)
    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(np.log10(1-scores+eps)[:,np.newaxis])
    log_dens = kde.score_samples(X_vals)
    return np.exp(log_dens)
def kl_divergence_error(y, y_hat):
    kd = KernelDensity(bandwidth=0.75).fit(y.reshape(-1, 1))
    yp = kd.score_samples(y.reshape(-1, 1))
    kd = KernelDensity(bandwidth=0.75).fit(y_hat.reshape(-1, 1))
    ypg = kd.score_samples(y_hat.reshape(-1, 1))
    return entropy(yp, ypg)
Beispiel #47
0
def main(num_dim,
         periodic_signal,
         bw,
         step,
         r_fname='training.mat',
         w_fname="proba.mat"):
    num_dim = int(num_dim)
    periodic_signal = int(periodic_signal)
    bw = float(bw)
    step = float(step)

    mat = scipy.io.loadmat(r_fname)
    keyto_concat = ['X_training', 'Y_training']
    to_concat = []
    for k in keyto_concat:
        print mat[k].shape
        to_concat.append(mat[k])
    to_concat[-1] = to_concat[-1][:, 0:num_dim]  # select coordinates of Y

    values = np.hstack(to_concat)
    print values.shape

    _, N = values.shape  # number of dimensions
    print 'number of dimensions'
    print N
    mins = []
    maxs = []
    for i in xrange(N):
        min_tmp = values[:, i].min()
        max_tmp = values[:, i].max()
        delta = max_tmp - min_tmp
        max_tmp = max_tmp + delta / 10.
        min_tmp = min_tmp - delta / 10.
        mins.append(min_tmp)
        maxs.append(max_tmp)
    mins[0] = 0.
    maxs[0] = 1.

    print values

    # add the same values at X-1 and X+1 to make sure that the estiamted pdf is for a periodic signal
    if periodic_signal:
        Xp = values[:, 0] + 1
        Xm = values[:, 0] - 1
        Y = values[:, 1:]

        to_concat = []
        to_concat.append(Xp)
        to_concat.append(Y)
        to_concat = np.column_stack(to_concat)
        to_concat2 = []
        to_concat2.append(to_concat)
        to_concat2.append(values)
        values = np.vstack(to_concat2)

        to_concat = []
        to_concat.append(Xm)
        to_concat.append(Y)
        to_concat = np.column_stack(to_concat)
        to_concat2 = []
        to_concat2.append(to_concat)
        to_concat2.append(values)
        values = np.vstack(to_concat2)

        print values.shape

    kde = KernelDensity(bandwidth=bw)
    kde.fit(values)

    # generate grid
    to_exec = ""
    to_exec += "np.mgrid["
    for i in np.arange(N):
        to_exec += "%f:%f:%fj," % (
            mins[i], maxs[i], step
        )  # select number of samples in each dimension
    to_exec = to_exec[:-1]
    to_exec += "]"
    print to_exec

    meshes = eval(to_exec)
    print meshes.shape

    size_grid = meshes[0].shape
    print 'size_grid'
    print size_grid

    Z = np.vstack([X.reshape(1, X.size) for X in meshes]).transpose()
    print Z.shape

    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde.score_samples(Z)
    probas = np.exp(log_pdf)

    print probas.shape
    probas = probas.transpose().reshape(size_grid)
    print probas.shape

    mdict = {'Proba_XY': probas}
    i = 1
    for X in meshes:
        mdict['X_%d' % i] = X
        i += 1
    print mdict.keys()
    scipy.io.savemat(w_fname, mdict)

    proba_plot = probas
    for i in np.arange(N - 1, 1, -1):
        proba_plot = proba_plot.sum(i)
    plotcolormap(np.rot90(proba_plot),
                 extent=[mins[0], maxs[0], mins[1], maxs[1]])
Beispiel #48
0
def kde(data):
    kd = KernelDensity(kernel='tophat', bandwidth=0.02).fit(data)
    return kd.score_samples(data)
Beispiel #49
0
# Note that it is not universal for it to be first minimum. Sometimes
# the second minimum is better!
print("Positions of the minima: ", min_vals)
print("Suggested threshold is the position of the first minimum: ",
      min_vals[0])
print(
    "Please verify with the graph. There is a chance subsequent minima may work better."
)
print("Elapsed time in seconds:", round(10.0 * (end - beg)) / 10.0)

# sklearn, with similar results
if use_sklearn:
    beg2 = time.time()
    kernel = 'gaussian'
    kde2 = KernelDensity(kernel=kernel, bandwidth=10).fit(data[:, np.newaxis])
    log_dens = kde2.score_samples(xvals[:, np.newaxis])
    yvals2 = np.exp(log_dens).reshape(-1)
    min_pos2 = argrelextrema(yvals2, np.less)
    min_vals2 = xvals[min_pos2]
    end2 = time.time()
    print("Elapsed time for sklearn kernel estimation in seconds:",
          round(10.0 * (end2 - beg2)) / 10.0)
    print("Suggested threshold is the position of the first minimum2: ",
          min_vals2[0])
    print("Positions of the minima2: ", min_vals2)

# Plot the kernel-density estimate and highlight the minima
if not options.no_plot:
    plt.figure(1)
    plt.hist(data, bins=100, density=True, label="Data histogram")
    plt.plot(xvals, yvals, label="KDE", c="red")
Beispiel #50
0
def run_benchmark(df_path,
                  n,
                  numScore,
                  tol,
                  cols,
                  bwValue=None,
                  bwMult=1.0,
                  denorm=False,
                  use_std=False):
    params = {
        "algorithm": "sklearn",
        "dataset": df_path,
        "dim": len(cols),
        "num_train": n,
        "num_test": numScore,
        "train_time": None,
        "test_time": None,
        # "num_kernels": None
    }
    print(params)
    data = pd.read_csv(df_path)[cols].iloc[:n].values

    trainstart = time.time()
    if bwValue is None:
        bw = bwMult * estimate_kde_bw(data, use_std=use_std)
        print("BW: {}".format(bw))
    else:
        bw = bwValue * np.ones(len(cols))
        print("BW: {}".format(bwValue))
    if numScore is None:
        numScore = len(data)

    internal_bw = 1
    if denorm:
        internal_bw = 1.0 / (math.sqrt(2 * math.pi))
    scaled_data = (data / bw) * internal_bw

    # Normalized Computations
    kde = KernelDensity(
        bandwidth=internal_bw,
        kernel='gaussian',
        algorithm='kd_tree',
        rtol=tol,
    )
    kde.fit(scaled_data)
    train_time = time.time() - trainstart
    params["train_time"] = 1000 * train_time
    print("Trained in {}".format(train_time), flush=True)

    scorestart = time.time()
    scores = np.exp(kde.score_samples(scaled_data[:numScore]))
    score_time = time.time() - scorestart
    params["test_time"] = 1000 * score_time
    print("Scored in {}".format(score_time), flush=True)
    print("Rate: {}".format(numScore / score_time))

    self_density = get_self_density(data.shape[1], data.shape[0])
    scores_minus_self = scores - self_density

    # scale scores back
    if denorm:
        final_scores = scores_minus_self
    else:
        final_scores = scores_minus_self / np.prod(bw)

    q = np.percentile(final_scores, 1.0)
    print("Quantile: {}".format(q))
    print("Final Output:")
    print(params)
    return final_scores
Beispiel #51
0
alpha_cm = plt.cm.Reds
alpha_cm._init()
alpha_cm._lut[:-3,-1] = abs(np.logspace(0, 1, alpha_cm.N) / 10 - 1)[::-1]
aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
lon_lat_box = (-88, -87.5, 41.6, 42.1)

sigthings = traps[traps['WnvPresent'] > 0]
sigthings = sigthings.groupby(['Date', 'Trap','Longitude', 'Latitude']).max()['WnvPresent'].reset_index()
X = sigthings[['Longitude', 'Latitude']].values
kd = KernelDensity(bandwidth=0.02)
kd.fit(X)

xv,yv = np.meshgrid(np.linspace(-88, -87.5, 100), np.linspace(41.6, 42.1, 100))
gridpoints = np.array([xv.ravel(),yv.ravel()]).T
zv = np.exp(kd.score_samples(gridpoints).reshape(100,100))
plt.figure(figsize=(10,14))
plt.imshow(mapdata,
           cmap=plt.get_cmap('gray'),
           extent=lon_lat_box,
           aspect=aspect)
plt.imshow(zv,
           origin='lower',
           cmap=alpha_cm,
           extent=lon_lat_box,
           aspect=aspect)

locations = traps[['Longitude', 'Latitude']].values
plt.scatter(locations[:,0], locations[:,1], marker='x')

plt.savefig('heatmap.png')
    print(samples.median())
    print(samples.std())

    plt.hist(samples['mu_1'], bins=15)
    plt.hist(samples['mu_2'], bins=15)
    plt.grid()
    plt.show()

    mu1_list = matrix[:, 0]
    mu2_list = matrix[:, 1]
    kde1 = KernelDensity(kernel='gaussian',
                         bandwidth=0.5).fit(mu1_list.reshape(-1, 1))
    kde2 = KernelDensity(kernel='gaussian',
                         bandwidth=0.5).fit(mu2_list.reshape(-1, 1))
    x_plot = np.linspace(0, 10, len(mu1_list)).reshape(-1, 1)
    prob1 = np.exp(kde1.score_samples(x_plot))
    prob2 = np.exp(kde2.score_samples(x_plot))
    plt.plot(x_plot, prob1)
    plt.plot(x_plot, prob2)
    plt.legend(['mu1', 'mu2'])
    plt.ylabel('density')
    plt.title('Kernel Density')
    plt.grid()
    plt.show()

    mixture = []
    for i in range(len(mu1_list)):
        mixture.append(max(prob1[i], prob2[i]))
    plt.plot(x_plot, mixture)
    plt.hist(samples['mu_1'], normed=True, bins='fd')
    plt.hist(samples['mu_2'], normed=True, bins='fd')
Beispiel #53
0
def plot(X:'array', threshold_freq:float = 0.0001, nbins:int = None, figsize:tuple = (15,8), supply:bool = False)->tuple:
    """
    Plot histogram with densities estimated by KDE using different kernels.
    X -- 1D array of data.
    threshold_freq -- frequency (probability) limit to discard or not a spike as a local maximum  (default 0.0001).
    nbins -- number of bins to be set (default None). In case of not to being included it will be estimated (optimal case).
    figsize -- figure size (default (15, 8)).
    supply -- return or not ax object. (default False).
    return -- ((x/y values of histogram), (x/y values of density for gaussian kernel), list of values where there are local maximums)
    
    NOTE - if supply = True, furthermore it will be returned the axis object ax.
    """

    # estimate x limits
    add = truncate(np.ptp(X)) * 0.5 / 10.
    xmin = truncate(np.min(X)) - add if truncate(np.min(X)) < 0 else truncate(np.min(X)) + add
    xmax = truncate(np.max(X)) - add if truncate(np.max(X)) < 0 else truncate(np.max(X)) + add

    # estimate local maximum
    steps_avg = local_maximums_kde_gaussian(X.copy(), threshold_freq) 
    #print('local maximums (gaussian): %s'%steps_avg)

    # number of bins
    if nbins is None:
        R = truncate(np.ptp(X))
        n = len(X)
        sigma = np.nanstd(X)
        nbins = truncate(( R * (n**float(1/3)) ) / 3.49 * sigma )
    bins = np.linspace(truncate(np.min(X)), truncate(np.max(X)), nbins)

    # x for plot
    X_plot = np.linspace(truncate(np.min(X)), truncate(np.max(X)), 1000)[:, np.newaxis]

    # create fig/axes
    fig, ax = plt.subplots(figsize = figsize)
    # plot the input data distribution
    h_y, h_x, _ = ax.hist(X , density = True, bins = bins, color = 'grey', label = 'input distribution', alpha = 0.2)
    # settings
    colors = ['cornflowerblue', 'darkorange', 'navy']
    kernels = ['tophat', 'epanechnikov', 'gaussian']
    lw = 2
    # calculate kde and plot
    for color, kernel in zip(colors, kernels):
        kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X)
        log_dens = kde.score_samples(X_plot)
        ax.plot(X_plot[:, 0], np.exp(log_dens), color=color, lw=lw,
                linestyle='-', label="kernel = '{0}'".format(kernel))
        # store gaussian results
        if kernel == 'gaussian':
            l_y = np.exp(log_dens)
            l_x = X_plot[:, 0]
    # set legend
    ax.legend(loc='upper left')
    # plot points on the botton
    if len(X) < 10000:
        ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k')
    # plot estimated local maximuns
    for avg in steps_avg:
        ax.axvline(avg, color='k', linestyle='--')
    # set chart limits
    ax.set_xlim(xmin, xmax)
    ax.set_ylim(-0.02, np.max(h_y) + 0.05)
    # set title
    ax.set_title("%s points / %s bins"%(len(X), nbins))
    # set labels
    ax.set_ylabel("freq")
    # display / return
    if supply:
        return ((h_x, h_y), (l_x, l_y), steps_avg, ax)
    else:
        plt.show()
        return ((h_x, h_y), (l_x, l_y), steps_avg)
                       ]))
df2 = train_in.reindex(index=range(0, 5000),
                       columns=list([
                           'x_ 1', 'x_ 2', 'x_ 3', 'x_ 4', 'x_ 5', 'x_ 6',
                           'x_ 7', 'x_ 8', 'x_ 9', 'x_11', 'x_12'
                       ]))

df5 = train_in.reindex(index=range(0, 1000),
                       columns=list(['x_ 10', 'x_ 13', 'x_ 14']))
df6 = train_in.reindex(index=range(0, 5000),
                       columns=list(['x_ 10', 'x_ 13', 'x_ 14']))

df3 = train_in.reindex(index=range(0, 2500), columns=list(train_in.columns))
df4 = train_in.reindex(index=range(0, 5000), columns=list(train_in.columns))

#print(len(df1))
#print(len(df2))
kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(df3)
log_dens = kde.score_samples(df4)

print(log_dens)
k = np.arange(1, 5001)
df = pd.DataFrame({
    'Point_ID': k,
    'Output': log_dens
},
                  index=None,
                  columns=['Point_ID', 'Output'])

df.to_csv('test_out1.csv', index=False)
Beispiel #55
0
def KDE(x, y):
    kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(x)
    score = sum(kde.score_samples(y[:, ]))
    return score
 def displayFamiliesDistribution(self, directory, label=None):
     if label is None:
         self.displayFamiliesDistribution(directory,
                                          label=labels_tools.MALICIOUS)
         self.displayFamiliesDistribution(directory,
                                          label=labels_tools.MALICIOUS)
         return
     families = self.families[labels_tools.labelBooleanToString(label)]
     bandwidth = 0.1
     num_points = 50
     eps = 0.00001
     kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
     fig, (ax) = plt.subplots(1, 1)
     i = 0
     for family in families:
         predictions = families[family]
         predictions_np = np.asarray(predictions)
         if i % 3 == 0:
             linestyle = 'solid'
         elif i % 3 == 1:
             linestyle = 'dashed'
         if i % 3 == 2:
             linestyle = 'dotted'
         linewidth = 2
         if np.var(predictions_np) < eps:
             linewidth = 4
             mean = np.mean(predictions_np)
             x = np.arange(0, 1, 0.1)
             x = np.sort(np.append(x, [mean, mean - eps, mean + eps]))
             density = [1 if v == mean else 0 for v in x]
         else:
             density_predictions = [[x] for x in predictions_np]
             kde.fit(density_predictions)
             # Computes the x axis
             p_max = np.amax(predictions_np)
             p_min = np.amin(predictions_np)
             delta = p_max - p_min
             density_delta = 1.1 * delta
             x = np.arange(0, 1, density_delta / num_points)
             x_density = [[y] for y in x]
             # kde.score_samples returns the 'log' of the density
             log_density = kde.score_samples(x_density).tolist()
             density = list(map(math.exp, log_density))
         ax.plot(x,
                 density,
                 label=family,
                 linewidth=linewidth,
                 linestyle=linestyle)
         fig_f, (ax_f) = plt.subplots(1, 1)
         ax_f.plot(x,
                   density,
                   linewidth=4,
                   color=colors_tools.getLabelColor(label))
         ax_f.set_title(family)
         ax_f.set_xlabel('P(Malicious)')
         ax_f.set_ylabel('Density')
         filename = label + '_family_' + family + '_prediction_distributions.png'
         fig_f.savefig(path.join(directory, filename))
         plt.close(fig_f)
         i += 1
     ax.legend(bbox_to_anchor=(0., 0.95, 1., .102),
               loc=3,
               ncol=5,
               mode='expand',
               borderaxespad=0.,
               fontsize='xx-small')
     ax.set_xlabel('P(Malicious)')
     ax.set_ylabel('Density')
     filename = label + '_families_prediction_distributions.png'
     fig.savefig(path.join(directory, filename))
     plt.close(fig)
Beispiel #57
0
class gaussian_kde_wrapper(object):
    def __init__(self,
                 hyperparameter,
                 param_name,
                 data,
                 oob_strategy='resample',
                 bandwith=0.4):
        if oob_strategy not in ['resample', 'round', 'ignore']:
            raise ValueError()
        self.oob_strategy = oob_strategy
        self.param_name = param_name
        self.hyperparameter = hyperparameter
        reshaped = np.reshape(data, (len(data), 1))

        if self.hyperparameter.log:
            if isinstance(self.hyperparameter, UniformIntegerHyperparameter):
                # self.probabilities = {val: self.distrib.pdf(np.log2(val)) for val in range(self.hyperparameter.lower, self.hyperparameter.upper)}
                raise ValueError(
                    'Log Integer hyperparameter not supported: %s' %
                    param_name)
            # self.distrib = gaussian_kde(np.log2(data))
            # self.distrib = KernelDensity(kernel='gaussian').fit(np.log2(np.reshape(data, (len(data), 1))))
            self.distrib = KernelDensity(kernel='gaussian',
                                         bandwidth=bandwith).fit(
                                             np.log2(reshaped))
        else:
            # self.distrib = gaussian_kde(data)
            self.distrib = KernelDensity(kernel='gaussian',
                                         bandwidth=bandwith).fit(reshaped)
        pass

    def pdf(self, x):
        x = np.reshape(x, (len(x), 1))
        if self.hyperparameter.log:
            x = np.log2(x)
        log_dens = self.distrib.score_samples(x)
        return np.exp(log_dens)

    def rvs(self, *args, **kwargs):
        # assumes a samplesize of 1, for random search
        while True:
            sample = self.distrib.sample(
                n_samples=1, random_state=kwargs['random_state'])[0][0]
            if self.hyperparameter.log:
                value = np.power(2, sample)
            else:
                value = sample
            if isinstance(self.hyperparameter, UniformIntegerHyperparameter):
                value = int(round(value))

            if self.hyperparameter.lower <= value <= self.hyperparameter.upper:
                return value
            elif self.oob_strategy == 'ignore':
                # TODO: hacky fail safe for some hyperparameters
                if hasattr(self.hyperparameter, 'lower_hard'
                           ) and self.hyperparameter.lower_hard > value:
                    continue
                if hasattr(self.hyperparameter, 'upper_hard'
                           ) and self.hyperparameter.upper_hard < value:
                    continue

                return value
            elif self.oob_strategy == 'round':
                if value < self.hyperparameter.lower:
                    return self.hyperparameter.lower
                elif value > self.hyperparameter.upper:
                    return self.hyperparameter.upper
Beispiel #58
0
def exe_kde(x_lon, y_lat, year, month, violation_code, kernel, bandwidth, metric):

    # Build shp
    shp = get_shp()

    # Build map
    fig = plt.figure(dpi = 1000)
    ax = fig.add_subplot(111)
    ax.axis('off')

    map = Basemap(projection = 'cyl',
                  resolution = 'h',
                  lat_0 = 43.0389025,
                  lon_0 = -87.9064736,
                  llcrnrlon = -88.080736,
                  llcrnrlat = 42.917670,
                  urcrnrlon = -87.839722,
                  urcrnrlat = 43.19712)

    map.readshapefile(shp, name = 'mke_nbhd')

    patches_mke_nbhd = []

    for info, shape in zip(map.mke_nbhd_info, map.mke_nbhd):
        if info['NEIGHBORHD'] != None:
            patches_mke_nbhd.append(Polygon(np.array(shape), True))

    ax.add_collection(PatchCollection(patches_mke_nbhd,
                                      edgecolor = '#000000',
                                      facecolor = '#bfbfbf',
                                      linewidths = 0.45,
                                      zorder = 5))

    # Build KDE
    k, m, kde_bw = kernel.lower(), metric.lower(), float(bandwidth)
    xy = np.stack([x_lon, y_lat])
    d, n = xy.shape[0], xy.shape[1]
    kde = KernelDensity(kernel = k,
                        bandwidth = kde_bw,
                        metric = m)
    kde.fit(xy.T)
    xmin, xmax, ymin, ymax = -88.080736, -87.839722, 42.917670, 43.19712

    # For all intents and purposes, this is grid size; in other words, this affects
    # resolution of the density plots
    X, Y = np.mgrid[xmin:xmax:1000j, ymin:ymax:1000j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    Z = np.reshape(np.exp(kde.score_samples(positions.T)), X.shape)

    # Build save
    cmap = colors.ListedColormap(['#ffffff',
                                  '#ffebeb',
                                  '#ffd8d8',
                                  '#ffc4c4',
                                  '#ffb1b1',
                                  '#ff9d9d',
                                  '#ff8a8a',
                                  '#ff7676',
                                  '#ff6262',
                                  '#ff4f4f',
                                  '#ff3b3b',
                                  '#ff2828',
                                  '#ff1414'])
    plt.imshow(np.rot90(Z),
               cmap = cmap,
               extent = [xmin, xmax, ymin, ymax],
               alpha = 0.5,
               zorder = 10)
               
    plt.scatter(x_lon,
                y_lat,
                c = '#0000ff',
                s = 1.75,
                alpha = 0.5,
                linewidths = 0,
                edgecolors = None,
                zorder = 15)

    s_yr, s_mo, s_vc = str(year), month.lower()[:3], str(violation_code)
    s_ke, s_bw, s_me = kernel.lower()[:3], str(kde_bw).replace('.', ''), metric.lower()[:3]
    plot_path_long = path.plot_path_long()
    plt.savefig(plot_path_long + '%s%s%s%s%s%s.png' % (s_yr, s_mo, s_vc, s_ke, s_bw, s_me),
                bbox_inches = 'tight',
                dpi = 1000)
Beispiel #59
0
def feature_distribution_plot_users(feats,
                                    users,
                                    feat_names,
                                    grp_col,
                                    grp_colors,
                                    dens_est=False,
                                    dens_num=50):
    """Creates a lattice of histogram plots for grouped features for many users.
    
    Each column contains the histogram feature plots for a single user.

    Args:
        feats: A dictionary containing the features (on the columns) and a
            grouping variable for each user.
        users: A subset of the keys of 'feats' to plot
        feat_names: A list of strings of features to plot, matching the column 
            names in feats.
        grp_col: The name (string) of the column in feats used for grouping.
        grp_colors: A dict containing the colors (rgb strings) for each
            group in the grouping column. (E.g. {'group1':'rgb(228,26,28)'}).
        ncol: The number of columns (int) in the lattice plot.
        dens_est: A boolean indicating whether or not to show density
            estimation plot instead of histogram.

    Returns:
        A plotly figure.

    """

    grps = np.unique(feats[users[0]][grp_col])

    figs = []
    cnt = 0
    user_plt_cnt = 0
    for f in feat_names:

        vals = []
        for u in users:
            vals.extend(feats[u][f])
        fmin = np.nanpercentile(vals, 2)
        fmax = np.nanpercentile(vals, 98)
        #fmin = min([min(feats[u][f]) for u in users])
        #fmax = max([max(feats[u][f]) for u in users])

        feat_plot_cnt = 0
        for u in users:

            if cnt == 0:
                show_leg = True
            else:
                show_leg = False

            if user_plt_cnt == 0:
                plt_title = u
            else:
                plt_title = ''

            if feat_plot_cnt == 0:
                y_title = f
            else:
                y_title = ''

            layout = go.Layout(title=plt_title,
                               titlefont=dict(size=10),
                               yaxis=dict(title=y_title,
                                          titlefont=dict(size=18),
                                          color='black'),
                               xaxis=dict(range=[fmin, fmax]),
                               autosize=False)

            f_min = min(feats[u].loc[:, f])
            f_max = max(feats[u].loc[:, f])
            xpts = np.linspace(f_min, f_max, dens_num)
            traces = []
            for g in grps:

                data = feats[u].loc[feats[u][grp_col] == g, f]
                if dens_est:
                    y = np.array(data)
                    y = y[np.isnan(y) == False]
                    kde = KernelDensity(kernel='gaussian',
                                        bandwidth=0.75).fit(y[:, np.newaxis])
                    log_dens = kde.score_samples(xpts[:, np.newaxis])
                    plt = go.Scatter(x=xpts,
                                     y=np.exp(log_dens),
                                     mode='lines',
                                     line=dict(color=grp_colors[g], width=2),
                                     name=g,
                                     showlegend=show_leg)
                else:

                    plt = go.Histogram(x=list(data),
                                       marker=Marker(color=grp_colors[g]),
                                       name=g,
                                       showlegend=show_leg)
                traces.append(plt)

            fig = go.Figure(data=traces, layout=layout)
            figs.append(fig)
            cnt = cnt + 1
            feat_plot_cnt = feat_plot_cnt + 1
        user_plt_cnt = user_plt_cnt + 1

    ncol = len(users)
    nrow = len(feat_names)

    return subplot_helper_fig(nrow, ncol, figs)
Beispiel #60
0
def visualize(network, title,pos):
    """
    Visualize the network given an array of posisitons.
    """
    print("-- Starting to Visualize --")
    
    colors = []
    colori = []
    i_edge_colors = []
    d_edge_colors = []
    default = []
    infected = []
    nstart = []
    ninfect = []
    
    for node in network.nodes():
        colorn = network.nodes[node]["color"]
        if colorn == "#A0C8F0":
            nstart.append(node)
            colors.append(network.nodes[node]["color"])
        elif colorn == "#30cc1f" or colorn == "red" or colorn == "purple":
            ninfect.append(node)
            colori.append(network.nodes[node]["color"])            
            
    for i,j in network.edges():
        color = network.nodes[i]["color"]
        if color == "#A0C8F0" or color == "#30cc1f" or color == "purple":
            color = "#A6A6A6"
            default.append((i,j))
            d_edge_colors.append(color)
        else:
            color = "red"
            infected.append((i,j))
            i_edge_colors.append(color)
            
            
    plt.figure(figsize=(30,20))
    ax = plt.axes(projection=ccrs.PlateCarree())
    ax.coastlines()
    
    #make density plot of infection
    node_positions = {node[0]: (float(node[1]['lon']), float(node[1]['lat'])) for node in network.nodes(data=True)}
    
    xp = []
    yp = []
    
    for node in network.nodes():
        infec = network.nodes[node]["status"] 
        if infec == 'i':
            xp.append(network.nodes[node]['lon'])
            yp.append(network.nodes[node]['lat'])
    
    if len(xp)>=1:
        m1, m2 = np.array(xp).astype(np.float), np.array(yp).astype(np.float)
        xmin = -180
        xmax = 180
        ymin = -90
        ymax = 90
        
        # get the density estimation 
        Xp, Yp = np.mgrid[xmin:xmax:250j, ymin:ymax:250j]
        XpYp = np.vstack([Xp.ravel(), Yp.ravel()]).T
        XpYp = np.radians(XpYp)
        values = np.column_stack((np.array(np.vstack(m1)), np.array(np.vstack(m2))))
        kernel = KernelDensity(bandwidth=0.035)
        kernel.fit(np.radians(values))
 
        #kernel = stats.gaussian_kde(values)
        Z = np.exp(kernel.score_samples(XpYp))
        Z = Z.reshape(Xp.shape)
  
        # plot the result
        cmap = plt.cm.jet
        cmap.set_under('white')
        plt.imshow(np.rot90(Z), norm = plt.Normalize(vmin=(Z.max()-(Z.max()*0.9)), vmax=Z.max()), cmap=cmap,
               extent=[xmin, xmax, ymin, ymax], alpha=0.3, interpolation = 'gaussian')
    
        
    # Fist pass - Gray lines
    nx.draw_networkx_edges(network,pos=node_positions,edgelist=default,
            width=0.005,
            edge_color=d_edge_colors,
            alpha=0.005,
            arrows=False)
   
    # Second Pass - Colored lines
    nx.draw_networkx_edges(network,pos=node_positions,edgelist=infected,
            width=0.1,
            edge_color=i_edge_colors,
            alpha=0.25,
            arrows=False)

     # first Pass - small nodes
    nx.draw_networkx_nodes(network,
            pos=node_positions,
            nodelist=nstart,
            linewidths=0.2,
            node_size=5,
            with_labels=False,
            node_color = colors)
    
#    # Second Pass - large nodes
    nx.draw_networkx_nodes(network,
            pos=node_positions,
            nodelist=ninfect,
            linewidths=0.2,
           node_size=20,
            with_labels=False,
            node_color = colori)
        
    plt.axis('off')

    number_files = str(len(os.listdir()))
    while len(number_files) < 3:
        number_files = "0" + number_files

    plt.savefig("infection-{0}.png".format(number_files),
                bbox_inches='tight', dpi=72 
            )
    plt.show()
    plt.close()