-
Notifications
You must be signed in to change notification settings - Fork 1
/
fig_density_estimation.py
105 lines (92 loc) · 3.78 KB
/
fig_density_estimation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Author: Jake VanderPlas
# License: BSD
# The figure produced by this code is published in the textbook
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
# For more information, see http://astroML.github.com
# To report a bug or issue, use the following forum:
# https://groups.google.com/forum/#!forum/astroml-general
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
from astroML.density_estimation import KNeighborsDensity
from astroML.plotting import hist
# Scikit-learn 0.14 added sklearn.neighbors.KernelDensity, which is a very
# fast kernel density estimator based on a KD Tree. We'll use this if
# available (and raise a warning if it isn't).
try:
from sklearn.neighbors import KernelDensity
use_sklearn_KDE = True
except:
import warnings
warnings.warn("KDE will be removed in astroML version 0.3. Please "
"upgrade to scikit-learn 0.14+ and use "
"sklearn.neighbors.KernelDensity.", DeprecationWarning)
from astroML.density_estimation import KDE
use_sklearn_KDE = False
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX. This may
# result in an error if LaTeX is not installed on your system. In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)
#------------------------------------------------------------
# Generate our data: a mix of several Cauchy distributions
# this is the same data used in the Bayesian Blocks figure
np.random.seed(0)
N = 10000
mu_gamma_f = [(5, 1.0, 0.1),
(7, 0.5, 0.5),
(9, 0.1, 0.1),
(12, 0.5, 0.2),
(14, 1.0, 0.1)]
true_pdf = lambda x: sum([f * stats.cauchy(mu, gamma).pdf(x)
for (mu, gamma, f) in mu_gamma_f])
x = np.concatenate([stats.cauchy(mu, gamma).rvs(int(f * N))
for (mu, gamma, f) in mu_gamma_f])
np.random.shuffle(x)
x = x[x > -10]
x = x[x < 30]
#------------------------------------------------------------
# plot the results
fig = plt.figure(figsize=(5, 5))
fig.subplots_adjust(bottom=0.08, top=0.95, right=0.95, hspace=0.1)
N_values = (500, 5000)
subplots = (211, 212)
k_values = (10, 100)
for N, k, subplot in zip(N_values, k_values, subplots):
ax = fig.add_subplot(subplot)
xN = x[:N]
t = np.linspace(-10, 30, 1000)
# Compute density with KDE
if use_sklearn_KDE:
kde = KernelDensity(0.1, kernel='gaussian')
kde.fit(xN[:, None])
dens_kde = np.exp(kde.score_samples(t[:, None]))
else:
kde = KDE('gaussian', h=0.1).fit(xN[:, None])
dens_kde = kde.eval(t[:, None]) / N
# Compute density with Bayesian nearest neighbors
nbrs = KNeighborsDensity('bayesian', n_neighbors=k).fit(xN[:, None])
dens_nbrs = nbrs.eval(t[:, None]) / N
# plot the results
ax.plot(t, true_pdf(t), ':', color='black', zorder=3,
label="Generating Distribution")
ax.plot(xN, -0.005 * np.ones(len(xN)), '|k')
hist(xN, bins='blocks', ax=ax, normed=True, zorder=1,
histtype='stepfilled', color='k', alpha=0.2,
label="Bayesian Blocks")
ax.plot(t, dens_nbrs, '-', lw=1.5, color='gray', zorder=2,
label="Nearest Neighbors (k=%i)" % k)
ax.plot(t, dens_kde, '-', color='black', zorder=3,
label="Kernel Density (h=0.1)")
# label the plot
ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top',
transform=ax.transAxes)
ax.set_ylabel('$p(x)$')
ax.legend(loc='upper right')
if subplot == 212:
ax.set_xlabel('$x$')
ax.set_xlim(0, 20)
ax.set_ylim(-0.01, 0.4001)
plt.show()