Ejemplo n.º 1
0
"""
# Author: Jake VanderPlas <*****@*****.**>
# License: BSD
#   The figure produced by this code is published in the textbook
#   "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
#   For more information, see http://astroML.github.com
import numpy as np
from matplotlib import pyplot as plt

from sklearn.mixture import GMM
from astroML.datasets import fetch_great_wall
from astroML.decorators import pickle_results

#------------------------------------------------------------
# load great wall data
X = fetch_great_wall()


#------------------------------------------------------------
# Create a function which will save the results to a pickle file
#  for large number of clusters, computation will take a long time!
@pickle_results('great_wall_GMM.pkl')
def compute_GMM(n_clusters, n_iter=1000, min_covar=3, covariance_type='full'):
    clf = GMM(n_clusters, covariance_type=covariance_type,
              n_iter=n_iter, min_covar=min_covar)
    clf.fit(X)
    print "converged:", clf.converged_
    return clf

#------------------------------------------------------------
# Compute a grid on which to evaluate the result
Ejemplo n.º 2
0
def question4():
    from astroML.datasets import fetch_great_wall
    X = fetch_great_wall()

    bw = 5  #bandwidth for the KDE

    # Create the grid on which to evaluate the results
    ratio = 50. / 125.
    sizefactor = 250  #default = 125
    Nx = int(ratio * sizefactor)
    Ny = int(sizefactor)

    xmin, xmax = (-375, -175)
    ymin, ymax = (-300, 200)

    xgrid = np.linspace(xmin, xmax, Nx)
    ygrid = np.linspace(ymin, ymax, Ny)
    mesh = np.meshgrid(xgrid, ygrid)

    tmp = map(np.ravel, mesh)
    Xgrid = np.vstack(tmp).T

    def Qa():
        #Make KDEs for the different kernels
        kde = KernelDensity(bandwidth=bw, kernel='gaussian').fit(X)
        # Evaluate the KDE on the grid
        log_dens = kde.score_samples(Xgrid)
        dens1 = X.shape[0] * np.exp(log_dens).reshape((Ny, Nx))

        kde = KernelDensity(bandwidth=bw, kernel='tophat').fit(X)
        # Evaluate the KDE on the grid
        log_dens = kde.score_samples(Xgrid)
        dens2 = X.shape[0] * np.exp(log_dens).reshape((Ny, Nx))

        kde = KernelDensity(bandwidth=bw, kernel='exponential').fit(X)
        # Evaluate the KDE on the grid
        log_dens = kde.score_samples(Xgrid)
        dens3 = X.shape[0] * np.exp(log_dens).reshape((Ny, Nx))

        kde = KernelDensity(bandwidth=bw, kernel='epanechnikov').fit(X)
        # Evaluate the KDE on the grid
        log_dens = kde.score_samples(Xgrid)
        dens4 = X.shape[0] * np.exp(log_dens).reshape((Ny, Nx))

        plt.figure(figsize=(12. * 2. / 5., 12))
        #plt.imshow(dens1, cmap=plt.get_cmap('hot'))
        plt.scatter(X.T[0], X.T[1], edgecolor='none', s=2, color='black')
        plt.axis('equal')
        plt.xlim((-375, -175))
        plt.ylim((-300, 200))
        plt.title('Great Wall galaxies')
        plt.xlabel('Width [Mly]')
        plt.ylabel('Height [Mly]')
        plt.savefig('great-wall_raw.svg', bbox_inches='tight')
        #plt.show()
        plt.close()

        fig, ((x1, x2), (x3, x4)) = plt.subplots(2, 2, figsize=(6.5, 12))
        x1.imshow(dens1, interpolation='nearest', cmap=plt.get_cmap('hot'))
        x1.set_title("Gaussian KDE")
        x1.set_xlabel('Width [Mly]')
        x1.set_ylabel('Height [Mly]')
        x2.imshow(dens2, interpolation='nearest', cmap=plt.get_cmap('hot'))
        x2.set_title("Tophat KDE")
        x2.set_xlabel('Width [Mly]')
        x2.set_ylabel('Height [Mly]')
        x3.imshow(dens3, interpolation='nearest', cmap=plt.get_cmap('hot'))
        x3.set_title("Exponential KDE")
        x3.set_xlabel('Width [Mly]')
        x3.set_ylabel('Height [Mly]')
        x4.imshow(dens4, interpolation='nearest', cmap=plt.get_cmap('hot'))
        x4.set_title("Epanechnikov KDE")
        x4.set_xlabel('Width [Mly]')
        x4.set_ylabel('Height [Mly]')

        plt.savefig('Question4a.svg', bbox_inches='tight')
        print('Best kernel: exponential')

        plt.show()

    def Qb():
        print('Starting cross validation...')

        #different values for the bandwidth
        bwrange = np.linspace(1, 20, 20)

        #set the number of folds
        kf = KFold(n_splits=10)

        likelyhood = np.zeros(len(bwrange))

        print('Finding the best bandwidth...')
        for bw, i in zip(bwrange, np.arange(len(bwrange))):
            print('{0} of {1}'.format(i + 1, len(bwrange)))
            lh = []
            for train_i, test_i in kf.split(X):
                Xtrain, Xtest = X[train_i], X[test_i]
                kde = KernelDensity(bandwidth=bw,
                                    kernel='exponential').fit(Xtrain)

                log_dens = kde.score_samples(Xtrain)
                lhscore = kde.score(Xtest)

                #print('Bandwidth: {0}, Likelyhood: {1}'.format(bw, lhscore))

                lh = np.append(lh, lhscore)

            likelyhood[i] = np.mean(lh)

        bestbandwidth = bwrange[np.argmax(likelyhood)]
        print('Highest likelyhood ({0}) at bandwidth = {1}'.format(
            round(np.max(likelyhood), 2), bestbandwidth))

        plt.plot(bwrange,
                 likelyhood,
                 color='black',
                 alpha=0.8,
                 label='Likelyhood')
        plt.scatter(bwrange[np.argmax(likelyhood)],
                    np.max(likelyhood),
                    marker='x',
                    s=100,
                    color='orange',
                    label='Maximum likelyhood')

        plt.xlabel('Bandwidth [Mly]')
        plt.ylabel('Likelyhood')
        plt.legend(loc='best')

        plt.title('Great wall KDE bandwidth likelyhood')
        plt.savefig('GreatWall-KDE-bandwidth-likelyhood.svg',
                    bbox_inches='tight')
        plt.show()

        #show the KDE with the highest likelyhood badwidth
        kde = KernelDensity(bandwidth=bestbandwidth,
                            kernel='exponential').fit(X)
        # Evaluate the KDE on the grid
        log_dens = kde.score_samples(Xgrid)
        bestdens = X.shape[0] * np.exp(log_dens).reshape((Ny, Nx))

        plt.figure()
        plt.imshow(bestdens, interpolation='nearest', cmap=plt.get_cmap('hot'))
        plt.title('Great Wall KDE with bandwidth = {0}'.format(
            round(bestbandwidth, 2)))
        plt.xlabel('Width [Mly]')
        plt.ylabel('Height [Mly]')
        plt.savefig('GreatWall-KDE-best-bandwidth.svg', bbox_inches='tight')
        plt.show()

    Qb()
Ejemplo n.º 3
0
# IPython log file


from astroML import datasets
X = datasets.fetch_great_wall()
A = datasets.fetch_moving_objects()
X.shape
plt.scatter(*X.T)
plt.scatter(*X.T, s=1, c='k')
plt.scatter(X[:, 1], X[:, 0], s=1, c='k')
X.shape
fig, ax = plt.subplots()
ax.set_facecolor('black')
fig, ax = plt.subplots(1, 2, figsize=(10, 5), facecolor='black')
for a in ax:
    a.set_facecolor('black')
    for spine in ax.spines.values():
        spine.set_color('w')
    for tick in ax.xaxis.get_major_ticks() + ax.yaxis.get_major_ticks():
        for child in tick.get_children():
            child.set_color('w')
for a in ax.ravel():
    a.set_facecolor('black')
    for spine in ax.spines.values():
        spine.set_color('w')
    for tick in ax.xaxis.get_major_ticks() + ax.yaxis.get_major_ticks():
        for child in tick.get_children():
            child.set_color('w')
for a in ax.ravel():
    a.set_facecolor('black')
    for spine in a.spines.values():
Ejemplo n.º 4
0
def test_fetch_great_wall():
    data = fetch_great_wall()
    assert data.shape == (8014, 2)