""" # Author: Jake VanderPlas <*****@*****.**> # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from sklearn.mixture import GMM from astroML.datasets import fetch_great_wall from astroML.decorators import pickle_results #------------------------------------------------------------ # load great wall data X = fetch_great_wall() #------------------------------------------------------------ # Create a function which will save the results to a pickle file # for large number of clusters, computation will take a long time! @pickle_results('great_wall_GMM.pkl') def compute_GMM(n_clusters, n_iter=1000, min_covar=3, covariance_type='full'): clf = GMM(n_clusters, covariance_type=covariance_type, n_iter=n_iter, min_covar=min_covar) clf.fit(X) print "converged:", clf.converged_ return clf #------------------------------------------------------------ # Compute a grid on which to evaluate the result
def question4(): from astroML.datasets import fetch_great_wall X = fetch_great_wall() bw = 5 #bandwidth for the KDE # Create the grid on which to evaluate the results ratio = 50. / 125. sizefactor = 250 #default = 125 Nx = int(ratio * sizefactor) Ny = int(sizefactor) xmin, xmax = (-375, -175) ymin, ymax = (-300, 200) xgrid = np.linspace(xmin, xmax, Nx) ygrid = np.linspace(ymin, ymax, Ny) mesh = np.meshgrid(xgrid, ygrid) tmp = map(np.ravel, mesh) Xgrid = np.vstack(tmp).T def Qa(): #Make KDEs for the different kernels kde = KernelDensity(bandwidth=bw, kernel='gaussian').fit(X) # Evaluate the KDE on the grid log_dens = kde.score_samples(Xgrid) dens1 = X.shape[0] * np.exp(log_dens).reshape((Ny, Nx)) kde = KernelDensity(bandwidth=bw, kernel='tophat').fit(X) # Evaluate the KDE on the grid log_dens = kde.score_samples(Xgrid) dens2 = X.shape[0] * np.exp(log_dens).reshape((Ny, Nx)) kde = KernelDensity(bandwidth=bw, kernel='exponential').fit(X) # Evaluate the KDE on the grid log_dens = kde.score_samples(Xgrid) dens3 = X.shape[0] * np.exp(log_dens).reshape((Ny, Nx)) kde = KernelDensity(bandwidth=bw, kernel='epanechnikov').fit(X) # Evaluate the KDE on the grid log_dens = kde.score_samples(Xgrid) dens4 = X.shape[0] * np.exp(log_dens).reshape((Ny, Nx)) plt.figure(figsize=(12. * 2. / 5., 12)) #plt.imshow(dens1, cmap=plt.get_cmap('hot')) plt.scatter(X.T[0], X.T[1], edgecolor='none', s=2, color='black') plt.axis('equal') plt.xlim((-375, -175)) plt.ylim((-300, 200)) plt.title('Great Wall galaxies') plt.xlabel('Width [Mly]') plt.ylabel('Height [Mly]') plt.savefig('great-wall_raw.svg', bbox_inches='tight') #plt.show() plt.close() fig, ((x1, x2), (x3, x4)) = plt.subplots(2, 2, figsize=(6.5, 12)) x1.imshow(dens1, interpolation='nearest', cmap=plt.get_cmap('hot')) x1.set_title("Gaussian KDE") x1.set_xlabel('Width [Mly]') x1.set_ylabel('Height [Mly]') x2.imshow(dens2, interpolation='nearest', cmap=plt.get_cmap('hot')) x2.set_title("Tophat KDE") x2.set_xlabel('Width [Mly]') x2.set_ylabel('Height [Mly]') x3.imshow(dens3, interpolation='nearest', cmap=plt.get_cmap('hot')) x3.set_title("Exponential KDE") x3.set_xlabel('Width [Mly]') x3.set_ylabel('Height [Mly]') x4.imshow(dens4, interpolation='nearest', cmap=plt.get_cmap('hot')) x4.set_title("Epanechnikov KDE") x4.set_xlabel('Width [Mly]') x4.set_ylabel('Height [Mly]') plt.savefig('Question4a.svg', bbox_inches='tight') print('Best kernel: exponential') plt.show() def Qb(): print('Starting cross validation...') #different values for the bandwidth bwrange = np.linspace(1, 20, 20) #set the number of folds kf = KFold(n_splits=10) likelyhood = np.zeros(len(bwrange)) print('Finding the best bandwidth...') for bw, i in zip(bwrange, np.arange(len(bwrange))): print('{0} of {1}'.format(i + 1, len(bwrange))) lh = [] for train_i, test_i in kf.split(X): Xtrain, Xtest = X[train_i], X[test_i] kde = KernelDensity(bandwidth=bw, kernel='exponential').fit(Xtrain) log_dens = kde.score_samples(Xtrain) lhscore = kde.score(Xtest) #print('Bandwidth: {0}, Likelyhood: {1}'.format(bw, lhscore)) lh = np.append(lh, lhscore) likelyhood[i] = np.mean(lh) bestbandwidth = bwrange[np.argmax(likelyhood)] print('Highest likelyhood ({0}) at bandwidth = {1}'.format( round(np.max(likelyhood), 2), bestbandwidth)) plt.plot(bwrange, likelyhood, color='black', alpha=0.8, label='Likelyhood') plt.scatter(bwrange[np.argmax(likelyhood)], np.max(likelyhood), marker='x', s=100, color='orange', label='Maximum likelyhood') plt.xlabel('Bandwidth [Mly]') plt.ylabel('Likelyhood') plt.legend(loc='best') plt.title('Great wall KDE bandwidth likelyhood') plt.savefig('GreatWall-KDE-bandwidth-likelyhood.svg', bbox_inches='tight') plt.show() #show the KDE with the highest likelyhood badwidth kde = KernelDensity(bandwidth=bestbandwidth, kernel='exponential').fit(X) # Evaluate the KDE on the grid log_dens = kde.score_samples(Xgrid) bestdens = X.shape[0] * np.exp(log_dens).reshape((Ny, Nx)) plt.figure() plt.imshow(bestdens, interpolation='nearest', cmap=plt.get_cmap('hot')) plt.title('Great Wall KDE with bandwidth = {0}'.format( round(bestbandwidth, 2))) plt.xlabel('Width [Mly]') plt.ylabel('Height [Mly]') plt.savefig('GreatWall-KDE-best-bandwidth.svg', bbox_inches='tight') plt.show() Qb()
# IPython log file from astroML import datasets X = datasets.fetch_great_wall() A = datasets.fetch_moving_objects() X.shape plt.scatter(*X.T) plt.scatter(*X.T, s=1, c='k') plt.scatter(X[:, 1], X[:, 0], s=1, c='k') X.shape fig, ax = plt.subplots() ax.set_facecolor('black') fig, ax = plt.subplots(1, 2, figsize=(10, 5), facecolor='black') for a in ax: a.set_facecolor('black') for spine in ax.spines.values(): spine.set_color('w') for tick in ax.xaxis.get_major_ticks() + ax.yaxis.get_major_ticks(): for child in tick.get_children(): child.set_color('w') for a in ax.ravel(): a.set_facecolor('black') for spine in ax.spines.values(): spine.set_color('w') for tick in ax.xaxis.get_major_ticks() + ax.yaxis.get_major_ticks(): for child in tick.get_children(): child.set_color('w') for a in ax.ravel(): a.set_facecolor('black') for spine in a.spines.values():
def test_fetch_great_wall(): data = fetch_great_wall() assert data.shape == (8014, 2)