def test_split_samples(): X = np.arange(100.) y = np.arange(100.) X_divisions, y_divisions = split_samples(X, y) assert (len(X_divisions[0]) == len(y_divisions[0]) == 75) assert (len(X_divisions[1]) == len(y_divisions[1]) == 25) assert (len(set(X_divisions[0]) | set(X_divisions[1])) == 100)
#---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() X = X[:, [1, 0, 2, 3]] # rearrange columns for better 1-color results (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # perform LDA classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1)
import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from astroML.utils import split_samples from astroML.utils import completeness_contamination from sklearn.metrics import roc_curve signal = np.load('ClassSample_training_1.npy') background = np.load('ClassSample_training_0.npy') data = np.concatenate([signal, background]) data = data[~np.isnan(data).any(axis=1)] (features_train, features_test), (labels_train, labels_test) = split_samples(data[:,:8], data[:,8], fractions=[0.75,0.25]) fig_roc = plt.figure() ax_roc = fig_roc.add_subplot(111) featureIDs = np.arange(2, data.shape[1]+1) classification = [] predictions = [] scores = [] for i in featureIDs: logr = LogisticRegression() logr.fit(features_train[:,:i], labels_train) labels_pred = logr.predict_proba(features_test[:,:i])[:,1] classification.append(logr) predictions.append(labels_pred) fpr, tpr, thresholds = roc_curve(labels_test, labels_pred) ax_roc.plot(fpr, tpr, label="%d features"%(i)) # print fpr, tpr ax_roc.set_xlabel("False positive rate") ax_roc.set_ylabel("True positive rate")
# The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from sklearn.naive_bayes import GaussianNB from astroML.datasets import fetch_rrlyrae_combined from astroML.utils import split_samples from astroML.utils import completeness_contamination #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() X = X[:, [1, 0, 2, 3]] # rearrange columns for better 1-color results (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # perform Naive Bayes classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) order = np.array([1, 0, 2, 3])
def main(): parser = argparse.ArgumentParser(description= 'Perform Dimensionality Reduction') parser.add_argument('--alg', type=str, default='MLLE', help='Algorithm to reduce dimensionality.') parser.add_argument('catalog', type=str, help='Specify the catalog on which to perform DimReduce.') args = parser.parse_args() #dat = Table.read('catalogs/ZEST_catalog_colors.fits') #training_sample = dat[0:10000] #testing_sample = dat[10001:20000] #zkeys = ['cc', 'aa', 'm20', 'gg'] base = os.path.basename(args.catalog) filename = os.path.splitext(base)[0] dat = Table.read(args.catalog) mkeys = ['elipt', 'C', 'A_1a', 'G', 'M20']# #dat.remove_column('color') if 'color' not in dat.colnames: if 'kaggle' in sample: dat = prep_catalog.color_data2(dat, 'gz2class') if 'direct' in sample: dat = prep_catalog.color_data(dat, 'zclass') dat.write(args.catalog, overwrite=True) #dat = prep_catalog.adjust_asym(dat, mkeys[2]) #train, traincols, targets = prep_catalog.whiten_data(dat, mkeys) n_neighbors = [10,12,15,20] #n_neighbors = [7] n_components = 3 for i, n_neigh in enumerate(n_neighbors): if args.alg in ['MLLE', 'LLE', 'LTSA', 'HLLE']: if args.alg == 'MLLE': method = 'modified' elif args.alg == 'LLE': method = 'standard' elif args.alg == 'LTSA': method = 'ltsa' elif args.alg == 'HLLE': method = 'hessian' #replace_panoptes(dat) #pdb.set_trace() #sample = 'directbig_panoptes' X, y = prep_catalog.whiten_data(dat, mkeys) (dat1, dat2),(thing1,thing2) = split_samples(dat, dat,[0.75, 0.35], random_state=0) (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.35], random_state=0) y_train = simplify_classlabels(y_train) y_test = simplify_classlabels(y_test) #filename = 'modified_7_directbig_new' X_train = X y_train = simplify_classlabels(y) #''' #sample ='direct_zcut' #Y_train, Y_test = open_previous_LLE(filename) #cut = np.where(X1['REDSHIFT'] <= 0.05) #X1_cut = X1[cut] #QC_plots(X1_cut) #Y_train = np.array(Y_train)[cut] #col_train = np.array(col_train)[cut] #X = Table(X) #cut_out_mixedup_region(X, np.array(Y_train)) #''' print "performing "+method+" LLE with",n_neigh,\ "nearest neighbors" print "on training sample of",len(X_train),"objects" t0 = time() A = LLE(n_neigh, n_components, eigen_solver='auto', method=method) error = A.fit(X_train).reconstruction_error_ Y_train = A.fit_transform(X_train) Y_test = A.transform(X_train) t1 = time() #''' metadata = {'method':method, 'N':n_neigh, 'd':n_components, 'error':error, 'time':t1-t0, 'sample':filename+'_total'} save_dimreduce(dat, Y_train, y_train, metadata, filename+'_total') #metadata = {'method':method, 'N':n_neigh, 'd':n_components, # 'error':error, 'time':t1-t0, 'sample':filename+'_test'} #save_dimreduce(X2, Y_test, y_test, metadata, filename+'_test') # plot in 3D plot_dimreduce_3D(Y_train, y_train[:,1], Y_test, y_test[:,1], method, n_neigh, error, t1-t0, filename, two=False) #====================================================================# elif args.alg == 'ISO': method='IsoMap' print "performing IsoMap with",n_neigh,"nearest neighbors" print "on training sample of",len(dat),"objects" t0 = time() A = Isomap(n_neigh, n_components, eigen_solver='dense') error = A.fit(train).reconstruction_error() Y = A.fit_transform(train) #Y2 = A.transform(test) t1 = time() print "%s: %.2g sec" %(args.alg, t1-t0) print "reconstruction error: ", error print "begin plotting" plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=0) plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=1) plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=2) plot_dimreduce_3D(Y, traincols, Y, traincols, method, n_neigh, (t1-t0), error, sample) elif args.alg == 'LDA': print "performing LDA" X, Xc, y = prep_catalog.whiten_data(dat, mkeys) (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) DRclf = LDA(3, priors=None) #DRclf.fit(X_train, y_train) DRtrain = DRclf.fit(X_train, y_train).transform(X_train) DRtest = DRclf.fit(X_train, y_train).transform(X_test) classes = np.unique(y_train) colors = np.array(['darkred', 'red', 'lightsalmon', 'darkgreen', 'lightgreen', 'lightseagreen', 'indigo', 'darkviolet', 'plum']) plot_LDA_3D(DRtrain, y_train, classes, colors, sample) pdb.set_trace() #classifiers = [] #predictions = [] #Nparams = np.arange(1, X.shape[1]+1) #for nc in Nparams: clf = LDA() clf.fit(DRtrain, y_train) y_pred = clf.predict(DRtest) matchesLDA = (y_pred == y_test) print np.sum(matchesLDA) pdb.set_trace() #------------------------------------------ from sklearn.neighbors import KNeighborsClassifier knc = KNeighborsClassifier(5) knc.fit(DRtrain, y_train) y_pred = knc.predict(DRtest) matchesKNN = (y_pred == y_test) print np.sum(matchesKNN) pdb.set_trace() #------------------------------------------ from astroML.classification import GMMBayes gmmb = GMMBayes(9) gmmb.fit(DRtrain, y_train) y_pred = gmmb.predict(DRtest) matchesGMMB = (y_pred == y_test) print np.sum(matchesGMMB) pdb.set_trace() #------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) pdb.set_trace() im = ax.scatter(X[:, 3], X[:, 4], color=Xc, cmap=plt.cm.Spectral, s=4, lw=0) #cmap=plt.cm.binary,, zorder=2 im.set_clim(-0.5, 1) #im = ax.imshow(Z, origin='lower', aspect='auto', # cmap=plt.cm.binary, zorder=1, # extent=xlim + ylim) #im.set_clim(0, 1.5) #ax.contour(xx, yy, Z, [0.5], colors='k') #ax.set_xlim(xlim) #ax.set_ylim(ylim) ax.set_xlabel('$G$') ax.set_ylabel('$M20$') #pred, true = classification_loss(predictions, y_test) #completeness, contamination = completeness_contamination(pred, true) pdb.set_trace() #''' #t0 = time() #A = LDA(n_components, priors=None) #Y = A.fit_transform(train, targets) #Y2 = A.fit(train, targets).transform(train) #t1 = time() #print "%s: %.2g sec" %(args.alg, t1-t0) predict = A.predict(train) #print "Predicted classes:", predict #pdb.set_trace() #pdb.set_trace() #''' plot_LDA_3D(Y2, targets, classes, colors, sample) plot_LDA(Y2, targets, classes, colors, sample, axis=0) plot_LDA(Y2, targets, classes, colors, sample, axis=1) plot_LDA(Y2, targets, classes, colors, sample, axis=2) pdb.set_trace()