Exemple #1
0
def test_split_samples():
    X = np.arange(100.)
    y = np.arange(100.)

    X_divisions, y_divisions = split_samples(X, y)

    assert (len(X_divisions[0]) == len(y_divisions[0]) == 75)
    assert (len(X_divisions[1]) == len(y_divisions[1]) == 25)
    assert (len(set(X_divisions[0]) | set(X_divisions[1])) == 100)
Exemple #2
0
#----------------------------------------------------------------------
# This function adjusts matplotlib settings for a uniform feel in the textbook.
# Note that with usetex=True, fonts are rendered with LaTeX.  This may
# result in an error if LaTeX is not installed on your system.  In that case,
# you can set usetex to False.
from astroML.plotting import setup_text_plots

setup_text_plots(fontsize=8, usetex=True)

#----------------------------------------------------------------------
# get data and split into training & testing sets
X, y = fetch_rrlyrae_combined()
X = X[:, [1, 0, 2, 3]]  # rearrange columns for better 1-color results
(X_train, X_test), (y_train, y_test) = split_samples(X,
                                                     y, [0.75, 0.25],
                                                     random_state=0)

N_tot = len(y)
N_st = np.sum(y == 0)
N_rr = N_tot - N_st
N_train = len(y_train)
N_test = len(y_test)
N_plot = 5000 + N_rr

#----------------------------------------------------------------------
# perform LDA
classifiers = []
predictions = []
Ncolors = np.arange(1, X.shape[1] + 1)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from astroML.utils import split_samples
from astroML.utils import completeness_contamination
from sklearn.metrics import roc_curve

signal = np.load('ClassSample_training_1.npy')
background = np.load('ClassSample_training_0.npy')
data = np.concatenate([signal, background])
data = data[~np.isnan(data).any(axis=1)]

(features_train, features_test), (labels_train, labels_test) = split_samples(data[:,:8], data[:,8], fractions=[0.75,0.25])

fig_roc = plt.figure()
ax_roc = fig_roc.add_subplot(111)
featureIDs = np.arange(2, data.shape[1]+1)
classification = []
predictions = []
scores = []
for i in featureIDs:
    logr = LogisticRegression()
    logr.fit(features_train[:,:i], labels_train)
    labels_pred = logr.predict_proba(features_test[:,:i])[:,1]
    classification.append(logr)
    predictions.append(labels_pred)
    fpr, tpr, thresholds = roc_curve(labels_test, labels_pred)
    ax_roc.plot(fpr, tpr, label="%d features"%(i))
#    print fpr, tpr
ax_roc.set_xlabel("False positive rate")
ax_roc.set_ylabel("True positive rate")
#   The figure produced by this code is published in the textbook
#   "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
#   For more information, see http://astroML.github.com
import numpy as np
from matplotlib import pyplot as plt

from sklearn.naive_bayes import GaussianNB
from astroML.datasets import fetch_rrlyrae_combined
from astroML.utils import split_samples
from astroML.utils import completeness_contamination

#----------------------------------------------------------------------
# get data and split into training & testing sets
X, y = fetch_rrlyrae_combined()
X = X[:, [1, 0, 2, 3]]  # rearrange columns for better 1-color results
(X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25],
                                                     random_state=0)

N_tot = len(y)
N_st = np.sum(y == 0)
N_rr = N_tot - N_st
N_train = len(y_train)
N_test = len(y_test)
N_plot = 5000 + N_rr

#----------------------------------------------------------------------
# perform Naive Bayes
classifiers = []
predictions = []
Ncolors = np.arange(1, X.shape[1] + 1)

order = np.array([1, 0, 2, 3])
def main():
    
    parser = argparse.ArgumentParser(description=
                                'Perform Dimensionality Reduction')
    parser.add_argument('--alg', type=str, default='MLLE',
        help='Algorithm to reduce dimensionality.')
    parser.add_argument('catalog', type=str,
        help='Specify the catalog on which to perform DimReduce.')
    args = parser.parse_args()

    #dat = Table.read('catalogs/ZEST_catalog_colors.fits')
    #training_sample = dat[0:10000]
    #testing_sample = dat[10001:20000]
    #zkeys = ['cc', 'aa', 'm20', 'gg']

    base = os.path.basename(args.catalog)
    filename = os.path.splitext(base)[0]

    dat = Table.read(args.catalog)
    mkeys = ['elipt', 'C', 'A_1a', 'G', 'M20']#

    #dat.remove_column('color')
    if 'color' not in dat.colnames:
        if 'kaggle' in sample:
            dat = prep_catalog.color_data2(dat, 'gz2class')
        if 'direct' in sample:
            dat = prep_catalog.color_data(dat, 'zclass')
        dat.write(args.catalog, overwrite=True)

    #dat = prep_catalog.adjust_asym(dat, mkeys[2])
    #train, traincols, targets = prep_catalog.whiten_data(dat, mkeys)

    n_neighbors = [10,12,15,20]
    #n_neighbors = [7]
    n_components = 3

    for i, n_neigh in enumerate(n_neighbors):
        
        if args.alg in ['MLLE', 'LLE', 'LTSA', 'HLLE']:
            if args.alg == 'MLLE':
                method = 'modified'
            elif args.alg == 'LLE':
                method = 'standard'
            elif args.alg == 'LTSA':
                method = 'ltsa'
            elif args.alg == 'HLLE':
                method = 'hessian'
                           
            #replace_panoptes(dat)
            #pdb.set_trace()
            #sample = 'directbig_panoptes'

            X, y = prep_catalog.whiten_data(dat, mkeys)

            (dat1, dat2),(thing1,thing2) = split_samples(dat, dat,[0.75, 0.35], 
                                                       random_state=0)
            
            (X_train, X_test), (y_train, y_test) = split_samples(X, y, 
                                                [0.75, 0.35], random_state=0)

            y_train = simplify_classlabels(y_train)
            y_test = simplify_classlabels(y_test)

            #filename = 'modified_7_directbig_new'

            X_train = X
            y_train = simplify_classlabels(y)

            #'''
            #sample ='direct_zcut'

            #Y_train, Y_test = open_previous_LLE(filename)

            #cut = np.where(X1['REDSHIFT'] <= 0.05)
            #X1_cut = X1[cut]
            #QC_plots(X1_cut)
            #Y_train = np.array(Y_train)[cut]
            #col_train = np.array(col_train)[cut]
            #X = Table(X)
            #cut_out_mixedup_region(X, np.array(Y_train))

            #'''
            print "performing "+method+" LLE with",n_neigh,\
                "nearest neighbors"
            print "on training sample of",len(X_train),"objects"

            t0 = time()
            A = LLE(n_neigh, n_components, eigen_solver='auto', method=method)
            error = A.fit(X_train).reconstruction_error_
            
            Y_train = A.fit_transform(X_train)
            Y_test = A.transform(X_train)
            t1 = time()
            #'''        

            metadata = {'method':method, 'N':n_neigh, 'd':n_components, 
                        'error':error, 'time':t1-t0, 'sample':filename+'_total'}
            save_dimreduce(dat, Y_train, y_train, metadata, filename+'_total')

            #metadata = {'method':method, 'N':n_neigh, 'd':n_components, 
            #            'error':error, 'time':t1-t0, 'sample':filename+'_test'}
            #save_dimreduce(X2, Y_test, y_test, metadata, filename+'_test')

            # plot in 3D
            plot_dimreduce_3D(Y_train, y_train[:,1], Y_test, y_test[:,1], 
                              method, n_neigh, error, t1-t0, filename, two=False)

        #====================================================================#

        elif args.alg == 'ISO':
            method='IsoMap'
                
            print "performing IsoMap with",n_neigh,"nearest neighbors"
            print "on training sample of",len(dat),"objects"
            
            t0 = time()
            A = Isomap(n_neigh, n_components, eigen_solver='dense')
            error = A.fit(train).reconstruction_error()
            
            Y = A.fit_transform(train)
            #Y2 = A.transform(test)
            
            t1 = time()
            print "%s: %.2g sec" %(args.alg, t1-t0)
            print "reconstruction error: ", error
            
            print "begin plotting"
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=0)
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=1)
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=2)
            plot_dimreduce_3D(Y, traincols, Y, traincols, method, 
                              n_neigh, (t1-t0), error, sample)
            
        elif args.alg == 'LDA':
            
            print "performing LDA"
            
            X, Xc, y = prep_catalog.whiten_data(dat, mkeys)

            (X_train, X_test), (y_train, y_test) = split_samples(X, y, 
                                                [0.75, 0.25], random_state=0)

            DRclf = LDA(3, priors=None)
            #DRclf.fit(X_train, y_train)
            DRtrain = DRclf.fit(X_train, y_train).transform(X_train)
            DRtest = DRclf.fit(X_train, y_train).transform(X_test)

            classes = np.unique(y_train)
            colors = np.array(['darkred', 'red', 'lightsalmon', 
                               'darkgreen', 'lightgreen', 'lightseagreen', 
                               'indigo', 'darkviolet', 'plum'])
            plot_LDA_3D(DRtrain, y_train, classes, colors, sample)

            pdb.set_trace()

            #classifiers = []
            #predictions = []
            #Nparams = np.arange(1, X.shape[1]+1)
            #for nc in Nparams:
            clf = LDA()
            clf.fit(DRtrain, y_train)
            y_pred = clf.predict(DRtest)
            
            matchesLDA = (y_pred == y_test)
            print np.sum(matchesLDA)

            pdb.set_trace()

            #------------------------------------------

            from sklearn.neighbors import KNeighborsClassifier
            knc = KNeighborsClassifier(5)
            knc.fit(DRtrain, y_train)
            y_pred = knc.predict(DRtest)

            matchesKNN = (y_pred == y_test)
            print np.sum(matchesKNN)

            pdb.set_trace()
            #------------------------------------------

            from astroML.classification import GMMBayes
            gmmb = GMMBayes(9)
            gmmb.fit(DRtrain, y_train)
            y_pred = gmmb.predict(DRtest)

            matchesGMMB = (y_pred == y_test)
            print np.sum(matchesGMMB)

            pdb.set_trace()
            #------------------------------------------

            # plot the results
            fig = plt.figure(figsize=(5, 2.5))
            fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0,
                                left=0.1, right=0.95, wspace=0.2)

            # left plot: data and decision boundary
            ax = fig.add_subplot(121)
            pdb.set_trace()
            im = ax.scatter(X[:, 3], X[:, 4], color=Xc, cmap=plt.cm.Spectral, 
                            s=4, lw=0) #cmap=plt.cm.binary,, zorder=2
            im.set_clim(-0.5, 1)
            
            #im = ax.imshow(Z, origin='lower', aspect='auto',
            #               cmap=plt.cm.binary, zorder=1,
            #               extent=xlim + ylim)
            #im.set_clim(0, 1.5)
            
            #ax.contour(xx, yy, Z, [0.5], colors='k')
            
            #ax.set_xlim(xlim)
            #ax.set_ylim(ylim)
            
            ax.set_xlabel('$G$')
            ax.set_ylabel('$M20$')

            #pred, true = classification_loss(predictions, y_test)
            #completeness, contamination = completeness_contamination(pred, true)

            pdb.set_trace()


            #'''
            #t0 = time()
            #A = LDA(n_components, priors=None)
            #Y = A.fit_transform(train, targets)
            #Y2 = A.fit(train, targets).transform(train)
                
            #t1 = time()
            #print "%s: %.2g sec" %(args.alg, t1-t0)
            
            predict = A.predict(train)
            #print "Predicted classes:", predict
            #pdb.set_trace()
            

            #pdb.set_trace()
            #'''
            
            plot_LDA_3D(Y2, targets, classes, colors, sample)
            plot_LDA(Y2, targets, classes, colors, sample, axis=0)
            plot_LDA(Y2, targets, classes, colors, sample, axis=1)
            plot_LDA(Y2, targets, classes, colors, sample, axis=2)
            
            pdb.set_trace()