Esempio n. 1
0
def compute_SVM_results(i_train, i_test, n_components=5):
    classifiers = []
    predictions = []
    Xtests = []
    ytests = []
    Xtrains = []
    ytrains = []

    for i in range(len(attributes)):
        Xtrain = X[i][i_train]
        Xtest = X[i][i_test]
        ytrain = y[i][i_train]
        ytest = y[i][i_test]

        clf = GMMBayes(n_components,
                       min_covar=1E-5,
                       covariance_type='full',
                       random_state=0)
        clf.fit(Xtrain, ytrain)
        y_pred = clf.predict(Xtest)

        classifiers.append(clf)
        predictions.append(y_pred)

    return classifiers, predictions
Esempio n. 2
0
def test_too_many_components_warning():
    X = np.random.normal(0, 1, size=(3, 2))
    y = np.zeros(3)

    ncm = 5
    clf = GMMBayes(ncm)

    with pytest.warns(UserWarning,
                      match="Expected n_samples >= "
                      "n_components but got "):
        clf.fit(X, y)
Esempio n. 3
0
def test_incompatible_shapes_exception():
    X = np.random.normal(0, 1, size=(100, 2))
    y = np.zeros(99)

    ncm = 1
    clf = GMMBayes(ncm)

    with pytest.raises(Exception) as e:
        assert clf.fit(X, y)

    assert str(e.value) == "X and y have incompatible shapes"
Esempio n. 4
0
def test_incompatible_number_of_components_exception():
    X = np.random.normal(0, 1, size=(100, 2))
    y = np.zeros(100)

    ncm = [1, 2, 3]
    clf = GMMBayes(ncm)

    with pytest.raises(Exception) as e:
        assert clf.fit(X, y)

    assert str(e.value) == ("n_components must be compatible with "
                            "the number of classes")
Esempio n. 5
0
def test_gmm2d():
    x1 = np.random.normal(0, 1, size=(100, 2))
    x2 = np.random.normal(10, 1, size=(100, 2))
    X = np.vstack((x1, x2))
    y = np.zeros(200)
    y[100:] = 1

    for ncm in (1, 2, 3):
        clf = GMMBayes(ncm)
        clf.fit(X, y)

        predicted = clf.predict(X)
        assert_allclose(y, predicted)
Esempio n. 6
0
def test_gmm1d():
    x1 = np.random.normal(0, 1, size=100)
    x2 = np.random.normal(10, 1, size=100)
    X = np.concatenate((x1, x2)).reshape((200, 1))
    y = np.zeros(200)
    y[100:] = 1

    ncm = 1
    clf = GMMBayes(ncm)
    clf.fit(X, y)

    predicted = clf.predict(X)
    assert_allclose(y, predicted)
Esempio n. 7
0
def compute_GMMbayes(Ncolors, Ncomp):
    classifiers = []
    predictions = []

    for ncm in Ncomp:
        classifiers.append([])
        predictions.append([])
        for nc in Ncolors:
            clf = GMMBayes(ncm, min_covar=1E-5, covariance_type='full')
            clf.fit(X_train[:, :nc], y_train)
            y_pred = clf.predict(X_test[:, :nc])

            classifiers[-1].append(clf)
            predictions[-1].append(y_pred)

    return classifiers, predictions
Esempio n. 8
0
def compute_GMMbayes(Ncolors, Ncomp):
    classifiers = []
    predictions = []

    for ncm in Ncomp:
        classifiers.append([])
        predictions.append([])
        for nc in Ncolors:
            clf = GMMBayes(ncm, min_covar=1E-5, covariance_type='full')
            clf.fit(X_train[:, :nc], y_train)
            y_pred = clf.predict(X_test[:, :nc])

            classifiers[-1].append(clf)
            predictions[-1].append(y_pred)

    return classifiers, predictions
Esempio n. 9
0
def compute_SVM_results(i_train, i_test, n_components=5):
    classifiers = []
    predictions = []
    Xtests = []
    ytests = []
    Xtrains = []
    ytrains = []

    for i in range(len(attributes)):
        Xtrain = X[i][i_train]
        Xtest = X[i][i_test]
        ytrain = y[i][i_train]
        ytest = y[i][i_test]

        clf = GMMBayes(n_components, min_covar=1E-5, covariance_type='full')
        clf.fit(Xtrain, ytrain)
        y_pred = clf.predict(Xtest)

        classifiers.append(clf)
        predictions.append(y_pred)

    return classifiers, predictions
Esempio n. 10
0
def gmm_bayes_analysis(X_train, X_test, y_train, y_test):
    clf = GMMBayes()

    t1 = time.time()
    clf.fit(X_train, y_train)
    t2 = time.time()

    t_train = t2 - t1

    t1 = time.time()
    score = clf.score(X_test, y_test)
    t2 = time.time()

    t_test = t2 - t1

    # Generate graphs/data for analysis
    tpr, fpr, roc_auc = roc_calc(GMMBayes(), X_train, X_test, y_train, y_test)

    return tpr, fpr, roc_auc, t_train, t_test, score
Esempio n. 11
0
                #posterior[m] =  knc.predict_proba(X_test)

        print "Error-Correcting Output Code: ", np.mean(
            accuracy) / 0.72, np.std(accuracy) / 0.72
        print k
        for i in range(0, 6):
            for j in range(0, 6):
                print '{:5.2f} '.format(box[i, j] / 100.0),
            print

    #end GNB

    box = np.zeros([6, 6])
    accuracy = np.zeros(100)
    for m in range(0, 100):
        gmm = GMMBayes(n_components=3)
        y_pred = gmm.fit(X_train, y_train).predict(X_test)
        for i in range(0, len(y_pred)):
            if y_pred[i] == y_test[i]:
                n = n + 1
                accuracy[m] = accuracy[m] + 1
            box[y_test[i] - 1,
                y_pred[i] - 1] = box[y_test[i] - 1, y_pred[i] - 1] + 1
    print "Gaussian Mixture Models, n_components=3: ", np.mean(
        accuracy) / 0.72, np.std(accuracy) / 0.72
    for i in range(0, 6):
        for j in range(0, 6):
            print '{:5.2f} '.format(box[i, j] / 100.0),
        print

    box = np.zeros([6, 6])
ytrain = y[i][i_train]
ytest = y[i][i_test]


print "Xtrain",Xtrain.shape
print "ytrain",ytrain.shape
print Xtrain
print ytrain
print np.isnan(Xtrain).sum()
print np.isnan(ytrain).sum()

n_componentss = np.arange(5,7)
scores = np.zeros(len(n_componentss))

for j,n_components in enumerate(n_componentss):
        clf = GMMBayes(n_components, min_covar=1E-5, covariance_type='full',
                       random_state=0)
        clf.fit(Xtrain, ytrain)
        y_pred = clf.predict(Xtest)
        print y_pred
        print "score, ", (y_pred==ytest).sum()
        #fpr, tpr, thresholds = roc_curve(ytest, y_prob)
        #aucs[j]= auc(fpr,tpr)
        scores[j]=1.*(y_pred==ytest).sum()

imax = np.argmax(scores)
print "optimal N is ",n_componentss[imax]

N_comp = n_componentss[imax]
clf = GMMBayes(N_comp, min_covar=1E-5, covariance_type='full',
                       random_state=0)
clf.fit(Xtrain, ytrain)
Esempio n. 13
0
        X_test = sample2
        y_test = labels[272:, i]
    else:
        X_train = training
        y_train = labels[:172, i]
        X_test = sampletest
        y_test = labels[172:, i]

    if i >= 2: j = 3
    else: j = 6

    accuracy = np.zeros(72)
    posterior = np.empty([10000, 72, 6])
    box = np.zeros([6, 6])
    for m in range(0, 100):
        gmm = GMMBayes(n_components=6)
        gmm.fit(X_train, y_train)
        y_pred = gmm.predict(X_test)

        n = 0
        for i in range(0, len(y_pred)):
            if y_pred[i] == y_test[i]:
                #print i, y_pred[i], y_test[i]
                n = n + 1
                accuracy[i] = accuracy[i] + 1
            box[y_test[i] - 1,
                y_pred[i] - 1] = box[y_test[i] - 1, y_pred[i] - 1] + 1
        posterior[m] = gmm.predict_proba(X_test)
    print 30, 20, sum(accuracy[0:8]) / 8.0, sum(accuracy[8:18]) / 10.0, sum(
        accuracy[18:30]) / 12.0, sum(accuracy[30:43]) / 13.0, sum(
            accuracy[43:56]) / 13.0, sum(
Esempio n. 14
0
np.random.shuffle(training_data_full)

n_samples = training_data_full.shape[0]
training_set = training_data_full[:n_samples*0.8, :]
test_set = training_data_full[n_samples*0.8:, :]

features = training_set[:, :-1]
labels = training_set[:, -1]

# Fit and plot
pl.figure()
ax1 = pl.subplot(121)
ax2 = pl.subplot(122)
for n_clusters in [2]:
    print n_clusters
    gmmb = GMMBayes(n_clusters)
    gmmb.fit(features, labels)
    scores = gmmb.predict_proba(features)

    fpr, tpr, thresholds = roc_curve(labels, scores[:,1])

    ax1.plot(fpr, tpr, label='%d clusters'%n_clusters)
    if n_clusters == 15:
        ax2.hist(scores[labels==0, 1], bins=100, normed=True, 
            alpha=0.5, label='Background')
        ax2.hist(scores[labels==1, 1], bins=100, normed=True, 
            alpha=0.5, label='Signal')
        ax2.legend(loc='best')

#np.save('fpr_tpr_GMMBayes_15clusters.npy', np.vstack([fpr, tpr]))
Esempio n. 15
0
for i in range(0,5):
    if i==1 or i==3:
        X_train = training2
        y_train = labels[100:172,i]
        X_test = sample2
        y_test = labels[272:,i]
    else:
        X_train = training
        y_train = labels[:172,i]
        X_test = sampletest
        y_test = labels[172:,i]

    ncomp = 0
    ncorrect = 0
    for j in range(1,9):
        gmm = GMMBayes(n_components=j)
        gmm.fit(X_train, y_train)
        y_pred = gmm.predict(X_test)
    
        n=0
        for k in range(0,len(y_pred)):
            if y_pred[k] == y_test[k]:
                #print i, y_pred[i], y_test[i]
                n = n+1
        if n > ncorrect:
            ncorrect = n
            ncomp = j
    print '{:3d}/{:3d}, {:2.2%}, n_components={:d}'.format(ncorrect, len(y_test), 
                                                           ncorrect*1.0/len(y_test), ncomp)

'''
Esempio n. 16
0
        X_test = sample2
        y_test = labels[272:,i]
    else:
        X_train = training
        y_train = labels[:172,i]
        X_test = sampletest
        y_test = labels[172:,i]

    if i>=2: j=3
    else: j=6

    accuracy = np.zeros(72)
    posterior = np.empty([10000,72,6])
    box = np.zeros([6,6])
    for m in range(0,100):
        gmm = GMMBayes(n_components=6)
        gmm.fit(X_train, y_train)
        y_pred = gmm.predict(X_test)

        n=0
        for i in range(0,len(y_pred)):
            if y_pred[i] == y_test[i]:
                #print i, y_pred[i], y_test[i]
                n = n+1
                accuracy[i] = accuracy[i]+1
            box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1
        posterior[m] =  gmm.predict_proba(X_test)
    print 30, 20, sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy[56:72])/16.0, sum(accuracy)/72.0
    means = np.empty([72,6])
    stds = np.empty([72,6])
    grid = np.empty([6,6])
Esempio n. 17
0
for i in range(0, 5):
    if i == 1 or i == 3:
        X_train = training2
        y_train = labels[100:172, i]
        X_test = sample2
        y_test = labels[272:, i]
    else:
        X_train = training
        y_train = labels[:172, i]
        X_test = sampletest
        y_test = labels[172:, i]

    ncomp = 0
    ncorrect = 0
    for j in range(1, 9):
        gmm = GMMBayes(n_components=j)
        gmm.fit(X_train, y_train)
        y_pred = gmm.predict(X_test)

        n = 0
        for k in range(0, len(y_pred)):
            if y_pred[k] == y_test[k]:
                #print i, y_pred[i], y_test[i]
                n = n + 1
        if n > ncorrect:
            ncorrect = n
            ncomp = j
    print '{:3d}/{:3d}, {:2.2%}, n_components={:d}'.format(
        ncorrect, len(y_test), ncorrect * 1.0 / len(y_test), ncomp)
'''
def onpick(event):
def main():
    
    parser = argparse.ArgumentParser(description=
                                'Perform Dimensionality Reduction')
    parser.add_argument('--alg', type=str, default='MLLE',
        help='Algorithm to reduce dimensionality.')
    parser.add_argument('catalog', type=str,
        help='Specify the catalog on which to perform DimReduce.')
    args = parser.parse_args()

    #dat = Table.read('catalogs/ZEST_catalog_colors.fits')
    #training_sample = dat[0:10000]
    #testing_sample = dat[10001:20000]
    #zkeys = ['cc', 'aa', 'm20', 'gg']

    base = os.path.basename(args.catalog)
    filename = os.path.splitext(base)[0]

    dat = Table.read(args.catalog)
    mkeys = ['elipt', 'C', 'A_1a', 'G', 'M20']#

    #dat.remove_column('color')
    if 'color' not in dat.colnames:
        if 'kaggle' in sample:
            dat = prep_catalog.color_data2(dat, 'gz2class')
        if 'direct' in sample:
            dat = prep_catalog.color_data(dat, 'zclass')
        dat.write(args.catalog, overwrite=True)

    #dat = prep_catalog.adjust_asym(dat, mkeys[2])
    #train, traincols, targets = prep_catalog.whiten_data(dat, mkeys)

    n_neighbors = [10,12,15,20]
    #n_neighbors = [7]
    n_components = 3

    for i, n_neigh in enumerate(n_neighbors):
        
        if args.alg in ['MLLE', 'LLE', 'LTSA', 'HLLE']:
            if args.alg == 'MLLE':
                method = 'modified'
            elif args.alg == 'LLE':
                method = 'standard'
            elif args.alg == 'LTSA':
                method = 'ltsa'
            elif args.alg == 'HLLE':
                method = 'hessian'
                           
            #replace_panoptes(dat)
            #pdb.set_trace()
            #sample = 'directbig_panoptes'

            X, y = prep_catalog.whiten_data(dat, mkeys)

            (dat1, dat2),(thing1,thing2) = split_samples(dat, dat,[0.75, 0.35], 
                                                       random_state=0)
            
            (X_train, X_test), (y_train, y_test) = split_samples(X, y, 
                                                [0.75, 0.35], random_state=0)

            y_train = simplify_classlabels(y_train)
            y_test = simplify_classlabels(y_test)

            #filename = 'modified_7_directbig_new'

            X_train = X
            y_train = simplify_classlabels(y)

            #'''
            #sample ='direct_zcut'

            #Y_train, Y_test = open_previous_LLE(filename)

            #cut = np.where(X1['REDSHIFT'] <= 0.05)
            #X1_cut = X1[cut]
            #QC_plots(X1_cut)
            #Y_train = np.array(Y_train)[cut]
            #col_train = np.array(col_train)[cut]
            #X = Table(X)
            #cut_out_mixedup_region(X, np.array(Y_train))

            #'''
            print "performing "+method+" LLE with",n_neigh,\
                "nearest neighbors"
            print "on training sample of",len(X_train),"objects"

            t0 = time()
            A = LLE(n_neigh, n_components, eigen_solver='auto', method=method)
            error = A.fit(X_train).reconstruction_error_
            
            Y_train = A.fit_transform(X_train)
            Y_test = A.transform(X_train)
            t1 = time()
            #'''        

            metadata = {'method':method, 'N':n_neigh, 'd':n_components, 
                        'error':error, 'time':t1-t0, 'sample':filename+'_total'}
            save_dimreduce(dat, Y_train, y_train, metadata, filename+'_total')

            #metadata = {'method':method, 'N':n_neigh, 'd':n_components, 
            #            'error':error, 'time':t1-t0, 'sample':filename+'_test'}
            #save_dimreduce(X2, Y_test, y_test, metadata, filename+'_test')

            # plot in 3D
            plot_dimreduce_3D(Y_train, y_train[:,1], Y_test, y_test[:,1], 
                              method, n_neigh, error, t1-t0, filename, two=False)

        #====================================================================#

        elif args.alg == 'ISO':
            method='IsoMap'
                
            print "performing IsoMap with",n_neigh,"nearest neighbors"
            print "on training sample of",len(dat),"objects"
            
            t0 = time()
            A = Isomap(n_neigh, n_components, eigen_solver='dense')
            error = A.fit(train).reconstruction_error()
            
            Y = A.fit_transform(train)
            #Y2 = A.transform(test)
            
            t1 = time()
            print "%s: %.2g sec" %(args.alg, t1-t0)
            print "reconstruction error: ", error
            
            print "begin plotting"
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=0)
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=1)
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=2)
            plot_dimreduce_3D(Y, traincols, Y, traincols, method, 
                              n_neigh, (t1-t0), error, sample)
            
        elif args.alg == 'LDA':
            
            print "performing LDA"
            
            X, Xc, y = prep_catalog.whiten_data(dat, mkeys)

            (X_train, X_test), (y_train, y_test) = split_samples(X, y, 
                                                [0.75, 0.25], random_state=0)

            DRclf = LDA(3, priors=None)
            #DRclf.fit(X_train, y_train)
            DRtrain = DRclf.fit(X_train, y_train).transform(X_train)
            DRtest = DRclf.fit(X_train, y_train).transform(X_test)

            classes = np.unique(y_train)
            colors = np.array(['darkred', 'red', 'lightsalmon', 
                               'darkgreen', 'lightgreen', 'lightseagreen', 
                               'indigo', 'darkviolet', 'plum'])
            plot_LDA_3D(DRtrain, y_train, classes, colors, sample)

            pdb.set_trace()

            #classifiers = []
            #predictions = []
            #Nparams = np.arange(1, X.shape[1]+1)
            #for nc in Nparams:
            clf = LDA()
            clf.fit(DRtrain, y_train)
            y_pred = clf.predict(DRtest)
            
            matchesLDA = (y_pred == y_test)
            print np.sum(matchesLDA)

            pdb.set_trace()

            #------------------------------------------

            from sklearn.neighbors import KNeighborsClassifier
            knc = KNeighborsClassifier(5)
            knc.fit(DRtrain, y_train)
            y_pred = knc.predict(DRtest)

            matchesKNN = (y_pred == y_test)
            print np.sum(matchesKNN)

            pdb.set_trace()
            #------------------------------------------

            from astroML.classification import GMMBayes
            gmmb = GMMBayes(9)
            gmmb.fit(DRtrain, y_train)
            y_pred = gmmb.predict(DRtest)

            matchesGMMB = (y_pred == y_test)
            print np.sum(matchesGMMB)

            pdb.set_trace()
            #------------------------------------------

            # plot the results
            fig = plt.figure(figsize=(5, 2.5))
            fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0,
                                left=0.1, right=0.95, wspace=0.2)

            # left plot: data and decision boundary
            ax = fig.add_subplot(121)
            pdb.set_trace()
            im = ax.scatter(X[:, 3], X[:, 4], color=Xc, cmap=plt.cm.Spectral, 
                            s=4, lw=0) #cmap=plt.cm.binary,, zorder=2
            im.set_clim(-0.5, 1)
            
            #im = ax.imshow(Z, origin='lower', aspect='auto',
            #               cmap=plt.cm.binary, zorder=1,
            #               extent=xlim + ylim)
            #im.set_clim(0, 1.5)
            
            #ax.contour(xx, yy, Z, [0.5], colors='k')
            
            #ax.set_xlim(xlim)
            #ax.set_ylim(ylim)
            
            ax.set_xlabel('$G$')
            ax.set_ylabel('$M20$')

            #pred, true = classification_loss(predictions, y_test)
            #completeness, contamination = completeness_contamination(pred, true)

            pdb.set_trace()


            #'''
            #t0 = time()
            #A = LDA(n_components, priors=None)
            #Y = A.fit_transform(train, targets)
            #Y2 = A.fit(train, targets).transform(train)
                
            #t1 = time()
            #print "%s: %.2g sec" %(args.alg, t1-t0)
            
            predict = A.predict(train)
            #print "Predicted classes:", predict
            #pdb.set_trace()
            

            #pdb.set_trace()
            #'''
            
            plot_LDA_3D(Y2, targets, classes, colors, sample)
            plot_LDA(Y2, targets, classes, colors, sample, axis=0)
            plot_LDA(Y2, targets, classes, colors, sample, axis=1)
            plot_LDA(Y2, targets, classes, colors, sample, axis=2)
            
            pdb.set_trace()