def compute_SVM_results(i_train, i_test, n_components=5): classifiers = [] predictions = [] Xtests = [] ytests = [] Xtrains = [] ytrains = [] for i in range(len(attributes)): Xtrain = X[i][i_train] Xtest = X[i][i_test] ytrain = y[i][i_train] ytest = y[i][i_test] clf = GMMBayes(n_components, min_covar=1E-5, covariance_type='full', random_state=0) clf.fit(Xtrain, ytrain) y_pred = clf.predict(Xtest) classifiers.append(clf) predictions.append(y_pred) return classifiers, predictions
def test_too_many_components_warning(): X = np.random.normal(0, 1, size=(3, 2)) y = np.zeros(3) ncm = 5 clf = GMMBayes(ncm) with pytest.warns(UserWarning, match="Expected n_samples >= " "n_components but got "): clf.fit(X, y)
def test_incompatible_shapes_exception(): X = np.random.normal(0, 1, size=(100, 2)) y = np.zeros(99) ncm = 1 clf = GMMBayes(ncm) with pytest.raises(Exception) as e: assert clf.fit(X, y) assert str(e.value) == "X and y have incompatible shapes"
def test_incompatible_number_of_components_exception(): X = np.random.normal(0, 1, size=(100, 2)) y = np.zeros(100) ncm = [1, 2, 3] clf = GMMBayes(ncm) with pytest.raises(Exception) as e: assert clf.fit(X, y) assert str(e.value) == ("n_components must be compatible with " "the number of classes")
def test_gmm2d(): x1 = np.random.normal(0, 1, size=(100, 2)) x2 = np.random.normal(10, 1, size=(100, 2)) X = np.vstack((x1, x2)) y = np.zeros(200) y[100:] = 1 for ncm in (1, 2, 3): clf = GMMBayes(ncm) clf.fit(X, y) predicted = clf.predict(X) assert_allclose(y, predicted)
def test_gmm1d(): x1 = np.random.normal(0, 1, size=100) x2 = np.random.normal(10, 1, size=100) X = np.concatenate((x1, x2)).reshape((200, 1)) y = np.zeros(200) y[100:] = 1 ncm = 1 clf = GMMBayes(ncm) clf.fit(X, y) predicted = clf.predict(X) assert_allclose(y, predicted)
def compute_GMMbayes(Ncolors, Ncomp): classifiers = [] predictions = [] for ncm in Ncomp: classifiers.append([]) predictions.append([]) for nc in Ncolors: clf = GMMBayes(ncm, min_covar=1E-5, covariance_type='full') clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers[-1].append(clf) predictions[-1].append(y_pred) return classifiers, predictions
def compute_SVM_results(i_train, i_test, n_components=5): classifiers = [] predictions = [] Xtests = [] ytests = [] Xtrains = [] ytrains = [] for i in range(len(attributes)): Xtrain = X[i][i_train] Xtest = X[i][i_test] ytrain = y[i][i_train] ytest = y[i][i_test] clf = GMMBayes(n_components, min_covar=1E-5, covariance_type='full') clf.fit(Xtrain, ytrain) y_pred = clf.predict(Xtest) classifiers.append(clf) predictions.append(y_pred) return classifiers, predictions
def gmm_bayes_analysis(X_train, X_test, y_train, y_test): clf = GMMBayes() t1 = time.time() clf.fit(X_train, y_train) t2 = time.time() t_train = t2 - t1 t1 = time.time() score = clf.score(X_test, y_test) t2 = time.time() t_test = t2 - t1 # Generate graphs/data for analysis tpr, fpr, roc_auc = roc_calc(GMMBayes(), X_train, X_test, y_train, y_test) return tpr, fpr, roc_auc, t_train, t_test, score
#posterior[m] = knc.predict_proba(X_test) print "Error-Correcting Output Code: ", np.mean( accuracy) / 0.72, np.std(accuracy) / 0.72 print k for i in range(0, 6): for j in range(0, 6): print '{:5.2f} '.format(box[i, j] / 100.0), print #end GNB box = np.zeros([6, 6]) accuracy = np.zeros(100) for m in range(0, 100): gmm = GMMBayes(n_components=3) y_pred = gmm.fit(X_train, y_train).predict(X_test) for i in range(0, len(y_pred)): if y_pred[i] == y_test[i]: n = n + 1 accuracy[m] = accuracy[m] + 1 box[y_test[i] - 1, y_pred[i] - 1] = box[y_test[i] - 1, y_pred[i] - 1] + 1 print "Gaussian Mixture Models, n_components=3: ", np.mean( accuracy) / 0.72, np.std(accuracy) / 0.72 for i in range(0, 6): for j in range(0, 6): print '{:5.2f} '.format(box[i, j] / 100.0), print box = np.zeros([6, 6])
ytrain = y[i][i_train] ytest = y[i][i_test] print "Xtrain",Xtrain.shape print "ytrain",ytrain.shape print Xtrain print ytrain print np.isnan(Xtrain).sum() print np.isnan(ytrain).sum() n_componentss = np.arange(5,7) scores = np.zeros(len(n_componentss)) for j,n_components in enumerate(n_componentss): clf = GMMBayes(n_components, min_covar=1E-5, covariance_type='full', random_state=0) clf.fit(Xtrain, ytrain) y_pred = clf.predict(Xtest) print y_pred print "score, ", (y_pred==ytest).sum() #fpr, tpr, thresholds = roc_curve(ytest, y_prob) #aucs[j]= auc(fpr,tpr) scores[j]=1.*(y_pred==ytest).sum() imax = np.argmax(scores) print "optimal N is ",n_componentss[imax] N_comp = n_componentss[imax] clf = GMMBayes(N_comp, min_covar=1E-5, covariance_type='full', random_state=0) clf.fit(Xtrain, ytrain)
X_test = sample2 y_test = labels[272:, i] else: X_train = training y_train = labels[:172, i] X_test = sampletest y_test = labels[172:, i] if i >= 2: j = 3 else: j = 6 accuracy = np.zeros(72) posterior = np.empty([10000, 72, 6]) box = np.zeros([6, 6]) for m in range(0, 100): gmm = GMMBayes(n_components=6) gmm.fit(X_train, y_train) y_pred = gmm.predict(X_test) n = 0 for i in range(0, len(y_pred)): if y_pred[i] == y_test[i]: #print i, y_pred[i], y_test[i] n = n + 1 accuracy[i] = accuracy[i] + 1 box[y_test[i] - 1, y_pred[i] - 1] = box[y_test[i] - 1, y_pred[i] - 1] + 1 posterior[m] = gmm.predict_proba(X_test) print 30, 20, sum(accuracy[0:8]) / 8.0, sum(accuracy[8:18]) / 10.0, sum( accuracy[18:30]) / 12.0, sum(accuracy[30:43]) / 13.0, sum( accuracy[43:56]) / 13.0, sum(
np.random.shuffle(training_data_full) n_samples = training_data_full.shape[0] training_set = training_data_full[:n_samples*0.8, :] test_set = training_data_full[n_samples*0.8:, :] features = training_set[:, :-1] labels = training_set[:, -1] # Fit and plot pl.figure() ax1 = pl.subplot(121) ax2 = pl.subplot(122) for n_clusters in [2]: print n_clusters gmmb = GMMBayes(n_clusters) gmmb.fit(features, labels) scores = gmmb.predict_proba(features) fpr, tpr, thresholds = roc_curve(labels, scores[:,1]) ax1.plot(fpr, tpr, label='%d clusters'%n_clusters) if n_clusters == 15: ax2.hist(scores[labels==0, 1], bins=100, normed=True, alpha=0.5, label='Background') ax2.hist(scores[labels==1, 1], bins=100, normed=True, alpha=0.5, label='Signal') ax2.legend(loc='best') #np.save('fpr_tpr_GMMBayes_15clusters.npy', np.vstack([fpr, tpr]))
for i in range(0,5): if i==1 or i==3: X_train = training2 y_train = labels[100:172,i] X_test = sample2 y_test = labels[272:,i] else: X_train = training y_train = labels[:172,i] X_test = sampletest y_test = labels[172:,i] ncomp = 0 ncorrect = 0 for j in range(1,9): gmm = GMMBayes(n_components=j) gmm.fit(X_train, y_train) y_pred = gmm.predict(X_test) n=0 for k in range(0,len(y_pred)): if y_pred[k] == y_test[k]: #print i, y_pred[i], y_test[i] n = n+1 if n > ncorrect: ncorrect = n ncomp = j print '{:3d}/{:3d}, {:2.2%}, n_components={:d}'.format(ncorrect, len(y_test), ncorrect*1.0/len(y_test), ncomp) '''
X_test = sample2 y_test = labels[272:,i] else: X_train = training y_train = labels[:172,i] X_test = sampletest y_test = labels[172:,i] if i>=2: j=3 else: j=6 accuracy = np.zeros(72) posterior = np.empty([10000,72,6]) box = np.zeros([6,6]) for m in range(0,100): gmm = GMMBayes(n_components=6) gmm.fit(X_train, y_train) y_pred = gmm.predict(X_test) n=0 for i in range(0,len(y_pred)): if y_pred[i] == y_test[i]: #print i, y_pred[i], y_test[i] n = n+1 accuracy[i] = accuracy[i]+1 box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1 posterior[m] = gmm.predict_proba(X_test) print 30, 20, sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy[56:72])/16.0, sum(accuracy)/72.0 means = np.empty([72,6]) stds = np.empty([72,6]) grid = np.empty([6,6])
for i in range(0, 5): if i == 1 or i == 3: X_train = training2 y_train = labels[100:172, i] X_test = sample2 y_test = labels[272:, i] else: X_train = training y_train = labels[:172, i] X_test = sampletest y_test = labels[172:, i] ncomp = 0 ncorrect = 0 for j in range(1, 9): gmm = GMMBayes(n_components=j) gmm.fit(X_train, y_train) y_pred = gmm.predict(X_test) n = 0 for k in range(0, len(y_pred)): if y_pred[k] == y_test[k]: #print i, y_pred[i], y_test[i] n = n + 1 if n > ncorrect: ncorrect = n ncomp = j print '{:3d}/{:3d}, {:2.2%}, n_components={:d}'.format( ncorrect, len(y_test), ncorrect * 1.0 / len(y_test), ncomp) ''' def onpick(event):
def main(): parser = argparse.ArgumentParser(description= 'Perform Dimensionality Reduction') parser.add_argument('--alg', type=str, default='MLLE', help='Algorithm to reduce dimensionality.') parser.add_argument('catalog', type=str, help='Specify the catalog on which to perform DimReduce.') args = parser.parse_args() #dat = Table.read('catalogs/ZEST_catalog_colors.fits') #training_sample = dat[0:10000] #testing_sample = dat[10001:20000] #zkeys = ['cc', 'aa', 'm20', 'gg'] base = os.path.basename(args.catalog) filename = os.path.splitext(base)[0] dat = Table.read(args.catalog) mkeys = ['elipt', 'C', 'A_1a', 'G', 'M20']# #dat.remove_column('color') if 'color' not in dat.colnames: if 'kaggle' in sample: dat = prep_catalog.color_data2(dat, 'gz2class') if 'direct' in sample: dat = prep_catalog.color_data(dat, 'zclass') dat.write(args.catalog, overwrite=True) #dat = prep_catalog.adjust_asym(dat, mkeys[2]) #train, traincols, targets = prep_catalog.whiten_data(dat, mkeys) n_neighbors = [10,12,15,20] #n_neighbors = [7] n_components = 3 for i, n_neigh in enumerate(n_neighbors): if args.alg in ['MLLE', 'LLE', 'LTSA', 'HLLE']: if args.alg == 'MLLE': method = 'modified' elif args.alg == 'LLE': method = 'standard' elif args.alg == 'LTSA': method = 'ltsa' elif args.alg == 'HLLE': method = 'hessian' #replace_panoptes(dat) #pdb.set_trace() #sample = 'directbig_panoptes' X, y = prep_catalog.whiten_data(dat, mkeys) (dat1, dat2),(thing1,thing2) = split_samples(dat, dat,[0.75, 0.35], random_state=0) (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.35], random_state=0) y_train = simplify_classlabels(y_train) y_test = simplify_classlabels(y_test) #filename = 'modified_7_directbig_new' X_train = X y_train = simplify_classlabels(y) #''' #sample ='direct_zcut' #Y_train, Y_test = open_previous_LLE(filename) #cut = np.where(X1['REDSHIFT'] <= 0.05) #X1_cut = X1[cut] #QC_plots(X1_cut) #Y_train = np.array(Y_train)[cut] #col_train = np.array(col_train)[cut] #X = Table(X) #cut_out_mixedup_region(X, np.array(Y_train)) #''' print "performing "+method+" LLE with",n_neigh,\ "nearest neighbors" print "on training sample of",len(X_train),"objects" t0 = time() A = LLE(n_neigh, n_components, eigen_solver='auto', method=method) error = A.fit(X_train).reconstruction_error_ Y_train = A.fit_transform(X_train) Y_test = A.transform(X_train) t1 = time() #''' metadata = {'method':method, 'N':n_neigh, 'd':n_components, 'error':error, 'time':t1-t0, 'sample':filename+'_total'} save_dimreduce(dat, Y_train, y_train, metadata, filename+'_total') #metadata = {'method':method, 'N':n_neigh, 'd':n_components, # 'error':error, 'time':t1-t0, 'sample':filename+'_test'} #save_dimreduce(X2, Y_test, y_test, metadata, filename+'_test') # plot in 3D plot_dimreduce_3D(Y_train, y_train[:,1], Y_test, y_test[:,1], method, n_neigh, error, t1-t0, filename, two=False) #====================================================================# elif args.alg == 'ISO': method='IsoMap' print "performing IsoMap with",n_neigh,"nearest neighbors" print "on training sample of",len(dat),"objects" t0 = time() A = Isomap(n_neigh, n_components, eigen_solver='dense') error = A.fit(train).reconstruction_error() Y = A.fit_transform(train) #Y2 = A.transform(test) t1 = time() print "%s: %.2g sec" %(args.alg, t1-t0) print "reconstruction error: ", error print "begin plotting" plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=0) plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=1) plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=2) plot_dimreduce_3D(Y, traincols, Y, traincols, method, n_neigh, (t1-t0), error, sample) elif args.alg == 'LDA': print "performing LDA" X, Xc, y = prep_catalog.whiten_data(dat, mkeys) (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) DRclf = LDA(3, priors=None) #DRclf.fit(X_train, y_train) DRtrain = DRclf.fit(X_train, y_train).transform(X_train) DRtest = DRclf.fit(X_train, y_train).transform(X_test) classes = np.unique(y_train) colors = np.array(['darkred', 'red', 'lightsalmon', 'darkgreen', 'lightgreen', 'lightseagreen', 'indigo', 'darkviolet', 'plum']) plot_LDA_3D(DRtrain, y_train, classes, colors, sample) pdb.set_trace() #classifiers = [] #predictions = [] #Nparams = np.arange(1, X.shape[1]+1) #for nc in Nparams: clf = LDA() clf.fit(DRtrain, y_train) y_pred = clf.predict(DRtest) matchesLDA = (y_pred == y_test) print np.sum(matchesLDA) pdb.set_trace() #------------------------------------------ from sklearn.neighbors import KNeighborsClassifier knc = KNeighborsClassifier(5) knc.fit(DRtrain, y_train) y_pred = knc.predict(DRtest) matchesKNN = (y_pred == y_test) print np.sum(matchesKNN) pdb.set_trace() #------------------------------------------ from astroML.classification import GMMBayes gmmb = GMMBayes(9) gmmb.fit(DRtrain, y_train) y_pred = gmmb.predict(DRtest) matchesGMMB = (y_pred == y_test) print np.sum(matchesGMMB) pdb.set_trace() #------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) pdb.set_trace() im = ax.scatter(X[:, 3], X[:, 4], color=Xc, cmap=plt.cm.Spectral, s=4, lw=0) #cmap=plt.cm.binary,, zorder=2 im.set_clim(-0.5, 1) #im = ax.imshow(Z, origin='lower', aspect='auto', # cmap=plt.cm.binary, zorder=1, # extent=xlim + ylim) #im.set_clim(0, 1.5) #ax.contour(xx, yy, Z, [0.5], colors='k') #ax.set_xlim(xlim) #ax.set_ylim(ylim) ax.set_xlabel('$G$') ax.set_ylabel('$M20$') #pred, true = classification_loss(predictions, y_test) #completeness, contamination = completeness_contamination(pred, true) pdb.set_trace() #''' #t0 = time() #A = LDA(n_components, priors=None) #Y = A.fit_transform(train, targets) #Y2 = A.fit(train, targets).transform(train) #t1 = time() #print "%s: %.2g sec" %(args.alg, t1-t0) predict = A.predict(train) #print "Predicted classes:", predict #pdb.set_trace() #pdb.set_trace() #''' plot_LDA_3D(Y2, targets, classes, colors, sample) plot_LDA(Y2, targets, classes, colors, sample, axis=0) plot_LDA(Y2, targets, classes, colors, sample, axis=1) plot_LDA(Y2, targets, classes, colors, sample, axis=2) pdb.set_trace()