def test_completeness_contamination(): completeness, contamination = completeness_contamination( np.ones(100), np.ones(100)) assert_allclose(completeness, 1) assert_allclose(contamination, 0) completeness, contamination = completeness_contamination( np.zeros(100), np.zeros(100)) assert_allclose(completeness, 0) assert_allclose(contamination, 0) completeness, contamination = completeness_contamination( np.concatenate((np.ones(50), np.zeros(50))), np.concatenate((np.ones(25), np.zeros(50), np.ones(25)))) assert_allclose(completeness, 0.5) assert_allclose(contamination, 0.5)
def do_stuff(model): #train model.fit(np.transpose([omegals[trainers], omegae1s[trainers]]), classifications[trainers]) #predict all classifications predictions = model.predict(np.transpose([omegals, omegae1s])) # predict the classification probabilities on a grid xlim = (0,100*np.pi/180.) ylim = (0,100*np.pi/180.) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1],1000), np.linspace(ylim[0], ylim[1],1000)) Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()]) #color grid by prediction gridclass = np.zeros(Z[:,1].shape) gridclass[(Z[:,0]>Z[:,1])&(Z[:,0]>Z[:,2])]=0 gridclass[(Z[:,1]>Z[:,2])&(Z[:,1]>Z[:,0])]=1 gridclass[(Z[:,2]>Z[:,0])&(Z[:,2]>Z[:,1])]=2 gridclass=gridclass.reshape(xx.shape) clf() cmap = matplotlib.colors.ListedColormap(['red', 'blue', 'green']) bounds=[0,.666,1.333,2] norm = matplotlib.colors.BoundaryNorm(bounds, cmap.N) img = plt.imshow(np.transpose(gridclass), origin='lower',interpolation='nearest', cmap = cmap, extent=[0,100,0,100], alpha = 0.5) cbar = plt.colorbar(img, cmap=cmap, norm=norm, boundaries=bounds, ticks=[.25,1,1.75]) cbar.ax.set_yticklabels(['shell', 'stream', 'unsure'])# horizontal colorbar #overplot correct answers plt.scatter(omegae1s[classifications=='stream']*180./np.pi, omegals[classifications=='stream']*180./np.pi, color='blue' , s=75, edgecolor='w') plt.scatter(omegae1s[classifications=='shell']*180./np.pi, omegals[classifications=='shell']*180./np.pi, color='red' , s=75, edgecolor='w') plt.scatter(omegae1s[classifications=='unsure']*180./np.pi, omegals[classifications=='unsure']*180./np.pi, color='green', s=75, edgecolor='w') plt.xlim([0,100]) plt.ylim([0,100]) plt.xlabel('$\Psi_E^1$ [degrees]', size = 'x-large') plt.ylabel('$\Psi_L$ [degrees]', size = 'x-large') print model for name in ['stream', 'shell ', 'unsure']: com, con = completeness_contamination(predictions==name, classifications==name) print name + ' completeness:%1.2f; contamination:%1.2f'%(com, con)
for i, C in enumerate(C_vals): clf = SVC(kernel='linear', class_weight=None, C=C) clf.fit(X_train, y_train) # Predict unknown values y_pred = clf.predict(X_test) # Compute confusion matrix #cm = confusion_matrix(y_test, y_pred) #print 'Confusion matrix:' #print cm # Completeness/contamination for j, c in enumerate([1, 2, 4, 5, 6]): #print 'Class:', c compl_i, cont_i = completeness_contamination(y_pred==c, y_test==c) completeness[i, j] = compl_i contamination[i, j] = cont_i #print 'Completeness:', completeness #print 'Contamination:', contamination # Plot matrix #plot_confusion_matrix(cm) #plt.show() # Plot contamination, completeness plt.subplot(211) plt.semilogx(C_vals, completeness) plt.xlabel('C') plt.ylabel('Completeness')
#---------------------------------------------------------------------- # perform LDA classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) for nc in Ncolors: clf = LDA() clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers.append(clf) predictions.append(y_pred) completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71), np.linspace(ylim[0], ylim[1], 81)) Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()]) Z = Z[:, 1].reshape(xx.shape)
# perform Naive Bayes classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) order = np.array([1, 0, 2, 3]) for nc in Ncolors: clf = GaussianNB() clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers.append(clf) predictions.append(y_pred) completeness, contamination = completeness_contamination(predictions, y_test) print "completeness", completeness print "contamination", contamination #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 81), np.linspace(ylim[0], ylim[1], 71)) Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()]) Z = Z[:, 1].reshape(xx.shape)
}] trainRoutine = [{ "Compile": [ keras.optimizers.Adam(lr=0.03), 'mean_squared_error', ['binary_accuracy', 'categorical_accuracy'] ], "Train": [1000, None, 2] }] nets[col].trainRoutine(routineSettings, trainRoutine) predictions = np.around(nets[col].predictModel( X_test_unbalanced[:, featCols[col]]).reshape( nets[col].predictModel( X_test_unbalanced[:, featCols[col]]).shape[0], )) completeness, contamination = completeness_contamination( predictions, (y_test_unbalanced)) cont.append(contamination) comp.append(completeness) nets[col].saveModel('astro_sliceOPy_' + str(col)) # create a figure object ax = fig.add_subplot(1, 4, col + 1) nets[col].plotLearningCurve(ax, Plot_Dict={ 'loss': "Loss", 'val_loss': "Test Loss" }) # if col == 0: # ax = fig.add_subplot(1,1,1) # nets[col].contourPlot(ax) #%% plt.show()
for layer in range(1,(depth)): print(layer) layers.append(keras.layers.Dense(width,kernel_initializer='normal', activation='tanh')) layers.append(keras.layers.Dense(1,kernel_initializer='normal', activation='sigmoid')) model = keras.Sequential(layers) model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=LR), loss='binary_crossentropy', metrics=['binary_accuracy', 'categorical_accuracy']) print(X_train.shape) history = model.fit(X_train, y_train, batch_size=BatchSize,epochs=Epochs, verbose=2) predictions = np.around(model.predict(X).reshape(model.predict(X).shape[0],)) completeness, contamination = completeness_contamination(predictions,(y)) scores = model.evaluate(X_test,y_test) lossTest = scores[0] widthD.append(width) depthD.append(depth) testLoss.append(lossTest) trainLoss.append(history.history['loss'][-1]) comp.append(completeness) cont.append(contamination) print("completeness",completeness) print("contamination", contamination) loss_data = history.history['loss'] epoch_data = np.arange(0,len((loss_data)))
# iterate through and show results for name, y_prob in zip(names, probs): fpr, tpr, thresh = roc_curve(y_test, y_prob) # add (0, 0) as first point fpr = np.concatenate([[0], fpr]) tpr = np.concatenate([[0], tpr]) ax1.plot(fpr, tpr, label=labels[name]) comp = np.zeros_like(thresholds) cont = np.zeros_like(thresholds) for i, t in enumerate(thresholds): y_pred = (y_prob >= t) comp[i], cont[i] = completeness_contamination(y_pred, y_test) ax2.plot(1 - cont, comp, label=labels[name]) ax1.set_xlim(0, 0.04) ax1.set_ylim(0, 1.02) ax1.xaxis.set_major_locator(plt.MaxNLocator(5)) ax1.set_xlabel('false positive rate') ax1.set_ylabel('true positive rate') ax1.legend(loc=4, prop=dict(size=12)) ax2.set_xlabel('efficiency') ax2.set_ylabel('completeness') ax2.set_xlim(0, 1.0) ax2.set_ylim(0.2, 1.02) plt.show()
def classifier(m_1, m_2, M_c, s, ax_pdf, ax_data, ax_log_pdf, ax_log_data, output_directory): M_c_front = M_c[:len(s) // 2] M_c_end = M_c[len(s) // 2:] s_front = s[:len(s) // 2] s_end = s[len(s) // 2:] index_pos = (s == 1) index_neg = (s == 0) M_c_em = M_c[index_pos] M_c_not_em = M_c[index_neg] Mcem_max = max(M_c_em) Mcnotem_min = min(M_c_not_em) #Training a classifier with half the data set train = M_c[:len(M_c) // 2] index_pos_half = index_pos[:len(s) // 2] index_neg_half = index_neg[:len(s) // 2] Mcem_half = train[index_pos_half] Mcnotem_half = train[index_neg_half] #Calculating the dividing line(using half the data) Mcem_half_max = max(Mcem_half) Mcem_half_min = min(Mcnotem_half) distance = abs(Mcem_half_max - Mcem_half_min) / 2. line_half = Mcem_half_max + distance #Print the Max Mc of EM CP and Min of other along with the dividing line Mc print("It works" if Mcem_max < line_half < Mcnotem_min else "It doesn't work") print("The Minimum M_c for the Others is: ", Mcnotem_min) print("The Maximum M_c for the EM CP is: ", Mcem_max) print("The Dividing line trained by half the data is: ", line_half) for ax in [ax_pdf, ax_data, ax_log_pdf, ax_log_data]: ax.axvline(line_half, color="black", linestyle="--") fig_train, ax = plt.subplots() ax.scatter(M_c_front[s_front], np.random.uniform(0.0, 0.5, size=np.shape(M_c_front[s_front])), edgecolor="red", facecolor="none", marker="s") ax.scatter(M_c_end[s_end], np.random.uniform(0.5, 1.0, size=np.shape(M_c_end[s_end])), edgecolor="red", facecolor="red", marker="s") ax.scatter(M_c_front[~s_front], np.random.uniform(0.0, 0.5, size=np.shape(M_c_front[~s_front])), edgecolor="blue", facecolor="none", marker="o") ax.scatter(M_c_end[~s_end], np.random.uniform(0.5, 1.0, size=np.shape(M_c_end[~s_end])), edgecolor="blue", facecolor="blue", marker="o") ax.axvline(line_half, color="black", linestyle="--") ax.set_xlabel(r"$\mathcal{M}_c\ [M_\odot]$") ax.semilogx() ax.yaxis.set_ticklabels([]) fig_train.savefig(path.join(output_directory, "classifier_comparison.pdf")) fig_2d, ax_2d = plt.subplots() m_1_smooth = np.logspace(0, 1.3, 1000) ax_2d.scatter(m_1[s], m_2[s], color="red", marker="s") ax_2d.scatter(m_1[~s], m_2[~s], color="blue", marker="o") ax_2d.plot(m_1_smooth, gw.m_2(m_1_smooth, line_half), "k--") ax_2d.set_xlabel(r"$m_1\ [M_\odot]$") ax_2d.set_ylabel(r"$m_2\ [M_\odot]$") ax_2d.loglog() fig_2d.savefig(path.join(output_directory, "mass-distribution.pdf")) m1_m2 = np.column_stack((m_1, m_2)) train2 = np.log10(m1_m2[:len(m1_m2) // 2]) clf = LinearSVC(C=100, class_weight='balanced').fit(train2, index_pos_half) index_pos_half_pred = clf.predict(train2) completeness2, contamination2 = completeness_contamination( index_pos_half_pred, index_pos_half) print("2D completeness: ", completeness2) print("2D contamination: ", contamination2) xx, yy = np.meshgrid( np.logspace(np.log10(m_1.min()), np.log10(m_1.max()), 500, endpoint=True), np.logspace(np.log10(m_2.min()), np.log10(m_2.max()), 500, endpoint=True)) Z = clf.predict(np.log10(np.c_[xx.ravel(), yy.ravel()])) Z = Z.reshape(xx.shape) print(np.unique(s)) fig2d, ax2d = plt.subplots() ax2d.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8, antialiased=False, extend='neither') ax2d.scatter(m_1, m_2, c=s, cmap=plt.cm.Paired) ax2d.set_xlabel('m$_1$') ax2d.set_ylabel('m$_2$') ax2d.loglog() ax2d.set_xlim(m_1.min(), m_1.max()) ax2d.set_ylim(m_2.min(), m_2.max()) fig2d.savefig(path.join(output_directory, "classifier-2D.pdf"))
def classifier(m_1, m_2, M_c, s, ax_pdf, ax_data, ax_log_pdf, ax_log_data, output_directory): M_c_front = M_c[:len(s)//2] M_c_end = M_c[len(s)//2:] s_front = s [:len(s)//2] s_end = s [len(s)//2:] index_pos = (s==1) index_neg = (s==0) M_c_em = M_c[index_pos] M_c_not_em = M_c[index_neg] Mcem_max = max(M_c_em) Mcnotem_min = min(M_c_not_em) #Training a classifier with half the data set train = M_c[:len(M_c)//2] index_pos_half = index_pos[:len(s)//2] index_neg_half = index_neg[:len(s)//2] Mcem_half = train[index_pos_half] Mcnotem_half = train[index_neg_half] #Calculating the dividing line(using half the data) Mcem_half_max = max(Mcem_half) Mcem_half_min = min(Mcnotem_half) distance = abs(Mcem_half_max - Mcem_half_min)/2. line_half = Mcem_half_max + distance #Print the Max Mc of EM CP and Min of other along with the dividing line Mc print("It works" if Mcem_max < line_half < Mcnotem_min else "It doesn't work") print("The Minimum M_c for the Others is: ", Mcnotem_min) print("The Maximum M_c for the EM CP is: ", Mcem_max) print("The Dividing line trained by half the data is: ", line_half) for ax in [ax_pdf, ax_data, ax_log_pdf, ax_log_data]: ax.axvline(line_half, color="black", linestyle="--") fig_train, ax = plt.subplots() ax.scatter(M_c_front[s_front], np.random.uniform(0.0, 0.5, size=np.shape(M_c_front[s_front])), edgecolor="red", facecolor="none", marker="s") ax.scatter(M_c_end[s_end], np.random.uniform(0.5, 1.0, size=np.shape(M_c_end[s_end])), edgecolor="red", facecolor="red", marker="s") ax.scatter(M_c_front[~s_front], np.random.uniform(0.0, 0.5, size=np.shape(M_c_front[~s_front])), edgecolor="blue", facecolor="none", marker="o") ax.scatter(M_c_end[~s_end], np.random.uniform(0.5, 1.0, size=np.shape(M_c_end[~s_end])), edgecolor="blue", facecolor="blue", marker="o") ax.axvline(line_half, color="black", linestyle="--") ax.set_xlabel(r"$\mathcal{M}_c\ [M_\odot]$") ax.semilogx() ax.yaxis.set_ticklabels([]) fig_train.savefig(path.join(output_directory, "classifier_comparison.pdf")) fig_2d, ax_2d = plt.subplots() m_1_smooth = np.logspace(0, 1.3, 1000) ax_2d.scatter(m_1[s], m_2[s], color="red", marker="s") ax_2d.scatter(m_1[~s], m_2[~s], color="blue", marker="o") ax_2d.plot(m_1_smooth, gw.m_2(m_1_smooth, line_half), "k--") ax_2d.set_xlabel(r"$m_1\ [M_\odot]$") ax_2d.set_ylabel(r"$m_2\ [M_\odot]$") ax_2d.loglog() fig_2d.savefig(path.join(output_directory, "mass-distribution.pdf")) m1_m2 = np.column_stack((m_1,m_2)) train2 = np.log10(m1_m2[:len(m1_m2)//2]) clf = LinearSVC(C=100,class_weight='balanced').fit(train2, index_pos_half) index_pos_half_pred = clf.predict(train2) completeness2, contamination2 = completeness_contamination(index_pos_half_pred, index_pos_half) print("2D completeness: ", completeness2) print("2D contamination: ", contamination2) xx, yy = np.meshgrid(np.logspace(np.log10(m_1.min()), np.log10(m_1.max()), 500,endpoint=True), np.logspace(np.log10(m_2.min()), np.log10(m_2.max()), 500,endpoint=True)) Z = clf.predict(np.log10(np.c_[xx.ravel(), yy.ravel()])) Z = Z.reshape(xx.shape) print(np.unique(s)) fig2d, ax2d = plt.subplots() ax2d.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8,antialiased=False, extend='neither') ax2d.scatter(m_1, m_2, c=s, cmap=plt.cm.Paired) ax2d.set_xlabel('m$_1$') ax2d.set_ylabel('m$_2$') ax2d.loglog() ax2d.set_xlim(m_1.min(), m_1.max()) ax2d.set_ylim(m_2.min(), m_2.max()) fig2d.savefig(path.join(output_directory, "classifier-2D.pdf"))
# iterate through and show results for name, y_prob in zip(names, probs): fpr, tpr, thresh = roc_curve(y_test, y_prob) # add (0, 0) as first point fpr = np.concatenate([[0], fpr]) tpr = np.concatenate([[0], tpr]) ax1.plot(fpr, tpr, label=labels[name]) comp = np.zeros_like(thresholds) cont = np.zeros_like(thresholds) for i, t in enumerate(thresholds): y_pred = (y_prob >= t) comp[i], cont[i] = completeness_contamination(y_pred, y_test) ax2.plot(1 - cont, comp, label=labels[name]) ax1.set_xlim(0, 0.04) ax1.set_ylim(0, 1.02) ax1.xaxis.set_major_locator(plt.MaxNLocator(5)) ax1.set_xlabel('false positive rate') ax1.set_ylabel('true positive rate') ax1.legend(loc=4) ax2.set_xlabel('efficiency') ax2.set_ylabel('completeness') ax2.set_xlim(0, 1.0) ax2.set_ylim(0.2, 1.02) plt.show()
galaxy_parser = Classify_Galaxies_Parser.Galaxy_Parser('Galaxies_hands_on_Chap9_larger.txt', precondition=True, replaceMean=True, trainfrac=0.8) print galaxy_parser.data_test.shape print galaxy_parser.data_train.shape print galaxy_parser.datanames print 'Num ellipticals: ', np.sum(galaxy_parser.labels_test) print 'Num non-ellipticals: ', np.sum(1.-galaxy_parser.labels_test) # Create and fit SVM classifier for different c cs = 10**np.linspace(0., 1, 20) contaminations = np.zeros_like(cs) completenesses = np.zeros_like(cs) for i, C in enumerate(cs): svm = LinearSVC(loss='squared_hinge', C=C) svm.fit(galaxy_parser.data_train, galaxy_parser.labels_train) # Evaluate SVM classifier predicted_labels = svm.predict(galaxy_parser.data_test) completeness, contamination = completeness_contamination(predicted_labels, galaxy_parser.labels_test) contaminations[i] = contamination completenesses[i] = completeness pl.semilogx(cs, contaminations, '*-', label='contamination') pl.semilogx(cs, completenesses, '*-', label='completeness') pl.legend(loc='best') pl.xlabel('C') pl.show()
rms_train = np.zeros(len(trees)) i_best = 0 label_fit_best = None completeness = [] contamination = [] for i, t in enumerate(trees): clf = RandomForestClassifier(t) clf.fit(data_train, labels_train) label_fit_train = clf.predict(data_train) label_fit = clf.predict(data_test) rms_train[i] = np.mean(np.sqrt((label_fit_train - labels_train) ** 2)) rms_test[i] = np.mean(np.sqrt((label_fit - labels_test) ** 2)) tmp_completeness, tmp_contamination = completeness_contamination(label_fit, labels_test) contamination.append(tmp_contamination) completeness.append(tmp_completeness) if rms_test[i] <= rms_test[i_best]: i_best = i label_fit_best = label_fit best_tree = trees[i_best] print "Depth of tree", best_tree print "Fraction of stars: ", sum(labels_train/len(labels_train)) plt.figure(figsize = (7,7)) plt.title('Random Forest', fontsize=15) plt.plot(trees, completeness, color='teal', label='Completeness', linewidth=4)