コード例 #1
0
def test_completeness_contamination():
    completeness, contamination = completeness_contamination(
        np.ones(100), np.ones(100))

    assert_allclose(completeness, 1)
    assert_allclose(contamination, 0)

    completeness, contamination = completeness_contamination(
        np.zeros(100), np.zeros(100))

    assert_allclose(completeness, 0)
    assert_allclose(contamination, 0)

    completeness, contamination = completeness_contamination(
        np.concatenate((np.ones(50), np.zeros(50))),
        np.concatenate((np.ones(25), np.zeros(50), np.ones(25))))

    assert_allclose(completeness, 0.5)
    assert_allclose(contamination, 0.5)
コード例 #2
0
ファイル: ch9.py プロジェクト: afcarl/soma
def do_stuff(model):
	#train
	model.fit(np.transpose([omegals[trainers], omegae1s[trainers]]), classifications[trainers])

	#predict all classifications
	predictions = model.predict(np.transpose([omegals, omegae1s]))

	# predict the classification probabilities on a grid
	xlim = (0,100*np.pi/180.)
	ylim = (0,100*np.pi/180.)
	xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1],1000),
	                     np.linspace(ylim[0], ylim[1],1000))
	Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])

	#color grid by prediction
	gridclass = np.zeros(Z[:,1].shape)
	gridclass[(Z[:,0]>Z[:,1])&(Z[:,0]>Z[:,2])]=0
	gridclass[(Z[:,1]>Z[:,2])&(Z[:,1]>Z[:,0])]=1
	gridclass[(Z[:,2]>Z[:,0])&(Z[:,2]>Z[:,1])]=2
	gridclass=gridclass.reshape(xx.shape)

	clf()
	cmap = matplotlib.colors.ListedColormap(['red', 'blue', 'green'])
	bounds=[0,.666,1.333,2]
	norm = matplotlib.colors.BoundaryNorm(bounds, cmap.N)
	img = plt.imshow(np.transpose(gridclass), origin='lower',interpolation='nearest', 
		cmap = cmap, extent=[0,100,0,100], alpha = 0.5)
	cbar = plt.colorbar(img, cmap=cmap, norm=norm, boundaries=bounds, ticks=[.25,1,1.75])
	cbar.ax.set_yticklabels(['shell', 'stream', 'unsure'])# horizontal colorbar

	#overplot correct answers
	plt.scatter(omegae1s[classifications=='stream']*180./np.pi, omegals[classifications=='stream']*180./np.pi, color='blue' , s=75, edgecolor='w')
	plt.scatter(omegae1s[classifications=='shell']*180./np.pi,  omegals[classifications=='shell']*180./np.pi,   color='red' , s=75, edgecolor='w')
	plt.scatter(omegae1s[classifications=='unsure']*180./np.pi, omegals[classifications=='unsure']*180./np.pi, color='green', s=75, edgecolor='w')
	plt.xlim([0,100])
	plt.ylim([0,100])
	plt.xlabel('$\Psi_E^1$ [degrees]', size = 'x-large')
	plt.ylabel('$\Psi_L$ [degrees]', size = 'x-large')

	print model
	for name in ['stream', 'shell ', 'unsure']:
		com, con = completeness_contamination(predictions==name, classifications==name)
		print name + ' completeness:%1.2f; contamination:%1.2f'%(com, con)
コード例 #3
0
for i, C in enumerate(C_vals):
    clf = SVC(kernel='linear', class_weight=None, C=C)
    clf.fit(X_train, y_train)

    # Predict unknown values
    y_pred = clf.predict(X_test)

    # Compute confusion matrix
    #cm = confusion_matrix(y_test, y_pred)
    #print 'Confusion matrix:'
    #print cm

    # Completeness/contamination
    for j, c in enumerate([1, 2, 4, 5, 6]):
        #print 'Class:', c
        compl_i, cont_i = completeness_contamination(y_pred==c, 
            y_test==c)
        completeness[i, j] = compl_i
        contamination[i, j] = cont_i
        #print 'Completeness:', completeness
        #print 'Contamination:', contamination

    # Plot matrix
    #plot_confusion_matrix(cm)
    #plt.show()


# Plot contamination, completeness
plt.subplot(211)
plt.semilogx(C_vals, completeness)
plt.xlabel('C')
plt.ylabel('Completeness')
コード例 #4
0
#----------------------------------------------------------------------
# perform LDA
classifiers = []
predictions = []
Ncolors = np.arange(1, X.shape[1] + 1)

for nc in Ncolors:
    clf = LDA()
    clf.fit(X_train[:, :nc], y_train)
    y_pred = clf.predict(X_test[:, :nc])

    classifiers.append(clf)
    predictions.append(y_pred)

completeness, contamination = completeness_contamination(predictions, y_test)

print("completeness", completeness)
print("contamination", contamination)

#------------------------------------------------------------
# Compute the decision boundary
clf = classifiers[1]
xlim = (0.7, 1.35)
ylim = (-0.15, 0.4)

xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71),
                     np.linspace(ylim[0], ylim[1], 81))

Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()])
Z = Z[:, 1].reshape(xx.shape)
コード例 #5
0
# perform Naive Bayes
classifiers = []
predictions = []
Ncolors = np.arange(1, X.shape[1] + 1)

order = np.array([1, 0, 2, 3])

for nc in Ncolors:
    clf = GaussianNB()
    clf.fit(X_train[:, :nc], y_train)
    y_pred = clf.predict(X_test[:, :nc])

    classifiers.append(clf)
    predictions.append(y_pred)

completeness, contamination = completeness_contamination(predictions, y_test)

print "completeness", completeness
print "contamination", contamination

#------------------------------------------------------------
# Compute the decision boundary
clf = classifiers[1]
xlim = (0.7, 1.35)
ylim = (-0.15, 0.4)

xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 81),
                     np.linspace(ylim[0], ylim[1], 71))

Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()])
Z = Z[:, 1].reshape(xx.shape)
コード例 #6
0
    }]

    trainRoutine = [{
        "Compile": [
            keras.optimizers.Adam(lr=0.03), 'mean_squared_error',
            ['binary_accuracy', 'categorical_accuracy']
        ],
        "Train": [1000, None, 2]
    }]

    nets[col].trainRoutine(routineSettings, trainRoutine)
    predictions = np.around(nets[col].predictModel(
        X_test_unbalanced[:, featCols[col]]).reshape(
            nets[col].predictModel(
                X_test_unbalanced[:, featCols[col]]).shape[0], ))
    completeness, contamination = completeness_contamination(
        predictions, (y_test_unbalanced))
    cont.append(contamination)
    comp.append(completeness)
    nets[col].saveModel('astro_sliceOPy_' + str(col))
    # create a figure object
    ax = fig.add_subplot(1, 4, col + 1)
    nets[col].plotLearningCurve(ax,
                                Plot_Dict={
                                    'loss': "Loss",
                                    'val_loss': "Test Loss"
                                })
#    if col == 0:
#        ax = fig.add_subplot(1,1,1)
#        nets[col].contourPlot(ax)
#%%
plt.show()
コード例 #7
0
 	
     for layer in range(1,(depth)):
         print(layer)
         layers.append(keras.layers.Dense(width,kernel_initializer='normal', activation='tanh'))
         
     layers.append(keras.layers.Dense(1,kernel_initializer='normal', activation='sigmoid'))
     
     model = keras.Sequential(layers)
 
     model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=LR), loss='binary_crossentropy', metrics=['binary_accuracy', 'categorical_accuracy'])
     print(X_train.shape)
     history = model.fit(X_train, y_train, batch_size=BatchSize,epochs=Epochs, verbose=2)
 	
     predictions = np.around(model.predict(X).reshape(model.predict(X).shape[0],))
 
     completeness, contamination = completeness_contamination(predictions,(y))
 
     scores = model.evaluate(X_test,y_test)
     
     lossTest = scores[0]
     
     widthD.append(width)
     depthD.append(depth)
     testLoss.append(lossTest)
     trainLoss.append(history.history['loss'][-1])
     comp.append(completeness)
     cont.append(contamination)
     print("completeness",completeness)
     print("contamination", contamination)
     loss_data = history.history['loss']
     epoch_data = np.arange(0,len((loss_data)))
コード例 #8
0
ファイル: fig_ROC_curve.py プロジェクト: MQQ/astroML
# iterate through and show results
for name, y_prob in zip(names, probs):
    fpr, tpr, thresh = roc_curve(y_test, y_prob)

    # add (0, 0) as first point
    fpr = np.concatenate([[0], fpr])
    tpr = np.concatenate([[0], tpr])

    ax1.plot(fpr, tpr, label=labels[name])

    comp = np.zeros_like(thresholds)
    cont = np.zeros_like(thresholds)
    for i, t in enumerate(thresholds):
        y_pred = (y_prob >= t)
        comp[i], cont[i] = completeness_contamination(y_pred, y_test)
    ax2.plot(1 - cont, comp, label=labels[name])

ax1.set_xlim(0, 0.04)
ax1.set_ylim(0, 1.02)
ax1.xaxis.set_major_locator(plt.MaxNLocator(5))
ax1.set_xlabel('false positive rate')
ax1.set_ylabel('true positive rate')
ax1.legend(loc=4, prop=dict(size=12))

ax2.set_xlabel('efficiency')
ax2.set_ylabel('completeness')
ax2.set_xlim(0, 1.0)
ax2.set_ylim(0.2, 1.02)

plt.show()
コード例 #9
0
def classifier(m_1, m_2, M_c, s, ax_pdf, ax_data, ax_log_pdf, ax_log_data,
               output_directory):
    M_c_front = M_c[:len(s) // 2]
    M_c_end = M_c[len(s) // 2:]
    s_front = s[:len(s) // 2]
    s_end = s[len(s) // 2:]

    index_pos = (s == 1)
    index_neg = (s == 0)
    M_c_em = M_c[index_pos]
    M_c_not_em = M_c[index_neg]
    Mcem_max = max(M_c_em)
    Mcnotem_min = min(M_c_not_em)

    #Training a classifier with half the data set
    train = M_c[:len(M_c) // 2]

    index_pos_half = index_pos[:len(s) // 2]
    index_neg_half = index_neg[:len(s) // 2]
    Mcem_half = train[index_pos_half]
    Mcnotem_half = train[index_neg_half]

    #Calculating the dividing line(using half the data)
    Mcem_half_max = max(Mcem_half)
    Mcem_half_min = min(Mcnotem_half)
    distance = abs(Mcem_half_max - Mcem_half_min) / 2.
    line_half = Mcem_half_max + distance

    #Print the Max Mc of EM CP and Min of other along with the dividing line Mc
    print("It works"
          if Mcem_max < line_half < Mcnotem_min else "It doesn't work")
    print("The Minimum M_c for the Others is: ", Mcnotem_min)
    print("The Maximum M_c for the EM CP is: ", Mcem_max)
    print("The Dividing line trained by half the data is: ", line_half)

    for ax in [ax_pdf, ax_data, ax_log_pdf, ax_log_data]:
        ax.axvline(line_half, color="black", linestyle="--")

    fig_train, ax = plt.subplots()

    ax.scatter(M_c_front[s_front],
               np.random.uniform(0.0, 0.5, size=np.shape(M_c_front[s_front])),
               edgecolor="red",
               facecolor="none",
               marker="s")
    ax.scatter(M_c_end[s_end],
               np.random.uniform(0.5, 1.0, size=np.shape(M_c_end[s_end])),
               edgecolor="red",
               facecolor="red",
               marker="s")
    ax.scatter(M_c_front[~s_front],
               np.random.uniform(0.0, 0.5, size=np.shape(M_c_front[~s_front])),
               edgecolor="blue",
               facecolor="none",
               marker="o")
    ax.scatter(M_c_end[~s_end],
               np.random.uniform(0.5, 1.0, size=np.shape(M_c_end[~s_end])),
               edgecolor="blue",
               facecolor="blue",
               marker="o")

    ax.axvline(line_half, color="black", linestyle="--")

    ax.set_xlabel(r"$\mathcal{M}_c\ [M_\odot]$")

    ax.semilogx()
    ax.yaxis.set_ticklabels([])

    fig_train.savefig(path.join(output_directory, "classifier_comparison.pdf"))

    fig_2d, ax_2d = plt.subplots()

    m_1_smooth = np.logspace(0, 1.3, 1000)

    ax_2d.scatter(m_1[s], m_2[s], color="red", marker="s")
    ax_2d.scatter(m_1[~s], m_2[~s], color="blue", marker="o")

    ax_2d.plot(m_1_smooth, gw.m_2(m_1_smooth, line_half), "k--")

    ax_2d.set_xlabel(r"$m_1\ [M_\odot]$")
    ax_2d.set_ylabel(r"$m_2\ [M_\odot]$")

    ax_2d.loglog()

    fig_2d.savefig(path.join(output_directory, "mass-distribution.pdf"))

    m1_m2 = np.column_stack((m_1, m_2))
    train2 = np.log10(m1_m2[:len(m1_m2) // 2])
    clf = LinearSVC(C=100, class_weight='balanced').fit(train2, index_pos_half)
    index_pos_half_pred = clf.predict(train2)
    completeness2, contamination2 = completeness_contamination(
        index_pos_half_pred, index_pos_half)
    print("2D completeness: ", completeness2)
    print("2D contamination: ", contamination2)

    xx, yy = np.meshgrid(
        np.logspace(np.log10(m_1.min()),
                    np.log10(m_1.max()),
                    500,
                    endpoint=True),
        np.logspace(np.log10(m_2.min()),
                    np.log10(m_2.max()),
                    500,
                    endpoint=True))

    Z = clf.predict(np.log10(np.c_[xx.ravel(), yy.ravel()]))

    Z = Z.reshape(xx.shape)
    print(np.unique(s))
    fig2d, ax2d = plt.subplots()
    ax2d.contourf(xx,
                  yy,
                  Z,
                  cmap=plt.cm.Paired,
                  alpha=0.8,
                  antialiased=False,
                  extend='neither')
    ax2d.scatter(m_1, m_2, c=s, cmap=plt.cm.Paired)
    ax2d.set_xlabel('m$_1$')
    ax2d.set_ylabel('m$_2$')
    ax2d.loglog()
    ax2d.set_xlim(m_1.min(), m_1.max())
    ax2d.set_ylim(m_2.min(), m_2.max())

    fig2d.savefig(path.join(output_directory, "classifier-2D.pdf"))
コード例 #10
0
def classifier(m_1, m_2, M_c, s,
               ax_pdf, ax_data, ax_log_pdf, ax_log_data,
               output_directory):
    M_c_front = M_c[:len(s)//2]
    M_c_end   = M_c[len(s)//2:]
    s_front   = s  [:len(s)//2]
    s_end     = s  [len(s)//2:]

    index_pos = (s==1)
    index_neg = (s==0)
    M_c_em = M_c[index_pos]
    M_c_not_em = M_c[index_neg]
    Mcem_max = max(M_c_em)
    Mcnotem_min = min(M_c_not_em)

    #Training a classifier with half the data set
    train = M_c[:len(M_c)//2]

    index_pos_half = index_pos[:len(s)//2]
    index_neg_half = index_neg[:len(s)//2]
    Mcem_half = train[index_pos_half]
    Mcnotem_half = train[index_neg_half]

    #Calculating the dividing line(using half the data)
    Mcem_half_max = max(Mcem_half)
    Mcem_half_min = min(Mcnotem_half)
    distance = abs(Mcem_half_max - Mcem_half_min)/2.
    line_half = Mcem_half_max + distance


    #Print the Max Mc of EM CP and Min of other along with the dividing line Mc
    print("It works" if Mcem_max < line_half < Mcnotem_min else "It doesn't work")
    print("The Minimum M_c for the Others is: ", Mcnotem_min)
    print("The Maximum M_c for the EM CP is: ", Mcem_max)
    print("The Dividing line trained by half the data is: ", line_half)

    for ax in [ax_pdf, ax_data, ax_log_pdf, ax_log_data]:
        ax.axvline(line_half, color="black", linestyle="--")

    fig_train, ax = plt.subplots()

    ax.scatter(M_c_front[s_front],
               np.random.uniform(0.0, 0.5, size=np.shape(M_c_front[s_front])),
               edgecolor="red", facecolor="none", marker="s")
    ax.scatter(M_c_end[s_end],
               np.random.uniform(0.5, 1.0, size=np.shape(M_c_end[s_end])),
               edgecolor="red", facecolor="red", marker="s")
    ax.scatter(M_c_front[~s_front],
               np.random.uniform(0.0, 0.5, size=np.shape(M_c_front[~s_front])),
               edgecolor="blue", facecolor="none", marker="o")
    ax.scatter(M_c_end[~s_end],
               np.random.uniform(0.5, 1.0, size=np.shape(M_c_end[~s_end])),
               edgecolor="blue", facecolor="blue", marker="o")

    ax.axvline(line_half, color="black", linestyle="--")

    ax.set_xlabel(r"$\mathcal{M}_c\ [M_\odot]$")

    ax.semilogx()
    ax.yaxis.set_ticklabels([])

    fig_train.savefig(path.join(output_directory, "classifier_comparison.pdf"))



    fig_2d, ax_2d = plt.subplots()

    m_1_smooth = np.logspace(0, 1.3, 1000)

    ax_2d.scatter(m_1[s], m_2[s],
                  color="red", marker="s")
    ax_2d.scatter(m_1[~s], m_2[~s],
                  color="blue", marker="o")

    ax_2d.plot(m_1_smooth, gw.m_2(m_1_smooth, line_half), "k--")

    ax_2d.set_xlabel(r"$m_1\ [M_\odot]$")
    ax_2d.set_ylabel(r"$m_2\ [M_\odot]$")

    ax_2d.loglog()

    fig_2d.savefig(path.join(output_directory, "mass-distribution.pdf"))
    

    m1_m2 = np.column_stack((m_1,m_2))
    train2 =  np.log10(m1_m2[:len(m1_m2)//2])
    clf = LinearSVC(C=100,class_weight='balanced').fit(train2, index_pos_half)
    index_pos_half_pred = clf.predict(train2)
    completeness2, contamination2 = completeness_contamination(index_pos_half_pred, index_pos_half)
    print("2D completeness: ", completeness2)
    print("2D contamination: ", contamination2)



    xx, yy = np.meshgrid(np.logspace(np.log10(m_1.min()), np.log10(m_1.max()), 500,endpoint=True),
                         np.logspace(np.log10(m_2.min()), np.log10(m_2.max()), 500,endpoint=True))

    Z = clf.predict(np.log10(np.c_[xx.ravel(), yy.ravel()]))
    
    Z = Z.reshape(xx.shape)
    print(np.unique(s))
    fig2d, ax2d = plt.subplots()
    ax2d.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8,antialiased=False,
                  extend='neither')
    ax2d.scatter(m_1, m_2, c=s, cmap=plt.cm.Paired)
    ax2d.set_xlabel('m$_1$')
    ax2d.set_ylabel('m$_2$')
    ax2d.loglog()
    ax2d.set_xlim(m_1.min(), m_1.max())
    ax2d.set_ylim(m_2.min(), m_2.max())
    
    fig2d.savefig(path.join(output_directory, "classifier-2D.pdf"))
コード例 #11
0
# iterate through and show results
for name, y_prob in zip(names, probs):
    fpr, tpr, thresh = roc_curve(y_test, y_prob)

    # add (0, 0) as first point
    fpr = np.concatenate([[0], fpr])
    tpr = np.concatenate([[0], tpr])

    ax1.plot(fpr, tpr, label=labels[name])

    comp = np.zeros_like(thresholds)
    cont = np.zeros_like(thresholds)
    for i, t in enumerate(thresholds):
        y_pred = (y_prob >= t)
        comp[i], cont[i] = completeness_contamination(y_pred, y_test)
    ax2.plot(1 - cont, comp, label=labels[name])

ax1.set_xlim(0, 0.04)
ax1.set_ylim(0, 1.02)
ax1.xaxis.set_major_locator(plt.MaxNLocator(5))
ax1.set_xlabel('false positive rate')
ax1.set_ylabel('true positive rate')
ax1.legend(loc=4)

ax2.set_xlabel('efficiency')
ax2.set_ylabel('completeness')
ax2.set_xlim(0, 1.0)
ax2.set_ylim(0.2, 1.02)

plt.show()
コード例 #12
0
galaxy_parser = Classify_Galaxies_Parser.Galaxy_Parser('Galaxies_hands_on_Chap9_larger.txt', 
    precondition=True, replaceMean=True, trainfrac=0.8)

print galaxy_parser.data_test.shape
print galaxy_parser.data_train.shape
print galaxy_parser.datanames
print 'Num ellipticals: ', np.sum(galaxy_parser.labels_test)
print 'Num non-ellipticals: ', np.sum(1.-galaxy_parser.labels_test)

# Create and fit SVM classifier for different c
cs = 10**np.linspace(0., 1, 20)
contaminations = np.zeros_like(cs)
completenesses = np.zeros_like(cs)
for i, C in enumerate(cs):
    svm = LinearSVC(loss='squared_hinge', C=C)
    svm.fit(galaxy_parser.data_train, galaxy_parser.labels_train)

    # Evaluate SVM classifier
    predicted_labels = svm.predict(galaxy_parser.data_test)
    completeness, contamination = completeness_contamination(predicted_labels,
         galaxy_parser.labels_test)
    contaminations[i] = contamination
    completenesses[i] = completeness

pl.semilogx(cs, contaminations, '*-', label='contamination')
pl.semilogx(cs, completenesses, '*-', label='completeness')
pl.legend(loc='best')
pl.xlabel('C')


pl.show()
コード例 #13
0
    rms_train = np.zeros(len(trees))
    i_best = 0
    label_fit_best = None

    completeness = []
    contamination = []
    for i, t in enumerate(trees):
        clf = RandomForestClassifier(t)
        clf.fit(data_train, labels_train)

        label_fit_train = clf.predict(data_train)
        label_fit = clf.predict(data_test)
        rms_train[i] = np.mean(np.sqrt((label_fit_train - labels_train) ** 2))
        rms_test[i] = np.mean(np.sqrt((label_fit - labels_test) ** 2))

        tmp_completeness, tmp_contamination = completeness_contamination(label_fit, labels_test)
        contamination.append(tmp_contamination)
        completeness.append(tmp_completeness)

        if rms_test[i] <= rms_test[i_best]:
            i_best = i
            label_fit_best = label_fit

    best_tree = trees[i_best]
    print "Depth of tree",  best_tree
    print "Fraction of stars: ", sum(labels_train/len(labels_train))


    plt.figure(figsize = (7,7))
    plt.title('Random Forest', fontsize=15)
    plt.plot(trees, completeness, color='teal', label='Completeness', linewidth=4)