completeness, contamination = completeness_contamination(predictions, y_test) print "completeness", completeness print "contamination", contamination #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71), np.linspace(ylim[0], ylim[1], 81)) Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()]) Z = Z[:, 1].reshape(xx.shape) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(8, 4)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) im = ax.scatter(X[-N_plot:, 1], X[-N_plot:, 0], c=y[-N_plot:], s=4, lw=0, cmap=plt.cm.binary, zorder=2) im.set_clim(-0.5, 1) im = ax.imshow(Z, origin='lower', aspect='auto',
sensitivity = truepos_n / int(cancer_pt.size / cancer_pt.ndim) # In[51]: #Generate grids for the entire plot if inRedox: xx, yy = np.meshgrid(np.linspace(0, xaxis_range, grid_resol), np.linspace(0, yaxis_range_rdx, grid_resol)) else: xx, yy = np.meshgrid(np.linspace(0, xaxis_range, grid_resol), np.linspace(0, yaxis_range, grid_resol)) plot_grid = np.c_[xx.ravel(), yy.ravel()] #Calculate the prediction probability for each point on the grid grid_z = clf.predict_proba(plot_grid)[:, 1].reshape(xx.shape) # In[99]: xx # In[95]: plt.figure() plt.contour(xx, yy, grid_z, [0.5], linewidths=2., colors='k') plt.scatter(X, Y, c='r', marker='^', label='Cancer (N =' + str(cancer_pt.size / 2) + ')')
completeness, contamination = completeness_contamination( predictions, y_test) print "completeness", completeness print "contamination", contamination #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71), np.linspace(ylim[0], ylim[1], 81)) Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()]) Z = Z[:, 1].reshape(xx.shape) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) im = ax.scatter(X[-N_plot:, 1],
def crossValidate(itr): norTrainNum, nor_isTraining = randTestData(t_data_perc, norDataNum) cnTrainNum, cn_isTraining = randTestData(t_data_perc, cnDataNum) isTraining = np.hstack((nor_isTraining, cn_isTraining)) # Training clf = QDA() trained_clf = clf.fit(train_data[isTraining], labels[isTraining]) # Using the remaining data for testing normal_pred = trained_clf.predict(normal_pt[nor_isTraining == False]) trueneg_n = (normal_pred == 0).sum() specificity = trueneg_n / int(norDataNum - norTrainNum) cancer_pred = trained_clf.predict(cancer_pt[cn_isTraining == False]) truepos_n = (cancer_pred == 1).sum() sensitivity = truepos_n / int(cnDataNum - cnTrainNum) # Generate grids for the entire plot if inRedox: xx, yy = np.meshgrid(np.linspace(0, xaxis_range, grid_resol), np.linspace(0, yaxis_range_rdx, grid_resol)) else: xx, yy = np.meshgrid(np.linspace(0, xaxis_range, grid_resol), np.linspace(0, yaxis_range, grid_resol)) plot_grid = np.c_[xx.ravel(), yy.ravel()] # Calculate the prediction probability for each point on the grid grid_z = clf.predict_proba(plot_grid)[:, 1].reshape(xx.shape) plt.figure() plt.contour(xx, yy, grid_z, [0.5], linewidths=2.0, colors="k") plt.scatter( x1[cn_isTraining == False], y1[cn_isTraining == False], c="r", marker="^", label="Cancer (N =" + str(cnDataNum - cnTrainNum) + ")", ) plt.scatter( x2[nor_isTraining == False], y2[nor_isTraining == False], c="g", marker="^", label="Normal(N =" + str(norDataNum - norTrainNum) + ")", ) plt.scatter( x1[cn_isTraining], y1[cn_isTraining], c="r", marker="o", label="Trn_Cancer (N =" + str(cnTrainNum) + ")" ) plt.scatter( x2[nor_isTraining], y2[nor_isTraining], c="g", marker="o", label="Trn_Normal(N =" + str(norTrainNum) + ")" ) plt.axis("tight") plt.xlabel(feature_x, fontsize="large") plt.ylabel(feature_y, fontsize="large") plt.legend() plt.suptitle(feature_x + " vs. " + feature_y, fontsize=16) plt.title( "Specificity: " + "{0:.3f}".format(specificity) + " ; " + "Sensitivity:" + "{0:.3f}".format(sensitivity), fontsize=12, ) plt.savefig("cv" + str(itr) + ".jpg") return specificity, sensitivity
X_test = pd.concat(test) all_ids.append(np.concatenate(idx)) X_test = X_test.drop(['id'], axis=1) X_test = np.asarray(X_test.astype(float)) current_prediction_lda = np.empty((X_test.shape[0], 6)) # number of test samples X number of labels current_prediction_lr = np.empty((X_test.shape[0], 6)) # number of test samples X number of labels current_prediction_qda = np.empty((X_test.shape[0], 6)) # number of test samples X number of labels X_test = data_preprocess_test(X_test) for i in range(6): print 'testing subject_id=',subject_id current_prediction_lr[:,i] = lr.predict_proba(X_test)[:,1] current_prediction_qda[:,i] = qda.predict_proba(X_test)[:,1] current_prediction_lda[:,i] = lda.predict_proba(X_test)[:,1] # print 'predicted:',current_prediction[:,i] all_predictions_lda.append(current_prediction_lda) all_predictions_qda.append(current_prediction_qda) all_predictions_lr.append(current_prediction_lr) all_predictions_avg.append( (current_prediction_lda+current_prediction_qda+current_prediction_lr)/3 ) print 'testing complete' print 'ids ',np.concatenate(all_ids).shape print 'predictions ',np.concatenate(all_predictions_avg).shape
remove = remove.union(redundant) print("For correlation coefficient = ", coefficient) #print(remove) #print(add) train_data = pd.DataFrame(data=train_data_g, columns = df.columns)[df.columns- remove].values test_data = pd.DataFrame(data=test_data_g, columns = df.columns)[df.columns- remove].values print("num of featurs = ", train_data.shape[1]) clf = QDA(); # This gets the time in ipython shell. print("Modelling time:") %time clf.fit(train_data, train_labels) print("Modelling time ends") print("prediction time starts:") %time predicted_labels = clf.predict(test_data) print("prediction time ends") #print(classification_report(test_labels, clf.predict(test_data))) print(classification_report(test_labels, predicted_labels)) print("num of featurs = ", train_data.shape[1]) y_true = test_labels; y_pred_proba = clf.predict_proba(test_data); fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba[:, 1]) roc_auc = auc(fpr, tpr) print("ROC AUC =", roc_auc) print("\n\n\n")
angle = 180 * angle / np.pi # convert to degrees # filled gaussian at 2 standard deviation ell = mpl.patches.Ellipse(mean, 2 * v[0]**0.5, 2 * v[1]**0.5, 180 + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(0.5) splot.add_artist(ell) xx, yy = np.meshgrid(np.linspace(4, 8.5, 200), np.linspace(1.5, 4.5, 200)) X_grid = np.c_[xx.ravel(), yy.ravel()] zz_lda = lda.predict_proba(X_grid)[:, 1].reshape(xx.shape) zz_qda = qda.predict_proba(X_grid)[:, 1].reshape(xx.shape) pl.figure() splot = pl.subplot(1, 2, 1) pl.contourf(xx, yy, zz_lda > 0.5, alpha=0.5) pl.scatter(X[y == 0, 0], X[y == 0, 1], c='b', label=target_names[0]) pl.scatter(X[y == 1, 0], X[y == 1, 1], c='r', label=target_names[1]) pl.contour(xx, yy, zz_lda, [0.5], linewidths=2., colors='k') plot_ellipse(splot, lda.means_[0], lda.covariance_, 'b') plot_ellipse(splot, lda.means_[1], lda.covariance_, 'r') pl.legend() pl.axis('tight') pl.title('Linear Discriminant Analysis') splot = pl.subplot(1, 2, 2) pl.contourf(xx, yy, zz_qda > 0.5, alpha=0.5)
def plot_ellipse(splot, mean, cov, color): v, w = linalg.eigh(cov) u = w[0] / linalg.norm(w[0]) angle = np.arctan(u[1]/u[0]) angle = 180 * angle / np.pi # convert to degrees # filled gaussian at 2 standard deviation ell = mpl.patches.Ellipse(mean, 2 * v[0] ** 0.5, 2 * v[1] ** 0.5, 180 + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(0.5) splot.add_artist(ell) xx, yy = np.meshgrid(np.linspace(4, 8.5, 200), np.linspace(1.5, 4.5, 200)) X_grid = np.c_[xx.ravel(), yy.ravel()] zz_lda = lda.predict_proba(X_grid)[:,1].reshape(xx.shape) zz_qda = qda.predict_proba(X_grid)[:,1].reshape(xx.shape) pl.figure() splot = pl.subplot(1, 2, 1) pl.contourf(xx, yy, zz_lda > 0.5, alpha=0.5) pl.scatter(X[y==0,0], X[y==0,1], c='b', label=target_names[0]) pl.scatter(X[y==1,0], X[y==1,1], c='r', label=target_names[1]) pl.contour(xx, yy, zz_lda, [0.5], linewidths=2., colors='k') plot_ellipse(splot, lda.means_[0], lda.covariance_, 'b') plot_ellipse(splot, lda.means_[1], lda.covariance_, 'r') pl.legend() pl.axis('tight') pl.title('Linear Discriminant Analysis') splot = pl.subplot(1, 2, 2) pl.contourf(xx, yy, zz_qda > 0.5, alpha=0.5)
def crossValidate(itr): norTrainNum, nor_isTraining = randTestData(t_data_perc, norDataNum) cnTrainNum, cn_isTraining = randTestData(t_data_perc, cnDataNum) isTraining = np.hstack((nor_isTraining, cn_isTraining)) #Training clf = QDA() trained_clf = clf.fit(train_data[isTraining], labels[isTraining]) #Using the remaining data for testing normal_pred = trained_clf.predict(normal_pt[nor_isTraining == False]) trueneg_n = (normal_pred == 0).sum() specificity = trueneg_n / int(norDataNum - norTrainNum) cancer_pred = trained_clf.predict(cancer_pt[cn_isTraining == False]) truepos_n = (cancer_pred == 1).sum() sensitivity = truepos_n / int(cnDataNum - cnTrainNum) #Generate grids for the entire plot if inRedox: xx, yy = np.meshgrid(np.linspace(0, xaxis_range, grid_resol), np.linspace(0, yaxis_range_rdx, grid_resol)) else: xx, yy = np.meshgrid(np.linspace(0, xaxis_range, grid_resol), np.linspace(0, yaxis_range, grid_resol)) plot_grid = np.c_[xx.ravel(), yy.ravel()] #Calculate the prediction probability for each point on the grid grid_z = clf.predict_proba(plot_grid)[:, 1].reshape(xx.shape) plt.figure() plt.contour(xx, yy, grid_z, [0.5], linewidths=2., colors='k') plt.scatter(x1[cn_isTraining == False], y1[cn_isTraining == False], c='r', marker='^', label='Cancer (N =' + str(cnDataNum - cnTrainNum) + ')') plt.scatter(x2[nor_isTraining == False], y2[nor_isTraining == False], c='g', marker='^', label='Normal(N =' + str(norDataNum - norTrainNum) + ')') plt.scatter(x1[cn_isTraining], y1[cn_isTraining], c='r', marker='o', label='Trn_Cancer (N =' + str(cnTrainNum) + ')') plt.scatter(x2[nor_isTraining], y2[nor_isTraining], c='g', marker='o', label='Trn_Normal(N =' + str(norTrainNum) + ')') plt.axis('tight') plt.xlabel(feature_x, fontsize='large') plt.ylabel(feature_y, fontsize='large') plt.legend() plt.suptitle(feature_x + ' vs. ' + feature_y, fontsize=16) plt.title('Specificity: ' + '{0:.3f}'.format(specificity) + ' ; ' + 'Sensitivity:' + '{0:.3f}'.format(sensitivity), fontsize=12) plt.savefig('cv' + str(itr) + '.jpg') return specificity, sensitivity
# In[119]: cancer_pred = trained_clf.predict(cancer_pt) truepos_n = (cancer_pred == 1).sum() sensitivity = truepos_n/int(cancer_ndata) # In[120]: #Generate grids for the entire plot xx, yy, zz = np.meshgrid(np.linspace(0, 255, 100), np.linspace(0, 255, 100), np.linspace(0, 0.2, 200)) plot_grid = np.c_[xx.ravel(), yy.ravel(), zz.ravel()] #Calculate the prediction probability for each point on the grid grid_result = clf.predict_proba(plot_grid)[:,1].reshape(xx.shape) # In[124]: a = abs(grid_result - 0.5) sur_x, sur_y = np.meshgrid(np.linspace(0, 255, 100), np.linspace(0, 255, 100)) sur_z = np.zeros(sur_x.size).reshape(sur_x.shape) sur_z.shape for i in range(100): for j in range(100): sur_z[i][j] = zz[i][j][a[i][j].argmin()] sur_z