plt.xlabel("Pairwise squared distances in original space") plt.ylabel("Pairwise squared distances in projected space") plt.title("Pairwise distances distribution for n_components=%d" % n_components) cb = plt.colorbar() cb.set_label('Sample pairs counts') rates = projected_dists / dists print("Mean distances rate: %0.2f (%0.2f)" % (np.mean(rates), np.std(rates))) plt.savefig('Figs/02b_rp_pwdist_{}_{}'.format(data_name, n_components)) plt.figure() plt.hist(rates, bins=50, range=(0., 2.), edgecolor='k', **density_param) plt.xlabel("Squared distances rate: projected / original") plt.ylabel("Distribution of samples pairs") plt.title("Histogram of pairwise distance rates for n_components=%d" % n_components) plt.savefig('Figs/02b_rp_histogram_{}_{}'.format(data_name, n_components)) plt.clf() # TODO: compute the expected value of eps and add them to the previous plot # as vertical lines / region np.random.seed(0) digits = load_digits() X, y = digits.data, digits.target johnson_lindenstrauss(digits.data, 'digits') X, y, data = getCreditCardData('./Data/ccdefault.xls', subset=0.2) johnson_lindenstrauss(data, 'credit')
ax.set_title( 'Dimensionality Reduced Data for NN (using {})'.format(reducer_name)) # For each number of components, find the best classifier results results = pd.DataFrame(search.cv_results_) results.to_csv( path_or_buf='Figs/05_{}_cluster_nn.csv'.format(reducer_name)) best_clfs = results.groupby(components_col).apply( lambda g: g.nlargest(1, 'mean_test_score')) best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score', legend=False, ax=ax) ax.set_ylabel('Classification accuracy (val)') ax.set_xlabel('n_components') plt.tight_layout() plt.savefig('Figs/05_{}_cluster_nn'.format(reducer_name)) np.random.seed(0) X, y, data = getCreditCardData('./Data/ccdefault.xls') reducer = KMeans(random_state=0) cluster_nn(X, y, reducer, 'kmeans') reducer = custGMM(GaussianMixture(reg_covar=1.0, random_state=0)) cluster_nn(X, y, reducer, 'gaussian_mix')
plt.plot(range(1, n + 1), test_mean_acc1, 'r') plt.fill_between(range(1, n + 1), test_mean_acc1 - 1 * test_std_acc1, test_mean_acc1 + 1 * test_std_acc1, alpha=0.10) plt.plot(range(1, n + 1), train_mean_acc1, 'm') plt.fill_between(range(1, n + 1), train_mean_acc1 - 1 * train_std_acc1, train_mean_acc1 + 1 * train_std_acc1, alpha=0.10) plt.legend(('Test Accuracy - {}'.format(data_name), 'Training Accuracy - {}'.format(data_name))) plt.ylabel('Accuracy') plt.xlabel('Decision Tree Depth') plt.tight_layout() plt.savefig('Figs/DT-depth-{}'.format(data_name)) plt.clf() if __name__ == "__main__": np.random.seed(0) test_size = 0.2 X_train1, X_test1, y_train1, y_test1 = getCreditCardData( path='./Data/ccdefault.xls', test_size=test_size) X_train2, X_test2, y_train2, y_test2 = getWineData( path='./Data/winequality-white.csv', test_size=test_size) DT(X_train1, X_test1, y_train1, y_test1, 'Credit Card Default', 0.8, 0.9) DT(X_train2, X_test2, y_train2, y_test2, 'Wine', 0.4, 1.01)