def dr(X, y, savedir, ds): # First do pca pca_pipe = A3.pca(X, y) pca = pca_pipe.named_steps['pca'] plt.savefig('{}/{}-pca.png'.format(savedir, ds)) np.savetxt('{}/{}-pca-ev.csv'.format(savedir, ds), pca.explained_variance_) np.savetxt('{}/{}-pca-ev-ratio.csv'.format(savedir, ds), pca.explained_variance_ratio_) plt.close('all') plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('n_components') plt.ylabel('explained variance (%)') plt.savefig('{}/{}-pca-ev.png'.format(savedir, ds)) plt.close('all') reconstruction_error = A3.recon_error(pca, X) logging.info('PCA reconstruction error: {}'.format(reconstruction_error)) # second ICA ica = None max_kurtosis = -np.inf ica_range = range(10, X.shape[1], 10) kurt_per_comp = [] for i in ica_range: ica_pipe = A3.ica(X, y, i) # This should be selected by kurtosis kurt = A3.avg_kurtosis(ica_pipe.transform(X)) kurt_per_comp.append(kurt) logging.info('ICA {} average kurtosis: {}'.format(i, kurt)) if kurt > max_kurtosis: ica = ica_pipe.named_steps['fastica'] max_kurtosis = kurt logging.info('ICA max kurtosis {} with {} components'.format( max_kurtosis, ica.components_.shape[0])) plt.plot(ica_range, kurt_per_comp) plt.xlabel('n_components') plt.ylabel('mean kurtosis') plt.savefig('{}/{}-ica-kurtosis.png'.format(savedir, ds)) # RP logging.info('Starting randomized projection...') rp_errors = [] rp = None best_rp_err = np.inf reconstruction_errors = [] for rp_run in range(10): logging.info('RP iteration {}'.format(rp_run)) best_run = np.inf run_errors = [] for i in range(10, X.shape[1], 10): rp_pipe = A3.random_projection(X, y, i) err = A3.recon_error( rp_pipe.named_steps['gaussianrandomprojection'], X) run_errors.append(err) logging.info('RP {} components reconstruction error: {}'.format( i, err)) if err < best_rp_err: rp = rp_pipe.named_steps['gaussianrandomprojection'] best_rp_err = err if err < best_run: best_run = err reconstruction_errors.append(run_errors) rp_errors.append(best_run) pd.DataFrame(reconstruction_errors, columns=range(10, X.shape[1], 10)).to_csv( '{}/{}-rp-reconstruction.csv'.format(savedir, ds)) # Manually set random projection if ds == 'musk': rp.set_params(n_components=50) elif ds == 'shoppers': rp.set_params(n_components=30) plt.figure() plt.plot(range(10), rp_errors) plt.xlabel('iteration') plt.ylabel('reconstruction error') plt.savefig('{}/{}-rp-reconstruction.png'.format(savedir, ds)) plt.close('all') logging.info('RP best n_components: {}'.format(rp.n_components_)) # TODO: fourth dimension reduction rf_pipe = A3.rfselect(X, y) rf = rf_pipe.named_steps['randomforest'] return [pca, ica, rp, rf]