def getPCAProjections(input_filename): """ Run PCA on the Kushmerick ad data Stop when there are sufficient PCA components to explain threshold_variance Project input data onto the top PCA components that explain threshold_variance Normalize this data Sort attributes by their correlation with output - input_filename : prepocessed data - Output 'pca': data projected onto PCA components 'norm': pca data normalized to std deviation 1 'corr': normalized data sorted by correlation with output 'index': """ print 'getPCAProjections:', input_filename explained_variance = 0.99 root_name = 'pca%03d' % round(explained_variance * 100.0) pca_filename = csv.makeCsvPath(root_name) pca_norm_filename = csv.makeCsvPath(root_name + '.norm') pca_norm_corr_filename = csv.makeCsvPath(root_name + '.norm.corr') corr_index_filename = csv.makeCsvPath(root_name + '.corr.idx') pca.pcaAdData(explained_variance, input_filename, pca_filename) pca.normalizeData(pca_filename, pca_norm_filename) sort_order, corr_index = pca.rankByCorrelationWithOutcomes(pca_norm_filename) def reorder(in_cells): return pca.reorderMatrix(in_cells, sort_order) csv.modifyCsvRaw(pca_norm_filename, pca_norm_corr_filename, reorder) csv.writeCsv(corr_index_filename, corr_index) return {'pca': pca_filename, 'norm':pca_norm_filename, 'corr':pca_norm_corr_filename, 'index':corr_index_filename}
describe(numpy) describe(scipy) describe(mdp) describe(bimdp) if False: covTest() #doTests() explained_variance = 0.99 ev = str(int(explained_variance*100.0)) # pca_filename = csv.headered_name_pca pca_filename = csv.makeCsvPath('pca' + ev) #pca_norm_filename = csv.headered_name_pca_norm pca_norm_filename = csv.makeCsvPath('pca' + ev + '.norm') #pca_norm_corr_filename = csv.headered_name_pca_corr pca_norm_corr_filename = csv.makeCsvPath('pca' + ev + '.norm.corr') if True: pcaAdData(explained_variance, csv.headered_name_pp, pca_filename) if True: normalizeData(pca_filename, pca_norm_filename) if True: sort_order = rankByCorrelationWithOutcomes(pca_norm_filename) def reorder(in_cells): return reorderMatrix(in_cells, sort_order) csv.modifyCsvRaw(pca_norm_filename, pca_norm_corr_filename, reorder)