Esempio n. 1
0
def getPCAProjections(input_filename):
	""" Run PCA on the Kushmerick ad data
        Stop when there are sufficient PCA components to explain threshold_variance
        Project input data onto the top PCA components that explain threshold_variance
        Normalize this data
        Sort attributes by their correlation with output
        - input_filename : prepocessed data
        - Output 'pca': data projected onto PCA components
        		 'norm': pca data normalized to std deviation 1
        		 'corr': normalized data sorted by correlation with output
        		 'index': 
  	"""
  	print 'getPCAProjections:', input_filename
 	explained_variance = 0.99
 	root_name = 'pca%03d' % round(explained_variance * 100.0)
 	pca_filename = csv.makeCsvPath(root_name)
 	pca_norm_filename = csv.makeCsvPath(root_name + '.norm')
 	pca_norm_corr_filename = csv.makeCsvPath(root_name + '.norm.corr')
 	corr_index_filename = csv.makeCsvPath(root_name + '.corr.idx')
    
	pca.pcaAdData(explained_variance, input_filename, pca_filename)
	pca.normalizeData(pca_filename, pca_norm_filename)    
	
	sort_order, corr_index = pca.rankByCorrelationWithOutcomes(pca_norm_filename)
	def reorder(in_cells):
	    return pca.reorderMatrix(in_cells, sort_order)
	csv.modifyCsvRaw(pca_norm_filename, pca_norm_corr_filename, reorder)
	csv.writeCsv(corr_index_filename, corr_index)
	
	return {'pca': pca_filename, 'norm':pca_norm_filename, 'corr':pca_norm_corr_filename, 'index':corr_index_filename}
Esempio n. 2
0
 describe(numpy)
 describe(scipy)
 describe(mdp)
 describe(bimdp)
 
 if False:
     covTest()
 #doTests()
 
 explained_variance = 0.99
 ev = str(int(explained_variance*100.0))
 # pca_filename = csv.headered_name_pca
 pca_filename = csv.makeCsvPath('pca' + ev)
 #pca_norm_filename = csv.headered_name_pca_norm
 pca_norm_filename = csv.makeCsvPath('pca' + ev + '.norm')
 #pca_norm_corr_filename = csv.headered_name_pca_corr
 pca_norm_corr_filename = csv.makeCsvPath('pca' + ev + '.norm.corr')
 
 if True:
     pcaAdData(explained_variance, csv.headered_name_pp, pca_filename)
     
 if True:
     normalizeData(pca_filename, pca_norm_filename)    
 
 if True:
     sort_order = rankByCorrelationWithOutcomes(pca_norm_filename)
     def reorder(in_cells):
         return reorderMatrix(in_cells, sort_order)
     csv.modifyCsvRaw(pca_norm_filename, pca_norm_corr_filename, reorder)