Exemple #1
0
def getPCAProjections(input_filename):
	""" Run PCA on the Kushmerick ad data
        Stop when there are sufficient PCA components to explain threshold_variance
        Project input data onto the top PCA components that explain threshold_variance
        Normalize this data
        Sort attributes by their correlation with output
        - input_filename : prepocessed data
        - Output 'pca': data projected onto PCA components
        		 'norm': pca data normalized to std deviation 1
        		 'corr': normalized data sorted by correlation with output
        		 'index': 
  	"""
  	print 'getPCAProjections:', input_filename
 	explained_variance = 0.99
 	root_name = 'pca%03d' % round(explained_variance * 100.0)
 	pca_filename = csv.makeCsvPath(root_name)
 	pca_norm_filename = csv.makeCsvPath(root_name + '.norm')
 	pca_norm_corr_filename = csv.makeCsvPath(root_name + '.norm.corr')
 	corr_index_filename = csv.makeCsvPath(root_name + '.corr.idx')
    
	pca.pcaAdData(explained_variance, input_filename, pca_filename)
	pca.normalizeData(pca_filename, pca_norm_filename)    
	
	sort_order, corr_index = pca.rankByCorrelationWithOutcomes(pca_norm_filename)
	def reorder(in_cells):
	    return pca.reorderMatrix(in_cells, sort_order)
	csv.modifyCsvRaw(pca_norm_filename, pca_norm_corr_filename, reorder)
	csv.writeCsv(corr_index_filename, corr_index)
	
	return {'pca': pca_filename, 'norm':pca_norm_filename, 'corr':pca_norm_corr_filename, 'index':corr_index_filename}
Exemple #2
0
def selectAttibutesGA():
	matrix = csv.readCsvRaw(csv.headered_name_pca_corr)
	num_attributes = len(matrix[0])-1
	if False:
		num_subset = 5
		num_trials = max(100, num_attributes*2)
		results = findBestOfSize(matrix, num_subset, num_trials)
		order = orderByResults(results,num_attributes)
	if True:
		sort_order = [i for i in range(num_attributes)]
		for num_subset in range(5, num_attributes, 5):
			num_trials = max(100, num_attributes*2)
			csv_matrix_name  = csv.makeCsvPath('subset.matrix' +('%03d'%num_subset))
			csv_results_name = csv.makePath('subset.results'+('%03d'%num_subset))
			csv_best_name    = csv.makeCsvPath('subset.best'   +('%03d'%num_subset))
			csv_summary_name  = csv.makeCsvPath('subset.summary'+('%03d'%num_subset))
		
			ordered_matrix = pca.reorderMatrix(matrix, sort_order)
			csv.writeCsv(csv_matrix_name, ordered_matrix)
			
			results = findBestOfSize(ordered_matrix, num_subset, num_trials, csv_summary_name)
			
			sort_order = orderByResults(results,num_attributes)
			#c_x = results[0].columns + [-1]      # include outcome
			#sub_matrix = [[row[i] for i in c_x] for row in ordered_matrix]
			#csv.writeCsv(csv_best_name,sub_matrix, )
			if not is_testing:
				shutil.copyfile(results[0]['csv'],csv_best_name)
				shutil.copyfile(results[0]['results'],csv_results_name)
Exemple #3
0
def testBySize(incrementing_hidden):
	"Test MLP results on matrix by number of left side columns"
	
	start_num_columns = 30 
	delta_num_columns = 10
	opts = '-M 0.5 -L 0.3 -x 4 -H '
	num_hidden = 13
		
	csv_matrix_name = csv.makeCsvPath('subset.matrix035')	
	base_name = 'number.attributes'
	if incrementing_hidden:
		base_name = base_name + '.inc'
	csv_results_name = csv.makePath(base_name + '.results')
	csv_summary_name = csv.makeCsvPath(base_name + '.summary')
	csv_best_name    = csv.makeCsvPath(base_name + '.best')
	
	matrix  = csv.readCsvRaw(csv_matrix_name)
	num_attribs = len(matrix[0])-1  # last column is category
	print 'testBySize', len(matrix), start_num_columns, delta_num_columns, len(matrix[0])
	
	best_accuracy = 0.0
	summary = []
	csv_summary = file(csv_summary_name, 'w')
	
	for num_columns in range(start_num_columns, len(matrix[0]), delta_num_columns):
		columns = [i for i in range(num_columns)]
		if incrementing_hidden:
			num_hidden = int(float(num_columns)*13.0/30.0)
		accuracy, temp_csv, temp_results, duration = testMatrixMLP(matrix, columns, opts + str(num_hidden))
		r = {'num':num_columns, 'accuracy':accuracy, 'csv':temp_csv, 'results':temp_results, 'duration':duration}
		summary.append(r)
		summary.sort(key = lambda r: -r['accuracy'])
		if True:
			print num_columns, ':',  accuracy, len(results), int(duration), 'seconds'
			for i in range(min(3,len(results))):
				rr = results[i]
				print '    ',i, ':', rr['accuracy'], rr['num'], int(rr['duration'])
		summary_row = [num_columns, accuracy, duration, temp_csv, temp_results]
		csv_line = ','.join([str(e) for e in summary_row])
		csv_summary.write(csv_line + '\n')
		csv_summary.flush()
		if accuracy > best_accuracy:
			best_accuracy = accuracy
			shutil.copyfile(temp_csv, csv_best_name)
			shutil.copyfile(temp_results, csv_results_name)
		
	return results
Exemple #4
0
def testByNumberHidden(csv_matrix_name, output_basename, num_columns, num_cv = 4):
	"""Test MLP results on matrix by number of neurons in hidden layer
		num_columns is number of leftmost columns of matrix to test
		num_cv is the number of cross-validation rounds
	"""
	
	start_num_hidden = min(10, num_columns-1) 
	delta_num_hidden = 10
	
	results_name = csv.makePath(output_basename + '.results')
	model_name   = csv.makePath(output_basename + '.model')
	csv_summary_name = csv.makeCsvPath(output_basename + '.summary')
	csv_best_name    = csv.makeCsvPath(output_basename + '.best')
	
	matrix  = csv.readCsvRaw(csv_matrix_name)
	num_attribs = len(matrix[0])-1  # last column is category
	print 'testByNumberHidden', len(matrix), start_num_hidden, delta_num_hidden, num_columns
	
	best_accuracy = 0.0
	results = []
	csv_summary = file(csv_summary_name, 'w')
	
	for num_hidden in range(start_num_hidden, num_columns, delta_num_hidden):
		columns = [i for i in range(num_columns)]
		accuracy, temp_csv, temp_results, duration = testMatrixMLP(matrix, columns, makeWekaOptions(0.3, 0.5, num_hidden, num_cv))
		r = {'num':num_hidden, 'accuracy':accuracy, 'csv':temp_csv, 'results':temp_results, 'duration':duration}
		results.append(r)
		results.sort(key = lambda r: -r['accuracy'])
		if True:
			print num_hidden, ':',  accuracy, len(results), int(duration), 'seconds'
			for i in range(min(5,len(results))):
				rr = results[i]
				print '    ',i, ':', rr['accuracy'], rr['num'], int(rr['duration'])
		summary = [num_hidden, accuracy, duration, temp_csv, temp_results]
		csv_line = ','.join([str(e) for e in summary])
		csv_summary.write(csv_line + '\n')
		csv_summary.flush()
		if accuracy > best_accuracy:
			best_accuracy = accuracy
			shutil.copyfile(temp_csv, csv_best_name)
			shutil.copyfile(temp_results, results_name)
			shutil.copyfile(outnameToModelname(temp_results), model_name)
			
	return {'summary':csv_summary_name, 'best':csv_best_name, 'results':results_name, 'model':model_name}
Exemple #5
0
def testCostMatrix(num_columns, num_cv = 4):
	"""Test MLP results with a range of false positive costs
	"""
	
	num_hidden = 5 
	
	csv_matrix_name = csv.makeCsvPath('subset.matrix035')	
	base_name = 'cost.col' + str(num_columns) + '.x' + str(num_cv) 
	csv_results_name = csv.makePath(base_name + '.results')
	csv_summary_name = csv.makeCsvPath(base_name + '.summary')
	csv_best_name    = csv.makeCsvPath(base_name + '.best')
	
	matrix  = csv.readCsvRaw(csv_matrix_name)
	num_attribs = len(matrix[0])-1  # last column is category
	print 'testCostMatrix', len(matrix), num_hidden, num_columns
	
	best_accuracy = 0.0
	results = []
	csv_results = file(csv_summary_name, 'w')
	
	for false_positive_cost in range(1, 11, 2):
		columns = [i for i in range(num_columns)]
		costs_map = {'True':1.0, 'False':float(false_positive_cost)}
		accuracy, temp_csv, temp_results, duration = testMatrixMLP(matrix, columns, makeWekaOptions(0.3, 0.5, num_hidden, num_cv, costs_map))
		r = {'cost':false_positive_cost, 'accuracy':accuracy, 'csv':temp_csv, 'results':temp_results, 'duration':duration}
		results.append(r)
		results.sort(key = lambda r: -r['accuracy'])
		if True:
			print false_positive_cost, ':',  accuracy, len(results), int(duration), 'seconds'
			for i in range(min(5,len(results))):
				rr = results[i]
				print '    ',i, ':', rr['accuracy'], rr['cost'], int(rr['duration'])
		summary = [num_hidden, accuracy, duration, temp_csv, temp_results]
		csv_line = ','.join([str(e) for e in summary])
		csv_results.write(csv_line + '\n')
		csv_results.flush()
		if accuracy > best_accuracy:
			best_accuracy = accuracy
			shutil.copyfile(temp_csv, csv_best_name)
			shutil.copyfile(temp_results, csv_results_name)
		
	return results
Exemple #6
0
		csv_line = ','.join([str(e) for e in summary])
		csv_results.write(csv_line + '\n')
		csv_results.flush()
		if accuracy > best_accuracy:
			best_accuracy = accuracy
			shutil.copyfile(temp_csv, csv_best_name)
			shutil.copyfile(temp_results, csv_results_name)
		
	return results

#
#
# Data files used in this test

# Input data - Don't touch this    
raw_name = csv.makeCsvPath('ad1')
# Input data with header. Needs to be generated once
headered_name = csv.makeCsvPath('header')
#Input data with header and pre-processing
headered_name_pp = csv.makeCsvPath('header.pp')   
# PCA on headered_name_pp
#headered_name_pca = csv.makePath('header.pp.pca')  
# PCA data normalized to stdev == 1
#headered_name_pca_norm = csv.makePath('header.pp.pca.norm')            
# PCA data normalized to stdev == 1 by correlation with outcome
#headered_name_pca_corr = csv.makePath('header.pp.pca.norm.corr_order')

if __name__ == '__main__':
	
	if False:
		dumpEnv()
Exemple #7
0
        
     
if __name__=='__main__':
    describe(numpy)
    describe(scipy)
    describe(mdp)
    describe(bimdp)
    
    if False:
        covTest()
    #doTests()
    
    explained_variance = 0.99
    ev = str(int(explained_variance*100.0))
    # pca_filename = csv.headered_name_pca
    pca_filename = csv.makeCsvPath('pca' + ev)
    #pca_norm_filename = csv.headered_name_pca_norm
    pca_norm_filename = csv.makeCsvPath('pca' + ev + '.norm')
    #pca_norm_corr_filename = csv.headered_name_pca_corr
    pca_norm_corr_filename = csv.makeCsvPath('pca' + ev + '.norm.corr')
    
    if True:
        pcaAdData(explained_variance, csv.headered_name_pp, pca_filename)
        
    if True:
        normalizeData(pca_filename, pca_norm_filename)    
    
    if True:
        sort_order = rankByCorrelationWithOutcomes(pca_norm_filename)
        def reorder(in_cells):
            return reorderMatrix(in_cells, sort_order)