def getPCAProjections(input_filename): """ Run PCA on the Kushmerick ad data Stop when there are sufficient PCA components to explain threshold_variance Project input data onto the top PCA components that explain threshold_variance Normalize this data Sort attributes by their correlation with output - input_filename : prepocessed data - Output 'pca': data projected onto PCA components 'norm': pca data normalized to std deviation 1 'corr': normalized data sorted by correlation with output 'index': """ print 'getPCAProjections:', input_filename explained_variance = 0.99 root_name = 'pca%03d' % round(explained_variance * 100.0) pca_filename = csv.makeCsvPath(root_name) pca_norm_filename = csv.makeCsvPath(root_name + '.norm') pca_norm_corr_filename = csv.makeCsvPath(root_name + '.norm.corr') corr_index_filename = csv.makeCsvPath(root_name + '.corr.idx') pca.pcaAdData(explained_variance, input_filename, pca_filename) pca.normalizeData(pca_filename, pca_norm_filename) sort_order, corr_index = pca.rankByCorrelationWithOutcomes(pca_norm_filename) def reorder(in_cells): return pca.reorderMatrix(in_cells, sort_order) csv.modifyCsvRaw(pca_norm_filename, pca_norm_corr_filename, reorder) csv.writeCsv(corr_index_filename, corr_index) return {'pca': pca_filename, 'norm':pca_norm_filename, 'corr':pca_norm_corr_filename, 'index':corr_index_filename}
def selectAttibutesGA(): matrix = csv.readCsvRaw(csv.headered_name_pca_corr) num_attributes = len(matrix[0])-1 if False: num_subset = 5 num_trials = max(100, num_attributes*2) results = findBestOfSize(matrix, num_subset, num_trials) order = orderByResults(results,num_attributes) if True: sort_order = [i for i in range(num_attributes)] for num_subset in range(5, num_attributes, 5): num_trials = max(100, num_attributes*2) csv_matrix_name = csv.makeCsvPath('subset.matrix' +('%03d'%num_subset)) csv_results_name = csv.makePath('subset.results'+('%03d'%num_subset)) csv_best_name = csv.makeCsvPath('subset.best' +('%03d'%num_subset)) csv_summary_name = csv.makeCsvPath('subset.summary'+('%03d'%num_subset)) ordered_matrix = pca.reorderMatrix(matrix, sort_order) csv.writeCsv(csv_matrix_name, ordered_matrix) results = findBestOfSize(ordered_matrix, num_subset, num_trials, csv_summary_name) sort_order = orderByResults(results,num_attributes) #c_x = results[0].columns + [-1] # include outcome #sub_matrix = [[row[i] for i in c_x] for row in ordered_matrix] #csv.writeCsv(csv_best_name,sub_matrix, ) if not is_testing: shutil.copyfile(results[0]['csv'],csv_best_name) shutil.copyfile(results[0]['results'],csv_results_name)
def testBySize(incrementing_hidden): "Test MLP results on matrix by number of left side columns" start_num_columns = 30 delta_num_columns = 10 opts = '-M 0.5 -L 0.3 -x 4 -H ' num_hidden = 13 csv_matrix_name = csv.makeCsvPath('subset.matrix035') base_name = 'number.attributes' if incrementing_hidden: base_name = base_name + '.inc' csv_results_name = csv.makePath(base_name + '.results') csv_summary_name = csv.makeCsvPath(base_name + '.summary') csv_best_name = csv.makeCsvPath(base_name + '.best') matrix = csv.readCsvRaw(csv_matrix_name) num_attribs = len(matrix[0])-1 # last column is category print 'testBySize', len(matrix), start_num_columns, delta_num_columns, len(matrix[0]) best_accuracy = 0.0 summary = [] csv_summary = file(csv_summary_name, 'w') for num_columns in range(start_num_columns, len(matrix[0]), delta_num_columns): columns = [i for i in range(num_columns)] if incrementing_hidden: num_hidden = int(float(num_columns)*13.0/30.0) accuracy, temp_csv, temp_results, duration = testMatrixMLP(matrix, columns, opts + str(num_hidden)) r = {'num':num_columns, 'accuracy':accuracy, 'csv':temp_csv, 'results':temp_results, 'duration':duration} summary.append(r) summary.sort(key = lambda r: -r['accuracy']) if True: print num_columns, ':', accuracy, len(results), int(duration), 'seconds' for i in range(min(3,len(results))): rr = results[i] print ' ',i, ':', rr['accuracy'], rr['num'], int(rr['duration']) summary_row = [num_columns, accuracy, duration, temp_csv, temp_results] csv_line = ','.join([str(e) for e in summary_row]) csv_summary.write(csv_line + '\n') csv_summary.flush() if accuracy > best_accuracy: best_accuracy = accuracy shutil.copyfile(temp_csv, csv_best_name) shutil.copyfile(temp_results, csv_results_name) return results
def testByNumberHidden(csv_matrix_name, output_basename, num_columns, num_cv = 4): """Test MLP results on matrix by number of neurons in hidden layer num_columns is number of leftmost columns of matrix to test num_cv is the number of cross-validation rounds """ start_num_hidden = min(10, num_columns-1) delta_num_hidden = 10 results_name = csv.makePath(output_basename + '.results') model_name = csv.makePath(output_basename + '.model') csv_summary_name = csv.makeCsvPath(output_basename + '.summary') csv_best_name = csv.makeCsvPath(output_basename + '.best') matrix = csv.readCsvRaw(csv_matrix_name) num_attribs = len(matrix[0])-1 # last column is category print 'testByNumberHidden', len(matrix), start_num_hidden, delta_num_hidden, num_columns best_accuracy = 0.0 results = [] csv_summary = file(csv_summary_name, 'w') for num_hidden in range(start_num_hidden, num_columns, delta_num_hidden): columns = [i for i in range(num_columns)] accuracy, temp_csv, temp_results, duration = testMatrixMLP(matrix, columns, makeWekaOptions(0.3, 0.5, num_hidden, num_cv)) r = {'num':num_hidden, 'accuracy':accuracy, 'csv':temp_csv, 'results':temp_results, 'duration':duration} results.append(r) results.sort(key = lambda r: -r['accuracy']) if True: print num_hidden, ':', accuracy, len(results), int(duration), 'seconds' for i in range(min(5,len(results))): rr = results[i] print ' ',i, ':', rr['accuracy'], rr['num'], int(rr['duration']) summary = [num_hidden, accuracy, duration, temp_csv, temp_results] csv_line = ','.join([str(e) for e in summary]) csv_summary.write(csv_line + '\n') csv_summary.flush() if accuracy > best_accuracy: best_accuracy = accuracy shutil.copyfile(temp_csv, csv_best_name) shutil.copyfile(temp_results, results_name) shutil.copyfile(outnameToModelname(temp_results), model_name) return {'summary':csv_summary_name, 'best':csv_best_name, 'results':results_name, 'model':model_name}
def testCostMatrix(num_columns, num_cv = 4): """Test MLP results with a range of false positive costs """ num_hidden = 5 csv_matrix_name = csv.makeCsvPath('subset.matrix035') base_name = 'cost.col' + str(num_columns) + '.x' + str(num_cv) csv_results_name = csv.makePath(base_name + '.results') csv_summary_name = csv.makeCsvPath(base_name + '.summary') csv_best_name = csv.makeCsvPath(base_name + '.best') matrix = csv.readCsvRaw(csv_matrix_name) num_attribs = len(matrix[0])-1 # last column is category print 'testCostMatrix', len(matrix), num_hidden, num_columns best_accuracy = 0.0 results = [] csv_results = file(csv_summary_name, 'w') for false_positive_cost in range(1, 11, 2): columns = [i for i in range(num_columns)] costs_map = {'True':1.0, 'False':float(false_positive_cost)} accuracy, temp_csv, temp_results, duration = testMatrixMLP(matrix, columns, makeWekaOptions(0.3, 0.5, num_hidden, num_cv, costs_map)) r = {'cost':false_positive_cost, 'accuracy':accuracy, 'csv':temp_csv, 'results':temp_results, 'duration':duration} results.append(r) results.sort(key = lambda r: -r['accuracy']) if True: print false_positive_cost, ':', accuracy, len(results), int(duration), 'seconds' for i in range(min(5,len(results))): rr = results[i] print ' ',i, ':', rr['accuracy'], rr['cost'], int(rr['duration']) summary = [num_hidden, accuracy, duration, temp_csv, temp_results] csv_line = ','.join([str(e) for e in summary]) csv_results.write(csv_line + '\n') csv_results.flush() if accuracy > best_accuracy: best_accuracy = accuracy shutil.copyfile(temp_csv, csv_best_name) shutil.copyfile(temp_results, csv_results_name) return results
csv_line = ','.join([str(e) for e in summary]) csv_results.write(csv_line + '\n') csv_results.flush() if accuracy > best_accuracy: best_accuracy = accuracy shutil.copyfile(temp_csv, csv_best_name) shutil.copyfile(temp_results, csv_results_name) return results # # # Data files used in this test # Input data - Don't touch this raw_name = csv.makeCsvPath('ad1') # Input data with header. Needs to be generated once headered_name = csv.makeCsvPath('header') #Input data with header and pre-processing headered_name_pp = csv.makeCsvPath('header.pp') # PCA on headered_name_pp #headered_name_pca = csv.makePath('header.pp.pca') # PCA data normalized to stdev == 1 #headered_name_pca_norm = csv.makePath('header.pp.pca.norm') # PCA data normalized to stdev == 1 by correlation with outcome #headered_name_pca_corr = csv.makePath('header.pp.pca.norm.corr_order') if __name__ == '__main__': if False: dumpEnv()
if __name__=='__main__': describe(numpy) describe(scipy) describe(mdp) describe(bimdp) if False: covTest() #doTests() explained_variance = 0.99 ev = str(int(explained_variance*100.0)) # pca_filename = csv.headered_name_pca pca_filename = csv.makeCsvPath('pca' + ev) #pca_norm_filename = csv.headered_name_pca_norm pca_norm_filename = csv.makeCsvPath('pca' + ev + '.norm') #pca_norm_corr_filename = csv.headered_name_pca_corr pca_norm_corr_filename = csv.makeCsvPath('pca' + ev + '.norm.corr') if True: pcaAdData(explained_variance, csv.headered_name_pp, pca_filename) if True: normalizeData(pca_filename, pca_norm_filename) if True: sort_order = rankByCorrelationWithOutcomes(pca_norm_filename) def reorder(in_cells): return reorderMatrix(in_cells, sort_order)