Example #1
0
def selectAttibutesGA():
	matrix = csv.readCsvRaw(csv.headered_name_pca_corr)
	num_attributes = len(matrix[0])-1
	if False:
		num_subset = 5
		num_trials = max(100, num_attributes*2)
		results = findBestOfSize(matrix, num_subset, num_trials)
		order = orderByResults(results,num_attributes)
	if True:
		sort_order = [i for i in range(num_attributes)]
		for num_subset in range(5, num_attributes, 5):
			num_trials = max(100, num_attributes*2)
			csv_matrix_name  = csv.makeCsvPath('subset.matrix' +('%03d'%num_subset))
			csv_results_name = csv.makePath('subset.results'+('%03d'%num_subset))
			csv_best_name    = csv.makeCsvPath('subset.best'   +('%03d'%num_subset))
			csv_summary_name  = csv.makeCsvPath('subset.summary'+('%03d'%num_subset))
		
			ordered_matrix = pca.reorderMatrix(matrix, sort_order)
			csv.writeCsv(csv_matrix_name, ordered_matrix)
			
			results = findBestOfSize(ordered_matrix, num_subset, num_trials, csv_summary_name)
			
			sort_order = orderByResults(results,num_attributes)
			#c_x = results[0].columns + [-1]      # include outcome
			#sub_matrix = [[row[i] for i in c_x] for row in ordered_matrix]
			#csv.writeCsv(csv_best_name,sub_matrix, )
			if not is_testing:
				shutil.copyfile(results[0]['csv'],csv_best_name)
				shutil.copyfile(results[0]['results'],csv_results_name)
Example #2
0
def rankByCorrelationWithOutcomes(in_fn):
    "Rank each attribute by its correlation with the outcome"
    print 'rankByCorrelationWithOutcomes:', in_fn
    in_cells = csv.readCsvRaw(in_fn)
    csv.validateMatrix(in_cells)
    
    name_map = {'nonad.':0.0, 'ad.':1.0}
    def strToFloat(s):
        return name_map[s.strip()]
    
    #last column is ad categories. normalize other columns
    in_data = [[float(e) for e in row[:-1]] for row in in_cells[1:]]
    print 'in_data', len(in_data), len(in_data[0])
    raw_outcomes = [strToFloat(row[-1]) for row in in_cells[1:]]
  
    print 'outcomes', len(raw_outcomes) #,len(raw_outcomes[0])
    values = array(in_data)
    outcomes = array(raw_outcomes)
    
    def correlationWithOutcome(column):
        return correlation(column, outcomes)
   
    # http://www.scipy.org/Numpy_Example_List#head-528347f2f13004fc0081dce432e81b87b3726a33
    corr_with_outcomes = apply_along_axis(correlationWithOutcome,0,values)
    # print 'corr_with_outcomes', corr_with_outcomes
    corr_index = [(i,c) for i,c in enumerate(corr_with_outcomes)]
    # print corr_index
    corr_index.sort(key = lambda x: -abs(x[1])) 
    # print corr_index
    sort_order = [x[0] for x in corr_index]
    #print sort_order
    return (sort_order, corr_index)
Example #3
0
def wekaToPredict(weka_filename):
    """ Convert a Weka formatted .cvs file to a Google
        Predict .csv file by moving class from last 
        column to first
    """
    parts = os.path.splitext(weka_filename)
    predict_filename = parts[0] + '.gp' + parts[1]
    print 'wekaToPredict:', weka_filename,'=>', predict_filename
    weka = csv.readCsvRaw(weka_filename)
    predict = [[w[-1]] + w[:-1] for w in weka]
    csv.writeCsv(predict_filename, predict) 
Example #4
0
def preprocess(raw_name, headered_name, headered_name_pp):
	"""	Add headers and pre-process the raw Kushmerick  data. 
		This needs to be done once.
		- raw_name is the Kushmerick data that is input
		- headered_name is the name of CSV file with headers that is created
		- headered_name_pp is the named a file created by preprocessing header name that is created
	"""
	print 'preprocess', raw_name, '=>', headered_name, '=>', headered_name_pp
	header = csv.makeHeader()
	data = csv.readCsvRaw(raw_name)
    
	hdata = [header] + data
	assert(len(hdata)==len(data)+1)
	csv.validateMatrix(hdata)

	#swapMatrixColumn(data, 3, -1)
	csv.writeCsv(headered_name, hdata)
	h2data = csv.readCsvRaw(headered_name)
    
	csv.replaceMissingValues(hdata)
	csv.writeCsv(headered_name_pp, hdata)
Example #5
0
def testBySize(incrementing_hidden):
	"Test MLP results on matrix by number of left side columns"
	
	start_num_columns = 30 
	delta_num_columns = 10
	opts = '-M 0.5 -L 0.3 -x 4 -H '
	num_hidden = 13
		
	csv_matrix_name = csv.makeCsvPath('subset.matrix035')	
	base_name = 'number.attributes'
	if incrementing_hidden:
		base_name = base_name + '.inc'
	csv_results_name = csv.makePath(base_name + '.results')
	csv_summary_name = csv.makeCsvPath(base_name + '.summary')
	csv_best_name    = csv.makeCsvPath(base_name + '.best')
	
	matrix  = csv.readCsvRaw(csv_matrix_name)
	num_attribs = len(matrix[0])-1  # last column is category
	print 'testBySize', len(matrix), start_num_columns, delta_num_columns, len(matrix[0])
	
	best_accuracy = 0.0
	summary = []
	csv_summary = file(csv_summary_name, 'w')
	
	for num_columns in range(start_num_columns, len(matrix[0]), delta_num_columns):
		columns = [i for i in range(num_columns)]
		if incrementing_hidden:
			num_hidden = int(float(num_columns)*13.0/30.0)
		accuracy, temp_csv, temp_results, duration = testMatrixMLP(matrix, columns, opts + str(num_hidden))
		r = {'num':num_columns, 'accuracy':accuracy, 'csv':temp_csv, 'results':temp_results, 'duration':duration}
		summary.append(r)
		summary.sort(key = lambda r: -r['accuracy'])
		if True:
			print num_columns, ':',  accuracy, len(results), int(duration), 'seconds'
			for i in range(min(3,len(results))):
				rr = results[i]
				print '    ',i, ':', rr['accuracy'], rr['num'], int(rr['duration'])
		summary_row = [num_columns, accuracy, duration, temp_csv, temp_results]
		csv_line = ','.join([str(e) for e in summary_row])
		csv_summary.write(csv_line + '\n')
		csv_summary.flush()
		if accuracy > best_accuracy:
			best_accuracy = accuracy
			shutil.copyfile(temp_csv, csv_best_name)
			shutil.copyfile(temp_results, csv_results_name)
		
	return results
Example #6
0
def normalizeData(in_fn, out_fn):
    """ Normalize ad data to equal std dev
        in_fn : read input data from this csv file
        out_fn : write output data to this csv fuile
    """
    print 'normalizeData:', in_fn, '=>', out_fn
    in_cells = csv.readCsvRaw(in_fn)
    csv.validateMatrix2(in_cells)
  
    # Remove header row on top and category row on right
    in_data = [[float(e.strip()) for e in row[:-1]] for row in in_cells[1:3280]]
    print 'data', len(in_data), len(in_data[0])
        
    out_data = normalizeMatrix(in_data)
    print 'out_data', len(out_data), len(out_data[0])  
    
    out_cells = [in_cells[0]] + [out_data[i-1] + [in_cells[i][-1]] for i in range(1,len(in_cells))]  
    csv.writeCsv(out_fn, out_cells)
Example #7
0
def testByNumberHidden(csv_matrix_name, output_basename, num_columns, num_cv = 4):
	"""Test MLP results on matrix by number of neurons in hidden layer
		num_columns is number of leftmost columns of matrix to test
		num_cv is the number of cross-validation rounds
	"""
	
	start_num_hidden = min(10, num_columns-1) 
	delta_num_hidden = 10
	
	results_name = csv.makePath(output_basename + '.results')
	model_name   = csv.makePath(output_basename + '.model')
	csv_summary_name = csv.makeCsvPath(output_basename + '.summary')
	csv_best_name    = csv.makeCsvPath(output_basename + '.best')
	
	matrix  = csv.readCsvRaw(csv_matrix_name)
	num_attribs = len(matrix[0])-1  # last column is category
	print 'testByNumberHidden', len(matrix), start_num_hidden, delta_num_hidden, num_columns
	
	best_accuracy = 0.0
	results = []
	csv_summary = file(csv_summary_name, 'w')
	
	for num_hidden in range(start_num_hidden, num_columns, delta_num_hidden):
		columns = [i for i in range(num_columns)]
		accuracy, temp_csv, temp_results, duration = testMatrixMLP(matrix, columns, makeWekaOptions(0.3, 0.5, num_hidden, num_cv))
		r = {'num':num_hidden, 'accuracy':accuracy, 'csv':temp_csv, 'results':temp_results, 'duration':duration}
		results.append(r)
		results.sort(key = lambda r: -r['accuracy'])
		if True:
			print num_hidden, ':',  accuracy, len(results), int(duration), 'seconds'
			for i in range(min(5,len(results))):
				rr = results[i]
				print '    ',i, ':', rr['accuracy'], rr['num'], int(rr['duration'])
		summary = [num_hidden, accuracy, duration, temp_csv, temp_results]
		csv_line = ','.join([str(e) for e in summary])
		csv_summary.write(csv_line + '\n')
		csv_summary.flush()
		if accuracy > best_accuracy:
			best_accuracy = accuracy
			shutil.copyfile(temp_csv, csv_best_name)
			shutil.copyfile(temp_results, results_name)
			shutil.copyfile(outnameToModelname(temp_results), model_name)
			
	return {'summary':csv_summary_name, 'best':csv_best_name, 'results':results_name, 'model':model_name}
Example #8
0
def testCostMatrix(num_columns, num_cv = 4):
	"""Test MLP results with a range of false positive costs
	"""
	
	num_hidden = 5 
	
	csv_matrix_name = csv.makeCsvPath('subset.matrix035')	
	base_name = 'cost.col' + str(num_columns) + '.x' + str(num_cv) 
	csv_results_name = csv.makePath(base_name + '.results')
	csv_summary_name = csv.makeCsvPath(base_name + '.summary')
	csv_best_name    = csv.makeCsvPath(base_name + '.best')
	
	matrix  = csv.readCsvRaw(csv_matrix_name)
	num_attribs = len(matrix[0])-1  # last column is category
	print 'testCostMatrix', len(matrix), num_hidden, num_columns
	
	best_accuracy = 0.0
	results = []
	csv_results = file(csv_summary_name, 'w')
	
	for false_positive_cost in range(1, 11, 2):
		columns = [i for i in range(num_columns)]
		costs_map = {'True':1.0, 'False':float(false_positive_cost)}
		accuracy, temp_csv, temp_results, duration = testMatrixMLP(matrix, columns, makeWekaOptions(0.3, 0.5, num_hidden, num_cv, costs_map))
		r = {'cost':false_positive_cost, 'accuracy':accuracy, 'csv':temp_csv, 'results':temp_results, 'duration':duration}
		results.append(r)
		results.sort(key = lambda r: -r['accuracy'])
		if True:
			print false_positive_cost, ':',  accuracy, len(results), int(duration), 'seconds'
			for i in range(min(5,len(results))):
				rr = results[i]
				print '    ',i, ':', rr['accuracy'], rr['cost'], int(rr['duration'])
		summary = [num_hidden, accuracy, duration, temp_csv, temp_results]
		csv_line = ','.join([str(e) for e in summary])
		csv_results.write(csv_line + '\n')
		csv_results.flush()
		if accuracy > best_accuracy:
			best_accuracy = accuracy
			shutil.copyfile(temp_csv, csv_best_name)
			shutil.copyfile(temp_results, csv_results_name)
		
	return results
Example #9
0
def pcaAdData(theshold_variance, in_filename, out_filename):   
    """ Run PCA on the Kushmerick ad data
        Stop when there are sufficient PCA components to explain threshold_variance
        Project input data onto these PCA components
        - in_filename : input data read from this CSV file
        - out_filename : output data written to this CSV file
    """
    h2data = csv.readCsvRaw(in_filename)
    csv.validateMatrix(h2data)
    
    # Boolean data are columns 3 to second last
    bool_data = [[float(e) for e in v[3:-1]] for v in h2data[1:]]
    print 'bool_data', len(bool_data), len(bool_data[0])
    x = array(bool_data)
    
    # Find the output dimension (#basis vectors) required to explain
    # threshold_variance
    print 'output_dim, explained_variance, time(sec)' 
    for odim in range(50, len(x[0]), 50):
        start_time = time.clock()
        pcanode = mdp.nodes.PCANode(svd=True, output_dim = odim, dtype='float64')
        pcanode.train(x)
        p = pcanode.get_projmatrix()
        d = pcanode.output_dim
        print '%10d' % d, ',',
        v = pcanode.explained_variance
        print '%15.03f' % v, ',',
        print '%6.1f' % (time.clock() - start_time)
        if v >= theshold_variance:
            break
    #print '-----------------------------1'
    print 'p', len(p), len(p[0]) 
    #print '-----------------------------2'
    # Project out data onto PCA components    
    xfd = dot(x, p)    
    pca = [[x for x in row] for row in xfd]
    print 'pca', len(pca), len(pca[0])    
    pca_header = ['pca_%03d' % i for i in range(len(pca[0]))]
    header = h2data[0][:3] + pca_header + [h2data[0][-1]]
    num_data = [h2data[i+1][:3] + pca[i] + [h2data[i+1][-1]] for i in range(len(h2data)-1)] 
    data = [header] + num_data   
    csv.writeCsv(out_filename, data)
Example #10
0
def preprocessSoybeanData():
	""" Pre-process the Soybean data set downloaded from http://archive.ics.uci.edu/ml/machine-learning-databases/soybean/
	"""

	""" Read the data files """
	training_data = csv.readCsvRaw(os.path.join(dir, training_file))
	test_data = csv.readCsvRaw(os.path.join(dir, test_file))
	
	""" Combined data file """
	combined_data = test_data + training_data
	print 'combined data', len(combined_data), len(combined_data[0])

	""" Random data file where the percentage of each class and attribute
		matches the combined data """
	random_data = getRandomData(combined_data)

	""" Find the duplicate instances in each data set
		The number of duplicates in random_data provides an estimate
		of the number of duplicates that would occur in the real
		data sets by pure chance """
	training_duplicates = getDuplicates(training_data)
	print 'training_duplicates =', len(training_duplicates)
	test_duplicates = getDuplicates(test_data)
	print 'test_duplicates =', len(test_duplicates)
	combined_duplicates = getDuplicates(combined_data)
	print 'combined_duplicates =', len(combined_duplicates)
	random_duplicates = getDuplicates(random_data)
	duplicates_warning = '*** Data files should not contain duplicates!' if len(random_duplicates) == 0 else ''
	print 'random_duplicates =', len(random_duplicates), duplicates_warning
	
	""" Remove duplicate instances within each data set 
		We know removing duplicates is valid if len(random_duplicates) is zero """
	filtered_training_data = removeDuplicates(training_data, training_duplicates, False)
	filtered_test_data = removeDuplicates(test_data, test_duplicates, False)
	filtered_combined_data = removeDuplicates(combined_data, combined_duplicates, False)
	filtered_random_data = removeDuplicates(random_data, random_duplicates, False)

	""" Remove the instances in duplicate-free test data that duplicate instances 
		in duplicate-free training data """
	all_duplicates = getDuplicates(filtered_training_data + filtered_test_data)
	filtered_test_data = removeDuplicates(filtered_test_data, all_duplicates, True)

	""" Sanity check """
	assert(len(filtered_test_data) + len(filtered_training_data) + len(combined_duplicates) == len(combined_data))

	""" Write out the intermediate .csv files with duplicates marked for debugging """
	csv.writeCsv(appendDescription(dir, training_file, 'sorted'), markDuplicates(training_data))
	csv.writeCsv(appendDescription(dir, test_file, 'sorted'), markDuplicates(test_data))
	csv.writeCsv(appendDescription(dir, combined_file, 'sorted'), markDuplicates(combined_data))
	csv.writeCsv(appendDescription(dir, random_file, 'sorted'), markDuplicates(random_data))

	""" Read the names of the classes and attributes from downloaded files """
	classes = parseClasses(os.path.join(dir, classes_file))
	attrs = parseAttrs(os.path.join(dir, attrs_file))

	""" Add class and attribute names to original data, for comparison with filter data """
	original_named_training_data = applyAttrs(training_data, attrs)
	original_named_test_data = applyAttrs(test_data, attrs)
	original_named_combined_data = applyAttrs(combined_data, attrs)
	
	""" Add class and attribute names to filtered data """
	named_training_data = applyAttrs(filtered_training_data, attrs)
	named_test_data = applyAttrs(filtered_test_data, attrs)
	named_combined_data = applyAttrs(filtered_combined_data, attrs)
	named_random_data = applyAttrs(filtered_random_data, attrs)
	
	""" Get the class distribution """
	class_distribution_training = getClassDistribution(named_training_data)
	class_distribution_test = getClassDistribution(named_test_data)
	class_distribution_combined = getClassDistribution(named_combined_data)

	named_training_data = removeClassesWithFewerInstances(named_training_data, class_distribution_training,2)

	""" Create a header row for the .csv file """
	header = makeHeaderRow(attrs)

	""" Write out the .csv files """
	
	csv.writeCsv(appendDescription(dir, training_file, 'distribution'), dictToMatrix(class_distribution_training), ['Class', 'Number'])
	
	csv.writeCsv(appendDescription(dir, training_file, 'orig'), named_training_data, header)
	csv.writeCsv(appendDescription(dir, test_file, 'orig'), named_test_data, header)
	csv.writeCsv(appendDescription(dir, combined_file, 'orig'), named_combined_data, header)

	csv.writeCsv(appendDescription(dir, training_file, 'named'), original_named_training_data, header)
	csv.writeCsv(appendDescription(dir, test_file, 'named'), original_named_test_data, header)
	csv.writeCsv(appendDescription(dir, combined_file, 'named'), original_named_combined_data, header)

	""" Write out the .arff files """
	writeArff(buildPath(dir, training_file, '.arff', 'orig'), 'soybean', classes, attrs, original_named_training_data)
	writeArff(buildPath(dir, test_file, '.arff', 'orig'), 'soybean', classes, attrs, original_named_test_data)
	writeArff(buildPath(dir, combined_file, '.arff', 'orig'), 'soybean', classes, attrs, original_named_combined_data)
	
	writeArff(buildPath(dir, training_file, '.arff'), 'soybean', classes, attrs, named_training_data)
	writeArff(buildPath(dir, test_file, '.arff'), 'soybean', classes, attrs, named_test_data)
	writeArff(buildPath(dir, combined_file, '.arff'), 'soybean', classes, attrs, named_combined_data)
	writeArff(buildPath(dir, random_file, '.arff'), 'soybean', classes, attrs, named_random_data)