Example #1
0
def rankByCorrelationWithOutcomes(in_fn):
    "Rank each attribute by its correlation with the outcome"
    print 'rankByCorrelationWithOutcomes:', in_fn
    in_cells = csv.readCsvRaw(in_fn)
    csv.validateMatrix(in_cells)
    
    name_map = {'nonad.':0.0, 'ad.':1.0}
    def strToFloat(s):
        return name_map[s.strip()]
    
    #last column is ad categories. normalize other columns
    in_data = [[float(e) for e in row[:-1]] for row in in_cells[1:]]
    print 'in_data', len(in_data), len(in_data[0])
    raw_outcomes = [strToFloat(row[-1]) for row in in_cells[1:]]
  
    print 'outcomes', len(raw_outcomes) #,len(raw_outcomes[0])
    values = array(in_data)
    outcomes = array(raw_outcomes)
    
    def correlationWithOutcome(column):
        return correlation(column, outcomes)
   
    # http://www.scipy.org/Numpy_Example_List#head-528347f2f13004fc0081dce432e81b87b3726a33
    corr_with_outcomes = apply_along_axis(correlationWithOutcome,0,values)
    # print 'corr_with_outcomes', corr_with_outcomes
    corr_index = [(i,c) for i,c in enumerate(corr_with_outcomes)]
    # print corr_index
    corr_index.sort(key = lambda x: -abs(x[1])) 
    # print corr_index
    sort_order = [x[0] for x in corr_index]
    #print sort_order
    return (sort_order, corr_index)
Example #2
0
def pcaAdData(theshold_variance, in_filename, out_filename):   
    """ Run PCA on the Kushmerick ad data
        Stop when there are sufficient PCA components to explain threshold_variance
        Project input data onto these PCA components
        - in_filename : input data read from this CSV file
        - out_filename : output data written to this CSV file
    """
    h2data = csv.readCsvRaw(in_filename)
    csv.validateMatrix(h2data)
    
    # Boolean data are columns 3 to second last
    bool_data = [[float(e) for e in v[3:-1]] for v in h2data[1:]]
    print 'bool_data', len(bool_data), len(bool_data[0])
    x = array(bool_data)
    
    # Find the output dimension (#basis vectors) required to explain
    # threshold_variance
    print 'output_dim, explained_variance, time(sec)' 
    for odim in range(50, len(x[0]), 50):
        start_time = time.clock()
        pcanode = mdp.nodes.PCANode(svd=True, output_dim = odim, dtype='float64')
        pcanode.train(x)
        p = pcanode.get_projmatrix()
        d = pcanode.output_dim
        print '%10d' % d, ',',
        v = pcanode.explained_variance
        print '%15.03f' % v, ',',
        print '%6.1f' % (time.clock() - start_time)
        if v >= theshold_variance:
            break
    #print '-----------------------------1'
    print 'p', len(p), len(p[0]) 
    #print '-----------------------------2'
    # Project out data onto PCA components    
    xfd = dot(x, p)    
    pca = [[x for x in row] for row in xfd]
    print 'pca', len(pca), len(pca[0])    
    pca_header = ['pca_%03d' % i for i in range(len(pca[0]))]
    header = h2data[0][:3] + pca_header + [h2data[0][-1]]
    num_data = [h2data[i+1][:3] + pca[i] + [h2data[i+1][-1]] for i in range(len(h2data)-1)] 
    data = [header] + num_data   
    csv.writeCsv(out_filename, data)
Example #3
0
def preprocess(raw_name, headered_name, headered_name_pp):
	"""	Add headers and pre-process the raw Kushmerick  data. 
		This needs to be done once.
		- raw_name is the Kushmerick data that is input
		- headered_name is the name of CSV file with headers that is created
		- headered_name_pp is the named a file created by preprocessing header name that is created
	"""
	print 'preprocess', raw_name, '=>', headered_name, '=>', headered_name_pp
	header = csv.makeHeader()
	data = csv.readCsvRaw(raw_name)
    
	hdata = [header] + data
	assert(len(hdata)==len(data)+1)
	csv.validateMatrix(hdata)

	#swapMatrixColumn(data, 3, -1)
	csv.writeCsv(headered_name, hdata)
	h2data = csv.readCsvRaw(headered_name)
    
	csv.replaceMissingValues(hdata)
	csv.writeCsv(headered_name_pp, hdata)