def rankByCorrelationWithOutcomes(in_fn): "Rank each attribute by its correlation with the outcome" print 'rankByCorrelationWithOutcomes:', in_fn in_cells = csv.readCsvRaw(in_fn) csv.validateMatrix(in_cells) name_map = {'nonad.':0.0, 'ad.':1.0} def strToFloat(s): return name_map[s.strip()] #last column is ad categories. normalize other columns in_data = [[float(e) for e in row[:-1]] for row in in_cells[1:]] print 'in_data', len(in_data), len(in_data[0]) raw_outcomes = [strToFloat(row[-1]) for row in in_cells[1:]] print 'outcomes', len(raw_outcomes) #,len(raw_outcomes[0]) values = array(in_data) outcomes = array(raw_outcomes) def correlationWithOutcome(column): return correlation(column, outcomes) # http://www.scipy.org/Numpy_Example_List#head-528347f2f13004fc0081dce432e81b87b3726a33 corr_with_outcomes = apply_along_axis(correlationWithOutcome,0,values) # print 'corr_with_outcomes', corr_with_outcomes corr_index = [(i,c) for i,c in enumerate(corr_with_outcomes)] # print corr_index corr_index.sort(key = lambda x: -abs(x[1])) # print corr_index sort_order = [x[0] for x in corr_index] #print sort_order return (sort_order, corr_index)
def pcaAdData(theshold_variance, in_filename, out_filename): """ Run PCA on the Kushmerick ad data Stop when there are sufficient PCA components to explain threshold_variance Project input data onto these PCA components - in_filename : input data read from this CSV file - out_filename : output data written to this CSV file """ h2data = csv.readCsvRaw(in_filename) csv.validateMatrix(h2data) # Boolean data are columns 3 to second last bool_data = [[float(e) for e in v[3:-1]] for v in h2data[1:]] print 'bool_data', len(bool_data), len(bool_data[0]) x = array(bool_data) # Find the output dimension (#basis vectors) required to explain # threshold_variance print 'output_dim, explained_variance, time(sec)' for odim in range(50, len(x[0]), 50): start_time = time.clock() pcanode = mdp.nodes.PCANode(svd=True, output_dim = odim, dtype='float64') pcanode.train(x) p = pcanode.get_projmatrix() d = pcanode.output_dim print '%10d' % d, ',', v = pcanode.explained_variance print '%15.03f' % v, ',', print '%6.1f' % (time.clock() - start_time) if v >= theshold_variance: break #print '-----------------------------1' print 'p', len(p), len(p[0]) #print '-----------------------------2' # Project out data onto PCA components xfd = dot(x, p) pca = [[x for x in row] for row in xfd] print 'pca', len(pca), len(pca[0]) pca_header = ['pca_%03d' % i for i in range(len(pca[0]))] header = h2data[0][:3] + pca_header + [h2data[0][-1]] num_data = [h2data[i+1][:3] + pca[i] + [h2data[i+1][-1]] for i in range(len(h2data)-1)] data = [header] + num_data csv.writeCsv(out_filename, data)
def preprocess(raw_name, headered_name, headered_name_pp): """ Add headers and pre-process the raw Kushmerick data. This needs to be done once. - raw_name is the Kushmerick data that is input - headered_name is the name of CSV file with headers that is created - headered_name_pp is the named a file created by preprocessing header name that is created """ print 'preprocess', raw_name, '=>', headered_name, '=>', headered_name_pp header = csv.makeHeader() data = csv.readCsvRaw(raw_name) hdata = [header] + data assert(len(hdata)==len(data)+1) csv.validateMatrix(hdata) #swapMatrixColumn(data, 3, -1) csv.writeCsv(headered_name, hdata) h2data = csv.readCsvRaw(headered_name) csv.replaceMissingValues(hdata) csv.writeCsv(headered_name_pp, hdata)