def main(): # load the data f_train = 'train.dat' f_test = 'test.dat' ## convert train data datSet_total= [] n_train = 0 with open(f_train,'rb') as csvfile: data = csv.reader(csvfile, delimiter=",") for i_row, row in enumerate(data): if i_row == 0: header = row # first row is header row else: datSet_total.append(row[1:]) # remove row indices n_train += 1 with open(f_test,'rb') as csvfile: data = csv.reader(csvfile, delimiter=",") for i_row, row in enumerate(data): # assume same header, thus skip first row if i_row>0: datSet_total.append(row[1:]) # remove row indices datSet_total= np.array(datSet_total) dataX = datSet_total[:,:-1] # data datay = datSet_total[:,-1] # labels datay = datay.reshape((len(datay),1)) dtype = 'int' dataX = cat2bool(dataX, categorical,dtype) print dataX[:10,:] fname_train = 'train_NN.dat' fname_test = 'test_NN.dat' print dataX.shape print datay.shape data_train = np.hstack((dataX[:n_train,:],datay[:n_train])) data_test = np.hstack((dataX[n_train:,:],datay[n_train:])) print data_train.shape print data_test.shape dataset2file(data_train,[],fname_train) dataset2file(data_test,[],fname_test)
def main(): # load the data f_train = 'train.dat' f_test = 'test.dat' ## convert train data datSet_total = [] n_train = 0 with open(f_train, 'rb') as csvfile: data = csv.reader(csvfile, delimiter=",") for i_row, row in enumerate(data): if i_row == 0: header = row # first row is header row else: datSet_total.append(row[1:]) # remove row indices n_train += 1 with open(f_test, 'rb') as csvfile: data = csv.reader(csvfile, delimiter=",") for i_row, row in enumerate(data): # assume same header, thus skip first row if i_row > 0: datSet_total.append(row[1:]) # remove row indices datSet_total = np.array(datSet_total) dataX = datSet_total[:, :-1] # data datay = datSet_total[:, -1] # labels datay = datay.reshape((len(datay), 1)) dtype = 'int' dataX = cat2bool(dataX, categorical, dtype) print dataX[:10, :] fname_train = 'train_NN.dat' fname_test = 'test_NN.dat' print dataX.shape print datay.shape data_train = np.hstack((dataX[:n_train, :], datay[:n_train])) data_test = np.hstack((dataX[n_train:, :], datay[n_train:])) print data_train.shape print data_test.shape dataset2file(data_train, [], fname_train) dataset2file(data_test, [], fname_test)
def main(): ## Load data f_train = 'train.csv' f_test = 'test.csv' datSet_total = [] with open(f_train,'rb') as csvfile: data = csv.reader(csvfile, delimiter=",") for row in data: datSet_total.append(row) n_train = len(datSet_total) n_feat = len(datSet_total[0])-1 # last column contains labels with open(f_test,'rb') as csvfile: data = csv.reader(csvfile, delimiter=",") for row in data: row.append('?') # ? stands for unknown label datSet_total.append(row) print 'Features: %d'%n_feat # Find uniqe values per column n_data = len(datSet_total) print 'n_data: %d'%n_data print 'n_train: %d'%n_train print 'n_test: %d'%(n_data-n_train) # construct remapping for number to char via ascii remap = range(97,122+1) # 'a' - 'z' tmp_map = range(65,90+1) # 'A' - 'Z' remap.extend(tmp_map) datSet_total = np.array(datSet_total) datSet_total = normalizedata(datSet_total,categorical) #max_n = 10 n_levels = [] for i in range(n_feat): if categorical[i]: print 'Feature %d is categorical'%i else: print 'Feature %d is continuous'%i unique_vals = [] for d in range(n_data): #if d>max_n: # break elem_cur = datSet_total[d][i] if elem_cur == 'NaN' or elem_cur == 'nan': datSet_total[d][i] = 'NA' # use ? #datSet_total[d][i] = '0' # use ? (perform after normalization) continue elif categorical[i]: if elem_cur not in unique_vals: unique_vals.append(elem_cur) index = unique_vals.index(elem_cur) if index > len(remap)-1: sys.exit('Too many unique values') datSet_total[d][i] = chr(remap[index]) elem_new = datSet_total[d][i] n_levels.append([len(unique_vals)]) #print '%s -> %s'%(elem_cur, elem_new) ## Split data again in test and train data datSet_train = datSet_total[:n_train] datSet_test = datSet_total[n_train:] for i_Set, Set in enumerate([datSet_train, datSet_test]): if i_Set == 0: fname = f_train Set = changelabels(Set,('0','1'),('-','+')) else: fname = f_test # create output files basename = os.path.splitext(fname)[0] fname_out = basename+'.dat' dataset2file(Set,[],fname_out)