test_num = Dataset.load_part('test', 'numeric_mean') train_num_enc = np.zeros(train_num.shape, dtype=np.float32) test_num_enc = np.zeros(test_num.shape, dtype=np.float32) with tqdm(total=train_num.shape[1], desc=' Transforming', unit='cols') as pbar: for col in range(train_num.shape[1]): values = np.hstack((train_num[:, col], test_num[:, col])) # Apply rank transformation values = rankdata(values).astype(np.float64) # Scale into range (-1, 1) values = minmax_scale(values, feature_range=(-lim, lim)) # Make gaussian values = scale(erfinv(values)) train_num_enc[:, col] = values[:train_num.shape[0]] test_num_enc[:, col] = values[train_num.shape[0]:] pbar.update(1) print("Saving...") Dataset.save_part_features('numeric_mean_rank_norm', Dataset.get_part_features('numeric')) Dataset(numeric_mean_rank_norm=train_num_enc).save('train') Dataset(numeric_mean_rank_norm=test_num_enc).save('test') print("Done.")
custom = pd.DataFrame() custom["inc/lAmount"] = all.ApplicantIncome / all.LoanAmount custom["prop_area_LAmount"] = all.Property_Area * all.LoanAmount custom["chist_loanamnt"] = all.Credit_History * all.LoanAmount custom[ "chist_loanamnt_term"] = all.Credit_History * all.LoanAmount / all.Loan_Amount_Term custom["allIncome"] = all.ApplicantIncome + all.CoapplicantIncome custom["netIncome"] = custom["allIncome"] - (all.LoanAmount / all.Loan_Amount_Term) custom["depend_chist"] = all.Dependents * all.Credit_History custom["married_chist"] = all.Credit_History * all.Married all_scaled = scale(custom) #all_scaled = custom train_custom = all_scaled[:train_cat.shape[0]] test_custom = all_scaled[train_cat.shape[0]:] print(train_cat.head()) print(test_custom.shape) print(test_cat.shape) print(list(custom.columns)) # # train_cat_enc = sp.hstack(train_cat_enc, format='csr') # test_cat_enc = sp.hstack(test_cat_enc, format='csr') Dataset.save_part_features('custom', list(custom.columns)) Dataset(custom=train_custom).save('train') Dataset(custom=test_custom).save('test') # # print("Done.")
train_num = Dataset.load_part('train', 'numeric') train_cat = Dataset.load_part('train', 'categorical_dummy') test_num = Dataset.load_part('test', 'numeric') test_cat = Dataset.load_part('test', 'categorical_dummy') train_cnt = train_num.shape[0] print "Combining data..." all_data = hstack((scale(vstack( (train_num, test_num)).astype(np.float64)).astype(np.float32), vstack((train_cat, test_cat)))) del train_num, train_cat, test_num, test_cat print "Fitting svd..." svd = TruncatedSVD(n_components) res = svd.fit_transform(all_data) print "Explained variance ratio: %.5f" % np.sum(svd.explained_variance_ratio_) print "Saving..." Dataset.save_part_features('svd', ['svd%d' % i for i in xrange(n_components)]) Dataset(svd=res[:train_cnt]).save('train') Dataset(svd=res[train_cnt:]).save('test') print "Done."
(train_cat.shape[0], 1)))) test_cat_enc.append( sp.csr_matrix( (test_cat[:, col] == val).astype(np.uint8).reshape( (test_cat.shape[0], 1)))) else: train_rares += (train_cat[:, col] == val).astype(np.uint8) test_rares += (test_cat[:, col] == val).astype(np.uint8) if train_rares.sum() > 0 and test_rares.sum() > 0: features.append('%s_rare' % cat) train_cat_enc.append( sp.csr_matrix(train_rares.reshape((train_cat.shape[0], 1)))) test_cat_enc.append( sp.csr_matrix(test_rares.reshape((test_cat.shape[0], 1)))) pbar.update(1) print "Created %d dummy vars" % len(features) print "Saving..." train_cat_enc = sp.hstack(train_cat_enc, format='csr') test_cat_enc = sp.hstack(test_cat_enc, format='csr') Dataset.save_part_features('categorical_dummy', features) Dataset(categorical_dummy=train_cat_enc).save('train') Dataset(categorical_dummy=test_cat_enc).save('test') print "Done."
train_num = Dataset.load_part('train', 'numeric_mean') test_num = Dataset.load_part('test', 'numeric_mean') train_num_enc = np.zeros(train_num.shape, dtype=np.float32) test_num_enc = np.zeros(test_num.shape, dtype=np.float32) with tqdm(total=train_num.shape[1], desc=' Transforming', unit='cols') as pbar: for col in range(train_num.shape[1]): values = np.hstack((train_num[:, col], test_num[:, col])) print(values) sk = skew(values) if sk > 0.25: values_enc, lam = boxcox(values+1) train_num_enc[:, col] = values_enc[:train_num.shape[0]] test_num_enc[:, col] = values_enc[train_num.shape[0]:] else: train_num_enc[:, col] = train_num[:, col] test_num_enc[:, col] = test_num[:, col] pbar.update(1) print("Saving...") Dataset.save_part_features('numeric_mean_boxcox', Dataset.get_part_features('numeric')) Dataset(numeric_mean_boxcox=train_num_enc).save('train') Dataset(numeric_mean_boxcox=test_num_enc).save('test') print("Done.")
import pandas as pd from util import Dataset for name in ['train', 'test']: print "Processing %s..." % name idx = Dataset.load_part(name, 'id') # Load parts numeric = pd.DataFrame(Dataset.load_part(name, 'numeric'), columns=Dataset.get_part_features('numeric_lin'), index=idx) numeric_lin = pd.DataFrame( Dataset.load_part(name, 'numeric_lin'), columns=Dataset.get_part_features('numeric_lin'), index=idx) # Build features df = pd.DataFrame(index=idx) #df['cont14'] = numeric['cont14'] df['cont_1_9_diff'] = numeric_lin['cont9'] - numeric_lin['cont1'] # Save column names if name == 'train': Dataset.save_part_features('manual', list(df.columns)) Dataset(manual=df.values).save(name) print "Done."
import pandas as pd import numpy as np from util import Dataset for name in ['train', 'test']: print "Processing %s..." % name data = pd.read_csv('../input/lin_%s.csv' % name) # Save column names if name == 'train': num_columns = [c for c in data.columns if c.startswith('cont')] Dataset.save_part_features('numeric_lin', num_columns) Dataset(numeric_lin=data[num_columns].values.astype(np.float32)).save(name) print "Done."
(train_cat.shape[0], 1)))) test_cat_enc.append( sp.csr_matrix( (test_cat[:, col] == val).astype(np.uint8).reshape( (test_cat.shape[0], 1)))) else: train_rares += (train_cat[:, col] == val).astype(np.uint8) test_rares += (test_cat[:, col] == val).astype(np.uint8) if train_rares.sum() > 0 and test_rares.sum() > 0: features.append('%s_rare' % cat) train_cat_enc.append( sp.csr_matrix(train_rares.reshape((train_cat.shape[0], 1)))) test_cat_enc.append( sp.csr_matrix(test_rares.reshape((test_cat.shape[0], 1)))) pbar.update(1) print("Created %d dummy vars" % len(features)) print("Saving...") train_cat_enc = sp.hstack(train_cat_enc, format='csr') test_cat_enc = sp.hstack(test_cat_enc, format='csr') Dataset.save_part_features('cat_man_dummy', features) Dataset(cat_man_dummy=train_cat_enc).save('train') Dataset(cat_man_dummy=test_cat_enc).save('test') print("Done.")
idx = Dataset.load_part("test", 'id') test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx) all_nData = train_num.append(test_num) print(all_nData.head()) all_num_norm = pd.DataFrame() all_num_norm["ApplicantIncome"] = np.log1p(all_nData.ApplicantIncome) all_num_norm["CoapplicantIncome"] = np.log1p(all_nData.CoapplicantIncome) all_num_norm["LoanAmount"] = (np.log1p(all_nData.LoanAmount)) all_num_norm["Loan_Amount_Term"] = np.log1p(all_nData.Loan_Amount_Term) train_custom = all_num_norm[:train_num.shape[0]] test_custom = all_num_norm[train_num.shape[0]:] print(train_num.head()) print(test_custom.shape) print(test_num.shape) print(list(all_num_norm.columns)) # # train_cat_enc = sp.hstack(train_cat_enc, format='csr') # test_cat_enc = sp.hstack(test_cat_enc, format='csr') Dataset.save_part_features('num_log1', list(all_num_norm.columns)) Dataset(num_log1=train_custom).save('train') Dataset(num_log1=test_custom).save('test') # # print("Done.")
train_num = Dataset.load_part('train', 'numeric') test_num = Dataset.load_part('test', 'numeric') print "Scaling..." numeric = pd.DataFrame(np.vstack((train_num, test_num)), columns=Dataset.get_part_features('numeric')) df = pd.DataFrame(index=numeric.index) df["cont1"] = np.sqrt(minmax_scale(numeric["cont1"])) df["cont4"] = np.sqrt(minmax_scale(numeric["cont4"])) df["cont5"] = np.sqrt(minmax_scale(numeric["cont5"])) df["cont8"] = np.sqrt(minmax_scale(numeric["cont8"])) df["cont10"] = np.sqrt(minmax_scale(numeric["cont10"])) df["cont11"] = np.sqrt(minmax_scale(numeric["cont11"])) df["cont12"] = np.sqrt(minmax_scale(numeric["cont12"])) df["cont6"] = np.log(minmax_scale(numeric["cont6"]) + 0000.1) df["cont7"] = np.log(minmax_scale(numeric["cont7"]) + 0000.1) df["cont9"] = np.log(minmax_scale(numeric["cont9"]) + 0000.1) df["cont13"] = np.log(minmax_scale(numeric["cont13"]) + 0000.1) df["cont14"] = (np.maximum(numeric["cont14"] - 0.179722, 0) / 0.665122)**0.25 print "Saving..." Dataset.save_part_features('numeric_unskew', list(df.columns)) Dataset(numeric_unskew=df.values[:train_num.shape[0]]).save('train') Dataset(numeric_unskew=df.values[train_num.shape[0]:]).save('test') print "Done."
from tqdm import tqdm from util import Dataset print("Loading data...") train_cat = Dataset.load_part('train', 'categorical_mode') test_cat = Dataset.load_part('test', 'categorical_mode') train_cat_counts = np.zeros(train_cat.shape, dtype=np.float32) test_cat_counts = np.zeros(test_cat.shape, dtype=np.float32) with tqdm(total=train_cat.shape[1], desc=' Counting', unit='cols') as pbar: for col in range(train_cat.shape[1]): train_series = pd.Series(train_cat[:, col]) test_series = pd.Series(test_cat[:, col]) counts = pd.concat((train_series, test_series)).value_counts() train_cat_counts[:, col] = train_series.map(counts).values test_cat_counts[:, col] = test_series.map(counts).values pbar.update(1) print("Saving...") print(train_cat_counts) Dataset.save_part_features('categorical_counts', Dataset.get_part_features('categorical')) Dataset(categorical_counts=train_cat_counts).save('train') Dataset(categorical_counts=test_cat_counts).save('test') print("Done.")
print "Combining data..." all_data = hstack((scale(vstack( (train_num, test_num)).astype(np.float64)).astype(np.float32), vstack((train_cat, test_cat)))) for n_clusters in [25, 50, 75, 100, 200]: part_name = 'cluster_rbf_%d' % n_clusters print "Finding %d clusters..." % n_clusters kmeans = MiniBatchKMeans(n_clusters, random_state=17 * n_clusters + 11, n_init=5) kmeans.fit(all_data) print "Transforming data..." cluster_rbf = np.exp(-gamma * kmeans.transform(all_data)) print "Saving..." Dataset.save_part_features( part_name, ['cluster_rbf_%d_%d' % (n_clusters, i) for i in xrange(n_clusters)]) Dataset(**{part_name: cluster_rbf[:train_num.shape[0]]}).save('train') Dataset(**{part_name: cluster_rbf[train_num.shape[0]:]}).save('test') print "Done."
train_cleaned.loc[:, cat_columns] = train_cleaned[cat_columns].apply( LabelEncoder().fit_transform) test_cleaned.loc[:, cat_columns] = test_cleaned[cat_columns].apply( LabelEncoder().fit_transform) Dataset(cat_manual=train_cleaned[cat_columns].values).save("train") Dataset(num_manual=train_cleaned[num_columns].values.astype(np.float32)).save( "train") Dataset(id=train_cleaned[id]).save("train") Dataset(cat_manual=test_cleaned[cat_columns].values).save("test") Dataset(num_manual=test_cleaned[num_columns].values.astype(np.float32)).save( "test") Dataset(id=test_cleaned[id]).save("test") Dataset.save_part_features('cat_manual', Dataset.get_part_features('categorical_mode')) Dataset.save_part_features('num_manual', Dataset.get_part_features('numeric_mean')) le = LabelEncoder() le.fit(train_cleaned[target]) print(le.transform(train_cleaned[target])) Dataset(target=le.transform(train_cleaned[target])).save("train") Dataset(target_labels=le.classes_).save("train") # # train_cat_enc = sp.hstack(train_cat_enc, format='csr') # test_cat_enc = sp.hstack(test_cat_enc, format='csr') # Dataset.save_part_features('custom', list(custom.columns)) # Dataset(custom=train_custom).save('train')
# Save column names num_columns = [ 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term' ] cat_columns = [ c for c in data.columns if (c not in [*num_columns, target, id]) ] cData_mode = data[cat_columns].copy() cData_mode.Credit_History.fillna(1.0, inplace=True) cData_mode.fillna("X", inplace=True) cData_mode = cData_mode.apply(LabelEncoder().fit_transform) Dataset.save_part_features('categorical_na_new', Dataset.get_part_features('categorical_mode')) Dataset(categorical_na_new=cData_mode.values).save(name) Dataset(id=data[id]).save(name) if target in data.columns: le = LabelEncoder() le.fit(data[target]) print(le.transform(data[target])) Dataset(target=le.transform(data[target])).save(name) Dataset(target_labels=le.classes_).save(name) print("Done.")
print "Loading data..." train_cat = Dataset.load_part('train', 'categorical') test_cat = Dataset.load_part('test', 'categorical') train_cat_enc = np.zeros(train_cat.shape, dtype=np.uint8) test_cat_enc = np.zeros(test_cat.shape, dtype=np.uint8) with tqdm(total=train_cat.shape[1], desc=' Encoding', unit='cols') as pbar: for col in xrange(train_cat.shape[1]): values = np.hstack((train_cat[:, col], test_cat[:, col])) values = np.unique(values) values = sorted(values, key=lambda x: (len(x), x)) encoding = dict(zip(values, range(len(values)))) train_cat_enc[:, col] = pd.Series(train_cat[:, col]).map(encoding).values test_cat_enc[:, col] = pd.Series(test_cat[:, col]).map(encoding).values pbar.update(1) print "Saving..." Dataset.save_part_features('categorical_encoded', Dataset.get_part_features('categorical')) Dataset(categorical_encoded=train_cat_enc).save('train') Dataset(categorical_encoded=test_cat_enc).save('test') print "Done."
# # all_s["app_s"] = all_s["ApplicantIncome"] * all_s["Self_Employed"] # all_s["ci_s"] = all_s["CoapplicantIncome"] * all_s["Self_Employed"] # all_s["la_s"] = all_s["LoanAmount"] * all_s["Self_Employed"] # all_s["lat_s"] = all_s["Loan_Amount_Term"] * all_s["Self_Employed"] features_to_drop = ['Gender', 'Married', 'Dependents','Education', 'Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term'] all_filtered = all_s.drop(features_to_drop,axis=1) print(train.columns) # ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', # 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', # 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'] train_cust = all_filtered[:ntrain] test_cust = all_filtered[ntrain:] print(all_s.columns) # # train_cat_enc = sp.hstack(train_cat_enc, format='csr') # test_cat_enc = sp.hstack(test_cat_enc, format='csr') Dataset.save_part_features('fSelect', list(all_filtered.columns)) Dataset(fSelect=train_cust.values).save('train') Dataset(fSelect=test_cust.values).save('test') # # print("Done.")
data = pd.read_csv('input/%s.csv' % name) target = "Loan_Status" id = "Loan_ID" # Save column names if name == 'train': num_columns = [ 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term' ] cat_columns = [ c for c in data.columns if (c not in [*num_columns, target, id]) ] print(cat_columns) print(num_columns) Dataset.save_part_features('categorical_mode', cat_columns) Dataset.save_part_features('numeric_mean', num_columns) Dataset.save_part_features('numeric_median', num_columns) cData_mode = data[cat_columns].copy() cat_mode = cData_mode.mode().ix[0] cData_mode.fillna(cat_mode, inplace=True) cData_mode = cData_mode.apply(LabelEncoder().fit_transform) #print(cat_data.isnull().sum()) numData_mean = data[num_columns].copy() num_mean = numData_mean.mean() numData_mean.fillna(num_mean, inplace=True) numData_median = data[num_columns].copy()
test_num = Dataset.load_part('test', 'numeric_mean') ntrain = train_num.shape[0] train_test = np.vstack([train_num, test_num]) num_features = Dataset.get_part_features('numeric') num_comb_df = pd.DataFrame() with tqdm(total=train_num.shape[1], desc=' Transforming', unit='cols') as pbar: for comb in itertools.combinations(num_features, 2): feat = comb[0] + "_" + comb[1] num_comb_df[ feat] = train_test[:, num_features.index(comb[0]) - 1] + train_test[:, num_features.index(comb[1]) - 1] print('Combining Columns:', feat) print("Saving...") print(num_comb_df.shape) Dataset.save_part_features('numeric_comb', list(num_comb_df.columns)) train_num_comb = num_comb_df[:ntrain] test_num_comb = num_comb_df[ntrain:] num_comb_df = scale(num_comb_df) Dataset(numeric_comb=train_num_comb.values).save('train') Dataset(numeric_comb=test_num_comb.values).save('test') print("Done.")
import numpy as np from util import Dataset from sklearn.preprocessing import scale print("Loading data...") train_num = Dataset.load_part('train', 'numeric_mean') test_num = Dataset.load_part('test', 'numeric_mean') print("Scaling...") all_scaled = scale(np.vstack((train_num, test_num))) print("Saving...") Dataset.save_part_features('numeric_mean_scaled', Dataset.get_part_features('numeric')) Dataset(numeric_mean_scaled=all_scaled[:train_num.shape[0]]).save('train') Dataset(numeric_mean_scaled=all_scaled[train_num.shape[0]:]).save('test') print("Done.")
import pandas as pd from util import Dataset for name in ['train', 'test']: print "Processing %s..." % name num = pd.DataFrame(Dataset.load_part(name, 'numeric'), columns=Dataset.get_part_features('numeric')) df = pd.DataFrame(index=num.index) df['diff_1_6'] = num['cont1'] - num['cont6'] df['diff_1_9'] = num['cont1'] - num['cont9'] df['diff_1_10'] = num['cont1'] - num['cont10'] df['diff_6_9'] = num['cont6'] - num['cont9'] df['diff_6_10'] = num['cont6'] - num['cont10'] df['diff_6_11'] = num['cont6'] - num['cont11'] df['diff_6_12'] = num['cont6'] - num['cont12'] df['diff_6_13'] = num['cont6'] - num['cont13'] df['diff_7_11'] = num['cont7'] - num['cont11'] df['diff_7_12'] = num['cont7'] - num['cont12'] df['diff_11_12'] = num['cont11'] - num['cont12'] if name == 'train': Dataset.save_part_features('numeric_combinations', list(df.columns)) Dataset(numeric_combinations=df.values).save(name) print "Done."
import pandas as pd import numpy as np from util import Dataset for name in ['train', 'test']: print "Processing %s..." % name data = pd.read_csv('../input/%s.csv.zip' % name) # Save column names if name == 'train': cat_columns = [c for c in data.columns if c.startswith('cat')] num_columns = [c for c in data.columns if c.startswith('cont')] Dataset.save_part_features('categorical', cat_columns) Dataset.save_part_features('numeric', num_columns) Dataset(categorical=data[cat_columns].values).save(name) Dataset(numeric=data[num_columns].values.astype(np.float32)).save(name) Dataset(id=data['id']).save(name) if 'loss' in data.columns: Dataset(loss=data['loss']).save(name) print "Done."