def extract_feature_names(preset): x = [] for part in preset.get('features', []): x += Dataset.get_part_features(part) lp = 1 for pred in preset.get('predictions', []): if type(pred) is list: x.append('pred_%d' % lp) lp += 1 else: x.append(pred) return x
import pandas as pd from util import Dataset for name in ['train', 'test']: print "Processing %s..." % name num = pd.DataFrame(Dataset.load_part(name, 'numeric'), columns=Dataset.get_part_features('numeric')) df = pd.DataFrame(index=num.index) df['diff_1_6'] = num['cont1'] - num['cont6'] df['diff_1_9'] = num['cont1'] - num['cont9'] df['diff_1_10'] = num['cont1'] - num['cont10'] df['diff_6_9'] = num['cont6'] - num['cont9'] df['diff_6_10'] = num['cont6'] - num['cont10'] df['diff_6_11'] = num['cont6'] - num['cont11'] df['diff_6_12'] = num['cont6'] - num['cont12'] df['diff_6_13'] = num['cont6'] - num['cont13'] df['diff_7_11'] = num['cont7'] - num['cont11'] df['diff_7_12'] = num['cont7'] - num['cont12'] df['diff_11_12'] = num['cont11'] - num['cont12'] if name == 'train': Dataset.save_part_features('numeric_combinations', list(df.columns)) Dataset(numeric_combinations=df.values).save(name) print "Done."
train_num = Dataset.load_part('train', 'numeric_mean') test_num = Dataset.load_part('test', 'numeric_mean') train_num_enc = np.zeros(train_num.shape, dtype=np.float32) test_num_enc = np.zeros(test_num.shape, dtype=np.float32) with tqdm(total=train_num.shape[1], desc=' Transforming', unit='cols') as pbar: for col in range(train_num.shape[1]): values = np.hstack((train_num[:, col], test_num[:, col])) print(values) sk = skew(values) if sk > 0.25: values_enc, lam = boxcox(values+1) train_num_enc[:, col] = values_enc[:train_num.shape[0]] test_num_enc[:, col] = values_enc[train_num.shape[0]:] else: train_num_enc[:, col] = train_num[:, col] test_num_enc[:, col] = test_num[:, col] pbar.update(1) print("Saving...") Dataset.save_part_features('numeric_mean_boxcox', Dataset.get_part_features('numeric')) Dataset(numeric_mean_boxcox=train_num_enc).save('train') Dataset(numeric_mean_boxcox=test_num_enc).save('test') print("Done.")
from keras.optimizers import SGD, Adam, Adadelta from keras.callbacks import ModelCheckpoint from keras import regularizers #from keras_util import ExponentialMovingAverage, batch_generator from statsmodels.regression.quantile_regression import QuantReg from pylightgbm.models import GBMRegressor from scipy.stats import boxcox from bayes_opt import BayesianOptimization from util import Dataset, load_prediction, hstack categoricals = Dataset.get_part_features('categorical') class DenseTransformer(BaseEstimator): def transform(self, X, y=None, **fit_params): return X.todense() def fit_transform(self, X, y=None, **fit_params): self.fit(X, y, **fit_params) return self.transform(X) def fit(self, X, y=None, **fit_params): return self class BaseAlgo(object):
# Save column names num_columns = [ 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term' ] cat_columns = [ c for c in data.columns if (c not in [*num_columns, target, id]) ] cData_mode = data[cat_columns].copy() cData_mode.Credit_History.fillna(1.0, inplace=True) cData_mode.fillna("X", inplace=True) cData_mode = cData_mode.apply(LabelEncoder().fit_transform) Dataset.save_part_features('categorical_na_new', Dataset.get_part_features('categorical_mode')) Dataset(categorical_na_new=cData_mode.values).save(name) Dataset(id=data[id]).save(name) if target in data.columns: le = LabelEncoder() le.fit(data[target]) print(le.transform(data[target])) Dataset(target=le.transform(data[target])).save(name) Dataset(target_labels=le.classes_).save(name) print("Done.")
from scipy.stats import skew, boxcox from sklearn.preprocessing import scale from tqdm import tqdm from util import Dataset import itertools print("Loading data...") train_num = Dataset.load_part('train', 'numeric_mean') test_num = Dataset.load_part('test', 'numeric_mean') ntrain = train_num.shape[0] train_test = np.vstack([train_num, test_num]) num_features = Dataset.get_part_features('numeric') num_comb_df = pd.DataFrame() with tqdm(total=train_num.shape[1], desc=' Transforming', unit='cols') as pbar: for comb in itertools.combinations(num_features, 2): feat = comb[0] + "_" + comb[1] num_comb_df[ feat] = train_test[:, num_features.index(comb[0]) - 1] + train_test[:, num_features.index(comb[1]) - 1] print('Combining Columns:', feat) print("Saving...") print(num_comb_df.shape)
import numpy as np import scipy.sparse as sp import pandas as pd from sklearn.preprocessing import scale from tqdm import tqdm from util import Dataset print("Loading data...") idx = Dataset.load_part("train", 'id') train_cat = pd.DataFrame(Dataset.load_part("train", 'categorical_mode'), columns=Dataset.get_part_features('categorical_mode'), index=idx) train_num = pd.DataFrame(Dataset.load_part("train", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx) train = pd.concat([train_cat, train_num], axis=1) idx = Dataset.load_part("test", 'id') test_cat = pd.DataFrame(Dataset.load_part("test", 'categorical_mode'), columns=Dataset.get_part_features('categorical_mode'), index=idx) test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx) test = pd.concat([test_cat, test_num], axis=1)
test_num = Dataset.load_part('test', 'numeric_mean') train_num_enc = np.zeros(train_num.shape, dtype=np.float32) test_num_enc = np.zeros(test_num.shape, dtype=np.float32) with tqdm(total=train_num.shape[1], desc=' Transforming', unit='cols') as pbar: for col in range(train_num.shape[1]): values = np.hstack((train_num[:, col], test_num[:, col])) # Apply rank transformation values = rankdata(values).astype(np.float64) # Scale into range (-1, 1) values = minmax_scale(values, feature_range=(-lim, lim)) # Make gaussian values = scale(erfinv(values)) train_num_enc[:, col] = values[:train_num.shape[0]] test_num_enc[:, col] = values[train_num.shape[0]:] pbar.update(1) print("Saving...") Dataset.save_part_features('numeric_mean_rank_norm', Dataset.get_part_features('numeric')) Dataset(numeric_mean_rank_norm=train_num_enc).save('train') Dataset(numeric_mean_rank_norm=test_num_enc).save('test') print("Done.")
import numpy as np import scipy.sparse as sp from scipy.stats import boxcox import pandas as pd from sklearn.preprocessing import scale from tqdm import tqdm from util import Dataset print("Loading data...") idx = Dataset.load_part("train", 'id') train_num = pd.DataFrame(Dataset.load_part("train", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx) idx = Dataset.load_part("test", 'id') test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx) all_nData = train_num.append(test_num) print(all_nData.head()) all_num_norm = pd.DataFrame() all_num_norm["ApplicantIncome"] = np.log1p(all_nData.ApplicantIncome) all_num_norm["CoapplicantIncome"] = np.log1p(all_nData.CoapplicantIncome) all_num_norm["LoanAmount"] = (np.log1p(all_nData.LoanAmount)) all_num_norm["Loan_Amount_Term"] = np.log1p(all_nData.Loan_Amount_Term) train_custom = all_num_norm[:train_num.shape[0]] test_custom = all_num_norm[train_num.shape[0]:]
print("Loading data...") train_num = Dataset.load_part('train', 'numeric_mean') test_num = Dataset.load_part('test', 'numeric_mean') train_cat = Dataset.load_part('train', 'categorical_dummy') test_cat = Dataset.load_part('test', 'categorical_dummy') ntrain = train_num.shape[0] train = np.hstack([train_num,train_cat.toarray()]) test = np.hstack([test_num,test_cat.toarray()]) train_test = np.vstack([train,test]) print(train_test[0]) num_features = Dataset.get_part_features('numeric_mean') cat_features = Dataset.get_part_features('categorical_dummy') num_features.extend(cat_features) print(num_features) num_comb_df = pd.DataFrame() with tqdm(total=train_num.shape[1], desc=' Transforming', unit='cols') as pbar: for comb in itertools.combinations(num_features, 2): feat = comb[0] + "_" + comb[1] #print(feat) num_comb_df[feat] = train_test[:,num_features.index(comb[0])-1] * train_test[:,num_features.index(comb[1])-1] print('Combining Columns:', feat) print("Saving...")
import pandas as pd from util import Dataset for name in ['train', 'test']: print "Processing %s..." % name idx = Dataset.load_part(name, 'id') # Load parts numeric = pd.DataFrame(Dataset.load_part(name, 'numeric'), columns=Dataset.get_part_features('numeric_lin'), index=idx) numeric_lin = pd.DataFrame( Dataset.load_part(name, 'numeric_lin'), columns=Dataset.get_part_features('numeric_lin'), index=idx) # Build features df = pd.DataFrame(index=idx) #df['cont14'] = numeric['cont14'] df['cont_1_9_diff'] = numeric_lin['cont9'] - numeric_lin['cont1'] # Save column names if name == 'train': Dataset.save_part_features('manual', list(df.columns)) Dataset(manual=df.values).save(name) print "Done."