def extract_feature_names(preset):
    x = []

    for part in preset.get('features', []):
        x += Dataset.get_part_features(part)

    lp = 1
    for pred in preset.get('predictions', []):
        if type(pred) is list:
            x.append('pred_%d' % lp)
            lp += 1
        else:
            x.append(pred)

    return x
import pandas as pd

from util import Dataset

for name in ['train', 'test']:
    print "Processing %s..." % name

    num = pd.DataFrame(Dataset.load_part(name, 'numeric'),
                       columns=Dataset.get_part_features('numeric'))
    df = pd.DataFrame(index=num.index)

    df['diff_1_6'] = num['cont1'] - num['cont6']
    df['diff_1_9'] = num['cont1'] - num['cont9']
    df['diff_1_10'] = num['cont1'] - num['cont10']
    df['diff_6_9'] = num['cont6'] - num['cont9']
    df['diff_6_10'] = num['cont6'] - num['cont10']
    df['diff_6_11'] = num['cont6'] - num['cont11']
    df['diff_6_12'] = num['cont6'] - num['cont12']
    df['diff_6_13'] = num['cont6'] - num['cont13']
    df['diff_7_11'] = num['cont7'] - num['cont11']
    df['diff_7_12'] = num['cont7'] - num['cont12']
    df['diff_11_12'] = num['cont11'] - num['cont12']

    if name == 'train':
        Dataset.save_part_features('numeric_combinations', list(df.columns))

    Dataset(numeric_combinations=df.values).save(name)

print "Done."
Ejemplo n.º 3
0
train_num = Dataset.load_part('train', 'numeric_mean')
test_num = Dataset.load_part('test', 'numeric_mean')

train_num_enc = np.zeros(train_num.shape, dtype=np.float32)
test_num_enc = np.zeros(test_num.shape, dtype=np.float32)

with tqdm(total=train_num.shape[1], desc='  Transforming', unit='cols') as pbar:
    for col in range(train_num.shape[1]):
        values = np.hstack((train_num[:, col], test_num[:, col]))
        print(values)
        sk = skew(values)

        if sk > 0.25:
            values_enc, lam = boxcox(values+1)

            train_num_enc[:, col] = values_enc[:train_num.shape[0]]
            test_num_enc[:, col] = values_enc[train_num.shape[0]:]
        else:
            train_num_enc[:, col] = train_num[:, col]
            test_num_enc[:, col] = test_num[:, col]

        pbar.update(1)

print("Saving...")

Dataset.save_part_features('numeric_mean_boxcox', Dataset.get_part_features('numeric'))
Dataset(numeric_mean_boxcox=train_num_enc).save('train')
Dataset(numeric_mean_boxcox=test_num_enc).save('test')

print("Done.")
from keras.optimizers import SGD, Adam, Adadelta
from keras.callbacks import ModelCheckpoint
from keras import regularizers
#from keras_util import ExponentialMovingAverage, batch_generator

from statsmodels.regression.quantile_regression import QuantReg

from pylightgbm.models import GBMRegressor

from scipy.stats import boxcox

from bayes_opt import BayesianOptimization

from util import Dataset, load_prediction, hstack

categoricals = Dataset.get_part_features('categorical')


class DenseTransformer(BaseEstimator):
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self


class BaseAlgo(object):
Ejemplo n.º 5
0
    # Save column names

    num_columns = [
        'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
        'Loan_Amount_Term'
    ]
    cat_columns = [
        c for c in data.columns if (c not in [*num_columns, target, id])
    ]

    cData_mode = data[cat_columns].copy()
    cData_mode.Credit_History.fillna(1.0, inplace=True)
    cData_mode.fillna("X", inplace=True)

    cData_mode = cData_mode.apply(LabelEncoder().fit_transform)

    Dataset.save_part_features('categorical_na_new',
                               Dataset.get_part_features('categorical_mode'))
    Dataset(categorical_na_new=cData_mode.values).save(name)

    Dataset(id=data[id]).save(name)

    if target in data.columns:
        le = LabelEncoder()
        le.fit(data[target])
        print(le.transform(data[target]))
        Dataset(target=le.transform(data[target])).save(name)
        Dataset(target_labels=le.classes_).save(name)

print("Done.")
from scipy.stats import skew, boxcox
from sklearn.preprocessing import scale

from tqdm import tqdm
from util import Dataset

import itertools

print("Loading data...")

train_num = Dataset.load_part('train', 'numeric_mean')
test_num = Dataset.load_part('test', 'numeric_mean')
ntrain = train_num.shape[0]

train_test = np.vstack([train_num, test_num])
num_features = Dataset.get_part_features('numeric')
num_comb_df = pd.DataFrame()

with tqdm(total=train_num.shape[1], desc='  Transforming',
          unit='cols') as pbar:
    for comb in itertools.combinations(num_features, 2):
        feat = comb[0] + "_" + comb[1]

        num_comb_df[
            feat] = train_test[:, num_features.index(comb[0]) -
                               1] + train_test[:,
                                               num_features.index(comb[1]) - 1]
        print('Combining Columns:', feat)

print("Saving...")
print(num_comb_df.shape)
Ejemplo n.º 7
0
import numpy as np
import scipy.sparse as sp
import pandas as pd
from sklearn.preprocessing import scale

from tqdm import tqdm
from util import Dataset

print("Loading data...")

idx = Dataset.load_part("train", 'id')

train_cat = pd.DataFrame(Dataset.load_part("train", 'categorical_mode'),
                         columns=Dataset.get_part_features('categorical_mode'),
                         index=idx)
train_num = pd.DataFrame(Dataset.load_part("train", 'numeric_mean'),
                         columns=Dataset.get_part_features('numeric_mean'),
                         index=idx)

train = pd.concat([train_cat, train_num], axis=1)

idx = Dataset.load_part("test", 'id')

test_cat = pd.DataFrame(Dataset.load_part("test", 'categorical_mode'),
                        columns=Dataset.get_part_features('categorical_mode'),
                        index=idx)
test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'),
                        columns=Dataset.get_part_features('numeric_mean'),
                        index=idx)

test = pd.concat([test_cat, test_num], axis=1)
test_num = Dataset.load_part('test', 'numeric_mean')

train_num_enc = np.zeros(train_num.shape, dtype=np.float32)
test_num_enc = np.zeros(test_num.shape, dtype=np.float32)

with tqdm(total=train_num.shape[1], desc='  Transforming', unit='cols') as pbar:
    for col in range(train_num.shape[1]):
        values = np.hstack((train_num[:, col], test_num[:, col]))

        # Apply rank transformation
        values = rankdata(values).astype(np.float64)

        # Scale into range (-1, 1)
        values = minmax_scale(values, feature_range=(-lim, lim))

        # Make gaussian
        values = scale(erfinv(values))

        train_num_enc[:, col] = values[:train_num.shape[0]]
        test_num_enc[:, col] = values[train_num.shape[0]:]

        pbar.update(1)

print("Saving...")

Dataset.save_part_features('numeric_mean_rank_norm', Dataset.get_part_features('numeric'))
Dataset(numeric_mean_rank_norm=train_num_enc).save('train')
Dataset(numeric_mean_rank_norm=test_num_enc).save('test')

print("Done.")
Ejemplo n.º 9
0
import numpy as np
import scipy.sparse as sp
from scipy.stats import boxcox
import pandas as pd
from sklearn.preprocessing import scale

from tqdm import tqdm
from util import Dataset

print("Loading data...")

idx = Dataset.load_part("train", 'id')

train_num = pd.DataFrame(Dataset.load_part("train", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx)

idx = Dataset.load_part("test", 'id')

test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx)


all_nData = train_num.append(test_num)
print(all_nData.head())

all_num_norm = pd.DataFrame()
all_num_norm["ApplicantIncome"] = np.log1p(all_nData.ApplicantIncome)
all_num_norm["CoapplicantIncome"] = np.log1p(all_nData.CoapplicantIncome)
all_num_norm["LoanAmount"] = (np.log1p(all_nData.LoanAmount))
all_num_norm["Loan_Amount_Term"] = np.log1p(all_nData.Loan_Amount_Term)

train_custom = all_num_norm[:train_num.shape[0]]
test_custom = all_num_norm[train_num.shape[0]:]
Ejemplo n.º 10
0
print("Loading data...")

train_num = Dataset.load_part('train', 'numeric_mean')
test_num = Dataset.load_part('test', 'numeric_mean')

train_cat = Dataset.load_part('train', 'categorical_dummy')
test_cat = Dataset.load_part('test', 'categorical_dummy')
ntrain = train_num.shape[0]

train = np.hstack([train_num,train_cat.toarray()])
test = np.hstack([test_num,test_cat.toarray()])


train_test = np.vstack([train,test])
print(train_test[0])
num_features = Dataset.get_part_features('numeric_mean')
cat_features = Dataset.get_part_features('categorical_dummy')

num_features.extend(cat_features)
print(num_features)
num_comb_df = pd.DataFrame()

with tqdm(total=train_num.shape[1], desc='  Transforming', unit='cols') as pbar:
    for comb in itertools.combinations(num_features, 2):
        feat = comb[0] + "_" + comb[1]
        #print(feat)
        num_comb_df[feat] = train_test[:,num_features.index(comb[0])-1] * train_test[:,num_features.index(comb[1])-1]
        print('Combining Columns:', feat)


print("Saving...")
Ejemplo n.º 11
0
import pandas as pd

from util import Dataset

for name in ['train', 'test']:
    print "Processing %s..." % name

    idx = Dataset.load_part(name, 'id')

    # Load parts
    numeric = pd.DataFrame(Dataset.load_part(name, 'numeric'),
                           columns=Dataset.get_part_features('numeric_lin'),
                           index=idx)
    numeric_lin = pd.DataFrame(
        Dataset.load_part(name, 'numeric_lin'),
        columns=Dataset.get_part_features('numeric_lin'),
        index=idx)

    # Build features
    df = pd.DataFrame(index=idx)
    #df['cont14'] = numeric['cont14']
    df['cont_1_9_diff'] = numeric_lin['cont9'] - numeric_lin['cont1']

    # Save column names
    if name == 'train':
        Dataset.save_part_features('manual', list(df.columns))

    Dataset(manual=df.values).save(name)

print "Done."