Ejemplo n.º 1
0
from lib.porto.feature_type import get_cat_features_idx
from lib.scoring.gini import gini_normalized
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, Imputer, FunctionTransformer
from sklearn.model_selection import train_test_split
from datetime import datetime
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

start = datetime.now()
cfg = get_config()
logger = get_logger()

logger.info("Loading training data into X and y...")
train = load_file()
X = train.drop(['target'], axis=1)
X['bias'] = 1
y = train.target
cat_columns = get_cat_features_idx(X)

logger.info("Preprocessing Data (Impute, Encode)...")
pipe = Pipeline([('impute', Imputer(missing_values=-1,
                                    strategy="most_frequent")),
                 ('encode',
                  OneHotEncoder(categorical_features=cat_columns,
                                handle_unknown='ignore')),
                 ('dense',
                  FunctionTransformer(lambda x: x.todense(),
                                      accept_sparse=True))])
Ejemplo n.º 2
0
from lib.data import load_file, convert_columns_to_int, make_missing_zero
from lib.submit import write_submission_file
from lib.logger import get_logger
from lib.porto.feature_type import get_bin_cat_features, get_cat_features_idx
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from lib.scoring.gini import gini_normalized

logger = get_logger()

# training data
train = load_file()
X = train.drop(['target'], axis=1)
y = train.target

# bump all values up 1, so missing is now zero
cat_columns = get_cat_features_idx(X)
X = make_missing_zero(X, cat_columns)

# make a pipeline
pipe = Pipeline([('encode',
                  OneHotEncoder(categorical_features=cat_columns,
                                handle_unknown='ignore')),
                 ('to_dense',
                  FunctionTransformer(lambda x: x.todense(),
                                      accept_sparse=True)),
                 ('model', LogisticRegression())])
param_grid = {'model': [GaussianNB(), LogisticRegression()]}
Ejemplo n.º 3
0
from lib.data import load_file, convert_columns_to_int
from lib.submit import write_submission_file
from lib.logger import get_logger
from lib.porto.feature_type import get_bin_cat_features
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

logger = get_logger()

# target columns / features
chi2_df = load_file("chi2")


def n_best(chdf, n=15):
    sorted = chdf.sort_values('chi2', axis=0, ascending=False)
    return sorted['feature'][:n]


columns = n_best(chi2_df, n=20)

# training data
train = load_file()
bit_columns = get_bin_cat_features(train)
bit_columns.append('target')
train = convert_columns_to_int(train, bit_columns)
X = train[columns]
y = train.target

# make a pipeline
pipe = make_pipeline(StandardScaler(), LogisticRegression())
Ejemplo n.º 4
0
from lib.data import load_file, convert_columns_to_int
from lib.submit import write_submission_file
from lib.logger import get_logger
from lib.porto.feature_type import get_bin_cat_features
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_predict, GridSearchCV
from lib.scoring.gini import gini_normalized

logger = get_logger()

# training data
train = load_file()
bit_columns = get_bin_cat_features(train)
bit_columns.append('target')
train = convert_columns_to_int(train, bit_columns)
X = train.drop(['target'], axis = 1)
y = train.target

# make a pipeline
pipe = Pipeline([('transform', StandardScaler()),
                 ('model', GaussianNB())])

param_grid = {}

model = GridSearchCV(pipe, param_grid, scoring = 'roc_auc')
model.fit(X, y)
logger.info("Best Params: {}".format(model.best_params_))

results = cross_val_predict(model, X, y, method = 'predict_proba')[:, 1]