Example #1
0
def run_training():
    """Train the model."""

    data = pd.read_csv('titanic.csv')
    X_train, X_test, y_train, y_test = train_test_split(data.drop('survived',
                                                                  axis=1),
                                                        data['survived'],
                                                        test_size=0.2,
                                                        random_state=0)

    print(X_train.head())
    test_pipeline_1 = pp.CategoricalImputer(variables=config.CATEGORICAL_VARS)
    test_pipeline_2 = pp.NumericalImputer(
        variables=config.NUMERICAL_VARS_WITH_NA)
    test_pipeline_3 = pp.ExtractFirstLetter(variables=config.CABIN)
    test_pipeline_4 = pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS)
    X_train = test_pipeline_1.fit_transform(X_train)
    X_train = test_pipeline_2.fit_transform(X_train)
    X_train = test_pipeline_3.fit_transform(X_train)
    print()
    print(X_train.head())
    print()
    X_train = test_pipeline_4.fit_transform(X_train)

    print(X_train.head())
Example #2
0
def pipeline(config):

    match_pipe = Pipeline([
        ('drop_fatures',
         pp.DropUnecessaryFeatures(
             variables_to_drop=config.variables.drop_features)),
        ('categorical_to_numerical',
         pp.CategoricalToNumerical(
             variables=config.variables.numerical_vars_from_numerical)),
        ('numerical_imputer', pp.NumericalImputer()),
        ('categorical_imputer',
         pp.CategoricalImputer(variables=config.variables.categorical_vars)),

        #('temporal_variable',
        #    pp.TemporalVariableEstimator(
        #       variables=config.TEMPORAL_VARS,
        #        reference_variable=config.DROP_FEATURES)),
        ('label extraction',
         pp.LabelExtraction(
             variables=config.variables.categorical_label_extraction)),
        ('rare label encoder',
         pp.RareLabelCategoricalEncoder(
             tol=0.01, variables=config.variables.categorical_vars)),
        ('categorical_encoder',
         pp.CategoricalEncoder(variables=config.variables.categorical_vars)),

        #('feature hashing',
        #   FeatureHasher(n_features=10, input_type='string')),

        #('log_transformer',
        #    pp.LogTransformer()),
        ('scaler', MinMaxScaler()),
        ('classifier', LogisticRegression())
    ])

    return match_pipe
# variables to log transform
NUMERICALS_LOG_VARS = ['LotFrontage', '1stFlrSF', 'GrLivArea']

# numerical variables with NA in train set
NUMERICAL_VARS_WITH_NA = ['LotFrontage']

# categorical variables to encode
CATEGORICAL_VARS = [
    'MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType', 'BsmtQual',
    'BsmtExposure', 'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu',
    'GarageType', 'GarageFinish', 'PavedDrive'
]

price_pipe = Pipeline([
    ('categorical_imputer',
     pp.CategoricalImputer(variables=CATEGORICAL_VARS_WITH_NA)),
    ('numerical_inputer',
     pp.NumericalImputer(variables=NUMERICAL_VARS_WITH_NA)),
    ('temporal_variable',
     pp.TemporalVariableEstimator(variables=TEMPORAL_VARS,
                                  reference_variable=TEMPORAL_VARS)),
    ('rare_label_encoder',
     pp.RareLabelCategoricalEncoder(tol=0.01, variables=CATEGORICAL_VARS)),
    ('categorical_encoder', pp.CategoricalEncoder(variables=CATEGORICAL_VARS)),
    ('log_transformer', pp.LogTransformer(variables=NUMERICALS_LOG_VARS)),
    ('drop_features',
     pp.DropUnecessaryFeatures(variables_to_drop=DROP_FEATURES)),
    ('scaler', MinMaxScaler()),
    ('Linear_model', Lasso(alpha=0.005, random_state=0))
])
Example #4
0
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import preprocessors as pp
import config


titanic_pipe = Pipeline(
    # complete with the list of steps from the preprocessors file
    # and the list of variables from the config
    [
        ('missing_indicator',
        	pp.MissingIndicator(variables=config.NUMERICAL_VARS)),

        ('categorical_imputer',
            pp.CategoricalImputer(variables=config.CATEGORICAL_VARS)),
         
        ('numerical_inputer',
            pp.NumericalImputer(variables=config.NUMERICAL_VARS)),

        ('extract_firstletter',
        	pp.ExtractFirstLetter(variables=config.CABIN)),

        ('rare_label_encoder',
        	pp.RareLabelCategoricalEncoder(
        		tol=0.05,
        		variables=config.CATEGORICAL_VARS)),

        ('categorical_encoder',
            pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS)),
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import preprocessors as pp
import utils as ut

config = ut.read_config_file('config.yaml')

titanic_pipe = Pipeline(
    # complete with the list of steps from the preprocessors file
    # and the list of variables from the config
    [('categorical_imputer',
      pp.CategoricalImputer(
          variables=config[2]['Feature_Groups'].get('categorical_vars'))),
     ('missing_indicator',
      pp.MissingIndicator(
          variables=config[2]['Feature_Groups'].get('numerical_to_impute'))),
     ('numerical_imputer',
      pp.NumericalImputer(
          variables=config[2]['Feature_Groups'].get('numerical_to_impute'))),
     ('cabin_variable',
      pp.ExtractFirstLetter(
          variables=config[2]['Feature_Groups'].get('categorical_vars')[1])),
     ('rare_label_encoder',
      pp.RareLabelCategoricalEncoder(
          tol=0.05,
          variables=config[2]['Feature_Groups'].get('categorical_vars'))),
     ('categorical_encoder',
      pp.CategoricalEncoder(
          variables=config[2]['Feature_Groups'].get('categorical_vars'))),
Example #6
0
from sklearn.pipeline import Pipeline

import preprocessors as pp

CATEGORICAL_VARS = [
    'MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType', 'BsmtQual',
    'BsmtExposure', 'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu',
    'GarageType', 'GarageFinish', 'PavedDrive'
]

PIPELINE_NAME = 'lasso_regression'

price_pipe = Pipeline([
    ('categorical_imputer', pp.CategoricalImputer(variables=CATEGORICAL_VARS)),
])
Example #7
0
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import preprocessors as pp
import config

titanic_pipe = Pipeline([
    # complete with the list of steps from the preprocessors file
    # and the list of variables from the config
    ('categorical_imputer', pp.CategoricalImputer(config.CATEGORICAL_VARS)),
    ('missing_indicator', pp.MissingIndicator(config.NUMERICAL_VARS)),
    ('numerical_imputer', pp.NumericalImputer(config.NUMERICAL_VARS)),
    ('cabin_extractor', pp.ExtractFirstLetter(config.CABIN)),
    ('rare_labels',
     pp.RareLabelCategoricalEncoder(tol=0.05,
                                    variables=config.CATEGORICAL_VARS)),
    ('categorical_encoder',
     pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS)),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(C=0.0005, random_state=0))
])
Example #8
0
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_union
from xgboost.sklearn import XGBRegressor

# %%

discrete_vars_pipeline = Pipeline([
    ('discrete_vars_select', pp.ItemsSelector(features=config.DISCRETE_VARS))
])

cat_vars_pipeline = Pipeline([
    ('cat_vars_select', pp.ItemsSelector(features=config.CATEGORICAL_VARS)),
    ('categorical_imputer',
     pp.CategoricalImputer(features=config.CATEGORICAL_VARS)),
    ('other_label_encoder',
     pp.OtherLabelCategoricalEncoder(features=config.CATEGORICAL_VARS)),
    ('categorical_encoder',
     pp.CategoricalEncoder(features=config.CATEGORICAL_VARS))
])

numerical_vars_pipeline = Pipeline([
    ('numerical_vars_select',
     pp.ItemsSelector(features=config.NUMERICAL_VARS)),
    ('numerical_imputer', pp.NumericalImputer(features=config.NUMERICAL_VARS)),
    ('power_transformer',
     pp.SelectedFeaturesPowerTransformer(features=config.NUMERICAL_VARS))
])

temporal_vars_pipeline = Pipeline([