Example #1
0
def pipeline(config):

    match_pipe = Pipeline([
        ('drop_fatures',
         pp.DropUnecessaryFeatures(
             variables_to_drop=config.variables.drop_features)),
        ('categorical_to_numerical',
         pp.CategoricalToNumerical(
             variables=config.variables.numerical_vars_from_numerical)),
        ('numerical_imputer', pp.NumericalImputer()),
        ('categorical_imputer',
         pp.CategoricalImputer(variables=config.variables.categorical_vars)),

        #('temporal_variable',
        #    pp.TemporalVariableEstimator(
        #       variables=config.TEMPORAL_VARS,
        #        reference_variable=config.DROP_FEATURES)),
        ('label extraction',
         pp.LabelExtraction(
             variables=config.variables.categorical_label_extraction)),
        ('rare label encoder',
         pp.RareLabelCategoricalEncoder(
             tol=0.01, variables=config.variables.categorical_vars)),
        ('categorical_encoder',
         pp.CategoricalEncoder(variables=config.variables.categorical_vars)),

        #('feature hashing',
        #   FeatureHasher(n_features=10, input_type='string')),

        #('log_transformer',
        #    pp.LogTransformer()),
        ('scaler', MinMaxScaler()),
        ('classifier', LogisticRegression())
    ])

    return match_pipe
# variables to log transform
NUMERICALS_LOG_VARS = ['LotFrontage', '1stFlrSF', 'GrLivArea']

# numerical variables with NA in train set
NUMERICAL_VARS_WITH_NA = ['LotFrontage']

# categorical variables to encode
CATEGORICAL_VARS = [
    'MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType', 'BsmtQual',
    'BsmtExposure', 'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu',
    'GarageType', 'GarageFinish', 'PavedDrive'
]

price_pipe = Pipeline([
    ('categorical_imputer',
     pp.CategoricalImputer(variables=CATEGORICAL_VARS_WITH_NA)),
    ('numerical_inputer',
     pp.NumericalImputer(variables=NUMERICAL_VARS_WITH_NA)),
    ('temporal_variable',
     pp.TemporalVariableEstimator(variables=TEMPORAL_VARS,
                                  reference_variable=TEMPORAL_VARS)),
    ('rare_label_encoder',
     pp.RareLabelCategoricalEncoder(tol=0.01, variables=CATEGORICAL_VARS)),
    ('categorical_encoder', pp.CategoricalEncoder(variables=CATEGORICAL_VARS)),
    ('log_transformer', pp.LogTransformer(variables=NUMERICALS_LOG_VARS)),
    ('drop_features',
     pp.DropUnecessaryFeatures(variables_to_drop=DROP_FEATURES)),
    ('scaler', MinMaxScaler()),
    ('Linear_model', Lasso(alpha=0.005, random_state=0))
])
Example #3
0

titanic_pipe = Pipeline(
    # complete with the list of steps from the preprocessors file
    # and the list of variables from the config
    [
        ('missing_indicator',
        	pp.MissingIndicator(variables=config.NUMERICAL_VARS)),

        ('categorical_imputer',
            pp.CategoricalImputer(variables=config.CATEGORICAL_VARS)),
         
        ('numerical_inputer',
            pp.NumericalImputer(variables=config.NUMERICAL_VARS)),

        ('extract_firstletter',
        	pp.ExtractFirstLetter(variables=config.CABIN)),

        ('rare_label_encoder',
        	pp.RareLabelCategoricalEncoder(
        		tol=0.05,
        		variables=config.CATEGORICAL_VARS)),

        ('categorical_encoder',
            pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS)),

        ('scaler', StandardScaler()),
        ('Linear_model', LogisticRegression(C=0.0005, random_state=0))
    ]

)
import preprocessors as pp
import utils as ut

config = ut.read_config_file('config.yaml')

titanic_pipe = Pipeline(
    # complete with the list of steps from the preprocessors file
    # and the list of variables from the config
    [('categorical_imputer',
      pp.CategoricalImputer(
          variables=config[2]['Feature_Groups'].get('categorical_vars'))),
     ('missing_indicator',
      pp.MissingIndicator(
          variables=config[2]['Feature_Groups'].get('numerical_to_impute'))),
     ('numerical_imputer',
      pp.NumericalImputer(
          variables=config[2]['Feature_Groups'].get('numerical_to_impute'))),
     ('cabin_variable',
      pp.ExtractFirstLetter(
          variables=config[2]['Feature_Groups'].get('categorical_vars')[1])),
     ('rare_label_encoder',
      pp.RareLabelCategoricalEncoder(
          tol=0.05,
          variables=config[2]['Feature_Groups'].get('categorical_vars'))),
     ('categorical_encoder',
      pp.CategoricalEncoder(
          variables=config[2]['Feature_Groups'].get('categorical_vars'))),
     ('scaler', StandardScaler()),
     ('linear_model', LogisticRegression(C=0.0005, random_state=0))])
Example #5
0
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import preprocessors as pp
import config

titanic_pipe = Pipeline(
    [('categorical_imputer', pp.CategoricalImputer(config.CATEGORICAL_VARS)),
     ('missing_indicator', pp.MissingIndicator(config.NUMERICAL_VARS)),
     ('numerical_imputer', pp.NumericalImputer(config.NUMERICAL_VARS)),
     ('extract_first_letter', pp.ExtractFirstLetter(config.CABIN)),
     ('rare_label_categorical_encoder',
      pp.RareLabelCategoricalEncoder(0.05, config.CATEGORICAL_VARS)),
     ('categorical_encoder', pp.CategoricalEncoder(config.CATEGORICAL_VARS)),
     ('scaler', StandardScaler()),
     ('linear_model', LogisticRegression(C=0.0005, random_state=0))]
    # complete with the list of steps from the preprocessors file
    # and the list of variables from the config
)
Example #6
0
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import preprocessors as pp
from config import *


titanic_pipe = Pipeline(

   	[
         ('categorical_imputer', pp.CategoricalImputer(variables=CATEGORICAL_VARS)),
   		('missing_indicator', pp.MissingIndicator(variables=NUMERICAL_VARS)),
   		('numerical_imputer', pp.NumericalImputer(variables=NUMERICAL_VARS)),
   		('extract_first_letter', pp.ExtractFirstLetter(variables=CABIN_VAR)),
   		('rare_label_encoding', pp.RareLabelCategoricalEncoder(tol=0.05, variables=CATEGORICAL_VARS)),
   		('categorical_encoding', pp.CategoricalEncoder(variables=CATEGORICAL_VARS)),
   		('scaler', StandardScaler()),
        ('model', LogisticRegression(C=0.0005, random_state=0))
   	]
  
   )
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import preprocessors as pp
import config


titanic_pipe = Pipeline(
    [
        ('Extract_First_Letter', pp.ExtractFirstLetter(variables=['cabin'])),
        ('Numerical_Imputer', pp.NumericalImputer(variables=config.NUMERICAL_VARS)),
        ('Categorical_Imputer', pp.CategoricalImputer(variables=config.CATEGORICAL_VARS)),
        ('Rare_Label_Categorical_Encoder', pp.RareLabelCategoricalEncoder(variables=config.CATEGORICAL_VARS)),
        ('Categorical_Encoder', pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS)),
        ('scaler', StandardScaler()),
        ('Linear_model', LogisticRegression(C=0.005, random_state=0))
    ],
    verbose=True
)