def test_evaluate_model(self): # create a sample model my_model = Model(model_type='linreg') my_model.setup_feature_target(TestModel.x, TestModel.y) score = my_model.evaluate_model(my_model.x, my_model.y) self.assertAlmostEqual(score.mean(), 0.65, 1, msg='Unexpected score')
def setup_model(self, **kwargs): """ Function to call the model class wrapper Model Parameters ---------- kwargs: Used to pas exclusive model inputs specific to each model. """ # build the model attr self.model = Model(model_type=self.model_type, test_size=self.test_size, time_series=self.time_series, **kwargs)
def test_setup_feature_target(self): # create a sample model my_model = Model(model_type='linreg') my_model.setup_feature_target(TestModel.x, TestModel.y) self.assertEqual(my_model.x.shape, TestModel.x.shape, msg='Shape mismatch!') self.assertEqual(my_model.y.shape, TestModel.y.shape, msg='Shape mismatch!')
def test_setup_model(self): my_model = Model(model_type='linreg') mock_model = LinearRegression() self.assertTrue(type(my_model._model) == type(mock_model), msg='Mismatch in initialized ' 'model {}'.format(mock_model)) my_model = Model(model_type='lasso') mock_model = Lasso() self.assertTrue(type(my_model._model) == type(mock_model), msg='Mismatch in initialized ' 'model {}'.format(mock_model))
def test_fit(self): # create a sample model my_model = Model(model_type='linreg') # only call the fit to update all the variables my_model.fit(TestModel.x, TestModel.y) self.assertAlmostEqual(my_model._model.coef_.max(), 1.45, 1, msg='Maximum coefficient mismatch' ' for the fitted model') self.assertEqual(my_model._model.coef_.shape[0], 14, msg='Wrong number of model coefficients!')
def test_train_test_split(self): # create a sample model my_model = Model(model_type='linreg') # assign x and y values my_model.x = TestModel.x my_model.y = TestModel.y # run the split train test my_model.train_test_split_() # check for the sample data split on default values self.assertEqual(my_model.x_train.shape, (1432, 14), msg='Mismatch in training test proportions') self.assertEqual(my_model.x_test.shape, (614, 14), msg='Mismatch in training test proportions') self.assertEqual(my_model.x_train.shape[0], my_model.y_train.shape[0], msg='Mismatch in feature and target training sizes')
class Fentool(object): """ Fentool feature engineering tool Parameters ---------- sup_learning_type: String Determines the type of supervised learning, should be regression or classification model_type: String Sets the type of "regression" or "classification" model. Currently only regression is implemented, the values of the model type should be 'linreg', 'lasso', 'lassocv', 'ridge', 'ridgecv', 'rfr', 'svr'. encoder_type: String Sets the type of encoding for the data sets. Currenlty one-hot encoding and ordinal encoding is available. The values should be 'one-hot' or 'Ordinal' input_treatment: String Sets the type of treatment for the input(feature set) Currenlty only normalization (minmax) and standardization (mean and standard deviation) is implemented. The values should be 'normalize' or 'standardize'. output_treatment: String Set the type of treatment for the output(target). Currenlty only normalization (minmax) and standardization (mean and standard deviation) is implemented. The values should be 'normalize' or 'standardize'. time_series: bool, Default=False Flag evaluating if the problem is a time series problem. Currently Fentool does not have support for time-series. fillna: String Method to remove or replace nans, nulls, etc. Should be "None", "drop", "mean", "zeros" test_size: Float, Default=0.3 Sets the size of the training set for evaluatingn the model. null_tol_ratio: Float, Default=0.8 A value that determines the maximum tolerance for fentool to handle datasets with many null values. Must be between 0 and 1. null_warn_ratio: Float, Default=0.3 A value that determines the lower threshold for fentool to give warnings with datasets containig many null values. Must be between 0 and 1. """ def __init__(self, sup_learning_type='regression', model_type='linreg', encoder_type=None, input_treatment=None, output_treatment=None, time_series=False, fillna='drop', test_size=0.3, null_tol_ratio=0.8, null_warn_ratio=0.3, **kwargs): self.sup_learning_type = sup_learning_type self.model_type = model_type self.encoder_type = encoder_type self.input_treatment = input_treatment self.output_treatment = output_treatment self.time_series = time_series self.fillna = fillna self.test_size = test_size self.null_tol_ratio = null_tol_ratio self.null_warn_ratio = null_warn_ratio self.target = [] self.model = [] self.df = pd.DataFrame() self.x = pd.DataFrame() self.y = pd.DataFrame() self.x_trans = pd.DataFrame() self.y_trans = pd.DataFrame() self.validate_inputs() self.setup_model(**kwargs) def validate_inputs(self): """ This function validates the inputs given to prophet """ # check for the type of supervised learning algorithm if self.sup_learning_type not in 'regression': raise ValueError('Currently supporting only "classification and ' '"regression models.') # check for the supported model types for assessing feature eng # effectiveness if self.model_type not in ('linreg', 'lasso', 'lassocv', 'ridge', 'ridgecv', 'rfr', 'svr'): raise ValueError('Not supported model type {} '.format( self.model_type)) # check for encoder types if self.encoder_type is not None: if self.encoder_type not in ('one-hot', 'ordinal'): raise ValueError('Not supported encoder type {} '.format( self.encoder_type)) # validate the input treatment if self.input_treatment is not None: if self.input_treatment not in ('normalize', 'standardize'): raise ValueError('Input treatment not supported!') if self.output_treatment is not None: if self.output_treatment not in ('normalize', 'standardize'): raise ValueError('Output treatment not supported!') if self.time_series is not False: raise ValueError('Time series support is not included yet') if self.fillna is not None: if self.fillna not in ('drop', 'zeros', 'mean'): raise ValueError('Not supported fill null method {} '.format( self.model_type)) if (self.test_size > 1.0) or (self.test_size < .0): raise ValueError('The test size value should be between 0 and 1') if (self.null_tol_ratio > 1.0) or (self.null_tol_ratio < .0): raise ValueError('The null_tol_ratio should be between 0 and 1') if (self.null_warn_ratio > 1.0) or (self.null_warn_ratio < .0): raise ValueError('The null_warn_ratio should be between 0 and 1') def clean_nans(self): """ Function to clean up nan values """ # drop nans if self.fillna is 'drop': self.df.dropna(inplace=True) # TODO One include methods to better handle nans for categorical vars # replace nans by zeros elif self.fillna is 'zeros': self.df.fillna(0) # replace nans by the mean value of each column elif self.fillna is 'mean': self.df.fillna(self.df.mean()) def setup_dataframe(self, df, target): """ Function to setup the dataframe Parameters ---------- df: pd.DataFrame The complete dataframe before. target: String Determines the column name of the target variable in the given dataframe. """ # create a copy self.df = df.copy() # calculate the number rows with nulls num_null = sum(self.df.isnull().values.ravel()) ratio_null = num_null / self.df.shape[0] # raise an error if the ratio of nulls is above a certain threshold if ratio_null > self.null_tol_ratio: raise ValueError("Dataframe has a high null to" " value ratio: {}".format(round(ratio_null, 2))) # raise a warning regardless if there are some nulls elif ratio_null > self.null_warn_ratio: warnings.warn( "Data set has a null " "ratio of {} , treating with given method " "'{}'".format(round(ratio_null, 2), self.fillna), Warning) # run clean nans function if self.fillna is not None: self.clean_nans() # set up feature and target sets self.x = self.df.drop(columns=target) self.y = pd.DataFrame(self.df[target]) def setup_model(self, **kwargs): """ Function to call the model class wrapper Model Parameters ---------- kwargs: Used to pas exclusive model inputs specific to each model. """ # build the model attr self.model = Model(model_type=self.model_type, test_size=self.test_size, time_series=self.time_series, **kwargs) def feature_encoder(self): """ Function to encode the feature and target sets """ # encode the feature set if self.encoder_type is not None: enc = Encoder(encoder_type=self.encoder_type) self.x = enc.fit_transform(self.x) # check to see if the target is a category variable if self.y.dtypes.name == 'category': self.y = pd.DataFrame(self.y.cat.codes) def feature_transform(self): """ Function to transform the feature sets with available transformer classes. """ # check fot the give feature engineering setting if self.input_treatment == 'normalize': trans_input = Minmax() trans_input.fit(self.x) self.x_trans = trans_input.transform(self.x) elif self.input_treatment == 'standardize': trans_input = Standard() trans_input.fit(self.x) self.x_trans = trans_input.transform(self.x) else: self.x_trans = self.x.copy() # check for the required target transformation setting if self.output_treatment == 'normalize': trans_output = Minmax() trans_output.fit(self.y) self.y_trans = trans_output.transform(self.y) elif self.input_treatment == 'standardize': trans_output = Standard() trans_output.fit(self.y) self.y_trans = trans_output.transform(self.y) else: self.y_trans = self.y.copy() def prepar_fit_data(self, df, target): """ wrapper for preparing data sets for fit method Parameters ---------- df: pd.DataFrame The complete dataframe before. target: String Determines the column name of the target variable in the given dataframe. """ # clean the data and setup feature and target sets self.setup_dataframe(df, target) # encode the feature and target sets self.feature_encoder() # transform the feature and target sets. self.feature_transform() def fit(self, df, target): """ Wrapper for model fit method Parameters ---------- df: pd.DataFrame The complete dataframe before. target: String Determines the column name of the target variable in the given dataframe. """ # prepare the data for the fit method self.prepar_fit_data(df.copy(), target) # fit the model with the transformed data self.model.fit(self.x_trans, self.y_trans) def evaluate_model(self, df, target, n_splits=10, metric='r2', shuffle=True): """ Function to evaluate the model using cross validation Parameters ---------- df: pd.DataFrame The complete dataframe before. target: String Determines the column name of the target variable in the given dataframe. n_splits: Integer, Default=10 Determines the number of fold for cross-validation metric: String, Defaul='r2' The metric to evaluate the model using cross-validation. shuffle: bool, default=True Determines if the training data needs to be shuffled before training. Returns ------- score: double, contains a dictionary with the different scores of the model from different cross validation folds. """ self.prepar_fit_data(df.copy(), target) score = self.model.evaluate_model(self.x_trans, self.y_trans, n_splits=n_splits, metric=metric, shuffle=shuffle) return score @staticmethod def model_compare(models, input_trans, output_trans, encoder_types, df, target, n_splits=10, metric='r2'): """ Wrapper to run a number of feature comninations Parameters ---------- models: String, list/tuple Contains the models where the user needs to evaluate the best feature settings. input_trans: String, list/tuple Contains the user settings for the required feature set transformation output_trans: String, list/tuple Contains the user settings for the required target set transformation encoder_types: String, list/tuple Contains the user settings for the required encoding method of the non-numerical varaibles. df: pd.DataFrame The complete dataframe before. target: String Determines the column name of the target variable in the given dataframe. n_splits: Integer, Default=10 Determines the number of fold for cross-validation metric: String, Defaul='r2' The metric to evaluate the model using cross-validation. Returns ------- Scores: pd.DataFrame A dataframe with different user cases and their corresponding scoring. """ # loop over case names scores = pd.DataFrame() for modl in models: for enc in encoder_types: for in_trans in input_trans: for out_trans in output_trans: case = Fentool(encoder_type=enc, model_type=modl, input_treatment=in_trans, output_treatment=out_trans) case_name = modl + '_' + enc + '_in' + \ str(in_trans)[:4] + '_out' + str(out_trans)[:4] scores[case_name] = case.evaluate_model( df=df, target=target, n_splits=n_splits, metric=metric) return scores