Beispiel #1
0
    def __init__(self,
                 dataframe,
                 predicted_column,
                 model_type,
                 impute=True,
                 grain_column=None,
                 verbose=False):
        """
        Set up a SupervisedModelTrainer

        Args:
            dataframe (pandas.core.frame.DataFrame): The training data in a pandas dataframe
            predicted_column (str): The name of the prediction column 
            model_type (str): the trainer type - 'classification' or 'regression'
            impute (bool): True to impute data (mean of numeric columns and mode of categorical ones). False to drop rows
                that contain any null values.
            grain_column (str): The name of the grain column
            verbose (bool): Set to true for verbose output. Defaults to False.
        """
        self.predicted_column = predicted_column
        self.grain_column = grain_column

        # Build the pipeline
        # Note: Missing numeric values are imputed in prediction. If we don't impute, then some rows on the prediction
        # data frame will be removed, which results in missing predictions.
        pipeline = hcai_pipelines.full_pipeline(model_type,
                                                predicted_column,
                                                grain_column,
                                                impute=impute)
        prediction_pipeline = hcai_pipelines.full_pipeline(model_type,
                                                           predicted_column,
                                                           grain_column,
                                                           impute=True)

        # Run the raw data through the data preparation pipeline
        clean_dataframe = pipeline.fit_transform(dataframe)
        _ = prediction_pipeline.fit_transform(dataframe)

        # Instantiate the advanced class
        self._advanced_trainer = AdvancedSupervisedModelTrainer(
            dataframe=clean_dataframe,
            model_type=model_type,
            predicted_column=predicted_column,
            grain_column=grain_column,
            original_column_names=dataframe.columns.values,
            verbose=verbose)

        # Save the pipeline to the parent class
        self._advanced_trainer.pipeline = prediction_pipeline

        # Split the data into train and test
        self._advanced_trainer.train_test_split()

        self._advanced_trainer.categorical_column_info = get_categorical_levels(
            dataframe=dataframe,
            columns_to_ignore=[grain_column, predicted_column])
Beispiel #2
0
    def __init__(self,
                 dataframe,
                 predicted_column,
                 model_type,
                 impute=True,
                 grain_column=None,
                 verbose=True,
                 imputeStrategy='MeanMedian'):
        """
        Set up a SupervisedModelTrainer.

        Helps the user by checking for high cardinality features (such as IDs or
        other unique identifiers) and low cardinality features (a column where
        all values are equal.

        Args:
            dataframe (pandas.core.frame.DataFrame): The training data in a 
            pandas dataframe 
            
            predicted_column (str): The name of the prediction column

            model_type (str): the trainer type - 'classification' or 'regression'

            impute (bool): True to impute data (mean of numeric columns and 
            mode of categorical ones). False to drop
            rows that contain any null values.

            grain_column (str): The name of the grain column

            verbose (bool): Set to true for verbose output. Defaults to True.
        """
        self.predicted_column = predicted_column
        self.grain_column = grain_column

        # Build the pipeline
        # Note: Missing numeric values are imputed in prediction. If we don't
        # impute, then some rows on the prediction
        # data frame will be removed, which results in missing predictions.
        pipeline = hcai_pipelines.full_pipeline(model_type,
                                                predicted_column,
                                                grain_column,
                                                impute=impute,
                                                verbose=True,
                                                imputeStrategy=imputeStrategy)

        prediction_pipeline = hcai_pipelines.full_pipeline(
            model_type,
            predicted_column,
            grain_column,
            impute=True,
            verbose=False,
            imputeStrategy=imputeStrategy)

        # Run a low and high cardinality check. Warn the user, and allow
        # them to proceed.
        hcai_ordinality.check_high_cardinality(dataframe, self.grain_column)
        hcai_ordinality.check_one_cardinality(dataframe)

        # Run the raw data through the data preparation pipeline
        clean_dataframe = pipeline.fit_transform(dataframe)
        _ = prediction_pipeline.fit_transform(dataframe)

        # Instantiate the advanced class
        self._advanced_trainer = AdvancedSupervisedModelTrainer(
            pipeline=pipeline,
            dataframe=clean_dataframe,
            model_type=model_type,
            predicted_column=predicted_column,
            grain_column=grain_column,
            original_column_names=dataframe.columns.values,
            verbose=verbose)

        # Save the pipeline to the parent class
        self._advanced_trainer.pipeline = prediction_pipeline

        # Split the data into train and test
        self._advanced_trainer.train_test_split()

        self._advanced_trainer.categorical_column_info = get_categorical_levels(
            dataframe=dataframe,
            columns_to_ignore=[grain_column, predicted_column])
Beispiel #3
0
class SupervisedModelTrainer(object):
    """Train supervised models.

    This class trains models using several common classifiers and regressors and
    reports appropriate metrics.
    """
    def __init__(self,
                 dataframe,
                 predicted_column,
                 model_type,
                 impute=True,
                 grain_column=None,
                 verbose=True,
                 imputeStrategy='MeanMedian'):
        """
        Set up a SupervisedModelTrainer.

        Helps the user by checking for high cardinality features (such as IDs or
        other unique identifiers) and low cardinality features (a column where
        all values are equal.

        Args:
            dataframe (pandas.core.frame.DataFrame): The training data in a 
            pandas dataframe 
            
            predicted_column (str): The name of the prediction column

            model_type (str): the trainer type - 'classification' or 'regression'

            impute (bool): True to impute data (mean of numeric columns and 
            mode of categorical ones). False to drop
            rows that contain any null values.

            grain_column (str): The name of the grain column

            verbose (bool): Set to true for verbose output. Defaults to True.
        """
        self.predicted_column = predicted_column
        self.grain_column = grain_column

        # Build the pipeline
        # Note: Missing numeric values are imputed in prediction. If we don't
        # impute, then some rows on the prediction
        # data frame will be removed, which results in missing predictions.
        pipeline = hcai_pipelines.full_pipeline(model_type,
                                                predicted_column,
                                                grain_column,
                                                impute=impute,
                                                verbose=True,
                                                imputeStrategy=imputeStrategy)

        prediction_pipeline = hcai_pipelines.full_pipeline(
            model_type,
            predicted_column,
            grain_column,
            impute=True,
            verbose=False,
            imputeStrategy=imputeStrategy)

        # Run a low and high cardinality check. Warn the user, and allow
        # them to proceed.
        hcai_ordinality.check_high_cardinality(dataframe, self.grain_column)
        hcai_ordinality.check_one_cardinality(dataframe)

        # Run the raw data through the data preparation pipeline
        clean_dataframe = pipeline.fit_transform(dataframe)
        _ = prediction_pipeline.fit_transform(dataframe)

        # Instantiate the advanced class
        self._advanced_trainer = AdvancedSupervisedModelTrainer(
            pipeline=pipeline,
            dataframe=clean_dataframe,
            model_type=model_type,
            predicted_column=predicted_column,
            grain_column=grain_column,
            original_column_names=dataframe.columns.values,
            verbose=verbose)

        # Save the pipeline to the parent class
        self._advanced_trainer.pipeline = prediction_pipeline

        # Split the data into train and test
        self._advanced_trainer.train_test_split()

        self._advanced_trainer.categorical_column_info = get_categorical_levels(
            dataframe=dataframe,
            columns_to_ignore=[grain_column, predicted_column])

    @property
    def clean_dataframe(self):
        """Return the dataframe after the preparation pipeline (imputation and such)."""
        return self._advanced_trainer.dataframe

    def random_forest(self, feature_importance_limit=15, save_plot=False):
        """Train a random forest model and print out the model performance metrics.

        Args:
            feature_importance_limit (int): The maximum number of features to show 
            in the feature importance plotl

            save_plot (bool): For the feature importance plot, True to save plot 
            (will not display). False by default to display.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        if self._advanced_trainer.model_type is 'classification':
            return self.random_forest_classification(
                feature_importance_limit=feature_importance_limit,
                save_plot=save_plot)
        elif self._advanced_trainer.model_type is 'regression':
            return self.random_forest_regression()

    @trainer_output
    def knn(self):
        """Train a knn model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.knn(scoring_metric='roc_auc',
                                          hyperparameter_grid=None,
                                          randomized_search=True)

    @trainer_output
    def random_forest_regression(self):
        """Train a random forest regression model and print out the 
        model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.random_forest_regressor(
            trees=200,
            scoring_metric='neg_mean_squared_error',
            randomized_search=True)

    @trainer_output
    def random_forest_classification(self,
                                     feature_importance_limit=15,
                                     save_plot=False):
        """Train a random forest classification model, print performance metrics 
        and show a feature importance plot.

        Args:
            feature_importance_limit (int): The maximum number of features to 
            show in the feature importance plot

            save_plot (bool): For the feature importance plot, True to save 
            plot (will not display). False by default to display.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        model = self._advanced_trainer.random_forest_classifier(
            trees=200, scoring_metric='roc_auc', randomized_search=True)

        # Save or show the feature importance graph
        hcai_tsm.plot_rf_features_from_tsm(
            model,
            self._advanced_trainer.x_train,
            feature_limit=feature_importance_limit,
            save=save_plot)

        return model

    @trainer_output
    def logistic_regression(self):
        """Train a logistic regression model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.logistic_regression(
            randomized_search=False)

    @trainer_output
    def linear_regression(self):
        """Train a linear regression model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.linear_regression(
            randomized_search=False)

    @trainer_output
    def lasso_regression(self):
        """Train a lasso regression model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.lasso_regression(randomized_search=False)

    @trainer_output
    def ensemble(self):
        """Train a ensemble model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        # TODO consider making a scoring parameter (which will necessitate some more logic
        if self._advanced_trainer.model_type is 'classification':
            metric = 'roc_auc'
            model = self._advanced_trainer.ensemble_classification(
                scoring_metric=metric)
        elif self._advanced_trainer.model_type is 'regression':
            # TODO stub
            metric = 'neg_mean_squared_error'
            model = self._advanced_trainer.ensemble_regression(
                scoring_metric=metric)

        print('Based on the scoring metric {}, '
              'the best algorithm found is: {}'.format(metric,
                                                       model.algorithm_name))

        return model

    @property
    def advanced_features(self):
        """Return the underlying AdvancedSupervisedModelTrainer instance. For 
        advanced users only."""
        return self._advanced_trainer
    def __init__(self, dataframe, predicted_column, model_type, impute=True, grain_column=None, verbose=True):
        """
        Set up a SupervisedModelTrainer.

        Helps the user by checking for high cardinality features (such as IDs or
        other unique identifiers) and low cardinality features (a column where
        all values are equal.

        Args:
            dataframe (pandas.core.frame.DataFrame): The training data in a 
            pandas dataframe 
            
            predicted_column (str): The name of the prediction column

            model_type (str): the trainer type - 'classification' or 'regression'

            impute (bool): True to impute data (mean of numeric columns and 
            mode of categorical ones). False to drop
            rows that contain any null values.

            grain_column (str): The name of the grain column

            verbose (bool): Set to true for verbose output. Defaults to True.
        """
        self.predicted_column = predicted_column
        self.grain_column = grain_column

        # Build the pipeline
        # Note: Missing numeric values are imputed in prediction. If we don't 
        # impute, then some rows on the prediction
        # data frame will be removed, which results in missing predictions.
        pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=impute,
                                                verbose=True)

        prediction_pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=True,
                                                           verbose=False)

        # Run a low and high cardinality check. Warn the user, and allow
        # them to proceed.
        hcai_ordinality.check_high_cardinality(dataframe, self.grain_column)
        hcai_ordinality.check_one_cardinality(dataframe)

        # Run the raw data through the data preparation pipeline
        clean_dataframe = pipeline.fit_transform(dataframe)
        _ = prediction_pipeline.fit_transform(dataframe)

        # Instantiate the advanced class
        self._advanced_trainer = AdvancedSupervisedModelTrainer(pipeline=pipeline,
            dataframe=clean_dataframe,
            model_type=model_type,
            predicted_column=predicted_column,
            grain_column=grain_column,
            original_column_names=dataframe.columns.values,
            verbose=verbose)

        # Save the pipeline to the parent class
        self._advanced_trainer.pipeline = prediction_pipeline

        # Split the data into train and test
        self._advanced_trainer.train_test_split()

        self._advanced_trainer.categorical_column_info = get_categorical_levels(
            dataframe=dataframe,
            columns_to_ignore=[grain_column, predicted_column])
class SupervisedModelTrainer(object):
    """Train supervised models.

    This class trains models using several common classifiers and regressors and
    reports appropriate metrics.
    """

    def __init__(self, dataframe, predicted_column, model_type, impute=True, grain_column=None, verbose=True):
        """
        Set up a SupervisedModelTrainer.

        Helps the user by checking for high cardinality features (such as IDs or
        other unique identifiers) and low cardinality features (a column where
        all values are equal.

        Args:
            dataframe (pandas.core.frame.DataFrame): The training data in a 
            pandas dataframe 
            
            predicted_column (str): The name of the prediction column

            model_type (str): the trainer type - 'classification' or 'regression'

            impute (bool): True to impute data (mean of numeric columns and 
            mode of categorical ones). False to drop
            rows that contain any null values.

            grain_column (str): The name of the grain column

            verbose (bool): Set to true for verbose output. Defaults to True.
        """
        self.predicted_column = predicted_column
        self.grain_column = grain_column

        # Build the pipeline
        # Note: Missing numeric values are imputed in prediction. If we don't 
        # impute, then some rows on the prediction
        # data frame will be removed, which results in missing predictions.
        pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=impute,
                                                verbose=True)

        prediction_pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=True,
                                                           verbose=False)

        # Run a low and high cardinality check. Warn the user, and allow
        # them to proceed.
        hcai_ordinality.check_high_cardinality(dataframe, self.grain_column)
        hcai_ordinality.check_one_cardinality(dataframe)

        # Run the raw data through the data preparation pipeline
        clean_dataframe = pipeline.fit_transform(dataframe)
        _ = prediction_pipeline.fit_transform(dataframe)

        # Instantiate the advanced class
        self._advanced_trainer = AdvancedSupervisedModelTrainer(pipeline=pipeline,
            dataframe=clean_dataframe,
            model_type=model_type,
            predicted_column=predicted_column,
            grain_column=grain_column,
            original_column_names=dataframe.columns.values,
            verbose=verbose)

        # Save the pipeline to the parent class
        self._advanced_trainer.pipeline = prediction_pipeline

        # Split the data into train and test
        self._advanced_trainer.train_test_split()

        self._advanced_trainer.categorical_column_info = get_categorical_levels(
            dataframe=dataframe,
            columns_to_ignore=[grain_column, predicted_column])

    @property
    def clean_dataframe(self):
        """Return the dataframe after the preparation pipeline (imputation and such)."""
        return self._advanced_trainer.dataframe

    def random_forest(self, feature_importance_limit=15, save_plot=False):
        """Train a random forest model and print out the model performance metrics.

        Args:
            feature_importance_limit (int): The maximum number of features to show 
            in the feature importance plotl

            save_plot (bool): For the feature importance plot, True to save plot 
            (will not display). False by default to display.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        if self._advanced_trainer.model_type is 'classification':
            return self.random_forest_classification(feature_importance_limit=feature_importance_limit,
                                                     save_plot=save_plot)
        elif self._advanced_trainer.model_type is 'regression':
            return self.random_forest_regression()

    @trainer_output
    def knn(self):
        """Train a knn model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.knn(
            scoring_metric='roc_auc',
            hyperparameter_grid=None,
            randomized_search=True)

    @trainer_output
    def random_forest_regression(self):
        """Train a random forest regression model and print out the 
        model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.random_forest_regressor(
            trees=200,
            scoring_metric='neg_mean_squared_error',
            randomized_search=True)

    @trainer_output
    def random_forest_classification(self, feature_importance_limit=15, save_plot=False):
        """Train a random forest classification model, print performance metrics 
        and show a feature importance plot.

        Args:
            feature_importance_limit (int): The maximum number of features to 
            show in the feature importance plot

            save_plot (bool): For the feature importance plot, True to save 
            plot (will not display). False by default to display.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        model = self._advanced_trainer.random_forest_classifier(
            trees=200,
            scoring_metric='roc_auc',
            randomized_search=True)

        # Save or show the feature importance graph
        hcai_tsm.plot_rf_features_from_tsm(
            model,
            self._advanced_trainer.x_train,
            feature_limit=feature_importance_limit,
            save=save_plot)

        return model

    @trainer_output
    def logistic_regression(self):
        """Train a logistic regression model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.logistic_regression(
            randomized_search=False)

    @trainer_output
    def linear_regression(self):
        """Train a linear regression model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.linear_regression(randomized_search=False)

    @trainer_output
    def lasso_regression(self):
        """Train a lasso regression model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.lasso_regression(randomized_search=False)

    @trainer_output
    def ensemble(self):
        """Train a ensemble model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        # TODO consider making a scoring parameter (which will necessitate some more logic
        if self._advanced_trainer.model_type is 'classification':
            metric = 'roc_auc'
            model = self._advanced_trainer.ensemble_classification(scoring_metric=metric)
        elif self._advanced_trainer.model_type is 'regression':
            # TODO stub
            metric = 'neg_mean_squared_error'
            model = self._advanced_trainer.ensemble_regression(scoring_metric=metric)

        print('Based on the scoring metric {}, '
              'the best algorithm found is: {}'.format(metric, model.algorithm_name))

        return model

    @property
    def advanced_features(self):
        """Return the underlying AdvancedSupervisedModelTrainer instance. For 
        advanced users only."""
        return self._advanced_trainer
Beispiel #6
0
class SupervisedModelTrainer(object):
    """
    This class helps create a model using several common classifiers and regressors, both of which report appropiate
    metrics.
    """
    def __init__(self,
                 dataframe,
                 predicted_column,
                 model_type,
                 impute=True,
                 grain_column=None,
                 verbose=False):
        """
        Set up a SupervisedModelTrainer

        Args:
            dataframe (pandas.core.frame.DataFrame): The training data in a pandas dataframe
            predicted_column (str): The name of the prediction column 
            model_type (str): the trainer type - 'classification' or 'regression'
            impute (bool): True to impute data (mean of numeric columns and mode of categorical ones). False to drop rows
                that contain any null values.
            grain_column (str): The name of the grain column
            verbose (bool): Set to true for verbose output. Defaults to False.
        """
        self.predicted_column = predicted_column
        self.grain_column = grain_column

        # Build the pipeline
        # Note: Missing numeric values are imputed in prediction. If we don't impute, then some rows on the prediction
        # data frame will be removed, which results in missing predictions.
        pipeline = hcai_pipelines.full_pipeline(model_type,
                                                predicted_column,
                                                grain_column,
                                                impute=impute)
        prediction_pipeline = hcai_pipelines.full_pipeline(model_type,
                                                           predicted_column,
                                                           grain_column,
                                                           impute=True)

        # Run the raw data through the data preparation pipeline
        clean_dataframe = pipeline.fit_transform(dataframe)
        _ = prediction_pipeline.fit_transform(dataframe)

        # Instantiate the advanced class
        self._advanced_trainer = AdvancedSupervisedModelTrainer(
            dataframe=clean_dataframe,
            model_type=model_type,
            predicted_column=predicted_column,
            grain_column=grain_column,
            original_column_names=dataframe.columns.values,
            verbose=verbose)

        # Save the pipeline to the parent class
        self._advanced_trainer.pipeline = prediction_pipeline

        # Split the data into train and test
        self._advanced_trainer.train_test_split()

        self._advanced_trainer.categorical_column_info = get_categorical_levels(
            dataframe=dataframe,
            columns_to_ignore=[grain_column, predicted_column])

    @property
    def clean_dataframe(self):
        """ Returns the dataframe after the preparation pipeline (imputation and such) """
        return self._advanced_trainer.dataframe

    def random_forest(self, save_plot=False):
        # TODO Convenience method. Probably not needed?
        """ Train a random forest model and print out the model performance metrics.

        Args:
            save_plot (bool): For the feature importance plot, True to save plot (will not display). False by default to
                display.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        if self._advanced_trainer.model_type is 'classification':
            return self.random_forest_classification(save_plot=save_plot)
        elif self._advanced_trainer.model_type is 'regression':
            return self.random_forest_regression()

    def knn(self):
        """ Train a knn model and print out the model performance metrics.
        
        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        model_name = 'KNN'
        print('\nTraining {}'.format(model_name))

        # Train the model
        trained_model = self._advanced_trainer.knn(scoring_metric='roc_auc',
                                                   hyperparameter_grid=None,
                                                   randomized_search=True)

        # Display the model metrics
        trained_model.print_training_results()

        return trained_model

    def random_forest_regression(self):
        """ Train a random forest regression model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        model_name = 'Random Forest Regression'
        print('\nTraining {}'.format(model_name))

        # Train the model
        trained_model = self._advanced_trainer.random_forest_regressor(
            trees=200,
            scoring_metric='neg_mean_squared_error',
            randomized_search=True)

        # Display the model metrics
        trained_model.print_training_results()

        return trained_model

    def random_forest_classification(self, save_plot=False):
        """ Train a random forest classification model, print out performance metrics and show a feature importance plot.
        
        Args:
            save_plot (bool): For the feature importance plot, True to save plot (will not display). False by default to
                display.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """

        model_name = 'Random Forest Classification'
        print('\nTraining {}'.format(model_name))

        # Train the model
        trained_model = self._advanced_trainer.random_forest_classifier(
            trees=200, scoring_metric='roc_auc', randomized_search=True)

        # Display the model metrics
        trained_model.print_training_results()

        # Save or show the feature importance graph
        hcai_tsm.plot_rf_features_from_tsm(trained_model,
                                           self._advanced_trainer.x_train,
                                           save=save_plot)

        return trained_model

    def logistic_regression(self):
        """ Train a logistic regression model and print out the model performance metrics.
        
        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        model_name = 'Logistic Regression'
        print('\nTraining {}'.format(model_name))

        # Train the model
        trained_model = self._advanced_trainer.logistic_regression(
            randomized_search=False)

        # Display the model metrics
        trained_model.print_training_results()

        return trained_model

    def linear_regression(self):
        """ Train a linear regression model and print out the model performance metrics.
        
        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        model_name = 'Linear Regression'
        print('\nTraining {}'.format(model_name))

        # Train the model
        trained_model = self._advanced_trainer.linear_regression(
            randomized_search=False)

        # Display the model metrics
        trained_model.print_training_results()

        return trained_model

    def lasso_regression(self):
        """ Train a lasso regression model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        model_name = 'Lasso Regression'
        print('\nTraining {}'.format(model_name))

        # Train the model
        trained_model = self._advanced_trainer.lasso_regression(
            randomized_search=False)

        # Display the model metrics
        trained_model.print_training_results()

        return trained_model

    def ensemble(self):
        """ Train a ensemble model and print out the model performance metrics.
        
        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        # TODO consider making a scoring parameter (which will necessitate some more logic
        model_name = 'ensemble {}'.format(self._advanced_trainer.model_type)
        print('\nTraining {}'.format(model_name))

        # Train the appropriate ensemble of models
        if self._advanced_trainer.model_type is 'classification':
            metric = 'roc_auc'
            trained_model = self._advanced_trainer.ensemble_classification(
                scoring_metric=metric)
        elif self._advanced_trainer.model_type is 'regression':
            # TODO stub
            metric = 'neg_mean_squared_error'
            trained_model = self._advanced_trainer.ensemble_regression(
                scoring_metric=metric)

        print(
            'Based on the scoring metric {}, the best algorithm found is: {}'.
            format(metric, trained_model.algorithm_name))

        # Display the model metrics
        trained_model.print_training_results()

        return trained_model

    @property
    def advanced_features(self):
        """ Returns the underlying AdvancedSupervisedModelTrainer instance. For advanced users only. """
        return self._advanced_trainer
Beispiel #7
0
    def __init__(self, dataframe, predicted_column, model_type, impute=True, grain_column=None, verbose=True, imputeStrategy='MeanMode',
                 tunedRandomForest=False, numeric_columns_as_categorical=None ):
        """
        Set up a SupervisedModelTrainer.

        Helps the user by checking for high cardinality features (such as IDs or
        other unique identifiers) and low cardinality features (a column where
        all values are equal.

        Args:
        -----
        dataframe (pandas.core.frame.DataFrame): The training data in a 
        pandas dataframe 
        
        predicted_column (str): The name of the prediction column

        model_type (str): the trainer type - 'classification' or 'regression'

        impute (bool): True to impute data (mean of numeric columns and 
        mode of categorical ones). False to drop
        rows that contain any null values.

        grain_column (str): The name of the grain column

        verbose (bool): Set to true for verbose output. Defaults to True.
        
        impute : boolean, default=True
        	If True, imputation of missing value takes place.
        	If False, drop rows that contain any null values.
        	
        imputeStrategy : string, default='MeanMode'
        	It decides the technique to be used for imputation of missing values.
        	If imputeStrategy = 'MeanMode', Columns of dtype object or category 
            (assumed categorical) and imputed by the mode value of that column. 
            Columns of other types (assumed continuous) : by mean of column.
        	
             If imputeStrategy = 'RandomForest', Columns of dtype object or category 
            (assumed categorical) : imputed using RandomForestClassifier. 
            Columns of other types (assumed continuous) : imputed using RandomForestRegressor
        			
        
        tunedRandomForest : boolean, default=False
        	If set to True, RandomForestClassifier/RandomForestRegressor to be used for 
        	imputation of missing values are tuned using grid search and K-fold cross 
        	validation.
        	
        	Note:
        	If set to True, imputation process may take longer time depending upon size of 
        	dataframe and number of columns having missing values.
        	
        numeric_columns_as_categorical : List of type String, default=None
        	List of column names which are numeric(int/float) in dataframe, but by nature 
        	they are to be considered as categorical.
        	
        	For example:
        	There is a column JobCode( Levels : 1,2,3,4,5,6)
        	If there are missing values in JobCode column, panadas will by default convert 
        	this column into type float.
        	
        	If numeric_columns_as_categorical=None
        	Missing values of this column will be imputed by Mean value of JobCode column.
        	type of 'JobCode' column will remain float. 
        	
            If numeric_columns_as_categorical=['JobCode']
            Missing values of this column will be imputed by mode value of JobCode column.
            Also final type of 'JobCode' column will be numpy.object 
				
        """
        self.predicted_column = predicted_column
        self.grain_column = grain_column

        # Build the pipeline
        # Note: Missing numeric values are imputed in prediction. If we don't 
        # impute, then some rows on the prediction
        # data frame will be removed, which results in missing predictions.
        pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=impute,
                                                verbose=True, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest,
                                                numeric_columns_as_categorical=numeric_columns_as_categorical )

        prediction_pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=True,
                                                           verbose=False, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest, 
                                                           numeric_columns_as_categorical=numeric_columns_as_categorical )

        # Run a low and high cardinality check. Warn the user, and allow
        # them to proceed.
        hcai_ordinality.check_high_cardinality(dataframe, self.grain_column)
        hcai_ordinality.check_one_cardinality(dataframe)

        # Run the raw data through the data preparation pipeline
        clean_dataframe = pipeline.fit_transform(dataframe)
        _ = prediction_pipeline.fit_transform(dataframe)

        # Instantiate the advanced class
        self._advanced_trainer = AdvancedSupervisedModelTrainer(pipeline=pipeline,
            dataframe=clean_dataframe,
            model_type=model_type,
            predicted_column=predicted_column,
            grain_column=grain_column,
            original_column_names=dataframe.columns.values,
            verbose=verbose)

        # Save the pipeline to the parent class
        self._advanced_trainer.pipeline = prediction_pipeline

        # Split the data into train and test
        self._advanced_trainer.train_test_split()

        self._advanced_trainer.categorical_column_info = get_categorical_levels(
            dataframe=dataframe,
            columns_to_ignore=[grain_column, predicted_column])
Beispiel #8
0
class SupervisedModelTrainer(object):
    """Train supervised models.

    This class trains models using several common classifiers and regressors and
    reports appropriate metrics.
    """

    def __init__(self, dataframe, predicted_column, model_type, impute=True, grain_column=None, verbose=True, imputeStrategy='MeanMode',
                 tunedRandomForest=False, numeric_columns_as_categorical=None ):
        """
        Set up a SupervisedModelTrainer.

        Helps the user by checking for high cardinality features (such as IDs or
        other unique identifiers) and low cardinality features (a column where
        all values are equal.

        Args:
        -----
        dataframe (pandas.core.frame.DataFrame): The training data in a 
        pandas dataframe 
        
        predicted_column (str): The name of the prediction column

        model_type (str): the trainer type - 'classification' or 'regression'

        impute (bool): True to impute data (mean of numeric columns and 
        mode of categorical ones). False to drop
        rows that contain any null values.

        grain_column (str): The name of the grain column

        verbose (bool): Set to true for verbose output. Defaults to True.
        
        impute : boolean, default=True
        	If True, imputation of missing value takes place.
        	If False, drop rows that contain any null values.
        	
        imputeStrategy : string, default='MeanMode'
        	It decides the technique to be used for imputation of missing values.
        	If imputeStrategy = 'MeanMode', Columns of dtype object or category 
            (assumed categorical) and imputed by the mode value of that column. 
            Columns of other types (assumed continuous) : by mean of column.
        	
             If imputeStrategy = 'RandomForest', Columns of dtype object or category 
            (assumed categorical) : imputed using RandomForestClassifier. 
            Columns of other types (assumed continuous) : imputed using RandomForestRegressor
        			
        
        tunedRandomForest : boolean, default=False
        	If set to True, RandomForestClassifier/RandomForestRegressor to be used for 
        	imputation of missing values are tuned using grid search and K-fold cross 
        	validation.
        	
        	Note:
        	If set to True, imputation process may take longer time depending upon size of 
        	dataframe and number of columns having missing values.
        	
        numeric_columns_as_categorical : List of type String, default=None
        	List of column names which are numeric(int/float) in dataframe, but by nature 
        	they are to be considered as categorical.
        	
        	For example:
        	There is a column JobCode( Levels : 1,2,3,4,5,6)
        	If there are missing values in JobCode column, panadas will by default convert 
        	this column into type float.
        	
        	If numeric_columns_as_categorical=None
        	Missing values of this column will be imputed by Mean value of JobCode column.
        	type of 'JobCode' column will remain float. 
        	
            If numeric_columns_as_categorical=['JobCode']
            Missing values of this column will be imputed by mode value of JobCode column.
            Also final type of 'JobCode' column will be numpy.object 
				
        """
        self.predicted_column = predicted_column
        self.grain_column = grain_column

        # Build the pipeline
        # Note: Missing numeric values are imputed in prediction. If we don't 
        # impute, then some rows on the prediction
        # data frame will be removed, which results in missing predictions.
        pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=impute,
                                                verbose=True, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest,
                                                numeric_columns_as_categorical=numeric_columns_as_categorical )

        prediction_pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=True,
                                                           verbose=False, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest, 
                                                           numeric_columns_as_categorical=numeric_columns_as_categorical )

        # Run a low and high cardinality check. Warn the user, and allow
        # them to proceed.
        hcai_ordinality.check_high_cardinality(dataframe, self.grain_column)
        hcai_ordinality.check_one_cardinality(dataframe)

        # Run the raw data through the data preparation pipeline
        clean_dataframe = pipeline.fit_transform(dataframe)
        _ = prediction_pipeline.fit_transform(dataframe)

        # Instantiate the advanced class
        self._advanced_trainer = AdvancedSupervisedModelTrainer(pipeline=pipeline,
            dataframe=clean_dataframe,
            model_type=model_type,
            predicted_column=predicted_column,
            grain_column=grain_column,
            original_column_names=dataframe.columns.values,
            verbose=verbose)

        # Save the pipeline to the parent class
        self._advanced_trainer.pipeline = prediction_pipeline

        # Split the data into train and test
        self._advanced_trainer.train_test_split()

        self._advanced_trainer.categorical_column_info = get_categorical_levels(
            dataframe=dataframe,
            columns_to_ignore=[grain_column, predicted_column])

    @property
    def clean_dataframe(self):
        """Return the dataframe after the preparation pipeline (imputation and such)."""
        return self._advanced_trainer.dataframe

    def random_forest(self, feature_importance_limit=15, save_plot=False):
        """Train a random forest model and print out the model performance metrics.

        Args:
            feature_importance_limit (int): The maximum number of features to show 
            in the feature importance plotl

            save_plot (bool): For the feature importance plot, True to save plot 
            (will not display). False by default to display.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        if self._advanced_trainer.model_type is 'classification':
            return self.random_forest_classification(feature_importance_limit=feature_importance_limit,
                                                     save_plot=save_plot)
        elif self._advanced_trainer.model_type is 'regression':
            return self.random_forest_regression()

    @trainer_output
    def knn(self):
        """Train a knn model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.knn(
            scoring_metric='roc_auc',
            hyperparameter_grid=None,
            randomized_search=True)

    @trainer_output
    def random_forest_regression(self):
        """Train a random forest regression model and print out the 
        model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.random_forest_regressor(
            trees=200,
            scoring_metric='neg_mean_squared_error',
            randomized_search=True)

    @trainer_output
    def random_forest_classification(self, feature_importance_limit=15, save_plot=False):
        """Train a random forest classification model, print performance metrics 
        and show a feature importance plot.

        Args:
            feature_importance_limit (int): The maximum number of features to 
            show in the feature importance plot

            save_plot (bool): For the feature importance plot, True to save 
            plot (will not display). False by default to display.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        model = self._advanced_trainer.random_forest_classifier(
            trees=200,
            scoring_metric='roc_auc',
            randomized_search=True)

        # Save or show the feature importance graph
        hcai_tsm.plot_rf_features_from_tsm(
            model,
            self._advanced_trainer.x_train,
            feature_limit=feature_importance_limit,
            save=save_plot)

        return model

    @trainer_output
    def logistic_regression(self):
        """Train a logistic regression model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.logistic_regression(
            randomized_search=False)

    @trainer_output
    def linear_regression(self):
        """Train a linear regression model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.linear_regression(randomized_search=False)

    @trainer_output
    def lasso_regression(self):
        """Train a lasso regression model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        return self._advanced_trainer.lasso_regression(randomized_search=False)

    @trainer_output
    def ensemble(self):
        """Train a ensemble model and print out the model performance metrics.

        Returns:
            TrainedSupervisedModel: A trained supervised model.
        """
        # TODO consider making a scoring parameter (which will necessitate some more logic
        if self._advanced_trainer.model_type is 'classification':
            metric = 'roc_auc'
            model = self._advanced_trainer.ensemble_classification(scoring_metric=metric)
        elif self._advanced_trainer.model_type is 'regression':
            # TODO stub
            metric = 'neg_mean_squared_error'
            model = self._advanced_trainer.ensemble_regression(scoring_metric=metric)

        print('Based on the scoring metric {}, '
              'the best algorithm found is: {}'.format(metric, model.algorithm_name))

        return model

    @property
    def advanced_features(self):
        """Return the underlying AdvancedSupervisedModelTrainer instance. For 
        advanced users only."""
        return self._advanced_trainer