Ejemplo n.º 1
0
    def setUpClass(cls):
        cls.df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        cls.df.drop(columns_to_remove, axis=1, inplace=True)

        np.random.seed(42)
        clean_regression_df = pipelines.full_pipeline(
            REGRESSION,
            REGRESION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(cls.df)

        clean_classification_df = pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(cls.df)

        cls.regression_trainer = AdvancedSupervisedModelTrainer(
            pipelines.full_pipeline(REGRESSION,
                                    REGRESION_PREDICTED_COLUMN,
                                    GRAIN_COLUMN_NAME,
                                    impute=True), clean_regression_df,
            REGRESSION, REGRESION_PREDICTED_COLUMN)

        cls.classification_trainer = AdvancedSupervisedModelTrainer(
            pipelines.full_pipeline(CLASSIFICATION,
                                    CLASSIFICATION_PREDICTED_COLUMN,
                                    GRAIN_COLUMN_NAME,
                                    impute=True).fit_transform(cls.df),
            clean_classification_df, CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN)
Ejemplo n.º 2
0
class TestMetricValidation(unittest.TestCase):
    # TODO this is pretty spartan testing only looking for happy path on binary classification
    def setUp(self):
        df = hcai_datasets.load_diabetes()

        # Drop uninformative columns
        df.drop(['PatientID'], axis=1, inplace=True)

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(df)
        self.classification_trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True),
            clean_df,
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN)

    def test_validate_score_metric_for_number_of_classes(self):
        self.assertTrue(self.classification_trainer.validate_score_metric_for_number_of_classes('pr_auc'))
        self.assertTrue(self.classification_trainer.validate_score_metric_for_number_of_classes('roc_auc'))
Ejemplo n.º 3
0
class TestLogisticRegression(unittest.TestCase):
    def setUp(self):
        df = hcai_datasets.load_diabetes()

        # Drop uninformative columns
        df.drop(['PatientID'], axis=1, inplace=True)

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(CLASSIFICATION,
                                           CLASSIFICATION_PREDICTED_COLUMN,
                                           GRAIN_COLUMN_NAME,
                                           impute=True).fit_transform(df)

        self.classification_trainer = AdvancedSupervisedModelTrainer(
            pipelines.full_pipeline(CLASSIFICATION,
                                    CLASSIFICATION_PREDICTED_COLUMN,
                                    GRAIN_COLUMN_NAME,
                                    impute=True), clean_df, CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN)

        self.classification_trainer.train_test_split(random_seed=0)
        self.lr = self.classification_trainer.logistic_regression(
            randomized_search=False)

    def test_logistic_regression_no_tuning(self):
        self.assertIsInstance(self.lr, TrainedSupervisedModel)
        test_helpers.assertBetween(self, 0.5, 0.8, self.lr.metrics['roc_auc'])
class TestLogisticRegression(unittest.TestCase):
    def setUp(self):
        df = hcai_datasets.load_diabetes()

        # Drop uninformative columns
        df.drop(['PatientID'], axis=1, inplace=True)

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(df)

        self.classification_trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
            impute=True),
            clean_df,
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN)

        self.classification_trainer.train_test_split(random_seed=0)
        self.lr = self.classification_trainer.logistic_regression(randomized_search=False)

    def test_logistic_regression_no_tuning(self):
        self.assertIsInstance(self.lr, TrainedSupervisedModel)
        test_helpers.assertBetween(self, 0.5, 0.8, self.lr.metrics['roc_auc'])
    def setUp(self):
        df = hcai_datasets.load_diabetes()
        # Drop uninformative columns
        df.drop(['PatientID'], axis=1, inplace=True)

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True).fit_transform(df)
        self.trainer = AdvancedSupervisedModelTrainer(clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)
        self.trainer.train_test_split(random_seed=0)
Ejemplo n.º 6
0
class TestRandomForestClassification(unittest.TestCase):
    def setUp(self):
        df = hcai_datasets.load_diabetes()
        # Drop uninformative columns
        df.drop(['PatientID'], axis=1, inplace=True)

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(CLASSIFICATION,
                                           CLASSIFICATION_PREDICTED_COLUMN,
                                           GRAIN_COLUMN_NAME,
                                           impute=True).fit_transform(df)
        self.trainer = AdvancedSupervisedModelTrainer(
            pipelines.full_pipeline(CLASSIFICATION,
                                    CLASSIFICATION_PREDICTED_COLUMN,
                                    GRAIN_COLUMN_NAME,
                                    impute=True), clean_df, CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN)
        self.trainer.train_test_split(random_seed=0)

    def test_random_forest_no_tuning(self):
        rf = self.trainer.random_forest_classifier(trees=200,
                                                   randomized_search=False)
        self.assertIsInstance(rf, TrainedSupervisedModel)
        test_helpers.assertBetween(self, 0.8, 0.97, rf.metrics['roc_auc'])

    def test_random_forest_tuning(self):
        rf = self.trainer.random_forest_classifier(randomized_search=True)
        self.assertIsInstance(rf, TrainedSupervisedModel)
        test_helpers.assertBetween(self, 0.7, 0.97, rf.metrics['roc_auc'])

    def test_random_foarest_tuning_2_column_raises_error(self):
        df_raw = hcai_datasets.load_diabetes()
        # select only specific columns
        df = df_raw[['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR']]

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(CLASSIFICATION,
                                           CLASSIFICATION_PREDICTED_COLUMN,
                                           GRAIN_COLUMN_NAME,
                                           impute=True).fit_transform(df)
        trainer = AdvancedSupervisedModelTrainer(
            pipelines.full_pipeline(CLASSIFICATION,
                                    CLASSIFICATION_PREDICTED_COLUMN,
                                    GRAIN_COLUMN_NAME,
                                    impute=True), clean_df, CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN)

        trainer.train_test_split()

        self.assertRaises(HealthcareAIError,
                          trainer.random_forest_classifier,
                          trees=200,
                          randomized_search=True)
    def test_random_foarest_tuning_2_column_raises_error(self):
        df_raw = hcai_datasets.load_diabetes()
        # select only specific columns
        df = df_raw[['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR']]

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(df)
        trainer = AdvancedSupervisedModelTrainer(clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)

        trainer.train_test_split()

        self.assertRaises(HealthcareAIError, trainer.random_forest_classifier, trees=200, randomized_search=True)
    def test_random_foarest_tuning_2_column_raises_error(self):
        df_raw = hcai_datasets.load_diabetes()
        # select only specific columns
        df = df_raw[['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR']]

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(df)
        trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True),clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)

        trainer.train_test_split()

        self.assertRaises(HealthcareAIError, trainer.random_forest_classifier, trees=200, randomized_search=True)
    def setUp(self):
        df = hcai_datasets.load_diabetes()
        # Drop uninformative columns
        df.drop(['PatientID'], axis=1, inplace=True)

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True).fit_transform(df)
        self.trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True),clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)
        self.trainer.train_test_split(random_seed=0)
class TestRandomForestClassification(unittest.TestCase):
    def setUp(self):
        df = hcai_datasets.load_diabetes()
        # Drop uninformative columns
        df.drop(['PatientID'], axis=1, inplace=True)

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True).fit_transform(df)
        self.trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True),clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)
        self.trainer.train_test_split(random_seed=0)

    def test_random_forest_no_tuning(self):
        rf = self.trainer.random_forest_classifier(trees=200, randomized_search=False)
        self.assertIsInstance(rf, TrainedSupervisedModel)
        test_helpers.assertBetween(self, 0.8, 0.97, rf.metrics['roc_auc'])

    def test_random_forest_tuning(self):
        rf = self.trainer.random_forest_classifier(randomized_search=True)
        self.assertIsInstance(rf, TrainedSupervisedModel)
        test_helpers.assertBetween(self, 0.7, 0.97, rf.metrics['roc_auc'])

    def test_random_foarest_tuning_2_column_raises_error(self):
        df_raw = hcai_datasets.load_diabetes()
        # select only specific columns
        df = df_raw[['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR']]

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(df)
        trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True),clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)

        trainer.train_test_split()

        self.assertRaises(HealthcareAIError, trainer.random_forest_classifier, trees=200, randomized_search=True)
Ejemplo n.º 11
0
def main():
    # Load the diabetes sample data
    dataframe = hcai_datasets.load_diabetes()

    # Drop columns that won't help machine learning
    dataframe.drop(['PatientID'], axis=1, inplace=True)

    # Step 1: Prepare the data using optional imputation. There are two options for this:

    # ## Option 1: Use built in data prep pipeline that does enocding, imputation, null filtering, dummification
    clean_training_dataframe = hcai_pipelines.full_pipeline(
        'classification',
        'ThirtyDayReadmitFLG',
        'PatientEncounterID',
        impute=True).fit_transform(dataframe)

    # ## Option 2: Build your own pipeline using healthcare.ai methods, your own, or a combination of either.
    # - Please note this is intentionally spartan, so we don't hinder your creativity. :)
    # - Also note that many of the healthcare.ai transformers intentionally return dataframes, compared to scikit that
    #   return numpy arrays
    # custom_pipeline = Pipeline([
    #     ('remove_grain_column', hcai_filters.DataframeColumnRemover(columns_to_remove=['PatientEncounterID', 'PatientID'])),
    #     ('imputation', hcai_transformers.DataFrameImputer(impute=True)),
    #     ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary('classification', 'ThirtyDayReadmitFLG')),
    #     # ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric('ThirtyDayReadmitFLG')),
    #     # ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=['ThirtyDayReadmitFLG'])),
    # ])
    #
    # clean_training_dataframe = custom_pipeline.fit_transform(dataframe)

    # Step 2: Instantiate an Advanced Trainer class with your clean and prepared training data
    classification_trainer = AdvancedSupervisedModelTrainer(
        dataframe=clean_training_dataframe,
        model_type='classification',
        predicted_column='ThirtyDayReadmitFLG',
        grain_column='PatientEncounterID',
        verbose=False)

    # Step 3: split the data into train and test
    classification_trainer.train_test_split()

    # Step 4: Train some models

    # ## Train a KNN classifier with a randomized search over custom hyperparameters
    knn_hyperparameters = {
        'algorithm': ['ball_tree', 'kd_tree'],
        'n_neighbors': [1, 4, 6, 8, 10, 15, 20, 30, 50, 100, 200],
        'weights': ['uniform', 'distance']
    }

    trained_knn = classification_trainer.knn(
        scoring_metric='accuracy',
        hyperparameter_grid=knn_hyperparameters,
        randomized_search=True,
        # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower
        # Lower will be faster and possibly less performant
        number_iteration_samples=10)

    # ## Train a random forest classifier with a randomized search over custom hyperparameters
    # TODO these are bogus hyperparams for random forest
    random_forest_hyperparameters = {
        'n_estimators': [50, 100, 200, 300],
        'max_features': [1, 2, 3, 4],
        'max_leaf_nodes': [None, 30, 400]
    }

    trained_random_forest = classification_trainer.random_forest_classifier(
        scoring_metric='accuracy',
        hyperparameter_grid=random_forest_hyperparameters,
        randomized_search=True,
        # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower
        # Lower will be faster and possibly less performant
        number_iteration_samples=10)

    # Show the random forest feature importance graph
    hcai_tsm.plot_rf_features_from_tsm(trained_random_forest,
                                       classification_trainer.x_train,
                                       save=False)

    # ## Train a custom ensemble of models
    # The ensemble methods take a dictionary of TrainedSupervisedModels by a name of your choice
    custom_ensemble = {
        'KNN':
        classification_trainer.knn(hyperparameter_grid=knn_hyperparameters,
                                   randomized_search=False,
                                   scoring_metric='roc_auc'),
        'Logistic Regression':
        classification_trainer.logistic_regression(),
        'Random Forest Classifier':
        classification_trainer.random_forest_classifier(
            randomized_search=False, scoring_metric='roc_auc')
    }

    trained_ensemble = classification_trainer.ensemble_classification(
        scoring_metric='roc_auc', trained_model_by_name=custom_ensemble)

    # Step 5: Evaluate and compare the models

    # Create a list of all the models you just trained that you want to compare
    models_to_compare = [trained_knn, trained_random_forest, trained_ensemble]

    # Create a ROC plot that compares all the them.
    hcai_tsm.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='ROC',
        save=False)

    # Create a PR plot that compares all the them.
    hcai_tsm.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='PR',
        save=False)

    # Inspect the raw ROC or PR cutoffs
    print(trained_random_forest.roc(print_output=False))
    print(trained_random_forest.pr(print_output=False))