コード例 #1
0
ファイル: test_trainer.py プロジェクト: Aylr/healthcareai-py
    def setUpClass(cls):
        df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        df.drop(columns_to_remove, axis=1, inplace=True)

        cls.classification_trainer = SupervisedModelTrainer(dataframe=df,
                                                            predicted_column='ThirtyDayReadmitFLG',
                                                            model_type='classification',
                                                            impute=True,
                                                            grain_column='PatientEncounterID',
                                                            verbose=False)
        cls.regression_trainer = SupervisedModelTrainer(df,
                                                        'SystolicBPNBR',
                                                        'regression',
                                                        grain_column='PatientEncounterID',
                                                        impute=True,
                                                        verbose=False)

        cls.regression_trainer_impute_false = SupervisedModelTrainer(df,
                                                                     'SystolicBPNBR',
                                                                     'regression',
                                                                     grain_column='PatientEncounterID',
                                                                     impute=False,
                                                                     verbose=False)
コード例 #2
0
    def setUpClass(cls):
        cls.df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        cls.df.drop(columns_to_remove, axis=1, inplace=True)

        np.random.seed(42)
        clean_regression_df = pipelines.full_pipeline(
            REGRESSION,
            REGRESION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(cls.df)

        clean_classification_df = pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(cls.df)

        cls.regression_trainer = AdvancedSupervisedModelTrainer(
            pipelines.full_pipeline(REGRESSION,
                                    REGRESION_PREDICTED_COLUMN,
                                    GRAIN_COLUMN_NAME,
                                    impute=True), clean_regression_df,
            REGRESSION, REGRESION_PREDICTED_COLUMN)

        cls.classification_trainer = AdvancedSupervisedModelTrainer(
            pipelines.full_pipeline(CLASSIFICATION,
                                    CLASSIFICATION_PREDICTED_COLUMN,
                                    GRAIN_COLUMN_NAME,
                                    impute=True).fit_transform(cls.df),
            clean_classification_df, CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN)
コード例 #3
0
    def setUpClass(cls):
        df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        df.drop(columns_to_remove, axis=1, inplace=True)

        cls.classification_trainer = SupervisedModelTrainer(dataframe=df,
                                                            predicted_column='ThirtyDayReadmitFLG',
                                                            model_type='classification',
                                                            impute=True,
                                                            grain_column='PatientEncounterID',
                                                            verbose=False)
        cls.regression_trainer = SupervisedModelTrainer(df,
                                                        'SystolicBPNBR',
                                                        'regression',
                                                        grain_column='PatientEncounterID',
                                                        impute=True,
                                                        verbose=False)

        cls.regression_trainer_impute_false = SupervisedModelTrainer(df,
                                                                     'SystolicBPNBR',
                                                                     'regression',
                                                                     grain_column='PatientEncounterID',
                                                                     impute=False,
                                                                     verbose=False)
コード例 #4
0
ファイル: test_trainer.py プロジェクト: Aylr/healthcareai-py
    def test_impute_false_nan_data(self):
        # Train the linear regression model with impute = False
        trained_linear_model = self.regression_trainer_impute_false.linear_regression()

        # Load a new df for predicting
        prediction_df = hcai_datasets.load_diabetes()

        # Assert that the number of rows of prediction should be equal between df and model predictions
        self.assertEqual(len(trained_linear_model.make_predictions(prediction_df)), len(prediction_df))
コード例 #5
0
ファイル: test_trainer.py プロジェクト: Aylr/healthcareai-py
    def test_linear_regression_raises_error_on_missing_columns(self):
        # TODO how is this working since the model does not use the training df???
        training_df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        training_df.drop(['PatientID'], axis=1, inplace=True)

        # Train the linear regression model
        trained_linear_model = self.regression_trainer.linear_regression()

        # Load a new df for predicting
        prediction_df = hcai_datasets.load_diabetes()

        # Drop columns that model expects
        prediction_df.drop('GenderFLG', axis=1, inplace=True)

        # Make some predictions
        self.assertRaises(HealthcareAIError, trained_linear_model.make_predictions, prediction_df)
コード例 #6
0
    def test_impute_false_nan_data(self):
        # Train the linear regression model with impute = False
        trained_linear_model = self.regression_trainer_impute_false.linear_regression()

        # Load a new df for predicting
        prediction_df = hcai_datasets.load_diabetes()

        # Assert that the number of rows of prediction should be equal between df and model predictions
        self.assertEqual(len(trained_linear_model.make_predictions(prediction_df)), len(prediction_df))
コード例 #7
0
    def test_linear_regression_raises_error_on_missing_columns(self):
        # TODO how is this working since the model does not use the training df???
        training_df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        training_df.drop(['PatientID'], axis=1, inplace=True)

        # Train the linear regression model
        trained_linear_model = self.regression_trainer.linear_regression()

        # Load a new df for predicting
        prediction_df = hcai_datasets.load_diabetes()

        # Drop columns that model expects
        prediction_df.drop('GenderFLG', axis=1, inplace=True)

        # Make some predictions
        self.assertRaises(HealthcareAIError, trained_linear_model.make_predictions, prediction_df)
コード例 #8
0
    def setUp(self):
        df = hcai_datasets.load_diabetes()
        # Drop uninformative columns
        df.drop(['PatientID'], axis=1, inplace=True)

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True).fit_transform(df)
        self.trainer = AdvancedSupervisedModelTrainer(clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)
        self.trainer.train_test_split(random_seed=0)
コード例 #9
0
    def setUpClass(cls):
        """ Load a dataframe, train a linear model and prepare a prediction dataframe for assertions """
        training_df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        training_df.drop(['PatientID'], axis=1, inplace=True)

        regression_trainer = SupervisedModelTrainer(
            training_df,
            'SystolicBPNBR',
            'regression',
            impute=True,
            grain_column='PatientEncounterID')

        classification_trainer = SupervisedModelTrainer(
            training_df,
            'ThirtyDayReadmitFLG',
            'classification',
            impute=True,
            grain_column='PatientEncounterID')

        # Train the models
        cls.trained_linear_model = regression_trainer.linear_regression()
        cls.trained_lr = classification_trainer.logistic_regression()

        # Load a new df for predicting
        cls.prediction_df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        cls.prediction_df.drop(columns_to_remove, axis=1, inplace=True)

        # Create various outputs
        cls.predictions = cls.trained_linear_model.make_predictions(
            cls.prediction_df)
        cls.factors = cls.trained_linear_model.make_factors(
            cls.prediction_df, number_top_features=3)
        cls.predictions_with_3_factors = cls.trained_linear_model.make_predictions_with_k_factors(
            cls.prediction_df, number_top_features=3)
        cls.original_with_predictions_3_factors = cls.trained_linear_model.make_original_with_predictions_and_factors(
            cls.prediction_df, number_top_features=3)
        cls.catalyst_dataframe = cls.trained_linear_model.create_catalyst_dataframe(
            cls.prediction_df)
コード例 #10
0
    def setUpClass(cls):
        """ Load a dataframe, train a linear model and prepare a prediction dataframe for assertions """
        training_df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        training_df.drop(['PatientID'], axis=1, inplace=True)

        regression_trainer = SupervisedModelTrainer(
            training_df,
            'SystolicBPNBR',
            'regression',
            impute=True,
            grain_column='PatientEncounterID')

        classification_trainer = SupervisedModelTrainer(
            training_df,
            'ThirtyDayReadmitFLG',
            'classification',
            impute=True,
            grain_column='PatientEncounterID')

        # Train the models
        cls.trained_linear_model = regression_trainer.linear_regression()
        cls.trained_lr = classification_trainer.logistic_regression()

        # Load a new df for predicting
        cls.prediction_df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        cls.prediction_df.drop(columns_to_remove, axis=1, inplace=True)

        # Create various outputs
        cls.predictions = cls.trained_linear_model.make_predictions(cls.prediction_df)
        cls.factors = cls.trained_linear_model.make_factors(cls.prediction_df, number_top_features=3)
        cls.predictions_with_3_factors = cls.trained_linear_model.make_predictions_with_k_factors(
            cls.prediction_df,
            number_top_features=3)
        cls.original_with_predictions_3_factors = cls.trained_linear_model.make_original_with_predictions_and_factors(
            cls.prediction_df,
            number_top_features=3)
        cls.catalyst_dataframe = cls.trained_linear_model.create_catalyst_dataframe(cls.prediction_df)
コード例 #11
0
    def setUp(self):
        df = hcai_datasets.load_diabetes()
        # Drop uninformative columns
        df.drop(['PatientID'], axis=1, inplace=True)

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True).fit_transform(df)
        self.trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True),clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)
        self.trainer.train_test_split(random_seed=0)
コード例 #12
0
    def test_random_foarest_tuning_2_column_raises_error(self):
        df_raw = hcai_datasets.load_diabetes()
        # select only specific columns
        df = df_raw[['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR']]

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(df)
        trainer = AdvancedSupervisedModelTrainer(clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)

        trainer.train_test_split()

        self.assertRaises(HealthcareAIError, trainer.random_forest_classifier, trees=200, randomized_search=True)
コード例 #13
0
    def test_random_foarest_tuning_2_column_raises_error(self):
        df_raw = hcai_datasets.load_diabetes()
        # select only specific columns
        df = df_raw[['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR']]

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(df)
        trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True),clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)

        trainer.train_test_split()

        self.assertRaises(HealthcareAIError, trainer.random_forest_classifier, trees=200, randomized_search=True)
コード例 #14
0
def main():
    # Load the diabetes sample data
    dataframe = hcai_datasets.load_diabetes()

    # ## Load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Drop columns that won't help machine learning
    dataframe.drop(['PatientID'], axis=1, inplace=True)

    # Step 1: Setup a healthcareai regression trainer. This prepares your data for model building
    regression_trainer = SupervisedModelTrainer(
        dataframe=dataframe,
        predicted_column='SystolicBPNBR',
        model_type='regression',
        grain_column='PatientEncounterID',
        impute=True,
        verbose=False)

    # Look at the first few rows of your dataframe after loading the data
    print(
        '\n\n-------------------[ Cleaned Dataframe ]--------------------------'
    )
    print(regression_trainer.clean_dataframe.head())

    # Step 2: train some models

    # Train and evaluate linear regression model
    trained_linear_model = regression_trainer.linear_regression()

    # Train and evaluate random forest model
    trained_random_forest = regression_trainer.random_forest_regression()

    # Once you are happy with the performance of any model, you can save it for use later in predicting new data.
    # File names are timestamped and look like '2017-05-31T12-36-21_regression_LinearRegression.pkl')
    # Note the file you saved and that will be used in example_regression_2.py
    trained_linear_model.save()
コード例 #15
0
    def setUp(self):
        df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        df.drop(columns_to_remove, axis=1, inplace=True)

        self.regression_trainer = SupervisedModelTrainer(df,
                                                         'SystolicBPNBR',
                                                         'regression',
                                                         grain_column='PatientEncounterID',
                                                         impute=True,
                                                         verbose=False)

        def undecorated_lr(self):
            return self._advanced_trainer.linear_regression(randomized_search=False)

        self.regression_trainer.undecorated_lr = undecorated_lr.__get__(self.regression_trainer,
                                                                        self.regression_trainer.__class__)
コード例 #16
0
    def setUpClass(cls):
        cls.df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        cls.df.drop(columns_to_remove, axis=1, inplace=True)

        np.random.seed(42)
        clean_regression_df = pipelines.full_pipeline(
            REGRESSION,
            REGRESION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(cls.df)

        clean_classification_df = pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(cls.df)

        cls.regression_trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(
            REGRESSION,
            REGRESION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True),
            clean_regression_df,
            REGRESSION,
            REGRESION_PREDICTED_COLUMN)

        cls.classification_trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(cls.df),
            clean_classification_df, CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN)
コード例 #17
0
def main():
    # Load the diabetes sample data
    dataframe = hcai_datasets.load_diabetes()

    # ## Load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Drop columns that won't help machine learning
    dataframe.drop(['PatientID'], axis=1, inplace=True)

    # Step 1: Setup a healthcareai classification trainer. This prepares your data for model building
    classification_trainer = SupervisedModelTrainer(
        dataframe=dataframe,
        predicted_column='ThirtyDayReadmitFLG',
        model_type='classification',
        grain_column='PatientEncounterID',
        impute=True,
        verbose=False)

    # Look at the first few rows of your dataframe after loading the data
    print(
        '\n\n-------------------[ Cleaned Dataframe ]--------------------------'
    )
    print(classification_trainer.clean_dataframe.head())

    # Step 2: train some models

    # Train a KNN model
    trained_knn = classification_trainer.knn()

    # View the ROC and PR plots
    trained_knn.roc_plot()
    trained_knn.pr_plot()

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_knn.roc()
    # trained_knn.pr()

    # Train a logistic regression model
    trained_lr = classification_trainer.logistic_regression()

    # View the ROC and PR plots
    trained_lr.roc_plot()
    trained_lr.pr_plot()

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_lr.roc()
    # trained_lr.pr()

    # Train a random forest model and view the feature importance plot
    trained_random_forest = classification_trainer.random_forest(
        save_plot=False)
    # View the ROC and PR plots
    trained_random_forest.roc_plot()
    trained_random_forest.pr_plot()

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_random_forest.roc()
    # trained_random_forest.pr()

    # Create a list of all the models you just trained that you want to compare
    models_to_compare = [trained_knn, trained_lr, trained_random_forest]

    # Create a ROC plot that compares them.
    tsm_plots.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='ROC',
        save=False)

    # Create a PR plot that compares them.
    tsm_plots.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='PR',
        save=False)

    # Once you are happy with the performance of any model, you can save it for use later in predicting new data.
    # File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl')
    # Note the file you saved and that will be used in example_classification_2.py
    trained_random_forest.save()
コード例 #18
0
def main():
    # Load the diabetes sample data
    prediction_dataframe = hcai_datasets.load_diabetes()

    # Load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             WHERE SystolicBPNBR is null"""
    #
    # engine = hcai_db.build_mssql_engine(server=server, database=database)
    # prediction_dataframe = pd.read_sql(query, engine)

    # Drop columns that won't help machine learning
    columns_to_remove = ['PatientID']
    prediction_dataframe.drop(columns_to_remove, axis=1, inplace=True)

    # Load the saved model using your filename.
    # File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl')
    # Note the file you saved in example_classification_1.py and set that here.
    trained_model = hcai_io_utilities.load_saved_model(
        'your_filename_here.pkl')

    # Any saved model can be inspected for properties such as plots, metrics, columns, etc. (More examples in the docs)
    trained_model.roc_plot()
    print(trained_model.roc())
    # print(trained_model.column_names)
    # print(trained_model.grain_column)
    # print(trained_model.prediction_column)

    # # Make predictions. Please note that there are four different formats you can choose from. All are shown
    #    here, though you only need one.

    # ## Get predictions
    predictions = trained_model.make_predictions(prediction_dataframe)
    print(
        '\n\n-------------------[ Predictions ]----------------------------------------------------\n'
    )
    print(predictions.head())

    # ## Get the important factors
    factors = trained_model.make_factors(prediction_dataframe,
                                         number_top_features=3)
    print(
        '\n\n-------------------[ Factors ]----------------------------------------------------\n'
    )
    print(factors.head())

    # ## Get predictions with factors
    predictions_with_factors_df = trained_model.make_predictions_with_k_factors(
        prediction_dataframe, number_top_features=3)
    print(
        '\n\n-------------------[ Predictions + factors ]----------------------------------------------------\n'
    )
    print(predictions_with_factors_df.head())

    # ## Get original dataframe with predictions and factors
    original_plus_predictions_and_factors = trained_model.make_original_with_predictions_and_factors(
        prediction_dataframe, number_top_features=3)
    print(
        '\n\n-------------------[ Original + predictions + factors ]-------------------------------------------\n'
    )
    print(original_plus_predictions_and_factors.head())

    # Save your predictions. You can save predictions to a csv or database. Examples are shown below.
    # Please note that you will likely only need one of these output types. Feel free to delete the others.

    # Save results to csv
    predictions_with_factors_df.to_csv('ClinicalPredictions.csv')
コード例 #19
0
def main():
    # Load the diabetes sample data
    dataframe = hcai_datasets.load_diabetes()

    # Drop columns that won't help machine learning
    dataframe.drop(['PatientID'], axis=1, inplace=True)

    # Step 1: Prepare the data using optional imputation. There are two options for this:

    # ## Option 1: Use built in data prep pipeline that does enocding, imputation, null filtering, dummification
    clean_training_dataframe = hcai_pipelines.full_pipeline(
        'classification',
        'ThirtyDayReadmitFLG',
        'PatientEncounterID',
        impute=True).fit_transform(dataframe)

    # ## Option 2: Build your own pipeline using healthcare.ai methods, your own, or a combination of either.
    # - Please note this is intentionally spartan, so we don't hinder your creativity. :)
    # - Also note that many of the healthcare.ai transformers intentionally return dataframes, compared to scikit that
    #   return numpy arrays
    # custom_pipeline = Pipeline([
    #     ('remove_grain_column', hcai_filters.DataframeColumnRemover(columns_to_remove=['PatientEncounterID', 'PatientID'])),
    #     ('imputation', hcai_transformers.DataFrameImputer(impute=True)),
    #     ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary('classification', 'ThirtyDayReadmitFLG')),
    #     # ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric('ThirtyDayReadmitFLG')),
    #     # ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=['ThirtyDayReadmitFLG'])),
    # ])
    #
    # clean_training_dataframe = custom_pipeline.fit_transform(dataframe)

    # Step 2: Instantiate an Advanced Trainer class with your clean and prepared training data
    classification_trainer = AdvancedSupervisedModelTrainer(
        dataframe=clean_training_dataframe,
        model_type='classification',
        predicted_column='ThirtyDayReadmitFLG',
        grain_column='PatientEncounterID',
        verbose=False)

    # Step 3: split the data into train and test
    classification_trainer.train_test_split()

    # Step 4: Train some models

    # ## Train a KNN classifier with a randomized search over custom hyperparameters
    knn_hyperparameters = {
        'algorithm': ['ball_tree', 'kd_tree'],
        'n_neighbors': [1, 4, 6, 8, 10, 15, 20, 30, 50, 100, 200],
        'weights': ['uniform', 'distance']
    }

    trained_knn = classification_trainer.knn(
        scoring_metric='accuracy',
        hyperparameter_grid=knn_hyperparameters,
        randomized_search=True,
        # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower
        # Lower will be faster and possibly less performant
        number_iteration_samples=10)

    # ## Train a random forest classifier with a randomized search over custom hyperparameters
    # TODO these are bogus hyperparams for random forest
    random_forest_hyperparameters = {
        'n_estimators': [50, 100, 200, 300],
        'max_features': [1, 2, 3, 4],
        'max_leaf_nodes': [None, 30, 400]
    }

    trained_random_forest = classification_trainer.random_forest_classifier(
        scoring_metric='accuracy',
        hyperparameter_grid=random_forest_hyperparameters,
        randomized_search=True,
        # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower
        # Lower will be faster and possibly less performant
        number_iteration_samples=10)

    # Show the random forest feature importance graph
    hcai_tsm.plot_rf_features_from_tsm(trained_random_forest,
                                       classification_trainer.x_train,
                                       save=False)

    # ## Train a custom ensemble of models
    # The ensemble methods take a dictionary of TrainedSupervisedModels by a name of your choice
    custom_ensemble = {
        'KNN':
        classification_trainer.knn(hyperparameter_grid=knn_hyperparameters,
                                   randomized_search=False,
                                   scoring_metric='roc_auc'),
        'Logistic Regression':
        classification_trainer.logistic_regression(),
        'Random Forest Classifier':
        classification_trainer.random_forest_classifier(
            randomized_search=False, scoring_metric='roc_auc')
    }

    trained_ensemble = classification_trainer.ensemble_classification(
        scoring_metric='roc_auc', trained_model_by_name=custom_ensemble)

    # Step 5: Evaluate and compare the models

    # Create a list of all the models you just trained that you want to compare
    models_to_compare = [trained_knn, trained_random_forest, trained_ensemble]

    # Create a ROC plot that compares all the them.
    hcai_tsm.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='ROC',
        save=False)

    # Create a PR plot that compares all the them.
    hcai_tsm.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='PR',
        save=False)

    # Inspect the raw ROC or PR cutoffs
    print(trained_random_forest.roc(print_output=False))
    print(trained_random_forest.pr(print_output=False))
コード例 #20
0
 def test_load_diabetes(self):
     df = ds.load_diabetes()
     self.assertEqual(1000, df.shape[0])
     self.assertEqual(7, df.shape[1])
コード例 #21
0
 def test_load_diabetes(self):
     df = ds.load_diabetes()
     self.assertEqual(1000, df.shape[0])
     self.assertEqual(7, df.shape[1])
コード例 #22
0
 def test_class_counter_on_many(self):
     df = hcai_datasets.load_diabetes()
     result = count_unique_elements_in_column(df, 'PatientEncounterID')
     self.assertEqual(result, 1000)
コード例 #23
0
 def test_class_counter_on_binary(self):
     df = hcai_datasets.load_diabetes()
     df.dropna(axis=0, how='any', inplace=True)
     result = count_unique_elements_in_column(df, 'ThirtyDayReadmitFLG')
     self.assertEqual(result, 2)
コード例 #24
0
 def test_class_counter_on_binary(self):
     df = hcai_datasets.load_diabetes()
     df.dropna(axis=0, how='any', inplace=True)
     result = count_unique_elements_in_column(df, 'ThirtyDayReadmitFLG')
     self.assertEqual(result, 2)
コード例 #25
0
 def test_class_counter_on_many(self):
     df = hcai_datasets.load_diabetes()
     result = count_unique_elements_in_column(df, 'PatientEncounterID')
     self.assertEqual(result, 1000)