Example #1
0
def main():
    # Load the included diabetes sample data
    prediction_dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # prediction_dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             WHERE SystolicBPNBR is null"""
    #
    # engine = hcai_db.build_mssql_engine(server=server, database=database)
    # prediction_dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(prediction_dataframe.head(5))

    # Load the saved model using your filename.
    # File names are timestamped and look like '2017-05-31T12-36-21_regression_LinearRegression.pkl')
    # Note the file you saved in example_regression_1.py and set that here.
    trained_model = healthcareai.load_saved_model('2017-08-16T16-48-02_regression_LinearRegression.pkl')

    # Any saved models can be inspected for properties such as metrics, columns, etc. (More examples are in the docs)
    print(trained_model.metrics)
    # print(trained_model.column_names)
    # print(trained_model.grain_column)
    # print(trained_model.prediction_column)

    # Making predictions from a saved model.
    # Please note that you will likely only need one of these prediction output types. Feel free to delete the others.

    # Make some predictions
    print('\n\n-------------------[ Predictions ]----------------------------------------------------\n')
    predictions = trained_model.make_predictions(prediction_dataframe)
    print(predictions.head())

    # Get the important factors
    print('\n\n-------------------[ Factors ]----------------------------------------------------\n')
    factors = trained_model.make_factors(prediction_dataframe, number_top_features=4)
    print(factors.head())

    # Get predictions + factors
    print('\n\n-------------------[ Predictions + factors ]----------------------------------------------------\n')
    predictions_with_factors_df = trained_model.make_predictions_with_k_factors(prediction_dataframe)
    print(predictions_with_factors_df.head())

    # Get original dataframe + predictions + factors
    print('\n\n-------------------[ Original + predictions + factors ]--------------------------\n')
    original_plus_predictions_and_factors = trained_model.make_original_with_predictions_and_factors(
        prediction_dataframe)
    print(original_plus_predictions_and_factors.head())

    # Save your predictions. You can save predictions to a csv or database. Examples are shown below.
    # Please note that you will likely only need one of these output types. Feel free to delete the others.

    # ## Save results to csv
    predictions.to_csv('ClinicalPredictions.csv')
def main():
    """Template script for using healthcareai to train a regression model."""
    # Load the included diabetes sample data
    dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(dataframe.head(5))

    # Step 1: Setup a healthcareai regression trainer. This prepares your data for model building
    regression_trainer = healthcareai.SupervisedModelTrainer(
        dataframe=dataframe,
        predicted_column='SystolicBPNBR',
        model_type='regression',
        grain_column='PatientEncounterID',
        impute=True,
        verbose=False)

    # Look at the first few rows of your dataframe after loading the data
    print(
        '\n\n-------------------[ Cleaned Dataframe ]--------------------------'
    )
    print(regression_trainer.clean_dataframe.head())

    # Step 2: train some models

    # Train and evaluate linear regression model
    trained_linear_model = regression_trainer.linear_regression()

    # Train and evaluate random forest model
    trained_random_forest = regression_trainer.random_forest_regression()

    # Once you are happy with the performance of any model, you can save it for use later in predicting new data.
    # File names are timestamped and look like '2017-05-31T12-36-21_regression_LinearRegression.pkl')
    # Note the file you saved and that will be used in example_regression_2.py
    trained_linear_model.save()
def main():
    """Template script for using healthcareai to train a regression model."""
    # Load the included diabetes sample data
    dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(dataframe.head(5))

    # Step 1: Setup a healthcareai regression trainer. This prepares your data for model building
    regression_trainer = healthcareai.SupervisedModelTrainer(
        dataframe=dataframe,
        predicted_column='SystolicBPNBR',
        model_type='regression',
        grain_column='PatientEncounterID',
        impute=True,
        verbose=False)

    # Look at the first few rows of your dataframe after loading the data
    print('\n\n-------------------[ Cleaned Dataframe ]--------------------------')
    print(regression_trainer.clean_dataframe.head())

    # Step 2: train some models

    # Train and evaluate linear regression model
    trained_linear_model = regression_trainer.linear_regression()

    # Train and evaluate random forest model
    trained_random_forest = regression_trainer.random_forest_regression()

    # Train and evaluate a lasso model
    trained_lasso = regression_trainer.lasso_regression()
def main():
    """Template script for using healthcareai predict using a trained classification model."""
    # Load the included diabetes sample data
    prediction_dataframe = healthcareai.load_diabetes()

    # uncomment below code if advance imputaion is used in example_classification_1
    # beacuse we have intentionally converted GenderFLG column into numeric type for demonstration of numeric_columns_as_categorical feature.
    """
    prediction_dataframe['GenderFLG'].iloc[ 500:530, ] = np.NaN
    prediction_dataframe['GenderFLG'].replace( to_replace=[ 'M', 'F' ], value=[ 0, 1], inplace=True )
    """

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # prediction_dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             WHERE ThirtyDayReadmitFLG is null"""
    #
    # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database)
    # prediction_dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(prediction_dataframe.head(5))

    # Load the saved model using your filename.
    # File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl')
    # Note the file you saved in example_classification_1.py and set that here.
    trained_model = healthcareai.load_saved_model(
        '2018-10-09T13-53-44_classification_RandomForestClassifier_defaultImputation.pkl'
    )
    #trained_model = healthcareai.load_saved_model('2018-10-09T13-25-28_classification_RandomForestClassifier_advanceImputation.pkl')

    # Any saved model can be inspected for properties such as plots, metrics, columns, etc. (More examples in the docs)
    trained_model.roc_plot()
    print(trained_model.roc())
    # print(trained_model.column_names)
    # print(trained_model.grain_column)
    # print(trained_model.prediction_column)

    # # Make predictions. Please note that there are four different formats you can choose from. All are shown
    #    here, though you only need one.

    # ## Get predictions
    predictions = trained_model.make_predictions(prediction_dataframe)
    print(
        '\n\n-------------------[ Predictions ]----------------------------------------------------\n'
    )
    print(predictions.head())

    # ## Get the important factors
    factors = trained_model.make_factors(prediction_dataframe,
                                         number_top_features=3)
    print(
        '\n\n-------------------[ Factors ]----------------------------------------------------\n'
    )
    print(factors.head())

    # ## Get predictions with factors
    predictions_with_factors_df = trained_model.make_predictions_with_k_factors(
        prediction_dataframe, number_top_features=3)
    print(
        '\n\n-------------------[ Predictions + factors ]----------------------------------------------------\n'
    )
    print(predictions_with_factors_df.head())

    # ## Get original dataframe with predictions and factors
    original_plus_predictions_and_factors = trained_model.make_original_with_predictions_and_factors(
        prediction_dataframe, number_top_features=3)
    print(
        '\n\n-------------------[ Original + predictions + factors ]-------------------------------------------\n'
    )
    print(original_plus_predictions_and_factors.head())

    # Save your predictions. You can save predictions to a csv or database. Examples are shown below.
    # Please note that you will likely only need one of these output types. Feel free to delete the others.

    # Save results to csv
    predictions_with_factors_df.to_csv('ClinicalPredictions.csv')
Example #5
0
def main():
    """Template script for ADVANCED USERS using healthcareai."""
    # Load the included diabetes sample data
    dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(dataframe.head(5))

    # Drop columns that won't help machine learning
    dataframe.drop(['PatientID'], axis=1, inplace=True)

    # Step 1: Prepare the data using optional imputation. There are two options for this:

    # ## Option 1: Use built in data prep pipeline that does enocding, imputation, null filtering, dummification
    clean_training_dataframe = hcai_pipelines.full_pipeline(
        'classification',
        'ThirtyDayReadmitFLG',
        'PatientEncounterID',
        impute=True).fit_transform(dataframe)

    # ## Option 2: Build your own pipeline using healthcare.ai methods, your own, or a combination of either.
    # - Please note this is intentionally spartan, so we don't hinder your creativity. :)
    # - Also note that many of the healthcare.ai transformers intentionally return dataframes, compared to scikit that
    #   return numpy arrays
    # custom_pipeline = Pipeline([
    #     ('remove_grain_column', hcai_filters.DataframeColumnRemover(columns_to_remove=['PatientEncounterID', 'PatientID'])),
    #     ('imputation', hcai_transformers.DataFrameImputer(impute=True)),
    #     ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary('classification', 'ThirtyDayReadmitFLG')),
    #     # ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric('ThirtyDayReadmitFLG')),
    #     # ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=['ThirtyDayReadmitFLG'])),
    # ])
    #
    # clean_training_dataframe = custom_pipeline.fit_transform(dataframe)

    # Step 2: Instantiate an Advanced Trainer class with your clean and prepared training data
    classification_trainer = healthcareai.AdvancedSupervisedModelTrainer(
        dataframe=clean_training_dataframe,
        model_type='classification',
        predicted_column='ThirtyDayReadmitFLG',
        grain_column='PatientEncounterID',
        verbose=False)

    # Step 3: split the data into train and test
    classification_trainer.train_test_split()

    # Step 4: Train some models

    # ## Train a KNN classifier with a randomized search over custom hyperparameters
    knn_hyperparameters = {
        'algorithm': ['ball_tree', 'kd_tree'],
        'n_neighbors': [1, 4, 6, 8, 10, 15, 20, 30, 50, 100, 200],
        'weights': ['uniform', 'distance']
    }

    trained_knn = classification_trainer.knn(
        scoring_metric='accuracy',
        hyperparameter_grid=knn_hyperparameters,
        randomized_search=True,
        # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower
        # Lower will be faster and possibly less performant
        number_iteration_samples=10)

    # ## Train a random forest classifier with a randomized search over custom hyperparameters
    # TODO these are bogus hyperparams for random forest
    random_forest_hyperparameters = {
        'n_estimators': [50, 100, 200, 300],
        'max_features': [1, 2, 3, 4],
        'max_leaf_nodes': [None, 30, 400]
    }

    trained_random_forest = classification_trainer.random_forest_classifier(
        scoring_metric='accuracy',
        hyperparameter_grid=random_forest_hyperparameters,
        randomized_search=True,
        # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower
        # Lower will be faster and possibly less performant
        number_iteration_samples=10)

    # Show the random forest feature importance graph
    hcai_tsm.plot_rf_features_from_tsm(trained_random_forest,
                                       classification_trainer.x_train,
                                       feature_limit=20,
                                       save=False)

    # ## Train a custom ensemble of models
    # The ensemble methods take a dictionary of TrainedSupervisedModels by a name of your choice
    custom_ensemble = {
        'KNN':
        classification_trainer.knn(hyperparameter_grid=knn_hyperparameters,
                                   randomized_search=False,
                                   scoring_metric='roc_auc'),
        'Logistic Regression':
        classification_trainer.logistic_regression(),
        'Random Forest Classifier':
        classification_trainer.random_forest_classifier(
            randomized_search=False, scoring_metric='roc_auc')
    }

    trained_ensemble = classification_trainer.ensemble_classification(
        scoring_metric='roc_auc', trained_model_by_name=custom_ensemble)

    # Step 5: Evaluate and compare the models

    # Create a list of all the models you just trained that you want to compare
    models_to_compare = [trained_knn, trained_random_forest, trained_ensemble]

    # Create a ROC plot that compares all the them.
    hcai_tsm.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='ROC',
        save=False)

    # Create a PR plot that compares all the them.
    hcai_tsm.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='PR',
        save=False)

    # Inspect the raw ROC or PR cutoffs
    print(trained_random_forest.roc(print_output=False))
    print(trained_random_forest.pr(print_output=False))
Example #6
0
def main():
    # Load the included diabetes sample data
    prediction_dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # prediction_dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             WHERE ThirtyDayReadmitFLG is null"""
    #
    # engine = hcai_db.build_mssql_engine(server=server, database=database)
    # prediction_dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(prediction_dataframe.head(5))

    # Load the saved model using your filename.
    # File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl')
    # Note the file you saved in example_classification_1.py and set that here.
    trained_model = healthcareai.load_saved_model(
        '2017-08-16T16-45-57_classification_RandomForestClassifier.pkl')

    # Any saved model can be inspected for properties such as plots, metrics, columns, etc. (More examples in the docs)
    trained_model.roc_plot()
    print(trained_model.roc())
    # print(trained_model.column_names)
    # print(trained_model.grain_column)
    # print(trained_model.prediction_column)

    # # Make predictions. Please note that there are four different formats you can choose from. All are shown
    #    here, though you only need one.

    # ## Get predictions
    predictions = trained_model.make_predictions(prediction_dataframe)
    print(
        '\n\n-------------------[ Predictions ]----------------------------------------------------\n'
    )
    print(predictions.head())

    # ## Get the important factors
    factors = trained_model.make_factors(prediction_dataframe,
                                         number_top_features=3)
    print(
        '\n\n-------------------[ Factors ]----------------------------------------------------\n'
    )
    print(factors.head())

    # ## Get predictions with factors
    predictions_with_factors_df = trained_model.make_predictions_with_k_factors(
        prediction_dataframe, number_top_features=3)
    print(
        '\n\n-------------------[ Predictions + factors ]----------------------------------------------------\n'
    )
    print(predictions_with_factors_df.head())

    # ## Get original dataframe with predictions and factors
    original_plus_predictions_and_factors = trained_model.make_original_with_predictions_and_factors(
        prediction_dataframe, number_top_features=3)
    print(
        '\n\n-------------------[ Original + predictions + factors ]-------------------------------------------\n'
    )
    print(original_plus_predictions_and_factors.head())

    # Save your predictions. You can save predictions to a csv or database. Examples are shown below.
    # Please note that you will likely only need one of these output types. Feel free to delete the others.

    # Save results to csv
    predictions_with_factors_df.to_csv('ClinicalPredictions.csv')
def main():
    """Template script for ADVANCED USERS using healthcareai."""
    # Load the included diabetes sample data
    dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(dataframe.head(5))

    # Drop columns that won't help machine learning
    dataframe.drop(['PatientID'], axis=1, inplace=True)

    # Step 1: Prepare the data using optional imputation. There are two options for this:

    # ## Option 1: Use built in data prep pipeline that does enocding, imputation, null filtering, dummification
    clean_training_dataframe = hcai_pipelines.full_pipeline(
        'classification',
        'ThirtyDayReadmitFLG',
        'PatientEncounterID',
        impute=True).fit_transform(dataframe)

    # ## Option 2: Build your own pipeline using healthcare.ai methods, your own, or a combination of either.
    # - Please note this is intentionally spartan, so we don't hinder your creativity. :)
    # - Also note that many of the healthcare.ai transformers intentionally return dataframes, compared to scikit that
    #   return numpy arrays
    # custom_pipeline = Pipeline([
    #     ('remove_grain_column', hcai_filters.DataframeColumnRemover(columns_to_remove=['PatientEncounterID', 'PatientID'])),
    #     ('imputation', hcai_transformers.DataFrameImputer(impute=True)),
    #     ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary('classification', 'ThirtyDayReadmitFLG')),
    #     # ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric('ThirtyDayReadmitFLG')),
    #     # ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=['ThirtyDayReadmitFLG'])),
    # ])
    #
    # clean_training_dataframe = custom_pipeline.fit_transform(dataframe)

    # Step 2: Instantiate an Advanced Trainer class with your clean and prepared training data
    classification_trainer = healthcareai.AdvancedSupervisedModelTrainer(
        dataframe=clean_training_dataframe,
        model_type='classification',
        predicted_column='ThirtyDayReadmitFLG',
        grain_column='PatientEncounterID',
        verbose=False)

    # Step 3: split the data into train and test
    classification_trainer.train_test_split()

    # Step 4: Train some models

    # ## Train a KNN classifier with a randomized search over custom hyperparameters
    knn_hyperparameters = {
        'algorithm': ['ball_tree', 'kd_tree'],
        'n_neighbors': [1, 4, 6, 8, 10, 15, 20, 30, 50, 100, 200],
        'weights': ['uniform', 'distance']}

    trained_knn = classification_trainer.knn(
        scoring_metric='accuracy',
        hyperparameter_grid=knn_hyperparameters,
        randomized_search=True,
        # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower
        # Lower will be faster and possibly less performant
        number_iteration_samples=10
    )

    # ## Train a random forest classifier with a randomized search over custom hyperparameters
    # TODO these are bogus hyperparams for random forest
    random_forest_hyperparameters = {
        'n_estimators': [50, 100, 200, 300],
        'max_features': [1, 2, 3, 4],
        'max_leaf_nodes': [None, 30, 400]}

    trained_random_forest = classification_trainer.random_forest_classifier(
        scoring_metric='accuracy',
        hyperparameter_grid=random_forest_hyperparameters,
        randomized_search=True,
        # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower
        # Lower will be faster and possibly less performant
        number_iteration_samples=10
    )

    # Show the random forest feature importance graph
    hcai_tsm.plot_rf_features_from_tsm(
        trained_random_forest,
        classification_trainer.x_train,
        feature_limit=20,
        save=False)

    # ## Train a custom ensemble of models
    # The ensemble methods take a dictionary of TrainedSupervisedModels by a name of your choice
    custom_ensemble = {
        'KNN': classification_trainer.knn(
            hyperparameter_grid=knn_hyperparameters,
            randomized_search=False,
            scoring_metric='roc_auc'),
        'Logistic Regression': classification_trainer.logistic_regression(),
        'Random Forest Classifier': classification_trainer.random_forest_classifier(
            randomized_search=False,
            scoring_metric='roc_auc')}

    trained_ensemble = classification_trainer.ensemble_classification(
        scoring_metric='roc_auc',
        trained_model_by_name=custom_ensemble)

    # Step 5: Evaluate and compare the models

    # Create a list of all the models you just trained that you want to compare
    models_to_compare = [trained_knn, trained_random_forest, trained_ensemble]

    # Create a ROC plot that compares all the them.
    hcai_tsm.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='ROC',
        save=False)

    # Create a PR plot that compares all the them.
    hcai_tsm.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='PR',
        save=False)

    # Inspect the raw ROC or PR cutoffs
    print(trained_random_forest.roc(print_output=False))
    print(trained_random_forest.pr(print_output=False))
Example #8
0
def main():
    """Template script for using healthcareai to train a classification model."""
    # Load the included diabetes sample data
    dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(dataframe.head(5))

    # Drop columns that won't help machine learning
    dataframe.drop(['PatientID'], axis=1, inplace=True)

    # Step 1: Setup a healthcareai classification trainer. This prepares your data for model building
    classification_trainer = healthcareai.SupervisedModelTrainer(
        dataframe=dataframe,
        predicted_column='ThirtyDayReadmitFLG',
        model_type='classification',
        grain_column='PatientEncounterID',
        impute=True,
        verbose=False)

    # Look at the first few rows of your dataframe after loading the data
    print(
        '\n\n-------------------[ Cleaned Dataframe ]--------------------------'
    )
    print(classification_trainer.clean_dataframe.head())

    # Step 2: train some models

    # Train a KNN model
    trained_knn = classification_trainer.knn()

    # View the ROC and PR plots
    trained_knn.roc_plot()
    trained_knn.pr_plot()

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_knn.roc()
    # trained_knn.pr()

    # Train a logistic regression model
    trained_lr = classification_trainer.logistic_regression()

    # View the ROC and PR plots
    trained_lr.roc_plot()
    trained_lr.pr_plot()

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_lr.roc()
    # trained_lr.pr()

    # Train a random forest model and view the feature importance plot
    trained_random_forest = classification_trainer.random_forest(
        save_plot=False)
    # View the ROC and PR plots
    trained_random_forest.roc_plot()
    trained_random_forest.pr_plot()

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_random_forest.roc()
    # trained_random_forest.pr()

    # Create a list of all the models you just trained that you want to compare
    models_to_compare = [trained_knn, trained_lr, trained_random_forest]

    # Create a ROC plot that compares them.
    tsm_plots.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='ROC',
        save=False)

    # Create a PR plot that compares them.
    tsm_plots.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='PR',
        save=False)

    # Once you are happy with the performance of any model, you can save it for use later in predicting new data.
    # File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl')
    # Note the file you saved and that will be used in example_classification_2.py
    trained_random_forest.save()
Example #9
0
def main():
    """Template script for using healthcareai to train a classification model."""
    # Load the included diabetes sample data
    dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(dataframe.head(5))

    # Drop columns that won't help machine learning
    dataframe.drop(['PatientID'], axis=1, inplace=True)

    # Step 1: Setup a healthcareai classification trainer. This prepares your data for model building
    classification_trainer = healthcareai.SupervisedModelTrainer(
        dataframe=dataframe,
        predicted_column='ThirtyDayReadmitFLG',
        model_type='classification',
        grain_column='PatientEncounterID',
        impute=True,
        verbose=False)
    """
    The below code demonstrate the advance features for imputation of missing values.
    imputeStrategy: 
        'MeanMode': (default), Impute using mean and mode values of column
        'RandomForest': Impute missing values in RandomForest models.(Imputed values are much more realistic)
    
    tunedRandomForest:
        True: ML to be used for imputation of missing values are tuned using grid search and K-fold cross 
              validation.
    
    numeric_columns_as_categorical :
        For example: GenderFLG (0,0,1,0,1,1 .... )
        So in normal case pandas by default will consider this column as numeric and missing values of this column 
        will be imputed using MEAN value (ex. 0.78 or 1.46 ....).
        
        Thus to explicitly mention such  as categorical there is this option which can be used as below:
            numeric_columns_as_categorical = 'GenderFLG'
        Now imputation will be done by MODE value and final type of the column wil be np.object.
    """

    # Uncomment below code to see advance imputation in action.
    """
    # Creating missing values in GenderFLG column and converting it into Numeric type to demostrate advance imputation features.
    dataframe['GenderFLG'].iloc[ 500:530, ] = np.NaN
    dataframe['GenderFLG'].replace( to_replace=[ 'M', 'F' ], value=[ 0, 1], inplace=True )
    pd.options.mode.chained_assignment = None
    
    classification_trainer = healthcareai.SupervisedModelTrainer(
        dataframe=dataframe,
        predicted_column='ThirtyDayReadmitFLG',
        model_type='classification',
        grain_column='PatientEncounterID',
        impute=True,
        verbose=False,
        imputeStrategy = 'RandomForest',
        tunedRandomForest = True,
        numeric_columns_as_categorical = 'GenderFLG'   
        )
    """

    # Look at the first few rows of your dataframe after loading the data
    print(
        '\n\n-------------------[ Cleaned Dataframe ]--------------------------'
    )
    print(classification_trainer.clean_dataframe.head())

    # Step 2: train some models

    # Train a KNN model
    trained_knn = classification_trainer.knn()

    # View the ROC and PR plots
    trained_knn.roc_plot()
    '''
    fig = plt.gcf()
    fig.set_size_inches(6, 4)
    fig.savefig('KNN-ROC.png', dpi=100)
    '''

    trained_knn.pr_plot()
    '''
    fig = plt.gcf()
    fig.set_size_inches(6, 4)
    fig.savefig('KNN-PR.png', dpi=100)
    '''

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_knn.roc()
    # trained_knn.pr()

    # Train a logistic regression model
    trained_lr = classification_trainer.logistic_regression()

    # View the ROC and PR plots
    trained_lr.roc_plot()
    '''
    fig = plt.gcf()
    fig.set_size_inches(6, 4)
    fig.savefig('LR-ROC.png', dpi=100)  
    '''

    trained_lr.pr_plot()
    '''
    fig = plt.gcf()
    fig.set_size_inches(6, 4)
    fig.savefig('LR-PR.png', dpi=100)
    '''
    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_lr.roc()
    # trained_lr.pr()

    # Train a random forest model and view the feature importance plot
    trained_random_forest = classification_trainer.random_forest(
        save_plot=False)
    # View the ROC and PR plots
    trained_random_forest.roc_plot()
    trained_random_forest.pr_plot()

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_random_forest.roc()
    # trained_random_forest.pr()

    # Create a list of all the models you just trained that you want to compare
    models_to_compare = [trained_knn, trained_lr, trained_random_forest]

    # Create a ROC plot that compares them.
    tsm_plots.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='ROC',
        save=False)

    # Create a PR plot that compares them.
    tsm_plots.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='PR',
        save=False)
def main():
    """Template script for using healthcareai predict using a trained regression model."""
    # Load the included diabetes sample data
    prediction_dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # prediction_dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             WHERE SystolicBPNBR is null"""
    #
    # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database)
    # prediction_dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(prediction_dataframe.head(5))

    # Load the saved model using your filename.
    # File names are timestamped and look like '2017-05-31T12-36-21_regression_LinearRegression.pkl')
    # Note the file you saved in example_regression_1.py and set that here.
    trained_model = healthcareai.load_saved_model('2017-08-16T16-48-02_regression_LinearRegression.pkl')

    # Any saved models can be inspected for properties such as metrics, columns, etc. (More examples are in the docs)
    print(trained_model.metrics)
    # print(trained_model.column_names)
    # print(trained_model.grain_column)
    # print(trained_model.prediction_column)

    # Making predictions from a saved model.
    # Please note that you will likely only need one of these prediction output types. Feel free to delete the others.

    # Make some predictions
    print('\n\n-------------------[ Predictions ]----------------------------------------------------\n')
    predictions = trained_model.make_predictions(prediction_dataframe)
    print(predictions.head())

    # Get the important factors
    print('\n\n-------------------[ Factors ]----------------------------------------------------\n')
    factors = trained_model.make_factors(prediction_dataframe, number_top_features=4)
    print(factors.head())

    # Get predictions + factors
    print('\n\n-------------------[ Predictions + factors ]----------------------------------------------------\n')
    predictions_with_factors_df = trained_model.make_predictions_with_k_factors(prediction_dataframe)
    print(predictions_with_factors_df.head())

    # Get original dataframe + predictions + factors
    print('\n\n-------------------[ Original + predictions + factors ]--------------------------\n')
    original_plus_predictions_and_factors = trained_model.make_original_with_predictions_and_factors(
        prediction_dataframe)
    print(original_plus_predictions_and_factors.head())

    # Save your predictions. You can save predictions to a csv or database. Examples are shown below.
    # Please note that you will likely only need one of these output types. Feel free to delete the others.

    # ## Save results to csv
    predictions.to_csv('ClinicalPredictions.csv')
import healthcareai
from healthcareai.common import feature_availability_profiler

dataframe = healthcareai.load_diabetes()

feature_availability_profiler
def main():
    """Template script for using healthcareai to train a regression model."""
    # Load the included diabetes sample data
    dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(dataframe.head(5))
    
    # Step 1: Setup a healthcareai regression trainer. This prepares your data for model building
    regression_trainer = healthcareai.SupervisedModelTrainer(
        dataframe=dataframe,
        predicted_column='SystolicBPNBR',
        model_type='regression',
        grain_column='PatientEncounterID',
        impute=True,
        verbose=False)
    
    
    """
    The below code demonstrate the advance features for imputation of missing values.
    imputeStrategy: 
        'MeanMode': (default), Impute using mean and mode values of column
        'RandomForest': Impute missing values in RandomForest models. (Imputed values are much more realistic)
    
    tunedRandomForest:
        True: ML to be used for imputation of missing values are tuned using grid search and K-fold cross 
              validation.
    
    numeric_columns_as_categorical :
        For example: GenderFLG (0,0,1,0,1,1 .... )
        So in normal case pandas by default will consider this column as numeric and missing values of this column 
        will be imputed using MEAN value (ex. 0.78 or 1.46 ....).
        
        Thus to explicitly mention such  as categorical there is this option which can be used as below:
            numeric_columns_as_categorical = 'GenderFLG'
        Now imputation will be done by MODE value and final type of the column wil be np.object.
    """
    
    # Uncomment below code to see advance imputation in action.
    """
    # Creating missing values in GenderFLG column and converting it into Numeric type to demostrate advance imputation features.
    dataframe['GenderFLG'].iloc[ 500:530, ] = np.NaN
    dataframe['GenderFLG'].replace( to_replace=[ 'M', 'F' ], value=[ 0, 1], inplace=True )
    pd.options.mode.chained_assignment = None
    
    regression_trainer = healthcareai.SupervisedModelTrainer(
        dataframe=dataframe,
        predicted_column='SystolicBPNBR',
        model_type='regression',
        grain_column='PatientEncounterID',
        impute=True,
        verbose=False,
        imputeStrategy = 'RandomForest',
        tunedRandomForest = True,
        numeric_columns_as_categorical = 'GenderFLG'   
        )
    """
    
    

    # Look at the first few rows of your dataframe after loading the data
    print('\n\n-------------------[ Cleaned Dataframe ]--------------------------')
    print(regression_trainer.clean_dataframe.head())

    # Step 2: train some models

    # Train and evaluate linear regression model
    trained_linear_model = regression_trainer.linear_regression()

    # Train and evaluate random forest model
    trained_random_forest = regression_trainer.random_forest_regression()

    # Train and evaluate a lasso model
    trained_lasso = regression_trainer.lasso_regression()
Example #13
0
import requests
import healthcareai
import json
url = 'http://localhost:5000/'
df = healthcareai.load_diabetes().to_json()
pred = [3, 10001, 170, 191, 4, 'M']
pred = json.dumps(pred)
r = requests.post(url,
                  json={
                      'PatientEncounterID': 1,
                      'PatientID': 2,
                      'SystolicBPNBR': 170,
                      'LDLNBR': 140,
                      'A1CNBR': 4.5,
                      'GenderFLG': 'M'
                  })
print(r.json())
def main():
    """Template script for using healthcareai to train a classification model."""
    # Load the included diabetes sample data
    dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(dataframe.head(5))

    # Drop columns that won't help machine learning
    dataframe.drop(['PatientID'], axis=1, inplace=True)

    # Step 1: Setup a healthcareai classification trainer. This prepares your data for model building
    classification_trainer = healthcareai.SupervisedModelTrainer(
        dataframe=dataframe,
        predicted_column='ThirtyDayReadmitFLG',
        model_type='classification',
        grain_column='PatientEncounterID',
        impute=True,
        verbose=False)

    # Look at the first few rows of your dataframe after loading the data
    print('\n\n-------------------[ Cleaned Dataframe ]--------------------------')
    print(classification_trainer.clean_dataframe.head())

    # Step 2: train some models

    # Train a KNN model
    trained_knn = classification_trainer.knn()

    # View the ROC and PR plots
    trained_knn.roc_plot()
    trained_knn.pr_plot()

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_knn.roc()
    # trained_knn.pr()

    # Train a logistic regression model
    trained_lr = classification_trainer.logistic_regression()

    # View the ROC and PR plots
    trained_lr.roc_plot()
    trained_lr.pr_plot()

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_lr.roc()
    # trained_lr.pr()

    # Train a random forest model and view the feature importance plot
    trained_random_forest = classification_trainer.random_forest(save_plot=False)
    # View the ROC and PR plots
    trained_random_forest.roc_plot()
    trained_random_forest.pr_plot()

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_random_forest.roc()
    # trained_random_forest.pr()

    # Create a list of all the models you just trained that you want to compare
    models_to_compare = [trained_knn, trained_lr, trained_random_forest]

    # Create a ROC plot that compares them.
    tsm_plots.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='ROC',
        save=False)

    # Create a PR plot that compares them.
    tsm_plots.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='PR',
        save=False)

    # Once you are happy with the performance of any model, you can save it for use later in predicting new data.
    # File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl')
    # Note the file you saved and that will be used in example_classification_2.py
    trained_random_forest.save()
Example #15
0
from flask import Flask, jsonify
import pandas as pd
import healthcareai

app = Flask(__name__)

#Yahan pr menay apna already saved/trained model load krlia kisi bhi variable main
trained_model = healthcareai.load_saved_model(
    '2020-02-22T19-17-50_classification_RandomForestClassifier.pkl')

#Yeh mera predictions krne k liye data, json format main bhi ho sakta haii; menay yahan pr simple dataset hi utha lia
prediction_dataframe = healthcareai.load_diabetes()


@app.route("/")
def hello():
    return "Hello Friend!"


@app.route("/predict")
def predict():
    #Yahan pr jo data aya tha usko menay model pr laga dia
    predictions = trained_model.make_predictions(prediction_dataframe)
    #Yahan pr menay result ko JSON main convert kiya aur jahan sy request ayi thi wapis return krdia
    print(predictions)
    return jsonify({'predictions': list(predictions)})


if __name__ == '__main__':
    app.run(debug=True)
def main():
    """Template script for using healthcareai predict using a trained classification model."""
    # Load the included diabetes sample data
    prediction_dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # prediction_dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             WHERE ThirtyDayReadmitFLG is null"""
    #
    # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database)
    # prediction_dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(prediction_dataframe.head(5))

    # Load the saved model using your filename.
    # File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl')
    # Note the file you saved in example_classification_1.py and set that here.
    trained_model = healthcareai.load_saved_model('2017-08-16T16-45-57_classification_RandomForestClassifier.pkl')

    # Any saved model can be inspected for properties such as plots, metrics, columns, etc. (More examples in the docs)
    trained_model.roc_plot()
    print(trained_model.roc())
    # print(trained_model.column_names)
    # print(trained_model.grain_column)
    # print(trained_model.prediction_column)

    # # Make predictions. Please note that there are four different formats you can choose from. All are shown
    #    here, though you only need one.

    # ## Get predictions
    predictions = trained_model.make_predictions(prediction_dataframe)
    print('\n\n-------------------[ Predictions ]----------------------------------------------------\n')
    print(predictions.head())

    # ## Get the important factors
    factors = trained_model.make_factors(prediction_dataframe, number_top_features=3)
    print('\n\n-------------------[ Factors ]----------------------------------------------------\n')
    print(factors.head())

    # ## Get predictions with factors
    predictions_with_factors_df = trained_model.make_predictions_with_k_factors(prediction_dataframe,
                                                                                number_top_features=3)
    print('\n\n-------------------[ Predictions + factors ]----------------------------------------------------\n')
    print(predictions_with_factors_df.head())

    # ## Get original dataframe with predictions and factors
    original_plus_predictions_and_factors = trained_model.make_original_with_predictions_and_factors(
        prediction_dataframe, number_top_features=3)
    print('\n\n-------------------[ Original + predictions + factors ]-------------------------------------------\n')
    print(original_plus_predictions_and_factors.head())

    # Save your predictions. You can save predictions to a csv or database. Examples are shown below.
    # Please note that you will likely only need one of these output types. Feel free to delete the others.

    # Save results to csv
    predictions_with_factors_df.to_csv('ClinicalPredictions.csv')