def train_ml_model(X_train, y_train, alg, groups, output_folderpath, random_state): hyperparams = {} hyperparams['algorithm'] = alg ml_classifier = SupervisedClassifier(classes=[0, 1], hyperparams=hyperparams) status = ml_classifier.train(X_train, y_train, groups=groups) return ml_classifier
def __init__(self, classes, hyperparams): if hyperparams[ 'bifurcation_strategy'] not in BifurcatedSupervisedClassifier.SUPPORTED_BIFURCATION_STRATEGIES: raise ValueError('Bifurcation strategy %s not supported.' % hyperparams['bifurcation_strategy']) self._classes = classes self._hyperparams = hyperparams # Note that if we don't pass a copies of hyperparams, then we won't # be able to change hyperparams independently in the two classifiers. self._sc_true = SupervisedClassifier(classes, hyperparams.copy()) self._sc_false = SupervisedClassifier(classes, hyperparams.copy())
def test_train_and_predict(self): # Load data set. X = DataFrame(RANDOM_CLASSIFICATION_TEST_CASE['X'], columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']) y = DataFrame(RANDOM_CLASSIFICATION_TEST_CASE['y']) random_state = RANDOM_CLASSIFICATION_TEST_CASE['random_state'] expected_y_pred_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['y_predicted'] expected_str_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['str'] expected_hyperparams_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['hyperparams'] expected_params_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['params'] expected_descriptions_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['description'] # Generate train/test split. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) # Iterate through SUPPORTED_ALGORITHMS. for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: log.info('Testing %s classifier...' % algorithm) # Train model. hyperparams = {'algorithm': algorithm, 'random_state': random_state} # Default to stochastic search for expensive algorithms. if algorithm in [SupervisedClassifier.RANDOM_FOREST]: hyperparams['hyperparam_strategy'] = SupervisedClassifier.STOCHASTIC_SEARCH # Test ability to force hyperparam values. hyperparams['max_depth'] = 2 hyperparams['n_estimators'] = 5 hyperparams['min_samples_leaf'] = 1 hyperparams['min_samples_split'] = 0.2 else: hyperparams['hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH classifier = SupervisedClassifier([0, 1], hyperparams) classifier.train(X_train, y_train) # Test str(). expected_str = expected_str_by_algorithm[algorithm] actual_str = str(classifier) self.assertEqual(expected_str, actual_str) # Test hyperparameters. expected_hyperparams = expected_hyperparams_by_algorithm[algorithm] actual_hyperparams = classifier.hyperparams() self._assert_equal_hyperparams(expected_hyperparams, actual_hyperparams) # Test model parameters. expected_params = expected_params_by_algorithm[algorithm] actual_params = classifier.params() self.assertEqualDict(expected_params, actual_params) # Test model description. expected_description = expected_descriptions_by_algorithm[algorithm] actual_description = classifier.description() self.assertEqual(expected_description, actual_description) # Test prediction values. expected_y_pred = expected_y_pred_by_algorithm[algorithm] log.debug('expected_y_pred: %s' % expected_y_pred) actual_y_pred = classifier.predict(X_test) log.debug('actual_y_pred: %s' % actual_y_pred) self.assertEqualList(expected_y_pred, actual_y_pred)
def run(self): file_organizer = Syst.FileOrganizerLocal( working_folderpath=self.working_folderpath) raw_matrix_train, raw_matrix_test = Utils.split_rows(self.input_matrix) X_train_raw, y_train = Utils.split_Xy(raw_matrix_train, ylabel=self.ylabel) feature_processing_pipeline = Pipeline( memory=None, # file_organizer.cached_pipeline_filepath, steps=[('impute_features', Clas.FeatureImputer()), ('remove_features', Clas.FeatureRemover()), ('select_features', Clas.Select_Features())]) X_train_processed = feature_processing_pipeline.fit_transform( X_train_raw, y_train) predictor = SupervisedClassifier( classes=[0, 1], hyperparams={ 'algorithm': 'random-forest', 'hyperparam_strategy': SupervisedClassifier.EXHAUSTIVE_SEARCH, 'max_iter': 1024 }) status = predictor.train(X_train_processed, column_or_1d(y_train)) X_test_raw, y_test = Utils.split_Xy(raw_matrix_test, ylabel=self.ylabel) X_test_processed = feature_processing_pipeline.transform(X_test_raw) y_test_pred_proba = predictor.predict_probability(X_test_processed)[:, 1] res_df = pd.DataFrame({'actual': y_test, 'predict': y_test_pred_proba}) res_df.to_csv(file_organizer.get_output_filepath()) '''TODO''' from scripts.LabTestAnalysis.lab_statistics.stats_utils import get_confusion_metrics from sklearn.metrics import roc_auc_score AUC = roc_auc_score(y_test, y_test_pred_proba) sensitivity, specificity, LR_p, LR_n, PPV, NPV = get_confusion_metrics( actual_labels=y_test.values, predict_probas=y_test_pred_proba, threshold=0.5) print("AUC: %s, sensitivity: %s, specificity: %s, LR_p: %s, LR_n: %s, PPV: %s, NPV: %s:. " \ % (AUC, sensitivity, specificity, LR_p, LR_n, PPV, NPV))
def setUp(self): log.level = logging.ERROR # Use simple classifier and test case for testing non-ROC analyses. X = RANDOM_10_TEST_CASE['X'] y = RANDOM_10_TEST_CASE['y'] self._list_classifier = ListPredictor([0, 1]) self._lc_analyzer = ClassifierAnalyzer(self._list_classifier, X, y) # Use ml classifier and complex test case. X = RANDOM_100_TEST_CASE['X'] y = RANDOM_100_TEST_CASE['y'] # Generate train/test split. X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=123456789) # Train logistic regression model. hyperparams = { 'algorithm': SupervisedClassifier.REGRESS_AND_ROUND, 'random_state': 123456789 } self._ml_classifier = SupervisedClassifier([0, 1], hyperparams) self._ml_classifier.train(X_train, column_or_1d(y_train)) self._ml_analyzer = ClassifierAnalyzer(self._ml_classifier, X_test, y_test)
def test_init(self): # Test unspecified algorithm. classifier = SupervisedClassifier([0, 1]) self.assertEqual(classifier.algorithm(), \ SupervisedClassifier.LOGISTIC_REGRESSION) # Test unsupported algorithm. with self.assertRaises(ValueError): hyperparams = {'algorithm': 'foo'} SupervisedClassifier([0, 1], hyperparams) # Confirm specified algorithm selection. hyperparams = {'algorithm': SupervisedClassifier.DECISION_TREE} classifier = SupervisedClassifier([0, 1], hyperparams) self.assertEqual(classifier.algorithm(), SupervisedClassifier.DECISION_TREE)
class TestClassifierAnalyzer(MedInfoTestCase): def setUp(self): log.level = logging.ERROR # Use simple classifier and test case for testing non-ROC analyses. X = RANDOM_10_TEST_CASE['X'] y = RANDOM_10_TEST_CASE['y'] self._list_classifier = ListPredictor([0, 1]) self._lc_analyzer = ClassifierAnalyzer(self._list_classifier, X, y) # Use ml classifier and complex test case. X = RANDOM_100_TEST_CASE['X'] y = RANDOM_100_TEST_CASE['y'] # Generate train/test split. X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=123456789) # Train logistic regression model. hyperparams = { 'algorithm': SupervisedClassifier.REGRESS_AND_ROUND, 'random_state': 123456789 } self._ml_classifier = SupervisedClassifier([0, 1], hyperparams) self._ml_classifier.train(X_train, column_or_1d(y_train)) self._ml_analyzer = ClassifierAnalyzer(self._ml_classifier, X_test, y_test) def tearDown(self): test_dir = os.path.dirname(os.path.abspath(__file__)) # Clean up the actual report file. try: actual_report_name = 'actual-list-classifier.report' actual_report_path = '/'.join([test_dir, actual_report_name]) os.remove(actual_report_path) except OSError: pass # Clean up the actual precision-recall plot. try: actual_plot_name = 'actual-precision-recall-plot.png' actual_plot_path = '/'.join([test_dir, actual_plot_name]) os.remove(actual_plot_path) except OSError: pass # Clean up the actual roc plot. try: actual_plot_name = 'actual-roc-plot.png' actual_plot_path = '/'.join([test_dir, actual_plot_name]) os.remove(actual_plot_path) except OSError: pass # Clean up the actual precision at k plot. try: actual_plot_name = 'actual-precision-at-k-plot.png' actual_plot_path = '/'.join([test_dir, actual_plot_name]) os.remove(actual_plot_path) except OSError: pass def _assert_fuzzy_equality(self, expected, actual): abs_diff = actual - expected rel_diff = (abs_diff) / expected self.assertTrue(rel_diff < 0.1) def test_score_accuracy(self): # Test accuracy. expected_accuracy = RANDOM_10_TEST_CASE['accuracy'] actual_accuracy = self._lc_analyzer.score() self.assertEqual(expected_accuracy, actual_accuracy) # Test accuracy. expected_accuracy = RANDOM_100_TEST_CASE['accuracy'] actual_accuracy = self._ml_analyzer.score() self.assertEqual(expected_accuracy, actual_accuracy) # Test bootstrapped CIs. actual_accuracy, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score( ci=0.95, n_bootstrap_iter=1000) self.assertEqual(expected_accuracy, actual_accuracy) expected_lower_ci = RANDOM_100_TEST_CASE['ci']['accuracy']['lower'] self.assertEqual(expected_lower_ci, actual_lower_ci) expected_upper_ci = RANDOM_100_TEST_CASE['ci']['accuracy']['upper'] self.assertEqual(expected_upper_ci, actual_upper_ci) def test_score_recall(self): # Test recall. expected_recall = RANDOM_10_TEST_CASE['recall'] actual_recall = self._lc_analyzer.score( metric=ClassifierAnalyzer.RECALL_SCORE) self.assertEqual(expected_recall, actual_recall) # Test recall. expected_recall = RANDOM_100_TEST_CASE['recall'] actual_recall = self._ml_analyzer.score( metric=ClassifierAnalyzer.RECALL_SCORE) self.assertEqual(expected_recall, actual_recall) # Test bootstrapped CIs. actual_recall, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score( metric=ClassifierAnalyzer.RECALL_SCORE, ci=0.95, n_bootstrap_iter=1000) self.assertEqual(expected_recall, actual_recall) expected_lower_ci = RANDOM_100_TEST_CASE['ci']['recall']['lower'] self.assertEqual(expected_lower_ci, actual_lower_ci) expected_upper_ci = RANDOM_100_TEST_CASE['ci']['recall']['upper'] self.assertEqual(expected_upper_ci, actual_upper_ci) def test_score_precision(self): # Test precision. expected_precision = RANDOM_10_TEST_CASE['precision'] actual_precision = self._lc_analyzer.score( metric=ClassifierAnalyzer.PRECISION_SCORE) self.assertEqual(expected_precision, actual_precision) # Test precision. expected_precision = RANDOM_100_TEST_CASE['precision'] actual_precision = self._ml_analyzer.score( metric=ClassifierAnalyzer.PRECISION_SCORE) self.assertEqual(expected_precision, actual_precision) # Test bootstrapped CIs. actual_precision, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score( metric=ClassifierAnalyzer.PRECISION_SCORE, ci=0.95, n_bootstrap_iter=1000) self.assertEqual(expected_precision, actual_precision) expected_lower_ci = RANDOM_100_TEST_CASE['ci']['precision']['lower'] self.assertEqual(expected_lower_ci, actual_lower_ci) expected_upper_ci = RANDOM_100_TEST_CASE['ci']['precision']['upper'] self.assertEqual(expected_upper_ci, actual_upper_ci) def test_score_f1(self): # Test F1 score. expected_f1 = RANDOM_10_TEST_CASE['f1'] actual_f1 = self._lc_analyzer.score(metric=ClassifierAnalyzer.F1_SCORE) self.assertEqual(expected_f1, actual_f1) # Test f1. expected_f1 = RANDOM_100_TEST_CASE['f1'] actual_f1 = self._ml_analyzer.score(metric=ClassifierAnalyzer.F1_SCORE) self.assertEqual(expected_f1, actual_f1) # Test bootstrapped CIs. actual_f1, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score( metric=ClassifierAnalyzer.F1_SCORE, ci=0.95, n_bootstrap_iter=1000) self.assertEqual(expected_f1, actual_f1) expected_lower_ci = RANDOM_100_TEST_CASE['ci']['f1']['lower'] self.assertEqual(expected_lower_ci, actual_lower_ci) expected_upper_ci = RANDOM_100_TEST_CASE['ci']['f1']['upper'] self.assertEqual(expected_upper_ci, actual_upper_ci) def test_score_average_precision(self): # Test average precision. expected_average_precision = RANDOM_100_TEST_CASE['average_precision'] actual_average_precision = self._ml_analyzer.score( metric=ClassifierAnalyzer.AVERAGE_PRECISION_SCORE) self.assertEqual(expected_average_precision, actual_average_precision) # Test bootstrapped CIs. actual_average_precision, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score( metric=ClassifierAnalyzer.AVERAGE_PRECISION_SCORE, ci=0.95, n_bootstrap_iter=1000) self.assertEqual(expected_average_precision, actual_average_precision) expected_lower_ci = RANDOM_100_TEST_CASE['ci']['average_precision'][ 'lower'] self.assertEqual(expected_lower_ci, actual_lower_ci) expected_upper_ci = RANDOM_100_TEST_CASE['ci']['average_precision'][ 'upper'] self.assertEqual(expected_upper_ci, actual_upper_ci) def test_score_roc_auc(self): # Test roc_auc. expected_roc_auc = RANDOM_100_TEST_CASE['roc_auc'] actual_roc_auc = self._ml_analyzer.score( metric=ClassifierAnalyzer.ROC_AUC_SCORE) self.assertEqual(expected_roc_auc, actual_roc_auc) # Test bootstrapped CIs. actual_roc_auc, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score( metric=ClassifierAnalyzer.ROC_AUC_SCORE, ci=0.95, n_bootstrap_iter=1000) self.assertEqual(expected_roc_auc, actual_roc_auc) expected_lower_ci = RANDOM_100_TEST_CASE['ci']['roc_auc']['lower'] self.assertEqual(expected_lower_ci, actual_lower_ci) expected_upper_ci = RANDOM_100_TEST_CASE['ci']['roc_auc']['upper'] self.assertEqual(expected_upper_ci, actual_upper_ci) def test_score_precision_at_k(self): # Test precision at K. prev_precision = 1.0 for k in range(1, 20): actual_precision_at_k = self._ml_analyzer.score( metric=ClassifierAnalyzer.PRECISION_AT_K_SCORE, k=k) expected_precision_at_k = RANDOM_100_TEST_CASE['precision_at_k'][k] self.assertEqual(expected_precision_at_k, actual_precision_at_k) # Test bootstrapped CIs. actual_precision_at_k, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score( metric=ClassifierAnalyzer.PRECISION_AT_K_SCORE, k=10, ci=0.95, n_bootstrap_iter=1000) expected_precision_at_k = RANDOM_100_TEST_CASE['precision_at_k'][10] self.assertEqual(expected_precision_at_k, actual_precision_at_k) expected_lower_ci = RANDOM_100_TEST_CASE['ci']['precision_at_k'][ 'lower'][10] self.assertEqual(expected_lower_ci, actual_lower_ci) expected_upper_ci = RANDOM_100_TEST_CASE['ci']['precision_at_k'][ 'upper'][10] self.assertEqual(expected_upper_ci, actual_upper_ci) def test_score_percent_predictably_positive(self): # Test roc_auc. expected_ppp = RANDOM_100_TEST_CASE['percent_predictably_positive'] actual_ppp = self._ml_analyzer.score( metric=ClassifierAnalyzer.PERCENT_PREDICTABLY_POSITIVE) self.assertEqual(expected_ppp, actual_ppp) # Test bootstrapped CIs. actual_ppp, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score( metric=ClassifierAnalyzer.PERCENT_PREDICTABLY_POSITIVE, ci=0.95, n_bootstrap_iter=1000) self.assertEqual(expected_ppp, actual_ppp) expected_lower_ci = RANDOM_100_TEST_CASE['ci'][ 'percent_predictably_positive']['lower'] self.assertEqual(expected_lower_ci, actual_lower_ci) expected_upper_ci = RANDOM_100_TEST_CASE['ci'][ 'percent_predictably_positive']['upper'] self.assertEqual(expected_upper_ci, actual_upper_ci) def test_plot_precision_recall_curve(self): # Compute precision-recall curve. precision_recall_curve = self._ml_analyzer.compute_precision_recall_curve( ) # Build paths for expected and actual plots. test_dir = os.path.dirname(os.path.abspath(__file__)) actual_plot_name = 'actual-precision-recall-plot.png' actual_plot_path = '/'.join([test_dir, actual_plot_name]) self._ml_analyzer.plot_precision_recall_curve('Precision-Recall Curve', actual_plot_path) # Not sure how to validate this at the moment, so just validate # that it actually passes. self.assertTrue(True) def test_plot_roc_curve(self): # Compute ROC curve. roc_curve = self._ml_analyzer.compute_roc_curve() # Build paths for expected and actual plots. test_dir = os.path.dirname(os.path.abspath(__file__)) actual_plot_name = 'actual-roc-plot.png' actual_plot_path = '/'.join([test_dir, actual_plot_name]) self._ml_analyzer.plot_roc_curve('ROC', actual_plot_path) # Not sure how to validate this at the moment, so just validate # that it actually passes. self.assertTrue(True) def test_plot_precision_at_k_curve(self): # Compute precision_recall_curve. k_vals, precision_vals = self._ml_analyzer.compute_precision_at_k_curve( ) # Build paths for expected and actual plots. test_dir = os.path.dirname(os.path.abspath(__file__)) actual_plot_name = 'actual-precision-at-k-plot.png' actual_plot_path = '/'.join([test_dir, actual_plot_name]) self._ml_analyzer.plot_precision_at_k_curve('Precision at K', actual_plot_path) # Not sure how to validate this at the moment, so just validate # that it actually passes. self.assertTrue(True) def test_build_report(self): # Build report. expected_report = RANDOM_100_TEST_CASE['report'] actual_report = self._ml_analyzer.build_report()[0] log.debug('expected_report: %s' % expected_report) log.debug('actual_report: %s' % actual_report) assert_frame_equal(expected_report, actual_report) # Build bootstrapped report. expected_report = RANDOM_100_TEST_CASE['ci']['report'] actual_report = self._ml_analyzer.build_report(ci=0.95)[0] assert_frame_equal(expected_report, actual_report) # Build paths for expected and actual report. test_dir = os.path.dirname(os.path.abspath(__file__)) actual_report_name = 'actual-list-classifier.report' actual_report_path = '/'.join([test_dir, actual_report_name]) # Write the report. self._ml_analyzer.write_report(actual_report_path) # Not sure how to validate this at the moment, so just validate # that it actually passes. self.assertTrue(True)
def _train_predictor(self): self._predictor = SupervisedClassifier( algorithm=SupervisedClassifier.REGRESS_AND_ROUND) self._predictor.train(self._X_train, column_or_1d(self._y_train))
class ConditionMortalityPredictor: def __init__(self, condition, num_patients, icd_list=None, use_cache=None): self._condition = condition self._num_patients = num_patients self._icd_list = icd_list self._FEATURES_TO_REMOVE = [ 'index_time', 'death_date', 'Death.post', 'Death.postTimeDays', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays' ] self._eliminated_features = list() self._build_cmm_names() if use_cache is None: self._build_raw_feature_matrix() print('Processing raw feature matrix...') self._process_raw_feature_matrix() print('Training predictor...') self._train_predictor() print('Testing predictor...') self._test_predictor() def _build_cmm_names(self): slugified_condition = "-".join(self._condition.split()) self._build_cmm_name_raw(slugified_condition, self._num_patients) self._build_cmm_name_processed(slugified_condition, self._num_patients) def _build_cmm_name_raw(self, slugified_condition, num_patients): template = '%s-mortality-matrix-%d-pat-raw.tab' self._cmm_name_raw = template % (slugified_condition, num_patients) def _build_cmm_name_processed(self, slugified_condition, num_patients): template = '%s-mortality-matrix-%d-pat-processed.tab' self._cmm_name_processed = template % (slugified_condition, num_patients) def _build_raw_feature_matrix(self): self._cmm = ConditionMortalityMatrix(self._condition, \ self._num_patients, self._cmm_name_raw, self._icd_list) def _process_raw_feature_matrix(self): # Read raw CMM. self._fm_io = FeatureMatrixIO() print('Reading raw matrix...') self._cmm_raw = self._fm_io.read_file_to_data_frame(self._cmm_name_raw) # Add and remove features to _cmm_processed. self._fmt = FeatureMatrixTransform() self._fmt.set_input_matrix(self._cmm_raw) print('Adding features...') self._add_features() print('Imputing data...') self._impute_data() self._remove_features() self._fmt.drop_duplicate_rows() self._cmm_processed = self._fmt.fetch_matrix() # Divide _cmm_processed into training and test data. # This must happen before feature selection so that we don't # accidentally learn information from the test data. self._train_test_split() print('Selecting features...') self._select_features() # Write output to new matrix. train = self._y_train.join(self._X_train) test = self._y_test.join(self._X_test) self._cmm_processed = train.append(test) header = self._build_processed_matrix_header() self._fm_io.write_data_frame_to_file(self._cmm_processed, self._cmm_name_processed, header) def _build_processed_matrix_header(self): # FeatureMatrixFactory and FeatureMatrixIO expect a list of strings. # Each comment below represents the line in the comment. header = list() # <file_name.tab> file_name = self._cmm_name_processed header.append(file_name) # Created: <timestamp> timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") header.append('Created: %s' % timestamp) # Source: __name__ header.append('Source: %s' % __name__) # Command: ConditionMortalityMatrix() if self._icd_list: command = 'ConditionMortalityPredictor(%s, %s, %s)' % \ (self._condition, self._num_patients, self._icd_list) else: command = 'ConditionMortalityPredictor(%s, %s)' % \ (self._condition, self._num_patients) header.append('Command: %s' % command) # header.append('') # Overview: header.append('Overview:') # This file is a processed version of ___. line = 'This file is a post-processed version of %s.' % self._cmm_name_raw header.append(line) # The outcome label is ___, which is a boolean indicator line = 'The outcome label is I(0<=Death.postTimeDays<=28), which is a boolean indicator' header.append(line) # for whether the patient given by pat_id passed away within 28 days line = 'for whether the patient given by pat_id passed away within 28 days' header.append(line) # of the time index represented by a given row. line = 'of the time index represented by a given row.' header.append(line) # This matrix is the result of the following processing steps on the raw matrix: line = 'This matrix is the result of the following processing steps on the raw matrix:' header.append(line) # (1) Imputing missing values with the mean value of each column. line = ' (1) Imputing missing values with the mean value of each column.' header.append(line) # (2) Manually removing low-information features: line = ' (2) Manually removing low-information features:' header.append(line) # ___ line = ' %s' % str(self._FEATURES_TO_REMOVE) header.append(line) # (3) Algorithmically selecting the top 100 features via recursive feature elimination. line = ' (3) Algorithmically selecting the top 100 features via recursive feature elimination.' header.append(line) # The following features were eliminated. line = ' The following features were eliminated:' header.append(line) # List all features with rank >100. line = ' %s' % str(self._eliminated_features) header.append(line) # line = '' header.append(line) # Each row represents a decision point (proxied by clinical order). line = 'Each row represents a decision point (proxied by clinical order).' header.append(line) # Each row contains fields summarizing the patient's demographics, line = "Each row contains fields summarizing the patient's demographics" header.append(line) # inpatient admit date, prior vitals, and prior lab results. line = 'inpatient admit date, prior vitals, and prior lab results.' header.append(line) # Most cells in matrix represent a count statistic for an event's line = "Most cells in matrix represent a count statistic for an event's" header.append(line) # occurrence or a difference between an event's time and index_time. line = "occurrence or a difference between an event's time and index_time." header.append(line) # header.append('') # Fields: header.append('Fields:') # pat_id - ID # for patient in the STRIDE data set. header.append(' pat_id - ID # for patient in the STRIDE data set.') # index_time - time at which clinical decision was made. header.append( ' index_time - time at which clinical decision was made.') # death_date - if patient died, date on which they died. header.append( ' death_date - if patient died, date on which they died.') # AdmitDxDate.[clinical_item] - admit diagnosis, pegged to admit date. header.append( ' AdmitDxDate.[clinical_item] - admit diagnosis, pegged to admit date.' ) # Birth.preTimeDays - patient's age in days. header.append(" Birth.preTimeDays - patient's age in days.") # [Male|Female].pre - is patient male/female (binary)? header.append(' [Male|Female].pre - is patient male/female (binary)?') # [RaceX].pre - is patient race [X]? header.append(' [RaceX].pre - is patient race [X]?') # Team.[specialty].[clinical_item] - specialist added to treatment team. header.append( ' Team.[specialty].[clinical_item] - specialist added to treatment team.' ) # Comorbidity.[disease].[clinical_item] - disease added to problem list. header.append( ' Comorbidity.[disease].[clinical_item] - disease added to problem list.' ) # ___.[flowsheet] - measurements for flowsheet biometrics. header.append( ' ___.[flowsheet] - measurements for flowsheet biometrics.') # Includes BP_High_Systolic, BP_Low_Diastolic, FiO2, header.append(' Includes BP_High_Systolic, BP_Low_Diastolic, FiO2,') # Glasgow Coma Scale Score, Pulse, Resp, Temp, and Urine. header.append( ' Glasgow Coma Scale Score, Pulse, Resp, Temp, and Urine.') # ___.[lab_result] - lab component results. header.append(' ___.[lab_result] - lab component results.') # Included standard components: WBC, HCT, PLT, NA, K, CO2, BUN, header.append( ' Included standard components: WBC, HCT, PLT, NA, K, CO2, BUN,' ) # CR, TBIL, ALB, CA, LAC, ESR, CRP, TNI, PHA, PO2A, PCO2A, header.append( ' CR, TBIL, ALB, CA, LAC, ESR, CRP, TNI, PHA, PO2A, PCO2A,') # PHV, PO2V, PCO2V header.append(' PHV, PO2V, PCO2V') # header.append('') # [clinical_item] fields may have the following suffixes: header.append( ' [clinical_item] fields may have the following suffixes:') # ___.pre - how many times has this occurred before order_time? header.append( ' ___.pre - how many times has this occurred before order_time?' ) # ___.pre.Xd - how many times has this occurred within X days before index_time? header.append( ' ___.pre.Xd - how many times has this occurred within X days before index_time?' ) # ___.preTimeDays - how many days before order_time was last occurrence? header.append( ' ___.preTimeDays - how many days before order_time was last occurrence?' ) # header.append('') # [flowsheet] and [lab_result] fields may have the following suffixes: header.append( ' [flowsheet] and [lab_result] fields may have the following suffixes:' ) # ___.X_Y.count - # of result values between X and Y days of index_time. header.append( ' ___.X_Y.count - # of result values between X and Y days of index_time.' ) # ___.X_Y.countInRange - # of result values in normal range. header.append( ' ___.X_Y.countInRange - # of result values in normal range.') # ___.X_Y.min - minimum result value. header.append(' ___.X_Y.min - minimum result value.') # ___.X_Y.max - maximum result value. header.append(' ___.X_Y.max - maximum result value.') # ___.X_Y.median - median result value. header.append(' ___.X_Y.median - median result value.') # ___.X_Y.std - standard deviation of result values. header.append(' ___.X_Y.std - standard deviation of result values.') # ___.X_Y.first - first result value. header.append(' ___.X_Y.first - first result value.') # ___.X_Y.last - last result value. header.append(' ___.X_Y.last - last result value.') # ___.X_Y.diff - difference between penultimate and proximate values. header.append( ' ___.X_Y.diff - difference between penultimate and proximate values.' ) # ___.X_Y.slope - slope between penultimate and proximate values. header.append( ' ___.X_Y.slope - slope between penultimate and proximate values.' ) # ___.X_Y.proximate - closest result value to order_time. header.append( ' ___.X_Y.proximate - closest result value to order_time.') # ___.X_Y.firstTimeDays - time between first and order_time. header.append( ' ___.X_Y.firstTimeDays - time between first and order_time.') # ___.X_Y.lastTimeDays - time between last and order_time. header.append( ' ___.X_Y.lastTimeDays - time between last and order_time.') # ___.X_Y.proximateTimeDays - time between proximate and order_time. header.append( ' ___.X_Y.proximateTimeDays - time between proximate and order_time.' ) return header def _train_predictor(self): self._predictor = SupervisedClassifier( algorithm=SupervisedClassifier.REGRESS_AND_ROUND) self._predictor.train(self._X_train, column_or_1d(self._y_train)) def _train_test_split(self): y = pd.DataFrame( self._cmm_processed.pop('I(0<=Death.postTimeDays<=28)')) # Without this line, sklearn complains about the format of y. # "DataConversionWarning: A column-vector y was passed when a 1d array # was expected. Please change the shape of y to (n_samples, ), for # example using ravel()." # Note that this turns y into a numpy array, so need to cast back. # y = y.values.ravel() X = self._cmm_processed self._X_train, self._X_test, self._y_train, self._y_test = train_test_split( X, y, shuffle=False) def _impute_data(self): # Impute missing values with mean value. for feature in self._cmm_raw.columns.values: if feature in self._FEATURES_TO_REMOVE: continue # If all values are null, just remove the feature. # Otherwise, imputation will fail (there's no mean value), # and sklearn will ragequit. if self._cmm_raw[feature].isnull().all(): self._fmt.remove_feature(feature) self._eliminated_features.append(feature) # Only try to impute if some of the values are null. elif self._cmm_raw[feature].isnull().any(): # TODO(sbala): Impute all time features with non-mean value. self._fmt.impute(feature) def _add_features(self): # Add threshold feature indicating whether death date # is within 28 days of index time. self._fmt.add_threshold_feature('Death.postTimeDays', lower_bound=0, upper_bound=28) def _remove_features(self): # Prune obviously unhelpful fields. # In theory, FeatureSelector should be able to prune these, but no # reason not to help it out a little bit. for feature in self._FEATURES_TO_REMOVE: self._fmt.remove_feature(feature) def _select_features(self): # Use FeatureSelector to prune all but 100 variables. fs = FeatureSelector(algorithm=FeatureSelector.RECURSIVE_ELIMINATION, \ problem=FeatureSelector.CLASSIFICATION) fs.set_input_matrix(self._X_train, column_or_1d(self._y_train)) num_features_to_select = int(0.01 * len(self._X_train.columns.values)) fs.select(k=num_features_to_select) # Enumerate eliminated features pre-transformation. self._feature_ranks = fs.compute_ranks() for i in range(len(self._feature_ranks)): if self._feature_ranks[i] > num_features_to_select: self._eliminated_features.append(self._X_train.columns[i]) self._X_train = fs.transform_matrix(self._X_train) self._X_test = fs.transform_matrix(self._X_test) def _test_predictor(self): self._accuracy = self._predictor.compute_accuracy( self._X_test, self._y_test) def predict(self, X): return self._predictor.predict(X) def summarize(self): summary_lines = list() # Condition: condition condition = self._condition line = 'Condition: %s' % condition summary_lines.append(line) # Algorithm: SupervisedClassifier(algorithm) algorithm = 'SupervisedClassifier(REGRESS_AND_ROUND)' line = 'Algorithm: %s' % algorithm summary_lines.append(line) # Train/Test Size: training_size, test_size training_size = self._X_train.shape[0] test_size = self._X_test.shape[0] line = 'Train/Test Size: %s/%s' % (training_size, test_size) summary_lines.append(line) # Model: sig_features coefs = self._predictor.coefs() cols = self._X_train.columns sig_features = [(coefs[cols.get_loc(f)], f) for f in cols.values if coefs[cols.get_loc(f)] > 0] linear_model = ' + '.join('%s*%s' % (weight, feature) for weight, feature in sig_features) line = 'Model: logistic(%s)' % linear_model summary_lines.append(line) # Baseline Episode Mortality: episode_mortality counts = self._y_test[self._y_test.columns[0]].value_counts() line = 'Baseline Episode Mortality: %s/%s' % (counts[1], test_size) summary_lines.append(line) # AUC: auc auc = self._predictor.compute_roc_auc(self._X_test, self._y_test) line = 'AUC: %s' % auc summary_lines.append(line) # Accuracy: accuracy line = 'Accuracy: %s' % self._accuracy summary_lines.append(line) return '\n'.join(summary_lines)
class BifurcatedSupervisedClassifier: BIFURCATION = 'bifurcation' EQUAL = '==' LTE = '<=' GTE = '>=' SUPPORTED_BIFURCATION_STRATEGIES = [EQUAL, GTE, LTE] def __init__(self, classes, hyperparams): if hyperparams[ 'bifurcation_strategy'] not in BifurcatedSupervisedClassifier.SUPPORTED_BIFURCATION_STRATEGIES: raise ValueError('Bifurcation strategy %s not supported.' % hyperparams['bifurcation_strategy']) self._classes = classes self._hyperparams = hyperparams # Note that if we don't pass a copies of hyperparams, then we won't # be able to change hyperparams independently in the two classifiers. self._sc_true = SupervisedClassifier(classes, hyperparams.copy()) self._sc_false = SupervisedClassifier(classes, hyperparams.copy()) def __repr__(self): bs = self._build_bifurcation_str() classes_str = str(self._classes) hyperparams_str = "hyperparams={'algorithm': %s, 'bifurcator': %s, 'bifurcation_strategy': %s, 'bifurcation_threshold': %s, 'random_state': %s}" % ( self._hyperparams['algorithm'], self._hyperparams['bifurcator'], self._hyperparams['bifurcation_strategy'], self._hyperparams['bifurcation_value'], self._hyperparams['random_state']) s = "BifurcatedSupervisedClassifier(%s, %s)" % (classes_str, hyperparams_str) return s __str__ = __repr__ def _build_bifurcation_str(self): args = (self._hyperparams['bifurcator'], self._hyperparams['bifurcation_strategy'], self._hyperparams['bifurcation_value']) return '%s %s %s' % args def fetch_bifurcation_masks(self, X): log.debug('bifurcator: %s' % self._hyperparams['bifurcator']) log.debug('bifurcation_strategy: %s' % BifurcatedSupervisedClassifier.EQUAL) log.debug('bifurcation_value: %s' % self._hyperparams['bifurcation_value']) if self._hyperparams[ 'bifurcation_strategy'] is BifurcatedSupervisedClassifier.EQUAL: true_mask = X[self._hyperparams['bifurcator']].astype( float) == self._hyperparams['bifurcation_value'] false_mask = X[self._hyperparams['bifurcator']].astype( float) != self._hyperparams['bifurcation_value'] elif self._hyperparams[ 'bifurcation_strategy'] is BifurcatedSupervisedClassifier.LTE: true_mask = X[self._hyperparams['bifurcator']].astype( float) <= self._hyperparams['bifurcation_value'] false_mask = X[self._hyperparams['bifurcator']].astype( float) > self._hyperparams['bifurcation_value'] elif self._hyperparams[ 'bifurcation_strategy'] is BifurcatedSupervisedClassifier.GTE: true_mask = X[self._hyperparams['bifurcator']].astype( float) >= self._hyperparams['bifurcation_value'] false_mask = X[self._hyperparams['bifurcator']].astype( float) < self._hyperparams['bifurcation_value'] log.debug('X[%s].value_counts(): %s' % (self._hyperparams['bifurcator'], X[self._hyperparams['bifurcator']].value_counts())) log.debug('true_mask.value_counts(): %s' % true_mask.value_counts()) log.debug('false_mask.value_counts(): %s' % false_mask.value_counts()) return true_mask, false_mask def description(self): args = (self._hyperparams['algorithm'].upper().replace('-', '_'), self._build_bifurcation_str(), self._sc_true.description(), self._sc_false.description()) return 'BIFURCATED_%s(%s, true=%s, false=%s)' % args def hyperparams(self): hyperparams = { 'model_true': self._sc_true.hyperparams(), 'model_false': self._sc_false.hyperparams() } return hyperparams def params(self): params = { 'bifurcator': self._hyperparams['bifurcator'], 'bifurcation_strategy': self._hyperparams['bifurcation_strategy'], 'bifurcation_value': self._hyperparams['bifurcation_value'], 'model_true': self._sc_true.description(), 'model_false': self._sc_false.description() } return params def train(self, X_train, y_train): true_mask, false_mask = self.fetch_bifurcation_masks(X_train) # Train sc_true. X_train_true = X_train[true_mask] y_train_true = y_train[true_mask] status_true = self._sc_true.train(X_train_true, y_train_true) if status_true == SupervisedClassifier.INSUFFICIENT_SAMPLES: return status_true # Train sc_true. X_train_false = X_train[false_mask] y_train_false = y_train[false_mask] status_false = self._sc_false.train(X_train_false, y_train_false) if status_false == SupervisedClassifier.INSUFFICIENT_SAMPLES: return status_false return SupervisedClassifier.TRAINED def _stitch_disjoint_row(self, row): if pd.isnull(row['y_pred_true']): val = row['y_pred_false'] else: val = row['y_pred_true'] return val def _stitch_prob_0(self, row): if pd.isnull(row['y_pred_prob_true_0']): val = row['y_pred_prob_false_0'] else: val = row['y_pred_prob_true_0'] return val def _stitch_prob_1(self, row): if pd.isnull(row['y_pred_prob_true_1']): val = row['y_pred_prob_false_1'] else: val = row['y_pred_prob_true_1'] return val def _predict_label_or_probability(self, X_test, probability=None): true_mask, false_mask = self.fetch_bifurcation_masks(X_test) # Predict X_test_true. X_test_true = X_test[true_mask] if probability: y_pred_true = self._sc_true.predict_probability(X_test_true) else: y_pred_true = self._sc_true.predict(X_test_true) log.debug('y_pred_true: %s' % y_pred_true) # Predict X_test_false. X_test_false = X_test[false_mask] if probability: y_pred_false = self._sc_false.predict_probability(X_test_false) else: y_pred_false = self._sc_false.predict(X_test_false) log.debug('y_pred_false: %s' % y_pred_false) # Stitch results. if probability: column_names = ['y_pred_true_0', 'y_pred_true_1'] else: column_names = ['y_pred_true'] y_pred_true_df = DataFrame(y_pred_true, index=X_test_true.index, \ columns=column_names) log.debug('y_pred_true_df: %s' % y_pred_true_df) if probability: column_names = ['y_pred_false_0', 'y_pred_false_1'] else: column_names = ['y_pred_false'] y_pred_false_df = DataFrame(y_pred_false, index=X_test_false.index, \ columns=column_names) log.debug('y_pred_false_df: %s' % y_pred_false_df) true_mask_df = DataFrame(true_mask) mask_plus_true = true_mask_df.merge(y_pred_true_df, how='left', \ left_index=True, right_index=True) mask_plus_true_plus_false = mask_plus_true.merge(y_pred_false_df, \ how='left', left_index=True, right_index=True) mask_plus_true_plus_false['y_pred'] = mask_plus_true_plus_false.apply( self._stitch_disjoint_row, axis=1) log.debug('mask_plus_false: %s' % mask_plus_true_plus_false) y_pred = mask_plus_true_plus_false['y_pred'].values return y_pred def predict(self, X_test): true_mask, false_mask = self.fetch_bifurcation_masks(X_test) # Predict X_test_true. X_test_true = X_test[true_mask] y_pred_true = self._sc_true.predict(X_test_true) log.debug('y_pred_true: %s' % y_pred_true) # Predict X_test_false. X_test_false = X_test[false_mask] y_pred_false = self._sc_false.predict(X_test_false) log.debug('y_pred_false: %s' % y_pred_false) # Stitch results. column_names = ['y_pred_true'] y_pred_true_df = DataFrame(y_pred_true, index=X_test_true.index, \ columns=column_names) log.debug('y_pred_true_df: %s' % y_pred_true_df) column_names = ['y_pred_false'] y_pred_false_df = DataFrame(y_pred_false, index=X_test_false.index, \ columns=column_names) log.debug('y_pred_false_df: %s' % y_pred_false_df) true_mask_df = DataFrame(true_mask) mask_plus_true = true_mask_df.merge(y_pred_true_df, how='left', \ left_index=True, right_index=True) mask_plus_true_plus_false = mask_plus_true.merge(y_pred_false_df, \ how='left', left_index=True, right_index=True) mask_plus_true_plus_false['y_pred'] = mask_plus_true_plus_false.apply( self._stitch_disjoint_row, axis=1) log.debug('mask_plus_false: %s' % mask_plus_true_plus_false) y_pred = mask_plus_true_plus_false['y_pred'].values return y_pred def predict_probability(self, X_test): true_mask, false_mask = self.fetch_bifurcation_masks(X_test) # Predict X_test_true. X_test_true = X_test[true_mask] y_pred_prob_true = self._sc_true.predict_probability(X_test_true) log.debug('y_pred_prob_true: %s' % y_pred_prob_true) # Predict X_test_false. X_test_false = X_test[false_mask] y_pred_prob_false = self._sc_false.predict_probability(X_test_false) log.debug('y_pred_prob_false: %s' % y_pred_prob_false) # Stitch results. column_names = ['y_pred_prob_true_0', 'y_pred_prob_true_1'] y_pred_prob_true_df = DataFrame(y_pred_prob_true, index=X_test_true.index, \ columns=column_names) log.debug('y_pred_prob_true_df: %s' % y_pred_prob_true_df) column_names = ['y_pred_prob_false_0', 'y_pred_prob_false_1'] y_pred_prob_false_df = DataFrame(y_pred_prob_false, index=X_test_false.index, \ columns=column_names) log.debug('y_pred_prob_false_df: %s' % y_pred_prob_false_df) true_mask_df = DataFrame(true_mask) mask_plus_true = true_mask_df.merge(y_pred_prob_true_df, how='left', \ left_index=True, right_index=True) composite = mask_plus_true.merge(y_pred_prob_false_df, \ how='left', left_index=True, right_index=True) composite['y_pred_prob_0'] = composite.apply(self._stitch_prob_0, axis=1) composite['y_pred_prob_1'] = composite.apply(self._stitch_prob_1, axis=1) log.debug('composite: %s' % composite) y_pred_prob = composite[['y_pred_prob_0', 'y_pred_prob_1']].values log.debug(y_pred_prob) return y_pred_prob
def run_one_lab_local(lab, lab_type, data_source, version, random_state=0): ''' Input: :return: ''' # X_train_raw, y_train = [[1], [2]], [1, 2] # X_test_raw, y_test = [[3], [4]], [3, 4] file_organizer = syst.FileOrganizerLocal(lab=lab, lab_type=lab_type, data_source=data_source, version=version) raw_matrix = file_organizer.get_raw_matrix() y_label = 'all_components_normal' ''' TODO: later on pat_ids ''' raw_matrix_train, raw_matrix_test = Utils.split_rows(raw_matrix) patIds_train = raw_matrix_train['pat_id'].values.tolist() X_train_raw, y_train = Utils.split_Xy(raw_matrix_train, ylabel=y_label) redundant_features = ['proc_code', 'num_components', 'num_normal_components', 'abnormal_panel'] id_features = ['pat_id', 'order_proc_id', 'order_time'] numeric_features = X_train_raw.columns[~X_train_raw.columns.isin([y_label]+redundant_features+id_features)] ''' Check if the left features are all numeric ''' assert X_train_raw[numeric_features].select_dtypes(exclude=['object']).shape == X_train_raw[numeric_features].shape features_by_type = {'redundant_features': redundant_features, 'id_features':id_features, 'numeric_features':numeric_features, 'y_label':y_label} ''' (1) Feature Impute: Imputation of some numerical values depend on prior stats of the same patient, so certain auxiliary columns are still useful (2) Feature Remove: Remove auxiliary columns (3) Feature Selection: Only select from numerical columns ''' feature_processing_pipeline = Pipeline( memory = None, #file_organizer.cached_pipeline_filepath, steps = [ ('impute_features', Cls.FeatureImputer()), ('remove_features', Cls.FeatureRemover(features_to_remove=Config.features_to_remove)), ('select_features', Cls.Select_Features(random_state=random_state, features_by_type=features_by_type)) ] ) # feature_engineering_pipeline.set_params() X_train_processed = feature_processing_pipeline.fit_transform(X_train_raw, y_train) hyperparams = {} hyperparams['algorithm'] = 'random-forest' predictor = SupervisedClassifier(classes=[0,1], hyperparams=hyperparams) ''' Automatically takes care of tuning hyperparameters via stochastic-search ''' status = predictor.train(X_train_processed, column_or_1d(y_train), groups = patIds_train) logging.INFO('status: %s'%status) ''' Test set ''' X_test_raw, y_test = Utils.split_Xy(raw_matrix_test, ylabel=y_label) X_test_processed = feature_processing_pipeline.transform(X_test_raw) y_test_pred_proba = predictor.predict_probability(X_test_processed) res_df = pd.DataFrame({'actual':y_test, 'predict': y_test_pred_proba}) res_df.to_csv(file_organizer.get_output_filepath(alg=hyperparams['algorithm']))