Esempio n. 1
0
def train_ml_model(X_train, y_train, alg, groups, output_folderpath,
                   random_state):
    hyperparams = {}
    hyperparams['algorithm'] = alg

    ml_classifier = SupervisedClassifier(classes=[0, 1],
                                         hyperparams=hyperparams)

    status = ml_classifier.train(X_train, y_train, groups=groups)
    return ml_classifier
Esempio n. 2
0
    def __init__(self, classes, hyperparams):
        if hyperparams[
                'bifurcation_strategy'] not in BifurcatedSupervisedClassifier.SUPPORTED_BIFURCATION_STRATEGIES:
            raise ValueError('Bifurcation strategy %s not supported.' %
                             hyperparams['bifurcation_strategy'])

        self._classes = classes
        self._hyperparams = hyperparams

        # Note that if we don't pass a copies of hyperparams, then we won't
        # be able to change hyperparams independently in the two classifiers.
        self._sc_true = SupervisedClassifier(classes, hyperparams.copy())
        self._sc_false = SupervisedClassifier(classes, hyperparams.copy())
Esempio n. 3
0
    def test_train_and_predict(self):
        # Load data set.
        X = DataFrame(RANDOM_CLASSIFICATION_TEST_CASE['X'],
                      columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10'])
        y = DataFrame(RANDOM_CLASSIFICATION_TEST_CASE['y'])
        random_state = RANDOM_CLASSIFICATION_TEST_CASE['random_state']
        expected_y_pred_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['y_predicted']
        expected_str_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['str']
        expected_hyperparams_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['hyperparams']
        expected_params_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['params']
        expected_descriptions_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['description']

        # Generate train/test split.
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

        # Iterate through SUPPORTED_ALGORITHMS.
        for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS:
            log.info('Testing %s classifier...' % algorithm)
            # Train model.
            hyperparams = {'algorithm': algorithm, 'random_state': random_state}
            # Default to stochastic search for expensive algorithms.
            if algorithm in [SupervisedClassifier.RANDOM_FOREST]:
                hyperparams['hyperparam_strategy'] = SupervisedClassifier.STOCHASTIC_SEARCH
                # Test ability to force hyperparam values.
                hyperparams['max_depth'] = 2
                hyperparams['n_estimators'] = 5
                hyperparams['min_samples_leaf'] = 1
                hyperparams['min_samples_split'] = 0.2
            else:
                hyperparams['hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH
            classifier = SupervisedClassifier([0, 1], hyperparams)
            classifier.train(X_train, y_train)

            # Test str().
            expected_str = expected_str_by_algorithm[algorithm]
            actual_str = str(classifier)
            self.assertEqual(expected_str, actual_str)

            # Test hyperparameters.
            expected_hyperparams = expected_hyperparams_by_algorithm[algorithm]
            actual_hyperparams = classifier.hyperparams()
            self._assert_equal_hyperparams(expected_hyperparams, actual_hyperparams)

            # Test model parameters.
            expected_params = expected_params_by_algorithm[algorithm]
            actual_params = classifier.params()
            self.assertEqualDict(expected_params, actual_params)

            # Test model description.
            expected_description = expected_descriptions_by_algorithm[algorithm]
            actual_description = classifier.description()
            self.assertEqual(expected_description, actual_description)

            # Test prediction values.
            expected_y_pred = expected_y_pred_by_algorithm[algorithm]
            log.debug('expected_y_pred: %s' % expected_y_pred)
            actual_y_pred = classifier.predict(X_test)
            log.debug('actual_y_pred: %s' % actual_y_pred)
            self.assertEqualList(expected_y_pred, actual_y_pred)
Esempio n. 4
0
    def run(self):
        file_organizer = Syst.FileOrganizerLocal(
            working_folderpath=self.working_folderpath)

        raw_matrix_train, raw_matrix_test = Utils.split_rows(self.input_matrix)

        X_train_raw, y_train = Utils.split_Xy(raw_matrix_train,
                                              ylabel=self.ylabel)

        feature_processing_pipeline = Pipeline(
            memory=None,  # file_organizer.cached_pipeline_filepath,
            steps=[('impute_features', Clas.FeatureImputer()),
                   ('remove_features', Clas.FeatureRemover()),
                   ('select_features', Clas.Select_Features())])
        X_train_processed = feature_processing_pipeline.fit_transform(
            X_train_raw, y_train)

        predictor = SupervisedClassifier(
            classes=[0, 1],
            hyperparams={
                'algorithm': 'random-forest',
                'hyperparam_strategy': SupervisedClassifier.EXHAUSTIVE_SEARCH,
                'max_iter': 1024
            })

        status = predictor.train(X_train_processed, column_or_1d(y_train))

        X_test_raw, y_test = Utils.split_Xy(raw_matrix_test,
                                            ylabel=self.ylabel)
        X_test_processed = feature_processing_pipeline.transform(X_test_raw)
        y_test_pred_proba = predictor.predict_probability(X_test_processed)[:,
                                                                            1]

        res_df = pd.DataFrame({'actual': y_test, 'predict': y_test_pred_proba})
        res_df.to_csv(file_organizer.get_output_filepath())
        '''TODO'''
        from scripts.LabTestAnalysis.lab_statistics.stats_utils import get_confusion_metrics
        from sklearn.metrics import roc_auc_score

        AUC = roc_auc_score(y_test, y_test_pred_proba)

        sensitivity, specificity, LR_p, LR_n, PPV, NPV = get_confusion_metrics(
            actual_labels=y_test.values,
            predict_probas=y_test_pred_proba,
            threshold=0.5)
        print("AUC: %s, sensitivity: %s, specificity: %s, LR_p: %s, LR_n: %s, PPV: %s, NPV: %s:. " \
                % (AUC, sensitivity, specificity, LR_p, LR_n, PPV, NPV))
Esempio n. 5
0
    def setUp(self):
        log.level = logging.ERROR
        # Use simple classifier and test case for testing non-ROC analyses.
        X = RANDOM_10_TEST_CASE['X']
        y = RANDOM_10_TEST_CASE['y']
        self._list_classifier = ListPredictor([0, 1])
        self._lc_analyzer = ClassifierAnalyzer(self._list_classifier, X, y)

        # Use ml classifier and complex test case.
        X = RANDOM_100_TEST_CASE['X']
        y = RANDOM_100_TEST_CASE['y']
        # Generate train/test split.
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, random_state=123456789)
        # Train logistic regression model.
        hyperparams = {
            'algorithm': SupervisedClassifier.REGRESS_AND_ROUND,
            'random_state': 123456789
        }
        self._ml_classifier = SupervisedClassifier([0, 1], hyperparams)
        self._ml_classifier.train(X_train, column_or_1d(y_train))
        self._ml_analyzer = ClassifierAnalyzer(self._ml_classifier, X_test,
                                               y_test)
Esempio n. 6
0
    def test_init(self):
        # Test unspecified algorithm.
        classifier = SupervisedClassifier([0, 1])
        self.assertEqual(classifier.algorithm(), \
            SupervisedClassifier.LOGISTIC_REGRESSION)

        # Test unsupported algorithm.
        with self.assertRaises(ValueError):
            hyperparams = {'algorithm': 'foo'}
            SupervisedClassifier([0, 1], hyperparams)

        # Confirm specified algorithm selection.
        hyperparams = {'algorithm': SupervisedClassifier.DECISION_TREE}
        classifier = SupervisedClassifier([0, 1], hyperparams)
        self.assertEqual(classifier.algorithm(), SupervisedClassifier.DECISION_TREE)
Esempio n. 7
0
class TestClassifierAnalyzer(MedInfoTestCase):
    def setUp(self):
        log.level = logging.ERROR
        # Use simple classifier and test case for testing non-ROC analyses.
        X = RANDOM_10_TEST_CASE['X']
        y = RANDOM_10_TEST_CASE['y']
        self._list_classifier = ListPredictor([0, 1])
        self._lc_analyzer = ClassifierAnalyzer(self._list_classifier, X, y)

        # Use ml classifier and complex test case.
        X = RANDOM_100_TEST_CASE['X']
        y = RANDOM_100_TEST_CASE['y']
        # Generate train/test split.
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, random_state=123456789)
        # Train logistic regression model.
        hyperparams = {
            'algorithm': SupervisedClassifier.REGRESS_AND_ROUND,
            'random_state': 123456789
        }
        self._ml_classifier = SupervisedClassifier([0, 1], hyperparams)
        self._ml_classifier.train(X_train, column_or_1d(y_train))
        self._ml_analyzer = ClassifierAnalyzer(self._ml_classifier, X_test,
                                               y_test)

    def tearDown(self):
        test_dir = os.path.dirname(os.path.abspath(__file__))
        # Clean up the actual report file.
        try:
            actual_report_name = 'actual-list-classifier.report'
            actual_report_path = '/'.join([test_dir, actual_report_name])
            os.remove(actual_report_path)
        except OSError:
            pass

        # Clean up the actual precision-recall plot.
        try:
            actual_plot_name = 'actual-precision-recall-plot.png'
            actual_plot_path = '/'.join([test_dir, actual_plot_name])
            os.remove(actual_plot_path)
        except OSError:
            pass

        # Clean up the actual roc plot.
        try:
            actual_plot_name = 'actual-roc-plot.png'
            actual_plot_path = '/'.join([test_dir, actual_plot_name])
            os.remove(actual_plot_path)
        except OSError:
            pass

        # Clean up the actual precision at k plot.
        try:
            actual_plot_name = 'actual-precision-at-k-plot.png'
            actual_plot_path = '/'.join([test_dir, actual_plot_name])
            os.remove(actual_plot_path)
        except OSError:
            pass

    def _assert_fuzzy_equality(self, expected, actual):
        abs_diff = actual - expected
        rel_diff = (abs_diff) / expected
        self.assertTrue(rel_diff < 0.1)

    def test_score_accuracy(self):
        # Test accuracy.
        expected_accuracy = RANDOM_10_TEST_CASE['accuracy']
        actual_accuracy = self._lc_analyzer.score()
        self.assertEqual(expected_accuracy, actual_accuracy)

        # Test accuracy.
        expected_accuracy = RANDOM_100_TEST_CASE['accuracy']
        actual_accuracy = self._ml_analyzer.score()
        self.assertEqual(expected_accuracy, actual_accuracy)

        # Test bootstrapped CIs.
        actual_accuracy, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score(
            ci=0.95, n_bootstrap_iter=1000)
        self.assertEqual(expected_accuracy, actual_accuracy)
        expected_lower_ci = RANDOM_100_TEST_CASE['ci']['accuracy']['lower']
        self.assertEqual(expected_lower_ci, actual_lower_ci)
        expected_upper_ci = RANDOM_100_TEST_CASE['ci']['accuracy']['upper']
        self.assertEqual(expected_upper_ci, actual_upper_ci)

    def test_score_recall(self):
        # Test recall.
        expected_recall = RANDOM_10_TEST_CASE['recall']
        actual_recall = self._lc_analyzer.score(
            metric=ClassifierAnalyzer.RECALL_SCORE)
        self.assertEqual(expected_recall, actual_recall)

        # Test recall.
        expected_recall = RANDOM_100_TEST_CASE['recall']
        actual_recall = self._ml_analyzer.score(
            metric=ClassifierAnalyzer.RECALL_SCORE)
        self.assertEqual(expected_recall, actual_recall)

        # Test bootstrapped CIs.
        actual_recall, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score(
            metric=ClassifierAnalyzer.RECALL_SCORE,
            ci=0.95,
            n_bootstrap_iter=1000)
        self.assertEqual(expected_recall, actual_recall)
        expected_lower_ci = RANDOM_100_TEST_CASE['ci']['recall']['lower']
        self.assertEqual(expected_lower_ci, actual_lower_ci)
        expected_upper_ci = RANDOM_100_TEST_CASE['ci']['recall']['upper']
        self.assertEqual(expected_upper_ci, actual_upper_ci)

    def test_score_precision(self):
        # Test precision.
        expected_precision = RANDOM_10_TEST_CASE['precision']
        actual_precision = self._lc_analyzer.score(
            metric=ClassifierAnalyzer.PRECISION_SCORE)
        self.assertEqual(expected_precision, actual_precision)

        # Test precision.
        expected_precision = RANDOM_100_TEST_CASE['precision']
        actual_precision = self._ml_analyzer.score(
            metric=ClassifierAnalyzer.PRECISION_SCORE)
        self.assertEqual(expected_precision, actual_precision)

        # Test bootstrapped CIs.
        actual_precision, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score(
            metric=ClassifierAnalyzer.PRECISION_SCORE,
            ci=0.95,
            n_bootstrap_iter=1000)
        self.assertEqual(expected_precision, actual_precision)
        expected_lower_ci = RANDOM_100_TEST_CASE['ci']['precision']['lower']
        self.assertEqual(expected_lower_ci, actual_lower_ci)
        expected_upper_ci = RANDOM_100_TEST_CASE['ci']['precision']['upper']
        self.assertEqual(expected_upper_ci, actual_upper_ci)

    def test_score_f1(self):
        # Test F1 score.
        expected_f1 = RANDOM_10_TEST_CASE['f1']
        actual_f1 = self._lc_analyzer.score(metric=ClassifierAnalyzer.F1_SCORE)
        self.assertEqual(expected_f1, actual_f1)

        # Test f1.
        expected_f1 = RANDOM_100_TEST_CASE['f1']
        actual_f1 = self._ml_analyzer.score(metric=ClassifierAnalyzer.F1_SCORE)
        self.assertEqual(expected_f1, actual_f1)

        # Test bootstrapped CIs.
        actual_f1, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score(
            metric=ClassifierAnalyzer.F1_SCORE, ci=0.95, n_bootstrap_iter=1000)
        self.assertEqual(expected_f1, actual_f1)
        expected_lower_ci = RANDOM_100_TEST_CASE['ci']['f1']['lower']
        self.assertEqual(expected_lower_ci, actual_lower_ci)
        expected_upper_ci = RANDOM_100_TEST_CASE['ci']['f1']['upper']
        self.assertEqual(expected_upper_ci, actual_upper_ci)

    def test_score_average_precision(self):
        # Test average precision.
        expected_average_precision = RANDOM_100_TEST_CASE['average_precision']
        actual_average_precision = self._ml_analyzer.score(
            metric=ClassifierAnalyzer.AVERAGE_PRECISION_SCORE)
        self.assertEqual(expected_average_precision, actual_average_precision)

        # Test bootstrapped CIs.
        actual_average_precision, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score(
            metric=ClassifierAnalyzer.AVERAGE_PRECISION_SCORE,
            ci=0.95,
            n_bootstrap_iter=1000)
        self.assertEqual(expected_average_precision, actual_average_precision)
        expected_lower_ci = RANDOM_100_TEST_CASE['ci']['average_precision'][
            'lower']
        self.assertEqual(expected_lower_ci, actual_lower_ci)
        expected_upper_ci = RANDOM_100_TEST_CASE['ci']['average_precision'][
            'upper']
        self.assertEqual(expected_upper_ci, actual_upper_ci)

    def test_score_roc_auc(self):
        # Test roc_auc.
        expected_roc_auc = RANDOM_100_TEST_CASE['roc_auc']
        actual_roc_auc = self._ml_analyzer.score(
            metric=ClassifierAnalyzer.ROC_AUC_SCORE)
        self.assertEqual(expected_roc_auc, actual_roc_auc)

        # Test bootstrapped CIs.
        actual_roc_auc, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score(
            metric=ClassifierAnalyzer.ROC_AUC_SCORE,
            ci=0.95,
            n_bootstrap_iter=1000)
        self.assertEqual(expected_roc_auc, actual_roc_auc)
        expected_lower_ci = RANDOM_100_TEST_CASE['ci']['roc_auc']['lower']
        self.assertEqual(expected_lower_ci, actual_lower_ci)
        expected_upper_ci = RANDOM_100_TEST_CASE['ci']['roc_auc']['upper']
        self.assertEqual(expected_upper_ci, actual_upper_ci)

    def test_score_precision_at_k(self):
        # Test precision at K.
        prev_precision = 1.0
        for k in range(1, 20):
            actual_precision_at_k = self._ml_analyzer.score(
                metric=ClassifierAnalyzer.PRECISION_AT_K_SCORE, k=k)
            expected_precision_at_k = RANDOM_100_TEST_CASE['precision_at_k'][k]
            self.assertEqual(expected_precision_at_k, actual_precision_at_k)

        # Test bootstrapped CIs.
        actual_precision_at_k, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score(
            metric=ClassifierAnalyzer.PRECISION_AT_K_SCORE,
            k=10,
            ci=0.95,
            n_bootstrap_iter=1000)
        expected_precision_at_k = RANDOM_100_TEST_CASE['precision_at_k'][10]
        self.assertEqual(expected_precision_at_k, actual_precision_at_k)
        expected_lower_ci = RANDOM_100_TEST_CASE['ci']['precision_at_k'][
            'lower'][10]
        self.assertEqual(expected_lower_ci, actual_lower_ci)
        expected_upper_ci = RANDOM_100_TEST_CASE['ci']['precision_at_k'][
            'upper'][10]
        self.assertEqual(expected_upper_ci, actual_upper_ci)

    def test_score_percent_predictably_positive(self):
        # Test roc_auc.
        expected_ppp = RANDOM_100_TEST_CASE['percent_predictably_positive']
        actual_ppp = self._ml_analyzer.score(
            metric=ClassifierAnalyzer.PERCENT_PREDICTABLY_POSITIVE)
        self.assertEqual(expected_ppp, actual_ppp)

        # Test bootstrapped CIs.
        actual_ppp, actual_lower_ci, actual_upper_ci = self._ml_analyzer.score(
            metric=ClassifierAnalyzer.PERCENT_PREDICTABLY_POSITIVE,
            ci=0.95,
            n_bootstrap_iter=1000)
        self.assertEqual(expected_ppp, actual_ppp)
        expected_lower_ci = RANDOM_100_TEST_CASE['ci'][
            'percent_predictably_positive']['lower']
        self.assertEqual(expected_lower_ci, actual_lower_ci)
        expected_upper_ci = RANDOM_100_TEST_CASE['ci'][
            'percent_predictably_positive']['upper']
        self.assertEqual(expected_upper_ci, actual_upper_ci)

    def test_plot_precision_recall_curve(self):
        # Compute precision-recall curve.
        precision_recall_curve = self._ml_analyzer.compute_precision_recall_curve(
        )

        # Build paths for expected and actual plots.
        test_dir = os.path.dirname(os.path.abspath(__file__))
        actual_plot_name = 'actual-precision-recall-plot.png'
        actual_plot_path = '/'.join([test_dir, actual_plot_name])

        self._ml_analyzer.plot_precision_recall_curve('Precision-Recall Curve',
                                                      actual_plot_path)

        # Not sure how to validate this at the moment, so just validate
        # that it actually passes.
        self.assertTrue(True)

    def test_plot_roc_curve(self):
        # Compute ROC curve.
        roc_curve = self._ml_analyzer.compute_roc_curve()

        # Build paths for expected and actual plots.
        test_dir = os.path.dirname(os.path.abspath(__file__))
        actual_plot_name = 'actual-roc-plot.png'
        actual_plot_path = '/'.join([test_dir, actual_plot_name])

        self._ml_analyzer.plot_roc_curve('ROC', actual_plot_path)

        # Not sure how to validate this at the moment, so just validate
        # that it actually passes.
        self.assertTrue(True)

    def test_plot_precision_at_k_curve(self):
        # Compute precision_recall_curve.
        k_vals, precision_vals = self._ml_analyzer.compute_precision_at_k_curve(
        )

        # Build paths for expected and actual plots.
        test_dir = os.path.dirname(os.path.abspath(__file__))
        actual_plot_name = 'actual-precision-at-k-plot.png'
        actual_plot_path = '/'.join([test_dir, actual_plot_name])

        self._ml_analyzer.plot_precision_at_k_curve('Precision at K',
                                                    actual_plot_path)

        # Not sure how to validate this at the moment, so just validate
        # that it actually passes.
        self.assertTrue(True)

    def test_build_report(self):
        # Build report.
        expected_report = RANDOM_100_TEST_CASE['report']
        actual_report = self._ml_analyzer.build_report()[0]
        log.debug('expected_report: %s' % expected_report)
        log.debug('actual_report: %s' % actual_report)
        assert_frame_equal(expected_report, actual_report)

        # Build bootstrapped report.
        expected_report = RANDOM_100_TEST_CASE['ci']['report']
        actual_report = self._ml_analyzer.build_report(ci=0.95)[0]
        assert_frame_equal(expected_report, actual_report)

        # Build paths for expected and actual report.
        test_dir = os.path.dirname(os.path.abspath(__file__))
        actual_report_name = 'actual-list-classifier.report'
        actual_report_path = '/'.join([test_dir, actual_report_name])

        # Write the report.
        self._ml_analyzer.write_report(actual_report_path)

        # Not sure how to validate this at the moment, so just validate
        # that it actually passes.
        self.assertTrue(True)
Esempio n. 8
0
 def _train_predictor(self):
     self._predictor = SupervisedClassifier(
         algorithm=SupervisedClassifier.REGRESS_AND_ROUND)
     self._predictor.train(self._X_train, column_or_1d(self._y_train))
Esempio n. 9
0
class ConditionMortalityPredictor:
    def __init__(self, condition, num_patients, icd_list=None, use_cache=None):
        self._condition = condition
        self._num_patients = num_patients
        self._icd_list = icd_list

        self._FEATURES_TO_REMOVE = [
            'index_time', 'death_date', 'Death.post', 'Death.postTimeDays',
            'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays',
            'RaceWhiteHispanicLatino.preTimeDays',
            'RaceWhiteNonHispanicLatino.preTimeDays',
            'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays',
            'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays',
            'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays',
            'RaceUnknown.preTimeDays'
        ]
        self._eliminated_features = list()

        self._build_cmm_names()
        if use_cache is None:
            self._build_raw_feature_matrix()
        print('Processing raw feature matrix...')
        self._process_raw_feature_matrix()
        print('Training predictor...')
        self._train_predictor()
        print('Testing predictor...')
        self._test_predictor()

    def _build_cmm_names(self):
        slugified_condition = "-".join(self._condition.split())
        self._build_cmm_name_raw(slugified_condition, self._num_patients)
        self._build_cmm_name_processed(slugified_condition, self._num_patients)

    def _build_cmm_name_raw(self, slugified_condition, num_patients):
        template = '%s-mortality-matrix-%d-pat-raw.tab'
        self._cmm_name_raw = template % (slugified_condition, num_patients)

    def _build_cmm_name_processed(self, slugified_condition, num_patients):
        template = '%s-mortality-matrix-%d-pat-processed.tab'
        self._cmm_name_processed = template % (slugified_condition,
                                               num_patients)

    def _build_raw_feature_matrix(self):
        self._cmm = ConditionMortalityMatrix(self._condition, \
            self._num_patients, self._cmm_name_raw, self._icd_list)

    def _process_raw_feature_matrix(self):
        # Read raw CMM.
        self._fm_io = FeatureMatrixIO()
        print('Reading raw matrix...')
        self._cmm_raw = self._fm_io.read_file_to_data_frame(self._cmm_name_raw)

        # Add and remove features to _cmm_processed.
        self._fmt = FeatureMatrixTransform()
        self._fmt.set_input_matrix(self._cmm_raw)
        print('Adding features...')
        self._add_features()
        print('Imputing data...')
        self._impute_data()
        self._remove_features()
        self._fmt.drop_duplicate_rows()
        self._cmm_processed = self._fmt.fetch_matrix()

        # Divide _cmm_processed into training and test data.
        # This must happen before feature selection so that we don't
        # accidentally learn information from the test data.
        self._train_test_split()
        print('Selecting features...')
        self._select_features()

        # Write output to new matrix.
        train = self._y_train.join(self._X_train)
        test = self._y_test.join(self._X_test)
        self._cmm_processed = train.append(test)

        header = self._build_processed_matrix_header()

        self._fm_io.write_data_frame_to_file(self._cmm_processed,
                                             self._cmm_name_processed, header)

    def _build_processed_matrix_header(self):
        # FeatureMatrixFactory and FeatureMatrixIO expect a list of strings.
        # Each comment below represents the line in the comment.
        header = list()

        # <file_name.tab>
        file_name = self._cmm_name_processed
        header.append(file_name)
        # Created: <timestamp>
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
        header.append('Created: %s' % timestamp)
        # Source: __name__
        header.append('Source: %s' % __name__)
        # Command: ConditionMortalityMatrix()
        if self._icd_list:
            command = 'ConditionMortalityPredictor(%s, %s, %s)' % \
                (self._condition, self._num_patients, self._icd_list)
        else:
            command = 'ConditionMortalityPredictor(%s, %s)' % \
                (self._condition, self._num_patients)
        header.append('Command: %s' % command)
        #
        header.append('')
        # Overview:
        header.append('Overview:')
        # This file is a processed version of ___.
        line = 'This file is a post-processed version of %s.' % self._cmm_name_raw
        header.append(line)
        # The outcome label is ___, which is a boolean indicator
        line = 'The outcome label is I(0<=Death.postTimeDays<=28), which is a boolean indicator'
        header.append(line)
        # for whether the patient given by pat_id passed away within 28 days
        line = 'for whether the patient given by pat_id passed away within 28 days'
        header.append(line)
        # of the time index represented by a given row.
        line = 'of the time index represented by a given row.'
        header.append(line)
        # This matrix is the result of the following processing steps on the raw matrix:
        line = 'This matrix is the result of the following processing steps on the raw matrix:'
        header.append(line)
        #   (1) Imputing missing values with the mean value of each column.
        line = '  (1) Imputing missing values with the mean value of each column.'
        header.append(line)
        #   (2) Manually removing low-information features:
        line = '  (2) Manually removing low-information features:'
        header.append(line)
        #       ___
        line = '      %s' % str(self._FEATURES_TO_REMOVE)
        header.append(line)
        #   (3) Algorithmically selecting the top 100 features via recursive feature elimination.
        line = '  (3) Algorithmically selecting the top 100 features via recursive feature elimination.'
        header.append(line)
        #       The following features were eliminated.
        line = '      The following features were eliminated:'
        header.append(line)
        # List all features with rank >100.
        line = '        %s' % str(self._eliminated_features)
        header.append(line)
        #
        line = ''
        header.append(line)
        # Each row represents a decision point (proxied by clinical order).
        line = 'Each row represents a decision point (proxied by clinical order).'
        header.append(line)
        # Each row contains fields summarizing the patient's demographics,
        line = "Each row contains fields summarizing the patient's demographics"
        header.append(line)
        # inpatient admit date, prior vitals, and prior lab results.
        line = 'inpatient admit date, prior vitals, and prior lab results.'
        header.append(line)
        # Most cells in matrix represent a count statistic for an event's
        line = "Most cells in matrix represent a count statistic for an event's"
        header.append(line)
        # occurrence or a difference between an event's time and index_time.
        line = "occurrence or a difference between an event's time and index_time."
        header.append(line)
        #
        header.append('')
        # Fields:
        header.append('Fields:')
        #   pat_id - ID # for patient in the STRIDE data set.
        header.append('  pat_id - ID # for patient in the STRIDE data set.')
        #   index_time - time at which clinical decision was made.
        header.append(
            '  index_time - time at which clinical decision was made.')
        #   death_date - if patient died, date on which they died.
        header.append(
            '  death_date - if patient died, date on which they died.')
        #   AdmitDxDate.[clinical_item] - admit diagnosis, pegged to admit date.
        header.append(
            '  AdmitDxDate.[clinical_item] - admit diagnosis, pegged to admit date.'
        )
        #   Birth.preTimeDays - patient's age in days.
        header.append("  Birth.preTimeDays - patient's age in days.")
        #   [Male|Female].pre - is patient male/female (binary)?
        header.append('  [Male|Female].pre - is patient male/female (binary)?')
        #   [RaceX].pre - is patient race [X]?
        header.append('  [RaceX].pre - is patient race [X]?')
        #   Team.[specialty].[clinical_item] - specialist added to treatment team.
        header.append(
            '  Team.[specialty].[clinical_item] - specialist added to treatment team.'
        )
        #   Comorbidity.[disease].[clinical_item] - disease added to problem list.
        header.append(
            '  Comorbidity.[disease].[clinical_item] - disease added to problem list.'
        )
        #   ___.[flowsheet] - measurements for flowsheet biometrics.
        header.append(
            '  ___.[flowsheet] - measurements for flowsheet biometrics.')
        #       Includes BP_High_Systolic, BP_Low_Diastolic, FiO2,
        header.append('    Includes BP_High_Systolic, BP_Low_Diastolic, FiO2,')
        #           Glasgow Coma Scale Score, Pulse, Resp, Temp, and Urine.
        header.append(
            '      Glasgow Coma Scale Score, Pulse, Resp, Temp, and Urine.')
        #   ___.[lab_result] - lab component results.
        header.append('  ___.[lab_result] - lab component results.')
        #       Included standard components: WBC, HCT, PLT, NA, K, CO2, BUN,
        header.append(
            '    Included standard components: WBC, HCT, PLT, NA, K, CO2, BUN,'
        )
        #           CR, TBIL, ALB, CA, LAC, ESR, CRP, TNI, PHA, PO2A, PCO2A,
        header.append(
            '      CR, TBIL, ALB, CA, LAC, ESR, CRP, TNI, PHA, PO2A, PCO2A,')
        #           PHV, PO2V, PCO2V
        header.append('      PHV, PO2V, PCO2V')
        #
        header.append('')
        #   [clinical_item] fields may have the following suffixes:
        header.append(
            '  [clinical_item] fields may have the following suffixes:')
        #       ___.pre - how many times has this occurred before order_time?
        header.append(
            '    ___.pre - how many times has this occurred before order_time?'
        )
        #       ___.pre.Xd - how many times has this occurred within X days before index_time?
        header.append(
            '    ___.pre.Xd - how many times has this occurred within X days before index_time?'
        )
        #       ___.preTimeDays - how many days before order_time was last occurrence?
        header.append(
            '    ___.preTimeDays - how many days before order_time was last occurrence?'
        )
        #
        header.append('')
        #   [flowsheet] and [lab_result] fields may have the following suffixes:
        header.append(
            '  [flowsheet] and [lab_result] fields may have the following suffixes:'
        )
        #       ___.X_Y.count - # of result values between X and Y days of index_time.
        header.append(
            '    ___.X_Y.count - # of result values between X and Y days of index_time.'
        )
        #       ___.X_Y.countInRange - # of result values in normal range.
        header.append(
            '    ___.X_Y.countInRange - # of result values in normal range.')
        #       ___.X_Y.min - minimum result value.
        header.append('    ___.X_Y.min - minimum result value.')
        #       ___.X_Y.max - maximum result value.
        header.append('    ___.X_Y.max - maximum result value.')
        #       ___.X_Y.median - median result value.
        header.append('    ___.X_Y.median - median result value.')
        #       ___.X_Y.std - standard deviation of result values.
        header.append('    ___.X_Y.std - standard deviation of result values.')
        #       ___.X_Y.first - first result value.
        header.append('    ___.X_Y.first - first result value.')
        #       ___.X_Y.last - last result value.
        header.append('    ___.X_Y.last - last result value.')
        #       ___.X_Y.diff - difference between penultimate and proximate values.
        header.append(
            '    ___.X_Y.diff - difference between penultimate and proximate values.'
        )
        #       ___.X_Y.slope - slope between penultimate and proximate values.
        header.append(
            '    ___.X_Y.slope - slope between penultimate and proximate values.'
        )
        #       ___.X_Y.proximate - closest result value to order_time.
        header.append(
            '    ___.X_Y.proximate - closest result value to order_time.')
        #       ___.X_Y.firstTimeDays - time between first and order_time.
        header.append(
            '    ___.X_Y.firstTimeDays - time between first and order_time.')
        #       ___.X_Y.lastTimeDays - time between last and order_time.
        header.append(
            '    ___.X_Y.lastTimeDays - time between last and order_time.')
        #       ___.X_Y.proximateTimeDays - time between proximate and order_time.
        header.append(
            '    ___.X_Y.proximateTimeDays - time between proximate and order_time.'
        )

        return header

    def _train_predictor(self):
        self._predictor = SupervisedClassifier(
            algorithm=SupervisedClassifier.REGRESS_AND_ROUND)
        self._predictor.train(self._X_train, column_or_1d(self._y_train))

    def _train_test_split(self):
        y = pd.DataFrame(
            self._cmm_processed.pop('I(0<=Death.postTimeDays<=28)'))
        # Without this line, sklearn complains about the format of y.
        # "DataConversionWarning: A column-vector y was passed when a 1d array
        #   was expected. Please change the shape of y to (n_samples, ), for
        #   example using ravel()."
        # Note that this turns y into a numpy array, so need to cast back.
        # y = y.values.ravel()
        X = self._cmm_processed
        self._X_train, self._X_test, self._y_train, self._y_test = train_test_split(
            X, y, shuffle=False)

    def _impute_data(self):
        # Impute missing values with mean value.
        for feature in self._cmm_raw.columns.values:
            if feature in self._FEATURES_TO_REMOVE:
                continue
            # If all values are null, just remove the feature.
            # Otherwise, imputation will fail (there's no mean value),
            # and sklearn will ragequit.
            if self._cmm_raw[feature].isnull().all():
                self._fmt.remove_feature(feature)
                self._eliminated_features.append(feature)
            # Only try to impute if some of the values are null.
            elif self._cmm_raw[feature].isnull().any():
                # TODO(sbala): Impute all time features with non-mean value.
                self._fmt.impute(feature)

    def _add_features(self):
        # Add threshold feature indicating whether death date
        # is within 28 days of index time.
        self._fmt.add_threshold_feature('Death.postTimeDays',
                                        lower_bound=0,
                                        upper_bound=28)

    def _remove_features(self):
        # Prune obviously unhelpful fields.
        # In theory, FeatureSelector should be able to prune these, but no
        # reason not to help it out a little bit.
        for feature in self._FEATURES_TO_REMOVE:
            self._fmt.remove_feature(feature)

    def _select_features(self):
        # Use FeatureSelector to prune all but 100 variables.
        fs = FeatureSelector(algorithm=FeatureSelector.RECURSIVE_ELIMINATION, \
            problem=FeatureSelector.CLASSIFICATION)

        fs.set_input_matrix(self._X_train, column_or_1d(self._y_train))
        num_features_to_select = int(0.01 * len(self._X_train.columns.values))
        fs.select(k=num_features_to_select)

        # Enumerate eliminated features pre-transformation.
        self._feature_ranks = fs.compute_ranks()
        for i in range(len(self._feature_ranks)):
            if self._feature_ranks[i] > num_features_to_select:
                self._eliminated_features.append(self._X_train.columns[i])

        self._X_train = fs.transform_matrix(self._X_train)
        self._X_test = fs.transform_matrix(self._X_test)

    def _test_predictor(self):
        self._accuracy = self._predictor.compute_accuracy(
            self._X_test, self._y_test)

    def predict(self, X):
        return self._predictor.predict(X)

    def summarize(self):
        summary_lines = list()

        # Condition: condition
        condition = self._condition
        line = 'Condition: %s' % condition
        summary_lines.append(line)

        # Algorithm: SupervisedClassifier(algorithm)
        algorithm = 'SupervisedClassifier(REGRESS_AND_ROUND)'
        line = 'Algorithm: %s' % algorithm
        summary_lines.append(line)

        # Train/Test Size: training_size, test_size
        training_size = self._X_train.shape[0]
        test_size = self._X_test.shape[0]
        line = 'Train/Test Size: %s/%s' % (training_size, test_size)
        summary_lines.append(line)

        # Model: sig_features
        coefs = self._predictor.coefs()
        cols = self._X_train.columns
        sig_features = [(coefs[cols.get_loc(f)], f) for f in cols.values
                        if coefs[cols.get_loc(f)] > 0]
        linear_model = ' + '.join('%s*%s' % (weight, feature)
                                  for weight, feature in sig_features)
        line = 'Model: logistic(%s)' % linear_model
        summary_lines.append(line)

        # Baseline Episode Mortality: episode_mortality
        counts = self._y_test[self._y_test.columns[0]].value_counts()
        line = 'Baseline Episode Mortality: %s/%s' % (counts[1], test_size)
        summary_lines.append(line)

        # AUC: auc
        auc = self._predictor.compute_roc_auc(self._X_test, self._y_test)
        line = 'AUC: %s' % auc
        summary_lines.append(line)
        # Accuracy: accuracy
        line = 'Accuracy: %s' % self._accuracy
        summary_lines.append(line)

        return '\n'.join(summary_lines)
Esempio n. 10
0
class BifurcatedSupervisedClassifier:
    BIFURCATION = 'bifurcation'
    EQUAL = '=='
    LTE = '<='
    GTE = '>='
    SUPPORTED_BIFURCATION_STRATEGIES = [EQUAL, GTE, LTE]

    def __init__(self, classes, hyperparams):
        if hyperparams[
                'bifurcation_strategy'] not in BifurcatedSupervisedClassifier.SUPPORTED_BIFURCATION_STRATEGIES:
            raise ValueError('Bifurcation strategy %s not supported.' %
                             hyperparams['bifurcation_strategy'])

        self._classes = classes
        self._hyperparams = hyperparams

        # Note that if we don't pass a copies of hyperparams, then we won't
        # be able to change hyperparams independently in the two classifiers.
        self._sc_true = SupervisedClassifier(classes, hyperparams.copy())
        self._sc_false = SupervisedClassifier(classes, hyperparams.copy())

    def __repr__(self):
        bs = self._build_bifurcation_str()
        classes_str = str(self._classes)
        hyperparams_str = "hyperparams={'algorithm': %s, 'bifurcator': %s, 'bifurcation_strategy': %s, 'bifurcation_threshold': %s, 'random_state': %s}" % (
            self._hyperparams['algorithm'], self._hyperparams['bifurcator'],
            self._hyperparams['bifurcation_strategy'],
            self._hyperparams['bifurcation_value'],
            self._hyperparams['random_state'])
        s = "BifurcatedSupervisedClassifier(%s, %s)" % (classes_str,
                                                        hyperparams_str)
        return s

    __str__ = __repr__

    def _build_bifurcation_str(self):
        args = (self._hyperparams['bifurcator'],
                self._hyperparams['bifurcation_strategy'],
                self._hyperparams['bifurcation_value'])
        return '%s %s %s' % args

    def fetch_bifurcation_masks(self, X):
        log.debug('bifurcator: %s' % self._hyperparams['bifurcator'])
        log.debug('bifurcation_strategy: %s' %
                  BifurcatedSupervisedClassifier.EQUAL)
        log.debug('bifurcation_value: %s' %
                  self._hyperparams['bifurcation_value'])
        if self._hyperparams[
                'bifurcation_strategy'] is BifurcatedSupervisedClassifier.EQUAL:
            true_mask = X[self._hyperparams['bifurcator']].astype(
                float) == self._hyperparams['bifurcation_value']
            false_mask = X[self._hyperparams['bifurcator']].astype(
                float) != self._hyperparams['bifurcation_value']
        elif self._hyperparams[
                'bifurcation_strategy'] is BifurcatedSupervisedClassifier.LTE:
            true_mask = X[self._hyperparams['bifurcator']].astype(
                float) <= self._hyperparams['bifurcation_value']
            false_mask = X[self._hyperparams['bifurcator']].astype(
                float) > self._hyperparams['bifurcation_value']
        elif self._hyperparams[
                'bifurcation_strategy'] is BifurcatedSupervisedClassifier.GTE:
            true_mask = X[self._hyperparams['bifurcator']].astype(
                float) >= self._hyperparams['bifurcation_value']
            false_mask = X[self._hyperparams['bifurcator']].astype(
                float) < self._hyperparams['bifurcation_value']

        log.debug('X[%s].value_counts(): %s' %
                  (self._hyperparams['bifurcator'],
                   X[self._hyperparams['bifurcator']].value_counts()))
        log.debug('true_mask.value_counts(): %s' % true_mask.value_counts())
        log.debug('false_mask.value_counts(): %s' % false_mask.value_counts())
        return true_mask, false_mask

    def description(self):
        args = (self._hyperparams['algorithm'].upper().replace('-', '_'),
                self._build_bifurcation_str(), self._sc_true.description(),
                self._sc_false.description())
        return 'BIFURCATED_%s(%s, true=%s, false=%s)' % args

    def hyperparams(self):
        hyperparams = {
            'model_true': self._sc_true.hyperparams(),
            'model_false': self._sc_false.hyperparams()
        }
        return hyperparams

    def params(self):
        params = {
            'bifurcator': self._hyperparams['bifurcator'],
            'bifurcation_strategy': self._hyperparams['bifurcation_strategy'],
            'bifurcation_value': self._hyperparams['bifurcation_value'],
            'model_true': self._sc_true.description(),
            'model_false': self._sc_false.description()
        }
        return params

    def train(self, X_train, y_train):
        true_mask, false_mask = self.fetch_bifurcation_masks(X_train)

        # Train sc_true.
        X_train_true = X_train[true_mask]
        y_train_true = y_train[true_mask]
        status_true = self._sc_true.train(X_train_true, y_train_true)
        if status_true == SupervisedClassifier.INSUFFICIENT_SAMPLES:
            return status_true

        # Train sc_true.
        X_train_false = X_train[false_mask]
        y_train_false = y_train[false_mask]
        status_false = self._sc_false.train(X_train_false, y_train_false)
        if status_false == SupervisedClassifier.INSUFFICIENT_SAMPLES:
            return status_false

        return SupervisedClassifier.TRAINED

    def _stitch_disjoint_row(self, row):
        if pd.isnull(row['y_pred_true']):
            val = row['y_pred_false']
        else:
            val = row['y_pred_true']
        return val

    def _stitch_prob_0(self, row):
        if pd.isnull(row['y_pred_prob_true_0']):
            val = row['y_pred_prob_false_0']
        else:
            val = row['y_pred_prob_true_0']
        return val

    def _stitch_prob_1(self, row):
        if pd.isnull(row['y_pred_prob_true_1']):
            val = row['y_pred_prob_false_1']
        else:
            val = row['y_pred_prob_true_1']
        return val

    def _predict_label_or_probability(self, X_test, probability=None):
        true_mask, false_mask = self.fetch_bifurcation_masks(X_test)

        # Predict X_test_true.
        X_test_true = X_test[true_mask]
        if probability:
            y_pred_true = self._sc_true.predict_probability(X_test_true)
        else:
            y_pred_true = self._sc_true.predict(X_test_true)
        log.debug('y_pred_true: %s' % y_pred_true)

        # Predict X_test_false.
        X_test_false = X_test[false_mask]
        if probability:
            y_pred_false = self._sc_false.predict_probability(X_test_false)
        else:
            y_pred_false = self._sc_false.predict(X_test_false)
        log.debug('y_pred_false: %s' % y_pred_false)

        # Stitch results.
        if probability:
            column_names = ['y_pred_true_0', 'y_pred_true_1']
        else:
            column_names = ['y_pred_true']
        y_pred_true_df = DataFrame(y_pred_true, index=X_test_true.index, \
                                    columns=column_names)
        log.debug('y_pred_true_df: %s' % y_pred_true_df)
        if probability:
            column_names = ['y_pred_false_0', 'y_pred_false_1']
        else:
            column_names = ['y_pred_false']
        y_pred_false_df = DataFrame(y_pred_false, index=X_test_false.index, \
                                    columns=column_names)
        log.debug('y_pred_false_df: %s' % y_pred_false_df)
        true_mask_df = DataFrame(true_mask)
        mask_plus_true = true_mask_df.merge(y_pred_true_df, how='left', \
                                            left_index=True, right_index=True)
        mask_plus_true_plus_false = mask_plus_true.merge(y_pred_false_df, \
                                how='left', left_index=True, right_index=True)
        mask_plus_true_plus_false['y_pred'] = mask_plus_true_plus_false.apply(
            self._stitch_disjoint_row, axis=1)
        log.debug('mask_plus_false: %s' % mask_plus_true_plus_false)
        y_pred = mask_plus_true_plus_false['y_pred'].values

        return y_pred

    def predict(self, X_test):
        true_mask, false_mask = self.fetch_bifurcation_masks(X_test)

        # Predict X_test_true.
        X_test_true = X_test[true_mask]
        y_pred_true = self._sc_true.predict(X_test_true)
        log.debug('y_pred_true: %s' % y_pred_true)

        # Predict X_test_false.
        X_test_false = X_test[false_mask]
        y_pred_false = self._sc_false.predict(X_test_false)
        log.debug('y_pred_false: %s' % y_pred_false)

        # Stitch results.
        column_names = ['y_pred_true']
        y_pred_true_df = DataFrame(y_pred_true, index=X_test_true.index, \
                                    columns=column_names)
        log.debug('y_pred_true_df: %s' % y_pred_true_df)
        column_names = ['y_pred_false']
        y_pred_false_df = DataFrame(y_pred_false, index=X_test_false.index, \
                                    columns=column_names)
        log.debug('y_pred_false_df: %s' % y_pred_false_df)
        true_mask_df = DataFrame(true_mask)
        mask_plus_true = true_mask_df.merge(y_pred_true_df, how='left', \
                                            left_index=True, right_index=True)
        mask_plus_true_plus_false = mask_plus_true.merge(y_pred_false_df, \
                                how='left', left_index=True, right_index=True)
        mask_plus_true_plus_false['y_pred'] = mask_plus_true_plus_false.apply(
            self._stitch_disjoint_row, axis=1)
        log.debug('mask_plus_false: %s' % mask_plus_true_plus_false)
        y_pred = mask_plus_true_plus_false['y_pred'].values

        return y_pred

    def predict_probability(self, X_test):
        true_mask, false_mask = self.fetch_bifurcation_masks(X_test)

        # Predict X_test_true.
        X_test_true = X_test[true_mask]
        y_pred_prob_true = self._sc_true.predict_probability(X_test_true)
        log.debug('y_pred_prob_true: %s' % y_pred_prob_true)

        # Predict X_test_false.
        X_test_false = X_test[false_mask]
        y_pred_prob_false = self._sc_false.predict_probability(X_test_false)
        log.debug('y_pred_prob_false: %s' % y_pred_prob_false)

        # Stitch results.
        column_names = ['y_pred_prob_true_0', 'y_pred_prob_true_1']
        y_pred_prob_true_df = DataFrame(y_pred_prob_true, index=X_test_true.index, \
                                    columns=column_names)
        log.debug('y_pred_prob_true_df: %s' % y_pred_prob_true_df)
        column_names = ['y_pred_prob_false_0', 'y_pred_prob_false_1']
        y_pred_prob_false_df = DataFrame(y_pred_prob_false, index=X_test_false.index, \
                                    columns=column_names)
        log.debug('y_pred_prob_false_df: %s' % y_pred_prob_false_df)
        true_mask_df = DataFrame(true_mask)
        mask_plus_true = true_mask_df.merge(y_pred_prob_true_df, how='left', \
                                            left_index=True, right_index=True)
        composite = mask_plus_true.merge(y_pred_prob_false_df, \
                                how='left', left_index=True, right_index=True)
        composite['y_pred_prob_0'] = composite.apply(self._stitch_prob_0,
                                                     axis=1)
        composite['y_pred_prob_1'] = composite.apply(self._stitch_prob_1,
                                                     axis=1)
        log.debug('composite: %s' % composite)
        y_pred_prob = composite[['y_pred_prob_0', 'y_pred_prob_1']].values
        log.debug(y_pred_prob)

        return y_pred_prob
Esempio n. 11
0
def run_one_lab_local(lab, lab_type, data_source, version, random_state=0):
    '''

    Input:

    :return:
    '''

    # X_train_raw, y_train = [[1], [2]], [1, 2]
    # X_test_raw, y_test = [[3], [4]], [3, 4]
    file_organizer = syst.FileOrganizerLocal(lab=lab,
                                             lab_type=lab_type,
                                             data_source=data_source,
                                             version=version)

    raw_matrix = file_organizer.get_raw_matrix()

    y_label = 'all_components_normal'

    '''
    TODO: later on pat_ids
    '''
    raw_matrix_train, raw_matrix_test = Utils.split_rows(raw_matrix)

    patIds_train = raw_matrix_train['pat_id'].values.tolist()

    X_train_raw, y_train = Utils.split_Xy(raw_matrix_train, ylabel=y_label)

    redundant_features = ['proc_code', 'num_components', 'num_normal_components', 'abnormal_panel']
    id_features = ['pat_id', 'order_proc_id', 'order_time']

    numeric_features = X_train_raw.columns[~X_train_raw.columns.isin([y_label]+redundant_features+id_features)]

    '''
    Check if the left features are all numeric
    '''
    assert X_train_raw[numeric_features].select_dtypes(exclude=['object']).shape == X_train_raw[numeric_features].shape

    features_by_type = {'redundant_features': redundant_features,
                    'id_features':id_features,
                    'numeric_features':numeric_features,
                    'y_label':y_label}

    '''
    (1) Feature Impute: 
    Imputation of some numerical values depend on prior stats of the same patient, 
        so certain auxiliary columns are still useful
    
    
    (2) Feature Remove:
    Remove auxiliary columns
    
    (3) Feature Selection:
    Only select from numerical columns
    
    '''
    feature_processing_pipeline = Pipeline(
        memory = None, #file_organizer.cached_pipeline_filepath,
        steps = [
             ('impute_features', Cls.FeatureImputer()),
             ('remove_features', Cls.FeatureRemover(features_to_remove=Config.features_to_remove)),
             ('select_features', Cls.Select_Features(random_state=random_state, features_by_type=features_by_type))
             ]
    )

    # feature_engineering_pipeline.set_params()
    X_train_processed = feature_processing_pipeline.fit_transform(X_train_raw, y_train)

    hyperparams = {}
    hyperparams['algorithm'] = 'random-forest'
    predictor = SupervisedClassifier(classes=[0,1], hyperparams=hyperparams)

    '''
    Automatically takes care of tuning hyperparameters via stochastic-search
    '''
    status = predictor.train(X_train_processed, column_or_1d(y_train),
                                       groups = patIds_train)

    logging.INFO('status: %s'%status)

    '''
    Test set
    '''
    X_test_raw, y_test = Utils.split_Xy(raw_matrix_test, ylabel=y_label)
    X_test_processed = feature_processing_pipeline.transform(X_test_raw)
    y_test_pred_proba = predictor.predict_probability(X_test_processed)

    res_df = pd.DataFrame({'actual':y_test,
                           'predict': y_test_pred_proba})
    res_df.to_csv(file_organizer.get_output_filepath(alg=hyperparams['algorithm']))