def _train_test_split(self, processed_matrix, outcome_label):
     log.debug('outcome_label: %s' % outcome_label)
     y = pd.DataFrame(processed_matrix.pop(outcome_label))
     X = processed_matrix
     log.debug('X.columns: %s' % X.columns)
     self._X_train, self._X_test, self._y_train, self._y_test = train_test_split(
         X, y, random_state=self._random_state)
    def _analyze_predictors_on_holdout(self):
        fm_io = FeatureMatrixIO()

        algorithms_to_test = list()
        algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS)

        pipeline_file_name = inspect.getfile(inspect.currentframe())
        data_dir = SupervisedLearningPipeline._fetch_data_dir_path(
            self, pipeline_file_name)
        # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS:
        #     algorithms_to_test.append('bifurcated-%s' % algorithm)
        log.debug('algorithms_to_test: %s' % algorithms_to_test)
        for algorithm in algorithms_to_test:
            log.info('analyzing %s...' % algorithm)
            # If report_dir does not exist, make it.
            report_dir = '/'.join([data_dir, algorithm])

            pipeline_prefix = '%s-normality-prediction-%s' % (self._var,
                                                              algorithm)

            predictor_path = self._build_model_dump_path(algorithm)

            if os.path.exists(
                    predictor_path) and 'bifurcated' not in algorithm:
                log.debug('Loading model from disk...')
                # TODO(sbala): Fix loblib.load so that it works for bifurcated
                # supervised classifiers.
                self._predictor = joblib.load(predictor_path)
                # self._features = self._X_train.columns
                status = SupervisedClassifier.TRAINED

            SupervisedLearningPipeline._analyze_predictor_holdoutset(
                self, report_dir, pipeline_prefix)
Ejemplo n.º 3
0
 def _filter_on_features(self, fmt, features_to_filter_on):
     # Filter out rows with unwanted value for given feature
     for filter_feature in features_to_filter_on:
         feature = filter_feature.get('feature')
         value = filter_feature.get('value')
         self._num_rows = fmt.filter_on_feature(feature, value)
         log.debug('Removed rows where %s equals \'%s\'; %d rows remain.' % (feature, str(value), self._num_rows))
Ejemplo n.º 4
0
 def _add_susc_features(self):
     for susc_name in self._susceptibility_names:
         log.debug('Adding %s feature...' % susc_name)
         self._factory.addClinicalItemFeatures([susc_name],
                                               column='name',
                                               label=susc_name,
                                               features="pre")
Ejemplo n.º 5
0
    def _add_features(self, fmt, features_to_add):
        # Expected format for features_to_add:
        # {
        #   'threshold': [{arg1, arg2, etc.}, ...]
        #   'indicator': [{arg1, arg2, etc.}, ...]
        #   'logarithm': [{arg1, arg2, etc.}, ...]
        # }
        indicator_features = features_to_add.get('indicator')
        threshold_features = features_to_add.get('threshold')
        logarithm_features = features_to_add.get('logarithm')
        change_features = features_to_add.get('change')

        if indicator_features:
            for feature in indicator_features:
                base_feature = feature.get('base_feature')
                boolean_indicator = feature.get('boolean_indicator')
                added_feature = fmt.add_indicator_feature(
                    base_feature, boolean_indicator)
                self._added_features.append(added_feature)

        if threshold_features:
            for feature in threshold_features:
                base_feature = feature.get('base_feature')
                lower_bound = feature.get('lower_bound')
                upper_bound = feature.get('upper_bound')
                added_feature = fmt.add_threshold_feature(
                    base_feature, lower_bound, upper_bound)
                self._added_features.append(added_feature)

        if logarithm_features:
            for feature in logarithm_features:
                base_feature = feature.get('base_feature')
                logarithm = feature.get('logarithm')
                added_feature = fmt.add_threshold_feature(
                    base_feature, logarithm)
                self._added_features.append(added_feature)

        # TODO (raikens): right now, unchanged_yn is the only allowable name for a
        # change feature, which means at most one change_feature can be added
        if change_features:
            if len(change_features) > 1:
                raise ValueError(
                    "Adding multiple \'change\' type features is not yet supported"
                )

            for feature in change_features:
                feature_old = feature.get('feature_old')
                feature_new = feature.get('feature_new')
                method = feature.get('method')
                param = feature.get('param')
                added_feature = fmt.add_change_feature(method, param,
                                                       feature_old,
                                                       feature_new)
                self._added_features.append(added_feature)

                # sd method discards 300 rows for measuring sd
                if method == 'sd':
                    self._num_rows = self._num_rows - 300

        log.debug('self._added_features: %s' % self._added_features)
Ejemplo n.º 6
0
    def _analyze_predictor(self, dest_dir, pipeline_prefix):
        analyzer = ClassifierAnalyzer(self._predictor, self._X_test, self._y_test)

        # Build names for output plots and report.
        direct_comparisons_name = 'direct_comparisons.csv' #'%s-direct-compare-results.csv' % pipeline_prefix
        precision_at_k_plot_name = '%s-precision-at-k-plot.png' % pipeline_prefix
        precision_recall_plot_name = '%s-precision-recall-plot.png' % pipeline_prefix
        roc_plot_name = '%s-roc-plot.png' % pipeline_prefix
        report_name = '%s-report.tab' % pipeline_prefix

        # Build paths.
        direct_comparisons_path = '/'.join([dest_dir, direct_comparisons_name])
        log.debug('direct_comparisons_path: %s' % direct_comparisons_path)
        precision_at_k_plot_path = '/'.join([dest_dir, precision_at_k_plot_name])
        log.debug('precision_at_k_plot_path: %s' % precision_at_k_plot_path)
        precision_recall_plot_path = '/'.join([dest_dir, precision_recall_plot_name])
        log.debug('precision_recall_plot_path: %s' % precision_recall_plot_path)
        roc_plot_path = '/'.join([dest_dir, roc_plot_name])
        log.debug('roc_plot_path: %s' % roc_plot_path)
        report_path = '/'.join([dest_dir, report_name])
        log.debug('report_path: %s' % report_path)

        # Build plot titles.
        roc_plot_title = 'ROC (%s)' % pipeline_prefix
        precision_recall_plot_title = 'Precision-Recall (%s)' % pipeline_prefix
        precision_at_k_plot_title = 'Precision @K (%s)' % pipeline_prefix

        # Write output.
        analyzer.output_direct_comparisons(direct_comparisons_path)
        analyzer.plot_roc_curve(roc_plot_title, roc_plot_path)
        analyzer.plot_precision_recall_curve(precision_recall_plot_title, precision_recall_plot_path)
        analyzer.plot_precision_at_k_curve(precision_at_k_plot_title, precision_at_k_plot_path)
        analyzer.write_report(report_path, ci=0.95)
Ejemplo n.º 7
0
    def predict(self, X_test):
        true_mask, false_mask = self.fetch_bifurcation_masks(X_test)

        # Predict X_test_true.
        X_test_true = X_test[true_mask]
        y_pred_true = self._sc_true.predict(X_test_true)
        log.debug('y_pred_true: %s' % y_pred_true)

        # Predict X_test_false.
        X_test_false = X_test[false_mask]
        y_pred_false = self._sc_false.predict(X_test_false)
        log.debug('y_pred_false: %s' % y_pred_false)

        # Stitch results.
        column_names = ['y_pred_true']
        y_pred_true_df = DataFrame(y_pred_true, index=X_test_true.index, \
                                    columns=column_names)
        log.debug('y_pred_true_df: %s' % y_pred_true_df)
        column_names = ['y_pred_false']
        y_pred_false_df = DataFrame(y_pred_false, index=X_test_false.index, \
                                    columns=column_names)
        log.debug('y_pred_false_df: %s' % y_pred_false_df)
        true_mask_df = DataFrame(true_mask)
        mask_plus_true = true_mask_df.merge(y_pred_true_df, how='left', \
                                            left_index=True, right_index=True)
        mask_plus_true_plus_false = mask_plus_true.merge(y_pred_false_df, \
                                how='left', left_index=True, right_index=True)
        mask_plus_true_plus_false['y_pred'] = mask_plus_true_plus_false.apply(
            self._stitch_disjoint_row, axis=1)
        log.debug('mask_plus_false: %s' % mask_plus_true_plus_false)
        y_pred = mask_plus_true_plus_false['y_pred'].values

        return y_pred
Ejemplo n.º 8
0
 def _analyze_predictor_traindata(self, dest_dir):
     analyzer = ClassifierAnalyzer(self._predictor, self._X_train,
                                   self._y_train)
     direct_comparisons_name = 'direct_comparisons_train.csv'
     direct_comparisons_path = '/'.join([dest_dir, direct_comparisons_name])
     log.debug('direct_comparisons_path: %s' % direct_comparisons_path)
     analyzer.output_direct_comparisons(direct_comparisons_path)
Ejemplo n.º 9
0
    def _get_average_orders_per_patient(self):
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Get average number of results for this lab test per patient.
        query = SQLQuery()
        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':  #TODO: add STRIDE component routine
            query.addSelect('CAST(pat_id AS BIGINT) AS pat_id')
            query.addSelect('COUNT(sop.order_proc_id) AS num_orders')
            query.addFrom('stride_order_proc AS sop')
            query.addFrom('stride_order_results AS sor')
            query.addWhere('sop.order_proc_id = sor.order_proc_id')
            query.addWhereIn("proc_code", [self._lab_panel])
            components = self._get_components_in_lab_panel()
            query.addWhereIn("base_name", components)
            query.addGroupBy('pat_id')

        elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
            query.addSelect('CAST(pat_id AS BIGINT) AS pat_id')
            query.addSelect('COUNT(order_proc_id) AS num_orders')
            query.addFrom('labs')
            query.addWhereIn(self._varTypeInTable, [self._lab_var])
            components = self._get_components_in_lab_panel()
            query.addWhereIn("base_name", components)
            query.addGroupBy('pat_id')
        log.debug('Querying median orders per patient...')
        results = DBUtil.execute(query)
        order_counts = [row[1] for row in results]
        if len(order_counts) == 0:
            error_msg = '0 orders for lab "%s."' % self._lab_var
            log.critical(error_msg)
            raise Exception(error_msg)
            # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception
        else:
            return numpy.median(order_counts)
Ejemplo n.º 10
0
    def __init__(self, classifier, X_test, y_test, random_state=None):
        # TODO(sbala): Make this API more flexible, so that it can work
        # with multi-label classifiers or binary classifiers whose
        # positive label != 1.
        PredictorAnalyzer.__init__(self, classifier, X_test, y_test,
                                   random_state)
        # If there is only one class in y_test, abort.
        classes = y_test[y_test.columns.values[0]].value_counts().index.values
        if len(classes) <= 1:
            sole_class = classes[0]
            log.debug('y_test only has samples of 1 class: %s' % sole_class)
            sys.exit(
                '[ERROR] ClassifierAnalyzer: y_test only has samples of 1 class: %s'
                % sole_class)

        self._y_pred_prob = DataFrame(
            classifier.predict_probability(X_test)[:, 1])
        log.debug('y_pred_prob[0].value_counts(): %s' %
                  self._y_pred_prob[0].value_counts())

        if random_state is None:
            self._random_state = np.random.RandomState(123456789)
        elif isinstance(random_state, int):
            self._random_state = np.random.RandomState(random_state)
        elif isinstance(random_state, np.random.RandomState):
            self._random_state = random_state
Ejemplo n.º 11
0
    def _get_random_patient_list(self):
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Get average number of results for this lab test per patient.
        avg_orders_per_patient = self._get_average_orders_per_patient()
        log.info('avg_orders_per_patient: %s' % avg_orders_per_patient)
        # Based on average # of results, figure out how many patients we'd
        # need to get for a feature matrix of requested size.
        self._num_patients = int(numpy.max([self._num_requested_episodes / \
            avg_orders_per_patient, 1]))

        # Get numPatientsToQuery random patients who have gotten test.
        # TODO(sbala): Have option to feed in a seed for the randomness.
        query = SQLQuery()
        query.addSelect('pat_id')
        query.addFrom('stride_order_proc AS sop')
        query.addWhereIn('proc_code', [self._lab_panel])
        query.addOrderBy('RANDOM()')
        query.setLimit(self._num_patients)
        log.debug('Querying random patient list...')
        results = DBUtil.execute(query)

        # Get patient list.
        random_patient_list = [ row[0] for row in results ]

        return random_patient_list
Ejemplo n.º 12
0
    def _get_components_in_lab_panel(self):
        # Initialize DB connection.
        cursor = self._connection.cursor()

        # Doing a single query results in a sequential scan through
        # stride_order_results. To avoid this, break up the query in two.

        # First, get all the order_proc_ids for proc_code.

        query = SQLQuery()
        query.addSelect('order_proc_id')
        query.addFrom('stride_order_proc')
        query.addWhereIn('proc_code', [self._lab_panel])
        query.addGroupBy('order_proc_id')
        log.debug('Querying order_proc_ids for %s...' % self._lab_panel)
        results = DBUtil.execute(query)
        lab_order_ids = [row[0] for row in results]

        # Second, get all base_names from those orders.
        query = SQLQuery()
        query.addSelect('base_name')
        query.addFrom('stride_order_results')
        query.addWhereIn('order_proc_id', lab_order_ids)
        query.addGroupBy('base_name')
        log.debug('Querying base_names for order_proc_ids...')
        results = DBUtil.execute(query)
        components = [row[0] for row in results]

        return components
Ejemplo n.º 13
0
    def __init__(
        self,
        change_params,
        lab_panel,
        num_episodes,
        use_cache=None,
        random_state=None,
        build_raw_only=False,
    ):
        SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes,
                                            use_cache, random_state)
        self._change_params = change_params
        self._change_params[
            'feature_old'] = self._lookup_previous_measurement_feature(
                self._var)
        log.debug('change_params: %s' % self._change_params)

        if build_raw_only:
            self._build_raw_feature_matrix()
            return

        else:
            self._build_raw_feature_matrix()
            self._build_processed_feature_matrix()
            self._train_and_analyze_predictors()
Ejemplo n.º 14
0
    def test_train_and_predict(self):
        # Load data set.
        X = DataFrame(RANDOM_CLASSIFICATION_TEST_CASE['X'],
                      columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10'])
        y = DataFrame(RANDOM_CLASSIFICATION_TEST_CASE['y'])
        random_state = RANDOM_CLASSIFICATION_TEST_CASE['random_state']
        expected_y_pred_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['y_predicted']
        expected_str_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['str']
        expected_hyperparams_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['hyperparams']
        expected_params_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['params']
        expected_descriptions_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['description']

        # Generate train/test split.
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

        # Iterate through SUPPORTED_ALGORITHMS.
        for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS:
            log.info('Testing %s classifier...' % algorithm)
            # Train model.
            hyperparams = {'algorithm': algorithm, 'random_state': random_state}
            # Default to stochastic search for expensive algorithms.
            if algorithm in [SupervisedClassifier.RANDOM_FOREST]:
                hyperparams['hyperparam_strategy'] = SupervisedClassifier.STOCHASTIC_SEARCH
                # Test ability to force hyperparam values.
                hyperparams['max_depth'] = 2
                hyperparams['n_estimators'] = 5
                hyperparams['min_samples_leaf'] = 1
                hyperparams['min_samples_split'] = 0.2
            else:
                hyperparams['hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH
            classifier = SupervisedClassifier([0, 1], hyperparams)
            classifier.train(X_train, y_train)

            # Test str().
            expected_str = expected_str_by_algorithm[algorithm]
            actual_str = str(classifier)
            self.assertEqual(expected_str, actual_str)

            # Test hyperparameters.
            expected_hyperparams = expected_hyperparams_by_algorithm[algorithm]
            actual_hyperparams = classifier.hyperparams()
            self._assert_equal_hyperparams(expected_hyperparams, actual_hyperparams)

            # Test model parameters.
            expected_params = expected_params_by_algorithm[algorithm]
            actual_params = classifier.params()
            self.assertEqualDict(expected_params, actual_params)

            # Test model description.
            expected_description = expected_descriptions_by_algorithm[algorithm]
            actual_description = classifier.description()
            self.assertEqual(expected_description, actual_description)

            # Test prediction values.
            expected_y_pred = expected_y_pred_by_algorithm[algorithm]
            log.debug('expected_y_pred: %s' % expected_y_pred)
            actual_y_pred = classifier.predict(X_test)
            log.debug('actual_y_pred: %s' % actual_y_pred)
            self.assertEqualList(expected_y_pred, actual_y_pred)
Ejemplo n.º 15
0
 def clear_stride_psql_tables():
     log.info('Clearing stride psql tables...')
     for params in list(STRIDE_LOADER_PARAMS.values()):
         psql_table = params['psql_table'] % TABLE_PREFIX
         log.debug('dropping table %s...' % psql_table)
         # load_stride_to_psql is not itempotent, so in case schema already
         # existed, clear table (avoid duplicate data).
         DBUtil.execute("DROP TABLE IF EXISTS %s CASCADE;" % psql_table)
Ejemplo n.º 16
0
 def _add_med_features(self):
     # Adds all prior antibiotic use as features
     for med_set in self._med_panel:
         med_label = med_set[0].split()[0]  # Takes name of antibiotic
         log.debug('Adding %s medication features...' % med_label)
         self._factory.addClinicalItemFeatures(med_set,
                                               column="description",
                                               label="Med." + med_label,
                                               features="pre")
Ejemplo n.º 17
0
    def _remove_features(self, fmt, features_to_remove):
        # Prune manually identified features (meant for obviously unhelpful).
        # In theory, FeatureSelector should be able to prune these, but no
        # reason not to help it out a little bit.
        for feature in features_to_remove:
            fmt.remove_feature(feature)
            self._removed_features.append(feature)

        log.debug('self._removed_features: %s' % self._removed_features)
Ejemplo n.º 18
0
    def predict(self, X):
        y_predicted = list()
        for row in X.iterrows():
            prediction = self._predictions[self._index]
            y_predicted.append(prediction)
            self._index = (self._index + 1) % self._num_predictions

        log.debug('y_predicted: %s' % y_predicted)
        return DataFrame({'y_predicted': y_predicted})
    def _select_features(self,
                         problem,
                         percent_features_to_select,
                         algorithm,
                         features_to_keep=None):
        # Initialize FeatureSelector.
        fs = FeatureSelector(problem=problem,
                             algorithm=algorithm,
                             random_state=self._random_state)
        fs.set_input_matrix(self._X_train, column_or_1d(self._y_train))
        num_features_to_select = int(percent_features_to_select *
                                     len(self._X_train.columns.values))

        # Parse features_to_keep.
        if features_to_keep is None:
            features_to_keep = []

        # Select features.
        fs.select(k=num_features_to_select)

        # Enumerate eliminated features pre-transformation.
        feature_ranks = fs.compute_ranks()
        for i in range(len(feature_ranks)):
            if feature_ranks[i] > num_features_to_select:
                # If in features_to_keep, pretend it wasn't eliminated.
                if self._X_train.columns[i] not in features_to_keep:
                    self._eliminated_features.append(self._X_train.columns[i])

        # Hack: rather than making FeatureSelector handle the concept of
        # kept features, just copy the data here and add it back to the
        # transformed matrices.
        # Rather than looping, do this individually so that we can skip if
        # transformed X already has the feature.
        for feature in features_to_keep:
            kept_X_train_feature = self._X_train[[feature]].copy()
            log.debug('kept_X_train_feature.shape: %s' %
                      str(kept_X_train_feature.shape))
            self._X_train = fs.transform_matrix(self._X_train)
            if feature not in self._X_train:
                self._X_train = self._X_train.merge(kept_X_train_feature,
                                                    left_index=True,
                                                    right_index=True)

            kept_X_test_feature = self._X_test[[feature]].copy()
            log.debug('kept_X_test_feature.shape: %s' %
                      str(kept_X_test_feature.shape))
            self._X_test = fs.transform_matrix(self._X_test)
            if feature not in self._X_test:
                self._X_test = self._X_test.merge(kept_X_test_feature,
                                                  left_index=True,
                                                  right_index=True)

        if not features_to_keep:
            # Even if there is no feature to keep, still need to
            # perform transform_matrix to drop most low-rank features
            self._X_train = fs.transform_matrix(self._X_train)
            self._X_test = fs.transform_matrix(self._X_test)
Ejemplo n.º 20
0
    def _maybe_reshape_y(self, y):
        # If necessary, reshape y from (n_samples, 1) to (n_samples, )
        try:
            num_cols = y.shape[1]
            y = column_or_1d(y)
            log.debug('Reshaped y to 1d.')
        except IndexError:
            log.debug('Did not need to reshape y to 1d.')

        return y
Ejemplo n.º 21
0
    def _bootstrap_score_ci(self, score_fn, ci, y_test, y_pred=None, y_pred_prob=None, n_bootstrap_iter=None, k=None, desired_precision=None):
        # Note that y_pred may either represent the predicted labels or the
        # predicted label probabilities. It's up to the caller to make the
        # right choice based on score_fn's expected input.
        if score_fn == self._score_precision_at_k:
            sample_score = score_fn(y_test, y_pred, y_pred_prob, k)
        elif score_fn == self._score_percent_predictably_positive:
            sample_score = score_fn(y_test, y_pred, y_pred_prob, desired_precision)
        elif score_fn in [average_precision_score, roc_auc_score]:
            sample_score = score_fn(y_test, y_pred_prob)
        else:
            sample_score = score_fn(y_test, y_pred)

        if n_bootstrap_iter is None:
            n_bootstrap_iter = 100
        # For consistency of results, seed random number generator with
        # fixed number.
        rng = self._random_state
        # Use bootstrap to compute CIs.
        bootstrap_scores = list()
        for i in range(0, n_bootstrap_iter):
            # Sample y_test and y_pred with replacement.
            indices = rng.randint(0, len(y_test) - 1, len(y_test))
            sample_y_test = np.array(y_test)[indices]
            if y_pred is not None: sample_y_pred = np.array(y_pred)[indices]
            if y_pred_prob is not None: sample_y_pred_prob = np.array(y_pred_prob)[indices]
            if len(np.unique(sample_y_test)) < 2:
                # We need at least one positive and one negative sample for ROC AUC
                # to be defined: reject the sample
                continue
            if score_fn == self._score_precision_at_k:
                score = score_fn(DataFrame(sample_y_test), DataFrame(sample_y_pred), DataFrame(sample_y_pred_prob), k)
            elif score_fn == self._score_percent_predictably_positive:
                score = score_fn(DataFrame(sample_y_test), DataFrame(sample_y_pred), DataFrame(sample_y_pred_prob), desired_precision)
            elif score_fn in [average_precision_score, roc_auc_score]:
                score = score_fn(sample_y_test, sample_y_pred_prob)
            else:
                score = score_fn(sample_y_test, sample_y_pred)
            bootstrap_scores.append(score)

        # Sort bootstrap scores to get CIs.
        bootstrap_scores.sort()
        log.debug('bootstrap_scores: %s' % bootstrap_scores)
        sorted_scores = np.array(bootstrap_scores)
        # May not be equal to n_bootstrap_iter if some samples were rejected
        num_bootstraps = len(sorted_scores)
        log.debug('sorted_scores: %s' % sorted_scores)

        ci_lower_bound_float = (1.0 - ci) / 2
        ci_lower_bound = sorted_scores[int(ci_lower_bound_float * num_bootstraps)]
        ci_upper_bound_float = ci + ci_lower_bound_float
        ci_upper_bound = sorted_scores[int(ci_upper_bound_float * num_bootstraps)]

        return sample_score, ci_lower_bound, ci_upper_bound
Ejemplo n.º 22
0
    def _tune_hyperparams_regress_and_round(self, X, y):
        self._hyperparams['hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH
        log.info('Tuning hyperparams via %s...' % self._hyperparams['hyperparam_strategy'])
        # If not provided, search for best coef_max.
        if self._hyperparams.get('coef_max') is None:
            self._hyperparams['coef_max'] = self._tune_coef_max(X, y)

        # Round linear coefficients.
        self._round_coefs(self._hyperparams['coef_max'])
        log.debug('hyperparams: %s' % self.hyperparams())
        log.debug('params: %s' % self.params())
Ejemplo n.º 23
0
    def _analyze_predictor_holdoutset(self, dest_dir, pipeline_prefix):
        slugified_var = '-'.join(self._var.split())
        holdout_path = dest_dir + '/../' + '%s-normality-matrix-%d-episodes-processed-holdout.tab' % (
            slugified_var, self._num_rows)
        fm_io = FeatureMatrixIO()
        processed_matrix = fm_io.read_file_to_data_frame(holdout_path)
        if self._isLabPanel:
            y_holdout = pd.DataFrame(
                processed_matrix.pop('all_components_normal'))
        else:
            y_holdout = pd.DataFrame(processed_matrix.pop('component_normal'))
        X_holdout = processed_matrix
        analyzer = ClassifierAnalyzer(self._predictor, X_holdout, y_holdout)
        train_label = 'holdoutset'

        # Build names for output plots and report.
        direct_comparisons_name = '%s-direct-compare-results-%s.csv' % (
            pipeline_prefix, train_label)
        precision_at_k_plot_name = '%s-precision-at-k-plot-%s.png' % (
            pipeline_prefix, train_label)
        precision_recall_plot_name = '%s-precision-recall-plot-%s.png' % (
            pipeline_prefix, train_label)
        roc_plot_name = '%s-roc-plot-%s.png' % (pipeline_prefix, train_label)
        report_name = '%s-report-%s.tab' % (pipeline_prefix, train_label)

        # Build paths.
        direct_comparisons_path = '/'.join([dest_dir, direct_comparisons_name])
        log.debug('direct_comparisons_path: %s' % direct_comparisons_path)
        precision_at_k_plot_path = '/'.join(
            [dest_dir, precision_at_k_plot_name])
        log.debug('precision_at_k_plot_path: %s' % precision_at_k_plot_path)
        precision_recall_plot_path = '/'.join(
            [dest_dir, precision_recall_plot_name])
        log.debug('precision_recall_plot_path: %s' %
                  precision_recall_plot_path)
        roc_plot_path = '/'.join([dest_dir, roc_plot_name])
        log.debug('roc_plot_path: %s' % roc_plot_path)
        report_path = '/'.join([dest_dir, report_name])
        log.debug('report_path: %s' % report_path)

        # Build plot titles.
        roc_plot_title = 'ROC (%s)' % pipeline_prefix
        precision_recall_plot_title = 'Precision-Recall (%s)' % pipeline_prefix
        precision_at_k_plot_title = 'Precision @K (%s)' % pipeline_prefix

        # Write output.
        analyzer.output_direct_comparisons(direct_comparisons_path)
        analyzer.plot_roc_curve(roc_plot_title, roc_plot_path)
        analyzer.plot_precision_recall_curve(precision_recall_plot_title,
                                             precision_recall_plot_path)
        analyzer.plot_precision_at_k_curve(precision_at_k_plot_title,
                                           precision_at_k_plot_path)
        analyzer.write_report(report_path, ci=0.95)
Ejemplo n.º 24
0
    def _train_test_split(self, processed_matrix, outcome_label, columnToSplitOn='pat_id'):
        log.debug('outcome_label: %s' % outcome_label)
        all_possible_ids = sorted(set(processed_matrix[columnToSplitOn].values.tolist()))

        train_ids, test_ids = train_test_split(all_possible_ids, random_state=self._random_state)

        train_matrix = processed_matrix[processed_matrix[columnToSplitOn].isin(train_ids)].copy()
        self._y_train = pd.DataFrame(train_matrix.pop(outcome_label))
        self._X_train = train_matrix

        test_matrix = processed_matrix[processed_matrix[columnToSplitOn].isin(test_ids)].copy()
        self._y_test = pd.DataFrame(test_matrix.pop(outcome_label))
        self._X_test = test_matrix
Ejemplo n.º 25
0
    def build_stride_psql_schemata():
        schemata_dir = StrideLoader.fetch_psql_schemata_dir()
        for params in list(STRIDE_LOADER_PARAMS.values()):
            psql_table = params['psql_table'] % TABLE_PREFIX

            log.debug('loading %s schema...' % psql_table)

            # Open file, feed to DBUtil, and close file.
            schema_file_name = '.'.join([psql_table, 'schema.sql'])
            schema_file_path = os.path.join(schemata_dir, schema_file_name)
            schema_file = open(schema_file_path, 'r')
            DBUtil.runDBScript(schema_file)
            schema_file.close()
Ejemplo n.º 26
0
    def build_stride_psql_indices():
        indices_dir = StrideLoader.fetch_psql_indices_dir()
        for params in list(STRIDE_LOADER_PARAMS.values()):
            psql_table = params['psql_table'] % TABLE_PREFIX

            # Open file, feed to DBUtil, and close file.
            indices_file_name = '.'.join([psql_table, 'indices.sql'])
            indices_file_path = os.path.join(indices_dir, indices_file_name)
            if os.path.exists(indices_file_path):
                log.debug('loading %s indices...' % psql_table)
                indices_file = open(indices_file_path, 'r')
                DBUtil.runDBScript(indices_file)
                indices_file.close()
Ejemplo n.º 27
0
    def train(self, X, y, groups=None):

        self._groups = groups
        assert ('pat_id' not in X.columns)

        self._features = X.columns

        y = self._maybe_reshape_y(y)

        # Verify that there are at least 2 samples of each class.
        value_counts = Series(y).value_counts()
        log.debug('y.value_counts(): %s' % value_counts)
        for class_label in self._classes:
            # If there aren't 2+ samples of each class, exit gracefully.
            try:
                num_samples = value_counts[class_label]
                if num_samples < 10:
                    log.error('Insufficient samples (%s) of label %s.' %
                              (num_samples, class_label))
                    return SupervisedClassifier.INSUFFICIENT_SAMPLES
            except KeyError:
                log.error('Insufficient samples (0) of label %s.' %
                          class_label)
                return SupervisedClassifier.INSUFFICIENT_SAMPLES

        log.info('Training %s classifier...' % self._hyperparams['algorithm'])
        if self._hyperparams[
                'algorithm'] == SupervisedClassifier.DECISION_TREE:
            self._train_decision_tree(X, y)
        elif self._hyperparams[
                'algorithm'] == SupervisedClassifier.LOGISTIC_REGRESSION:
            self._train_logistic_regression(X, y)
        elif self._hyperparams[
                'algorithm'] == SupervisedClassifier.RANDOM_FOREST:
            self._train_random_forest(X, y)
        elif self._hyperparams[
                'algorithm'] == SupervisedClassifier.REGRESS_AND_ROUND:
            self._train_regress_and_round(X, y)
        elif self._hyperparams['algorithm'] == SupervisedClassifier.ADABOOST:
            self._train_adaboost(X, y)
        elif self._hyperparams[
                'algorithm'] == SupervisedClassifier.GAUSSIAN_NAIVE_BAYES:
            self._train_gaussian_naive_bayes(X, y)
        elif self._hyperparams['algorithm'] == SupervisedClassifier.SVM:
            self._train_svm(X, y)
        elif self._hyperparams['algorithm'] == SupervisedClassifier.XGB:
            self._train_xgb(X, y)
        elif self._hyperparams['algorithm'] == SupervisedClassifier.NN:
            self._train_nn(X, y)

        return SupervisedClassifier.TRAINED
Ejemplo n.º 28
0
    def build_virtual_clinical_item(item_sequence, virtual_id):
        """
        Simple wrapper around medinfo/cpoe/TripleAssociationAnalysis.
        """
        sequence_str = ' -> '.join([str(id) for id in item_sequence])
        log.debug('(%s) = (%s)' % (sequence_str, virtual_id))

        build_virtual_item_command = [
            'python', '-m', 'medinfo/cpoe/TripleAssociationAnalysis.py', '-s',
            ','.join([str(id) for id in item_sequence]), '-v',
            str(virtual_id)
        ]

        subprocess.call(build_virtual_item_command)
Ejemplo n.º 29
0
 def build_composite_clinical_item(components, name, description,
                                   category_id):
     """
     Simple wrapper around medinfo/cpoe/DataManager.py
     """
     component_str = ','.join([str(id) for id in components])
     log.debug('(%s, %s, %s) = (%s)' % (name, description, category_id, \
                                         component_str))
     composite_arg = '%s|%s|%s|%s' % (component_str, name, description, \
                                         category_id)
     dm = DataManager()
     dm.main([
         'medinfo/cpoe/DataManager.py', '--compositeRelated', composite_arg
     ])
Ejemplo n.º 30
0
    def load_stride_to_psql():
        # Build clean data files.
        StrideLoader.build_clean_csv_files()

        # Build psql schemata.
        StrideLoader.build_stride_psql_schemata()

        # Build paths to clean data files.
        clean_data_dir = StrideLoader.fetch_clean_data_dir()
        for raw_file in sorted(STRIDE_LOADER_PARAMS.keys()):
            params = STRIDE_LOADER_PARAMS[raw_file]

            # Build clean data file.
            clean_file = params['clean_file'] % TABLE_PREFIX
            log.info('loading %s...' % clean_file)
            clean_path = os.path.join(clean_data_dir, clean_file)

            # Uncompress data file.
            unzipped_clean_path = clean_path[:-3]
            with gzip.open(clean_path,
                           'rb') as f_in, open(unzipped_clean_path,
                                               'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

            # psql COPY data from clean files into DB.
            psql_table = params['psql_table'] % TABLE_PREFIX
            log.debug('stride/data/clean/%s ==> %s' % (clean_file, psql_table))
            # In some cases, two files going to the same table will have
            # non-identical column names. Pass these explicitly so that
            # psql knows which columns to try to fill from file.
            # Strip the newline character.
            with open(unzipped_clean_path, 'r') as f_in:
                columns = f_in.readline()[:-1]
            command = "COPY %s (%s) FROM '%s' WITH (FORMAT csv, HEADER);" % (
                psql_table, columns, unzipped_clean_path)
            DBUtil.execute(command)

            # Delete unzipped_clean_path.
            os.remove(unzipped_clean_path)

        # Run any one-off postprocessing transformations which all users
        # of the STRIDE database should receive. Defer any application-specific
        # transformations to other modules.
        StrideLoader.process_stride_psql_db()

        # Build indices.
        StrideLoader.build_stride_psql_indices()