def _analyze_predictors_on_holdout(self):
        fm_io = FeatureMatrixIO()

        algorithms_to_test = list()
        algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS)

        pipeline_file_name = inspect.getfile(inspect.currentframe())
        data_dir = SupervisedLearningPipeline._fetch_data_dir_path(
            self, pipeline_file_name)
        # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS:
        #     algorithms_to_test.append('bifurcated-%s' % algorithm)
        log.debug('algorithms_to_test: %s' % algorithms_to_test)
        for algorithm in algorithms_to_test:
            log.info('analyzing %s...' % algorithm)
            # If report_dir does not exist, make it.
            report_dir = '/'.join([data_dir, algorithm])

            pipeline_prefix = '%s-normality-prediction-%s' % (self._var,
                                                              algorithm)

            predictor_path = self._build_model_dump_path(algorithm)

            if os.path.exists(
                    predictor_path) and 'bifurcated' not in algorithm:
                log.debug('Loading model from disk...')
                # TODO(sbala): Fix loblib.load so that it works for bifurcated
                # supervised classifiers.
                self._predictor = joblib.load(predictor_path)
                # self._features = self._X_train.columns
                status = SupervisedClassifier.TRAINED

            SupervisedLearningPipeline._analyze_predictor_holdoutset(
                self, report_dir, pipeline_prefix)
    def __init__(
        self,
        change_params,
        lab_panel,
        num_episodes,
        use_cache=None,
        random_state=None,
        build_raw_only=False,
    ):
        SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes,
                                            use_cache, random_state)
        self._change_params = change_params
        self._change_params[
            'feature_old'] = self._lookup_previous_measurement_feature(
                self._var)
        log.debug('change_params: %s' % self._change_params)

        if build_raw_only:
            self._build_raw_feature_matrix()
            return

        else:
            self._build_raw_feature_matrix()
            self._build_processed_feature_matrix()
            self._train_and_analyze_predictors()
Beispiel #3
0
    def __init__(self, lab_panel, microcultures,  num_episodes, use_cache=None, random_state=None):
        SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state)

        self.panel = microcultures
        self._build_raw_feature_matrix()
        self._build_processed_feature_matrix()
        self._train_and_analyze_predictors()
Beispiel #4
0
 def __init__(self, lab_panel, num_episodes, use_cache=None, random_state=None, isLabPanel=True,
              notUsePatIds=[], pat_batch_ind=None):
     self.notUsePatIds = notUsePatIds
     self.pat_batch_ind = pat_batch_ind
     self.usedPatIds = []
     SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state, isLabPanel)
     self._factory = FeatureMatrixFactory()
     self._build_raw_feature_matrix()
Beispiel #5
0
    def __init__(self,
                 lab_panel,
                 num_episodes,
                 use_cache=None,
                 random_state=None,
                 isLabPanel=True,
                 timeLimit=None,
                 notUsePatIds=None,
                 holdOut=False,
                 pat_batch_ind=None):
        self.notUsePatIds = notUsePatIds
        self.pat_batch_ind = pat_batch_ind
        self.usedPatIds = []
        SupervisedLearningPipeline.__init__(
            self,
            lab_panel,
            num_episodes,
            use_cache,
            random_state,
            isLabPanel,
            timeLimit,
            holdOut,
            isLabNormalityPredictionPipeline=True)
        # TODO: naming of lab_panel
        self._factory = FeatureMatrixFactory()
        self._build_raw_feature_matrix()

        data_lab_folder = self._fetch_data_dir_path(
            inspect.getfile(inspect.currentframe()))
        feat2imputed_dict_path = data_lab_folder + '/feat2imputed_dict.pkl'

        if holdOut:
            '''
            For holdOut evaluation data, produce the raw matrix, pick 
            features according to the saved feat2imputed_dict. 
            '''
            self.feat2imputed_dict = pickle.load(
                open(feat2imputed_dict_path, 'r'))
            self._build_processed_feature_matrix_holdout()
            self._analyze_predictors_on_holdout()
        else:
            '''
            For training/validation data, record the pat_ids, 
            selected features and their imputed value correspondingly. 
            '''
            pickle.dump(self.usedPatIds,
                        open('data/used_patient_set_%s.pkl' % self._var, 'w'),
                        pickle.HIGHEST_PROTOCOL)
            self._build_processed_feature_matrix()
            self._build_baseline_results()  # TODO: prototype in SLPP
            return

            # TODO: find better place to put the dict.pkl
            pickle.dump(self.feat2imputed_dict,
                        open(feat2imputed_dict_path, 'w'),
                        pickle.HIGHEST_PROTOCOL)
            self._train_and_analyze_predictors()
    def _build_raw_feature_matrix(self):
        raw_matrix_path = self._build_raw_matrix_path()
        matrix_class = LabNormalityMatrix
        SupervisedLearningPipeline._build_raw_feature_matrix(self, matrix_class, \
            raw_matrix_path)

        if not self._holdOut:
            fm_io = FeatureMatrixIO()
            matrix = fm_io.read_file_to_data_frame(raw_matrix_path)
            self.usedPatIds = set(matrix['pat_id'].values)
Beispiel #7
0
    def __init__(self, lab_panel, num_episodes, use_cache=None, random_state=None,
                 timeLimit=None, notUsePatIds=None, holdOut=False, pat_batch_ind=None, includeLastNormality=True):
        # self.notUsePatIds = notUsePatIds
        self.pat_batch_ind = pat_batch_ind
        self._holdOut = holdOut
        self.usedPatIds = []
        SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state,
                                            timeLimit, notUsePatIds)
        # TODO: naming of lab_panel
        self._factory = FeatureMatrixFactory()
        self._build_raw_feature_matrix()

        if LAB_TYPE == 'panel':
            self.ylabel = 'all_components_normal'
        else:
            self.ylabel = 'component_normal'

        self.includeLastNormality = includeLastNormality

        if self.includeLastNormality:
            fm_io = FeatureMatrixIO()
            df = fm_io.read_file_to_data_frame('data/'+lab_panel+'/%s-normality-matrix-raw.tab'%lab_panel)
            df = df.sort_values(['pat_id', 'order_time']).reset_index(drop=True)
            df['last_normality'] = df['order_proc_id'].apply(lambda x:float('nan'))
            for i in range(1,df.shape[0]):
                if df.ix[i, 'pat_id'] == df.ix[i-1, 'pat_id']:
                    df.ix[i, 'last_normality'] = df.ix[i-1, self.ylabel]
            df.to_csv('data/'+lab_panel+'/%s-normality-matrix-raw.tab'%lab_panel, index=False, sep='\t')

        data_lab_folder = self._fetch_data_dir_path(inspect.getfile(inspect.currentframe()))
        feat2imputed_dict_path = data_lab_folder + '/feat2imputed_dict.pkl'

        if holdOut:
            '''
            For holdOut evaluation data, produce the raw matrix, pick 
            features according to the saved feat2imputed_dict. 
            '''
            self.feat2imputed_dict = pickle.load(open(feat2imputed_dict_path, 'r'))
            self._build_processed_feature_matrix_holdout()
            self._analyze_predictors_on_holdout()
        else:
            '''
            For training/validation data, record the pat_ids, 
            selected features and their imputed value correspondingly. 
            '''
            pickle.dump(self.usedPatIds, open('data/used_patient_set_%s.pkl'%self._var, 'w'), pickle.HIGHEST_PROTOCOL)
            self._build_processed_feature_matrix()
            self._build_baseline_results()  # TODO: prototype in SLPP
            # return

            # TODO: find better place to put the dict.pkl
            pickle.dump(self.feat2imputed_dict, open(feat2imputed_dict_path, 'w'), pickle.HIGHEST_PROTOCOL)
            self._train_and_analyze_predictors()
Beispiel #8
0
 def _build_raw_matrix_path(self):
     if not self.pat_batch_ind:
         template = '%s-normality-matrix-%d-episodes-raw.tab'
     else:
         template = '%s-normality-matrix-%d-episodes-raw-'+str(self.pat_batch_ind)+'.tab'
     pipeline_file_name = inspect.getfile(inspect.currentframe())
     return SupervisedLearningPipeline._build_matrix_path(self, template, \
         pipeline_file_name)
Beispiel #9
0
 def _build_processed_matrix_path(self):
     processed_matrix_filename = '%s-normality-matrix-processed.tab' % self._var  #
     processed_matrix_path = os.path.join('data', self._var, processed_matrix_filename)  # TODO
     return processed_matrix_path
     if not self._holdOut:
         template = '%s-normality-matrix-processed.tab'
     else:
         template = '%s-normality-matrix-%d-episodes-processed-holdout.tab'
     pipeline_file_path = inspect.getfile(inspect.currentframe())
     return SupervisedLearningPipeline._build_matrix_path(self, template, \
         pipeline_file_path)
Beispiel #10
0
 def _build_raw_matrix_path(self):
     raw_matrix_filename = '%s-normality-matrix-raw.tab' % self._var  #
     raw_matrix_filepath = os.path.join('data', self._var, raw_matrix_filename)  # TODO
     if not os.path.exists('data'):
         os.mkdir('data')
     if not os.path.exists(os.path.join('data', self._var)):
         os.mkdir(os.path.join('data', self._var))
     return raw_matrix_filepath
     if not self._holdOut:
         template = '%s-normality-matrix-raw.tab'
     else:
         template = '%s-normality-matrix-%d-episodes-raw-holdout.tab'
     pipeline_file_name = inspect.getfile(inspect.currentframe())
     return SupervisedLearningPipeline._build_matrix_path(self, template, \
         pipeline_file_name)
    def _build_raw_matrix_path(self):
        template = '%s-change-matrix-%d-episodes-raw.tab'
        pipeline_file_name = inspect.getfile(inspect.currentframe())

        # Build matrix file name.
        slugified_var = '-'.join(self._var.split())
        matrix_name = template % (slugified_var, self._num_rows)

        # Build path using parent class logic for _fetch_data_dir_path.
        # This puts raw matrix in the directory for lab test rather than the
        # subdirectory for the specific change definition.  That way it can be
        # reused in pipelines for multiple different change defs.
        data_dir = SupervisedLearningPipeline._fetch_data_dir_path(self, pipeline_file_name)
        matrix_path = '/'.join([data_dir, matrix_name])

        return matrix_path
 def _build_raw_feature_matrix(self):
     raw_matrix_path = self._build_raw_matrix_path()
     matrix_class = LabChangeMatrix
     SupervisedLearningPipeline._build_raw_feature_matrix(self, matrix_class, \
         raw_matrix_path)
 def _build_model_dump_path(self, algorithm):
     template = '%s' + '-change-%s-model.pkl' % algorithm
     pipeline_file_name = inspect.getfile(inspect.currentframe())
     return SupervisedLearningPipeline._build_model_dump_path(self, template, \
         pipeline_file_name)
    def _train_and_analyze_predictors(self):
        log.info('Training and analyzing predictors...')
        problem = SupervisedLearningPipeline.CLASSIFICATION
        meta_report = None
        fm_io = FeatureMatrixIO()

        # Build paths for output.
        pipeline_file_name = inspect.getfile(inspect.currentframe())
        data_dir = self._fetch_data_dir_path(pipeline_file_name)

        # Test BifurcatedSupervisedClassifier and SupervisedClassifier.
        algorithms_to_test = list()
        algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS)
        for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS:
            pass  # TODO:(raikens) something in the BifurcatedSupervisedClassifier pipeline is crashing
            #algorithms_to_test.append('bifurcated-%s' % algorithm)
        log.debug('algorithms_to_test: %s' % algorithms_to_test)

        # Train and analyse algorithms.
        for algorithm in algorithms_to_test:
            log.info('Training and analyzing %s...' % algorithm)
            # If report_dir does not exist, make it.
            report_dir = '/'.join([data_dir, algorithm])
            if not os.path.exists(report_dir):
                os.makedirs(report_dir)

            log.debug('report_dir: %s' % report_dir)

            # Define hyperparams.
            hyperparams = {}
            hyperparams['algorithm'] = algorithm
            hyperparams[
                'hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH
            hyperparams['max_iter'] = 1024

            # If bifurcated algorithm, define bifurcator.
            if 'bifurcated' in algorithm:
                # bifrucator = LAB.pre == 0
                hyperparams['bifurcator'] = '%s.pre' % self._var
                hyperparams[
                    'bifurcation_strategy'] = BifurcatedSupervisedClassifier.EQUAL
                hyperparams['bifurcation_value'] = 0
                hyperparams['bifurcated'] = True

            # Train classifier.
            predictor_path = self._build_model_dump_path(algorithm)
            if os.path.exists(
                    predictor_path) and 'bifurcated' not in algorithm:
                log.debug('Loading model from disk...')
                # TODO(sbala): Fix loblib.load so that it works for bifurcated
                # supervised classifiers.
                self._predictor = joblib.load(predictor_path)
                self._features = self._X_train.columns
                status = SupervisedClassifier.TRAINED
            else:
                status = SupervisedLearningPipeline._train_predictor(
                    self, problem, [0, 1], hyperparams)

            # If failed to train, write an error report.
            y_train_counts = self._y_train[
                self._y_train.columns[0]].value_counts()
            y_test_counts = self._y_test[
                self._y_test.columns[0]].value_counts()
            if status == SupervisedClassifier.INSUFFICIENT_SAMPLES:
                # Skip all analysis and reporting.
                # This will be true for all algorithms, so just return.
                # Build error report.
                algorithm_report = DataFrame(
                    {
                        'lab_panel': [self._var],
                        'algorithm': [algorithm],
                        'error': [status],
                        'y_train.value_counts()': [y_train_counts.to_dict()],
                        'y_test.value_counts()': [y_test_counts.to_dict()]
                    },
                    columns=[
                        'lab_panel', 'algorithm', 'error',
                        'y_train.value_counts()', 'y_test.value_counts()'
                    ])
                header = [
                    'LabChangePredictionPipeline("%s", %d)' %
                    (self._var, self._num_rows)
                ]
                # Write error report.
                fm_io.write_data_frame_to_file(algorithm_report, \
                    '/'.join([report_dir, '%s-change-prediction-report.tab' % (self._var)]), \
                    header)
            # If successfully trained, append to a meta report.
            elif status == SupervisedClassifier.TRAINED:
                pipeline_prefix = '%s-change-prediction-%s' % (self._var,
                                                               algorithm)
                SupervisedLearningPipeline._analyze_predictor(
                    self, report_dir, pipeline_prefix)
                if meta_report is None:
                    meta_report = fm_io.read_file_to_data_frame('/'.join(
                        [report_dir,
                         '%s-report.tab' % pipeline_prefix]))
                else:
                    algorithm_report = fm_io.read_file_to_data_frame('/'.join(
                        [report_dir,
                         '%s-report.tab' % pipeline_prefix]))
                    log.debug('algorithm_report: %s' % algorithm_report)
                    meta_report = meta_report.append(algorithm_report)
                # Write predictor to disk.
                predictor = SupervisedLearningPipeline.predictor(self)
                predictor_path = self._build_model_dump_path(algorithm)
                joblib.dump(predictor, predictor_path)

        # After building per-algorithm reports, write to meta report.
        # Note that if there were insufficient samples to build any of the
        # algorithms, then meta_report will still be None.
        if meta_report is not None:
            header = [
                'LabChangePredictionPipeline("%s", %d)' %
                (self._var, self._num_rows)
            ]
            fm_io.write_data_frame_to_file(meta_report, \
                '/'.join([data_dir, '%s-change-prediction-report.tab' % self._var]), header)
Beispiel #15
0
 def _build_composite_raw_feature_matrix(self):
     composite_raw_matrix_path = self._build_composite_raw_matrix_path()
     matrix_class = AntiBiogramMatrix
     SupervisedLearningPipeline._build_raw_feature_matrix(self, matrix_class, \
         composite_raw_matrix_path)
Beispiel #16
0
 def _build_composite_raw_matrix_path(self):
     template = '%s-normality-matrix-%d-episodes-raw.tab'
     pipeline_file_name = inspect.getfile(inspect.currentframe())
     return SupervisedLearningPipeline._build_matrix_path(self, template, \
         pipeline_file_name)
Beispiel #17
0
    def _build_processed_feature_matrix(self):
        # Define parameters for processing steps.
        params = {}
        raw_matrix_path = self._build_raw_matrix_path()
        processed_matrix_path = self._build_processed_matrix_path()
        features_to_add = {}
        imputation_strategies = {}

        features_to_remove = [
            'pat_id', 'shifted_order_time', 'proc_code', 'abnormal_panel',
            'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays',
            'RaceWhiteHispanicLatino.preTimeDays',
            'RaceWhiteNonHispanicLatino.preTimeDays',
            'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays',
            'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays',
            'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays',
            'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays',
            'organism_name'
        ]

        antibiotic_flags = [
            'ceftriaxone', 'meropenem', 'piperacillin_tazobactam', 'cefazolin',
            'imipenem', 'aztreonam'
        ]

        features_to_remove += [
            '%s_susc' % flag for flag in antibiotic_flags if flag != self.drug
        ]
        features_to_remove += ['%s_tested' % flag for flag in antibiotic_flags]

        features_to_keep = ['LABBLC-LABBLC2-LABURNC.pre']
        # Keep the # of times it's been ordered in past, even if low info.
        # '%s.pre' % '-'.join(self._var.split())

        outcome_label = '%s_susc' % self.drug
        selection_problem = FeatureSelector.CLASSIFICATION
        selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION
        percent_features_to_select = 0.05
        matrix_class = AntiBiogramMatrix
        pipeline_file_path = inspect.getfile(inspect.currentframe())
        random_state = self._random_state
        data_overview = [
            # Overview:
            'Overview',
            # The outcome label is ___.
            'The outcome label is %s.' % outcome_label,
            # %s is a boolean indicator which summarizes whether all components
            '%s is a boolean indicator which summarizes whether all components '
            % outcome_label,
            # in the lab panel order represented by a given row are normal.
            'in the lab panel order represented by a given row are normal.',
            # Each row represents a unique lab panel order.
            'Each row represents a unique lab panel order.',
            # Each row contains fields summarizing the patient's demographics,
            "Each row contains fields summarizing the patient's demographics",
            # inpatient admit date, prior vitals, and prior lab results.
            'inpatient admit date, prior vitals, and prior lab results.',
            # Most cells in matrix represent a count statistic for an event's
            "Most cells in matrix represent a count statistic for an event's",
            # occurrence or a difference between an event's time and index_time.
            "occurrence or a difference between an event's time and index_time.",
        ]

        # Bundle parameters into single object to be unpacked in SLP.
        params['raw_matrix_path'] = raw_matrix_path
        params['processed_matrix_path'] = processed_matrix_path
        params['features_to_add'] = features_to_add
        params['features_to_keep'] = features_to_keep
        params['imputation_strategies'] = imputation_strategies
        params['features_to_remove'] = features_to_remove
        params['outcome_label'] = outcome_label
        params['selection_problem'] = selection_problem
        params['selection_algorithm'] = selection_algorithm
        params['percent_features_to_select'] = percent_features_to_select
        params['matrix_class'] = matrix_class
        params['pipeline_file_path'] = pipeline_file_path
        params['data_overview'] = data_overview
        params['random_state'] = random_state

        # Defer processing logic to SupervisedLearningPipeline.
        SupervisedLearningPipeline._build_processed_feature_matrix(
            self, params)
Beispiel #18
0
    def _build_processed_feature_matrix(self):
        # Define parameters for processing steps.
        params = {}
        raw_matrix_path = self._build_raw_matrix_path()
        processed_matrix_path = self._build_processed_matrix_path()
        features_to_add = {}
        imputation_strategies = {
        }

        features_to_remove = [
            'pat_anon_id', 'shifted_order_time',
            'proc_code', 'abnormal_panel', 'Birth.pre',
            'Male.preTimeDays', 'Female.preTimeDays',
            'RaceWhiteHispanicLatino.preTimeDays',
            'RaceWhiteNonHispanicLatino.preTimeDays',
            'RaceHispanicLatino.preTimeDays',
            'RaceAsian.preTimeDays',
            'RaceBlack.preTimeDays',
            'RacePacificIslander.preTimeDays',
            'RaceNativeAmerican.preTimeDays',
            'RaceOther.preTimeDays',
            'RaceUnknown.preTimeDays',
            'Death.post',
            'Death.postTimeDays',
            'escherichia_coli',
            "organism_name",
            "staphylococcus_aureus",
            "enterococcus_species", 
            "klebsiella_pneumoniae",
            "pseudomonas_aeruginosa", 
            "coag_negative_staphylococcus", 
            "enterococcus_faecalis", 
            "proteus_mirabilis", 
            "candida_albicans"
        ]
        features_to_keep = [
            # Keep the # of times it's been ordered in past, even if low info.
            '%s.pre' % self._var
        ]
        outcome_label = 'no_bacteria'
        selection_problem = FeatureSelector.CLASSIFICATION
        selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION
        percent_features_to_select = 0.05
        matrix_class = LabCultureMatrix
        pipeline_file_path = inspect.getfile(inspect.currentframe())
        random_state = self._random_state
        data_overview = [
            # Overview:
            'Overview',
            # The outcome label is ___.
            'The outcome label is %s.' % outcome_label,
            # %s is a boolean indicator which summarizes whether all components
            '%s is a boolean indicator which summarizes whether all components ' % outcome_label,
            # in the lab panel order represented by a given row are normal.
            'in the lab panel order represented by a given row are normal.',
            # Each row represents a unique lab panel order.
            'Each row represents a unique lab panel order.',
            # Each row contains fields summarizing the patient's demographics,
            "Each row contains fields summarizing the patient's demographics",
            # inpatient admit date, prior vitals, and prior lab results.
            'inpatient admit date, prior vitals, and prior lab results.',
            # Most cells in matrix represent a count statistic for an event's
            "Most cells in matrix represent a count statistic for an event's",
            # occurrence or a difference between an event's time and index_time.
            "occurrence or a difference between an event's time and index_time.",
        ]

        # Bundle parameters into single object to be unpacked in SLP.
        params['raw_matrix_path'] = raw_matrix_path
        params['processed_matrix_path'] = processed_matrix_path
        params['features_to_add'] = features_to_add
        params['features_to_keep'] = features_to_keep
        params['imputation_strategies'] = imputation_strategies
        params['features_to_remove'] = features_to_remove
        params['outcome_label'] = outcome_label
        params['selection_problem'] = selection_problem
        params['selection_algorithm'] = selection_algorithm
        params['percent_features_to_select'] = percent_features_to_select
        params['matrix_class'] = matrix_class
        params['pipeline_file_path'] = pipeline_file_path
        params['data_overview'] = data_overview
        params['random_state'] = random_state

        # Defer processing logic to SupervisedLearningPipeline.
        SupervisedLearningPipeline._build_processed_feature_matrix(self, params)
Beispiel #19
0
    def _build_processed_feature_matrix(self):
        # Define parameters for processing steps.
        params = {}
        raw_matrix_path = self._build_raw_matrix_path()
        processed_matrix_path = self._build_processed_matrix_path()
        features_to_add = {}
        imputation_strategies = {
        }

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':
            features_to_remove = [
                'pat_id', 'order_time', 'order_proc_id',
                'proc_code', 'abnormal_panel',
                'num_normal_components', 'Birth.pre',
                'Male.preTimeDays', 'Female.preTimeDays',
                'RaceWhiteHispanicLatino.preTimeDays',
                'RaceWhiteNonHispanicLatino.preTimeDays',
                'RaceHispanicLatino.preTimeDays',
                'RaceAsian.preTimeDays',
                'RaceBlack.preTimeDays',
                'RacePacificIslander.preTimeDays',
                'RaceNativeAmerican.preTimeDays',
                'RaceOther.preTimeDays',
                'RaceUnknown.preTimeDays',
                'Death.post',
                'Death.postTimeDays',
                'num_components'
            ]
            outcome_label = 'all_components_normal' # TODO: for component...

        elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
            features_to_remove = [
                'pat_id', 'order_time', 'order_proc_id',
                'Birth.pre',
                'Male.preTimeDays', 'Female.preTimeDays',
                # 'Caucasian.preTimeDays',
                # 'Hispanic.preTimeDays',
                # 'Native Hawaiian and Other Pacific Islander.preTimeDays'
            ]
            RACE_FEATURES = self._factory.queryAllRaces()
            features_to_remove += [x + '.preTimeDays' for x in RACE_FEATURES]
            if self._isLabPanel:
                features_to_remove += ['proc_code', 'num_normal_components', 'num_components']
            else:
                features_to_remove += ['base_name']

            outcome_label = 'abnormal_lab'

        features_to_keep = [
            # Keep the # of times it's been ordered in past, even if low info.
            '%s.pre' % self._var
        ]

        selection_problem = FeatureSelector.CLASSIFICATION
        selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION
        percent_features_to_select = 0.05
        matrix_class = LabNormalityMatrix
        pipeline_file_path = inspect.getfile(inspect.currentframe())
        random_state = self._random_state
        data_overview = [
            # Overview:
            'Overview',
            # The outcome label is ___.
            'The outcome label is %s.' % outcome_label,
            # %s is a boolean indicator which summarizes whether all components
            '%s is a boolean indicator which summarizes whether all components ' % outcome_label,
            # in the lab panel order represented by a given row are normal.
            'in the lab panel order represented by a given row are normal.',
            # Each row represents a unique lab panel order.
            'Each row represents a unique lab panel order.',
            # Each row contains fields summarizing the patient's demographics,
            "Each row contains fields summarizing the patient's demographics",
            # inpatient admit date, prior vitals, and prior lab results.
            'inpatient admit date, prior vitals, and prior lab results.',
            # Most cells in matrix represent a count statistic for an event's
            "Most cells in matrix represent a count statistic for an event's",
            # occurrence or a difference between an event's time and index_time.
            "occurrence or a difference between an event's time and index_time.",
        ]

        # Bundle parameters into single object to be unpacked in SLP.
        params['raw_matrix_path'] = raw_matrix_path
        params['processed_matrix_path'] = processed_matrix_path
        params['features_to_add'] = features_to_add
        params['features_to_keep'] = features_to_keep
        params['imputation_strategies'] = imputation_strategies
        params['features_to_remove'] = features_to_remove
        params['outcome_label'] = outcome_label
        params['selection_problem'] = selection_problem
        params['selection_algorithm'] = selection_algorithm
        params['percent_features_to_select'] = percent_features_to_select
        params['matrix_class'] = matrix_class
        params['pipeline_file_path'] = pipeline_file_path
        params['data_overview'] = data_overview
        params['random_state'] = random_state

        # Defer processing logic to SupervisedLearningPipeline.
        SupervisedLearningPipeline._build_processed_feature_matrix(self, params)
 def _build_processed_matrix_path(self, raw_matrix_path):
     template = '%s-change-matrix-%d-episodes-processed.tab'
     pipeline_file_path = inspect.getfile(inspect.currentframe())
     return SupervisedLearningPipeline._build_matrix_path(self, template, \
         pipeline_file_path)
Beispiel #21
0
    def _build_processed_feature_matrix(self):
        # Define parameters for processing steps.
        params = {}
        raw_matrix_path = self._build_raw_matrix_path()
        processed_matrix_path = self._build_processed_matrix_path()
        features_to_add = {}
        imputation_strategies = {
        }

        features_to_remove = [
            'pat_id', 'order_time', 'pat_enc_csn_id',
            'proc_code', 'abnormal_panel', 'Birth.pre',
            'Male.preTimeDays', 'Female.preTimeDays',
            'RaceWhiteHispanicLatino.preTimeDays',
            'RaceWhiteNonHispanicLatino.preTimeDays',
            'RaceHispanicLatino.preTimeDays',
            'RaceAsian.preTimeDays',
            'RaceBlack.preTimeDays',
            'RacePacificIslander.preTimeDays',
            'RaceNativeAmerican.preTimeDays',
            'RaceOther.preTimeDays',
            'RaceUnknown.preTimeDays',
            'Death.post',
            'Death.postTimeDays'
        ]

        features_to_keep = [
            # Keep the # of times it's been ordered in past, even if low info.
            "LABBLC.pre" # placeholder - one element at least needed for pipeline to function - weird
        ]
        outcome_label = 'mrsa_present'
        selection_problem = FeatureSelector.CLASSIFICATION
        selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION
        percent_features_to_select = 0.05
        matrix_class = MRSAMatrix
        pipeline_file_path = inspect.getfile(inspect.currentframe())
        random_state = self._random_state
        data_overview = [
            # Overview:
            'Overview',
            # The outcome label is ___.
            'The outcome label is %s.' % outcome_label,
            # %s is a boolean indicator which summarizes whether all components
            '%s is a boolean indicator which summarizes whether all components ' % outcome_label,
            # in the lab panel order represented by a given row are normal.
            'in the lab panel order represented by a given row are normal.',
            # Each row represents a unique lab panel order.
            'Each row represents a unique lab panel order.',
            # Each row contains fields summarizing the patient's demographics,
            "Each row contains fields summarizing the patient's demographics",
            # inpatient admit date, prior vitals, and prior lab results.
            'inpatient admit date, prior vitals, and prior lab results.',
            # Most cells in matrix represent a count statistic for an event's
            "Most cells in matrix represent a count statistic for an event's",
            # occurrence or a difference between an event's time and index_time.
            "occurrence or a difference between an event's time and index_time.",
        ]

        # Bundle parameters into single object to be unpacked in SLP.
        params['raw_matrix_path'] = raw_matrix_path
        params['processed_matrix_path'] = processed_matrix_path
        params['features_to_add'] = features_to_add
        params['features_to_keep'] = features_to_keep
        params['imputation_strategies'] = imputation_strategies
        params['features_to_remove'] = features_to_remove
        params['outcome_label'] = outcome_label
        params['selection_problem'] = selection_problem
        params['selection_algorithm'] = selection_algorithm
        params['percent_features_to_select'] = percent_features_to_select
        params['matrix_class'] = matrix_class
        params['pipeline_file_path'] = pipeline_file_path
        params['data_overview'] = data_overview
        params['random_state'] = random_state

        # Defer processing logic to SupervisedLearningPipeline.
        SupervisedLearningPipeline._build_processed_feature_matrix(self, params)
 def _build_raw_feature_matrix(self):
     raw_matrix_path = self._build_raw_matrix_path()
     matrix_class = ComponentNormalityMatrix
     SupervisedLearningPipeline._build_raw_feature_matrix(self, matrix_class, \
         raw_matrix_path)
    def _build_processed_feature_matrix(self):
        # Define parameters for processing steps.
        params = {}
        raw_matrix_path = self._build_raw_matrix_path()
        processed_matrix_path = self._build_processed_matrix_path(
            raw_matrix_path)

        log.debug('params: %s' % params)

        prev_measurement_feature = self._change_params['feature_old']
        features_to_add = {'change': [self._change_params]}
        features_to_filter_on = [{
            'feature': prev_measurement_feature,
            'value': np.nan
        }]
        imputation_strategies = {}

        features_to_remove = [
            'pat_id', 'order_time', 'order_proc_id', 'ord_num_value',
            'proc_code', 'abnormal_panel', 'all_components_normal',
            'num_normal_components', 'Birth.pre', 'Male.preTimeDays',
            'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays',
            'RaceWhiteNonHispanicLatino.preTimeDays',
            'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays',
            'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays',
            'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays',
            'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays',
            'num_components'
        ]
        features_to_keep = [
            # Keep the # of times it's been ordered in past, even if low info.
            '%s.pre' % self._var
        ]
        outcome_label = 'unchanged_yn'
        selection_problem = FeatureSelector.CLASSIFICATION
        selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION
        percent_features_to_select = 0.05
        matrix_class = LabChangeMatrix
        pipeline_file_path = inspect.getfile(inspect.currentframe())
        data_overview = [
            # Overview:
            'Overview',
            # The outcome label is ___.
            'The outcome label is %s.' % outcome_label,
            # %s is a boolean indicator which summarizes whether the lab test
            '%s is a boolean indicator which summarizes whether the lab test '
            % outcome_label,
            # result is unchanged compared to the previous measurement.
            'result is unchanged compared to the previous measurement.',
            # Each row represents a unique lab panel order.
            'Each row represents a unique lab panel order.',
            # Each row contains fields summarizing the patient's demographics,
            "Each row contains fields summarizing the patient's demographics",
            # inpatient admit date, prior vitals, and prior lab results.
            'inpatient admit date, prior vitals, and prior lab results.',
            # Most cells in matrix represent a count statistic for an event's
            "Most cells in matrix represent a count statistic for an event's",
            # occurrence or a difference between an event's time and index_time.
            "occurrence or a difference between an event's time and index_time.",
            # Lab panel orders were only included if a previous measurement of
            "Lab panel orders were only included if a previous measurement of",
            # the same lab panel has been recorded
            "the same lab panel has been recorded."
        ]

        # Bundle parameters into single object
        params['raw_matrix_path'] = raw_matrix_path
        params['processed_matrix_path'] = processed_matrix_path
        params['features_to_add'] = features_to_add
        params['features_to_keep'] = features_to_keep
        params['features_to_filter_on'] = features_to_filter_on
        params['imputation_strategies'] = imputation_strategies
        params['features_to_remove'] = features_to_remove
        params['outcome_label'] = outcome_label
        params['selection_problem'] = selection_problem
        params['selection_algorithm'] = selection_algorithm
        params['percent_features_to_select'] = percent_features_to_select
        params['matrix_class'] = matrix_class
        params['pipeline_file_path'] = pipeline_file_path
        params['data_overview'] = data_overview

        # Defer processing logic to SupervisedLearningPipeline.
        SupervisedLearningPipeline._build_processed_feature_matrix(
            self, params)
    def _build_processed_feature_matrix(self):
        # Define parameters for processing steps.
        params = {}
        raw_matrix_path = self._build_raw_matrix_path()
        processed_matrix_path = self._build_processed_matrix_path()
        features_to_add = {}
        imputation_strategies = {  #'sxu_new_imputation'
        }

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':
            features_to_remove = [
                'pat_id', 'order_time', 'order_proc_id', 'Birth.pre',
                'Male.preTimeDays', 'Female.preTimeDays',
                'RaceWhiteHispanicLatino.preTimeDays',
                'RaceWhiteNonHispanicLatino.preTimeDays',
                'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays',
                'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays',
                'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays',
                'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays'
            ]
            if self._isLabPanel:
                features_to_remove += [
                    'proc_code', 'num_components', 'num_normal_components',
                    'abnormal_panel'
                ]
                outcome_label = 'all_components_normal'  #
            else:
                features_to_remove += ['base_name']
                outcome_label = 'component_normal'  # TODO: danger, previous version might not consistent!

        else:
            features_to_remove = [
                'pat_id',
                'order_time',
                'order_proc_id',
                'Birth.pre',
                'Male.preTimeDays',
                'Female.preTimeDays',
                # 'Caucasian.preTimeDays',
                # 'Hispanic.preTimeDays',
                # 'Native Hawaiian and Other Pacific Islander.preTimeDays'
            ]
            RACE_FEATURES = self._factory.queryAllRaces()
            features_to_remove += [x + '.preTimeDays' for x in RACE_FEATURES]
            if self._isLabPanel:
                features_to_remove += [
                    'proc_code', 'num_normal_components', 'num_components'
                ]
                outcome_label = 'all_components_normal'
            else:
                features_to_remove += ['base_name']

                outcome_label = 'component_normal'  #

        features_to_keep = [
            # Keep the # of times it's been ordered in past, even if low info.
            '%s.pre' % self._var
        ]
        if self.includeLastNormality:
            features_to_keep.append('last_normality')

        selection_problem = FeatureSelector.CLASSIFICATION
        selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION
        percent_features_to_select = 0.05
        matrix_class = LabNormalityMatrix
        pipeline_file_path = inspect.getfile(inspect.currentframe())
        random_state = self._random_state
        data_overview = [
            # Overview:
            'Overview',
            # The outcome label is ___.
            'The outcome label is %s.' % outcome_label,
            # %s is a boolean indicator which summarizes whether all components
            '%s is a boolean indicator which summarizes whether all components '
            % outcome_label,
            # in the lab panel order represented by a given row are normal.
            'in the lab panel order represented by a given row are normal.',
            # Each row represents a unique lab panel order.
            'Each row represents a unique lab panel order.',
            # Each row contains fields summarizing the patient's demographics,
            "Each row contains fields summarizing the patient's demographics",
            # inpatient admit date, prior vitals, and prior lab results.
            'inpatient admit date, prior vitals, and prior lab results.',
            # Most cells in matrix represent a count statistic for an event's
            "Most cells in matrix represent a count statistic for an event's",
            # occurrence or a difference between an event's time and index_time.
            "occurrence or a difference between an event's time and index_time.",
        ]

        # Bundle parameters into single object to be unpacked in SLP.
        params['raw_matrix_path'] = raw_matrix_path
        params['processed_matrix_path'] = processed_matrix_path
        params['features_to_add'] = features_to_add
        params['features_to_keep'] = features_to_keep
        params['imputation_strategies'] = imputation_strategies
        params['features_to_remove'] = features_to_remove
        params['outcome_label'] = outcome_label
        params['selection_problem'] = selection_problem
        params['selection_algorithm'] = selection_algorithm
        params['percent_features_to_select'] = percent_features_to_select
        params['matrix_class'] = matrix_class
        params['pipeline_file_path'] = pipeline_file_path
        params['data_overview'] = data_overview
        params['random_state'] = random_state

        # Defer processing logic to SupervisedLearningPipeline.
        SupervisedLearningPipeline._build_processed_feature_matrix(
            self, params)
        '''
        For testing the model on the holdout set, should remember features 
        to select from the raw matrix of the holdout data. 
        '''
        final_features = self._X_train.columns.values
        if not self.feat2imputed_dict:
            '''
            The dict was not created during imputation. 
            Probably because the processed matrix was loaded from previous session. 
            Take the 'best guess' for the imputed value as the most common one in
            any column. 
            '''
            for feat in final_features:
                most_freq_val = self._X_train[feat].value_counts().idxmax()
                self.feat2imputed_dict[feat] = most_freq_val
        '''