def test_dbCache(self):
        """Test database result caching."""
        factory = FeatureMatrixFactory(cacheDBResults=False)
        self.assertEqual(factory.dbCache, None)

        factory = FeatureMatrixFactory()
        self.assertEqual(type(factory.dbCache), type(dict()))
Esempio n. 2
0
 def __init__(self, lab_panel, num_episodes, use_cache=None, random_state=None, isLabPanel=True,
              notUsePatIds=[], pat_batch_ind=None):
     self.notUsePatIds = notUsePatIds
     self.pat_batch_ind = pat_batch_ind
     self.usedPatIds = []
     SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state, isLabPanel)
     self._factory = FeatureMatrixFactory()
     self._build_raw_feature_matrix()
Esempio n. 3
0
    def __init__(self,
                 lab_panel,
                 num_episodes,
                 use_cache=None,
                 random_state=None,
                 isLabPanel=True,
                 timeLimit=None,
                 notUsePatIds=None,
                 holdOut=False,
                 pat_batch_ind=None):
        self.notUsePatIds = notUsePatIds
        self.pat_batch_ind = pat_batch_ind
        self.usedPatIds = []
        SupervisedLearningPipeline.__init__(
            self,
            lab_panel,
            num_episodes,
            use_cache,
            random_state,
            isLabPanel,
            timeLimit,
            holdOut,
            isLabNormalityPredictionPipeline=True)
        # TODO: naming of lab_panel
        self._factory = FeatureMatrixFactory()
        self._build_raw_feature_matrix()

        data_lab_folder = self._fetch_data_dir_path(
            inspect.getfile(inspect.currentframe()))
        feat2imputed_dict_path = data_lab_folder + '/feat2imputed_dict.pkl'

        if holdOut:
            '''
            For holdOut evaluation data, produce the raw matrix, pick 
            features according to the saved feat2imputed_dict. 
            '''
            self.feat2imputed_dict = pickle.load(
                open(feat2imputed_dict_path, 'r'))
            self._build_processed_feature_matrix_holdout()
            self._analyze_predictors_on_holdout()
        else:
            '''
            For training/validation data, record the pat_ids, 
            selected features and their imputed value correspondingly. 
            '''
            pickle.dump(self.usedPatIds,
                        open('data/used_patient_set_%s.pkl' % self._var, 'w'),
                        pickle.HIGHEST_PROTOCOL)
            self._build_processed_feature_matrix()
            self._build_baseline_results()  # TODO: prototype in SLPP
            return

            # TODO: find better place to put the dict.pkl
            pickle.dump(self.feat2imputed_dict,
                        open(feat2imputed_dict_path, 'w'),
                        pickle.HIGHEST_PROTOCOL)
            self._train_and_analyze_predictors()
    def setUp(self):
        """Prepare state for test cases."""
        DBTestCase.setUp(self)
        StrideLoader.build_stride_psql_schemata()
        ClinicalItemDataLoader.build_clinical_item_psql_schemata()

        self._deleteTestRecords()
        self._insertTestRecords()

        self.factory = FeatureMatrixFactory()
        self.connection = DBUtil.connection()
Esempio n. 5
0
    def __init__(self, lab_panel, num_episodes, use_cache=None, random_state=None,
                 timeLimit=None, notUsePatIds=None, holdOut=False, pat_batch_ind=None, includeLastNormality=True):
        # self.notUsePatIds = notUsePatIds
        self.pat_batch_ind = pat_batch_ind
        self._holdOut = holdOut
        self.usedPatIds = []
        SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state,
                                            timeLimit, notUsePatIds)
        # TODO: naming of lab_panel
        self._factory = FeatureMatrixFactory()
        self._build_raw_feature_matrix()

        if LAB_TYPE == 'panel':
            self.ylabel = 'all_components_normal'
        else:
            self.ylabel = 'component_normal'

        self.includeLastNormality = includeLastNormality

        if self.includeLastNormality:
            fm_io = FeatureMatrixIO()
            df = fm_io.read_file_to_data_frame('data/'+lab_panel+'/%s-normality-matrix-raw.tab'%lab_panel)
            df = df.sort_values(['pat_id', 'order_time']).reset_index(drop=True)
            df['last_normality'] = df['order_proc_id'].apply(lambda x:float('nan'))
            for i in range(1,df.shape[0]):
                if df.ix[i, 'pat_id'] == df.ix[i-1, 'pat_id']:
                    df.ix[i, 'last_normality'] = df.ix[i-1, self.ylabel]
            df.to_csv('data/'+lab_panel+'/%s-normality-matrix-raw.tab'%lab_panel, index=False, sep='\t')

        data_lab_folder = self._fetch_data_dir_path(inspect.getfile(inspect.currentframe()))
        feat2imputed_dict_path = data_lab_folder + '/feat2imputed_dict.pkl'

        if holdOut:
            '''
            For holdOut evaluation data, produce the raw matrix, pick 
            features according to the saved feat2imputed_dict. 
            '''
            self.feat2imputed_dict = pickle.load(open(feat2imputed_dict_path, 'r'))
            self._build_processed_feature_matrix_holdout()
            self._analyze_predictors_on_holdout()
        else:
            '''
            For training/validation data, record the pat_ids, 
            selected features and their imputed value correspondingly. 
            '''
            pickle.dump(self.usedPatIds, open('data/used_patient_set_%s.pkl'%self._var, 'w'), pickle.HIGHEST_PROTOCOL)
            self._build_processed_feature_matrix()
            self._build_baseline_results()  # TODO: prototype in SLPP
            # return

            # TODO: find better place to put the dict.pkl
            pickle.dump(self.feat2imputed_dict, open(feat2imputed_dict_path, 'w'), pickle.HIGHEST_PROTOCOL)
            self._train_and_analyze_predictors()
Esempio n. 6
0
    def __init__(self, variable, num_data_points, params=None):
        # Process arguments.
        self._var = variable
        self._num_rows = num_data_points
        if params is None:
            self._params = {}
        else:
            self._params = params

        # Initialize FeatureMatrixFactory.
        self._factory = FeatureMatrixFactory()

        # Initialize DB connection.
        self._connection = DBUtil.connection()
    def setUp(self):
        """Prepare state for test cases."""
        DBTestCase.setUp(self)

        # StrideLoader.build_stride_psql_schemata()
        # ClinicalItemDataLoader.build_clinical_item_psql_schemata();

        # self._deleteTestRecords()
        # self._insertTestRecords()
        self.factory = FeatureMatrixFactory()  # TODO: self.RACE_FEATURES = self.queryAllRaces(), OperationalError: no such table: demographics

        self.connection = DBUtil.connection();  # Setup a common connection for test cases to work with, can catch in finally tearDown method to close/cleanup
        self.cursor = self.connection.cursor()
        self._insertUMichTestRecords()
    def test_loadMapData(self):
        self.factory = FeatureMatrixFactory()

        # Depends on external data file
        reader = self.factory.loadMapData("CharlsonComorbidity-ICD9CM")
        charlsonByICD9 = dict()

        for row in reader:
            charlsonByICD9[row["icd9cm"]] = row["charlson"]

        self.assertEqual("Dementia", charlsonByICD9["294.1"])
        self.assertEqual("Dementia", charlsonByICD9["331.2"])
        self.assertEqual("COPD", charlsonByICD9["490"])
        self.assertEqual("COPD", charlsonByICD9["416.8"])
        self.assertEqual("Malignancy Metastatic", charlsonByICD9["199"])
        self.assertEqual("AIDS/HIV", charlsonByICD9["042"])
class LabNormalityPredictionPipeline(SupervisedLearningPipeline):
    def __init__(self,
                 lab_panel,
                 num_episodes,
                 use_cache=None,
                 random_state=None,
                 isLabPanel=True,
                 timeLimit=None,
                 notUsePatIds=None,
                 holdOut=False,
                 pat_batch_ind=None,
                 includeLastNormality=True):
        self.notUsePatIds = notUsePatIds
        self.pat_batch_ind = pat_batch_ind
        self.usedPatIds = []
        SupervisedLearningPipeline.__init__(
            self,
            lab_panel,
            num_episodes,
            use_cache,
            random_state,
            isLabPanel,
            timeLimit,
            holdOut,
            isLabNormalityPredictionPipeline=True)
        # TODO: naming of lab_panel
        self._factory = FeatureMatrixFactory()
        self._build_raw_feature_matrix()

        if self._isLabPanel:
            self.ylabel = 'all_components_normal'
        else:
            self.ylabel = 'component_normal'

        self.includeLastNormality = includeLastNormality

        if self.includeLastNormality:
            fm_io = FeatureMatrixIO()
            df = fm_io.read_file_to_data_frame('data/' + lab_panel +
                                               '/%s-normality-matrix-raw.tab' %
                                               lab_panel)
            df = df.sort_values(['pat_id',
                                 'order_time']).reset_index(drop=True)
            df['last_normality'] = df['order_proc_id'].apply(
                lambda x: float('nan'))
            for i in range(1, df.shape[0]):
                if df.ix[i, 'pat_id'] == df.ix[i - 1, 'pat_id']:
                    df.ix[i, 'last_normality'] = df.ix[i - 1, self.ylabel]
            df.to_csv('data/' + lab_panel +
                      '/%s-normality-matrix-raw.tab' % lab_panel,
                      index=False,
                      sep='\t')

        data_lab_folder = self._fetch_data_dir_path(
            inspect.getfile(inspect.currentframe()))
        feat2imputed_dict_path = data_lab_folder + '/feat2imputed_dict.pkl'

        if holdOut:
            '''
            For holdOut evaluation data, produce the raw matrix, pick 
            features according to the saved feat2imputed_dict. 
            '''
            self.feat2imputed_dict = pickle.load(
                open(feat2imputed_dict_path, 'r'))
            self._build_processed_feature_matrix_holdout()
            self._analyze_predictors_on_holdout()
        else:
            '''
            For training/validation data, record the pat_ids, 
            selected features and their imputed value correspondingly. 
            '''
            pickle.dump(self.usedPatIds,
                        open('data/used_patient_set_%s.pkl' % self._var, 'w'),
                        pickle.HIGHEST_PROTOCOL)
            self._build_processed_feature_matrix()
            self._build_baseline_results()  # TODO: prototype in SLPP
            # return

            # TODO: find better place to put the dict.pkl
            pickle.dump(self.feat2imputed_dict,
                        open(feat2imputed_dict_path, 'w'),
                        pickle.HIGHEST_PROTOCOL)
            self._train_and_analyze_predictors()

    def _build_model_dump_path(self, algorithm):
        template = '%s' + '-normality-%s-model.pkl' % algorithm
        pipeline_file_name = inspect.getfile(inspect.currentframe())
        return SupervisedLearningPipeline._build_model_dump_path(self, template, \
            pipeline_file_name)

    def _build_raw_matrix_path(self):
        raw_matrix_filename = '%s-normality-matrix-raw.tab' % self._var  #
        raw_matrix_filepath = os.path.join('data', self._var,
                                           raw_matrix_filename)  # TODO
        if not os.path.exists('data'):
            os.mkdir('data')
        if not os.path.exists(os.path.join('data', self._var)):
            os.mkdir(os.path.join('data', self._var))
        return raw_matrix_filepath
        if not self._holdOut:
            template = '%s-normality-matrix-raw.tab'
        else:
            template = '%s-normality-matrix-%d-episodes-raw-holdout.tab'
        pipeline_file_name = inspect.getfile(inspect.currentframe())
        return SupervisedLearningPipeline._build_matrix_path(self, template, \
            pipeline_file_name)

    def _build_raw_feature_matrix(self):
        raw_matrix_path = self._build_raw_matrix_path()
        matrix_class = LabNormalityMatrix
        SupervisedLearningPipeline._build_raw_feature_matrix(self, matrix_class, \
            raw_matrix_path)

        if not self._holdOut:
            fm_io = FeatureMatrixIO()
            matrix = fm_io.read_file_to_data_frame(raw_matrix_path)
            self.usedPatIds = set(matrix['pat_id'].values)

    def _build_baseline_results(self):
        if not self._holdOut:
            template = '%s-normality-matrix-raw.tab'
        else:
            template = '%s-normality-matrix-%d-episodes-raw-holdout.tab'
        pipeline_file_name = inspect.getfile(inspect.currentframe())
        # raw_matrix_path = SupervisedLearningPipeline._build_matrix_path(self, template, \
        #                                                      pipeline_file_name)
        raw_matrix_path = self._build_raw_matrix_path()
        # Another direct call to the _factory instance
        self._factory.obtain_baseline_results(
            raw_matrix_path=raw_matrix_path,
            random_state=self._random_state,
            isLabPanel=self._isLabPanel,
            isHoldOut=self._holdOut)  #TODO: file name

    def _build_processed_matrix_path(self):
        processed_matrix_filename = '%s-normality-matrix-processed.tab' % self._var  #
        processed_matrix_path = os.path.join('data', self._var,
                                             processed_matrix_filename)  # TODO
        return processed_matrix_path
        if not self._holdOut:
            template = '%s-normality-matrix-processed.tab'
        else:
            template = '%s-normality-matrix-%d-episodes-processed-holdout.tab'
        pipeline_file_path = inspect.getfile(inspect.currentframe())
        return SupervisedLearningPipeline._build_matrix_path(self, template, \
            pipeline_file_path)

    def _build_processed_feature_matrix_holdout(self):
        fm_io = FeatureMatrixIO()
        raw_matrix = fm_io.read_file_to_data_frame(
            self._build_raw_matrix_path())

        # if outcome_label in self.feat2imputed_dict:
        #     self.feat2imputed_dict.pop(outcome_label)
        #
        # processed_matrix = raw_matrix[self.feat2imputed_dict.keys()+[outcome_label]].copy()
        '''
        TODO: feat2imputed_dict includes the outcome label
        '''
        processed_matrix = raw_matrix[self.feat2imputed_dict.keys()].copy()

        # TODO: tmp solution!
        tmp_path = self._build_processed_matrix_path().replace(
            "2000", "10000").replace("-holdout", "")
        fm_io1 = FeatureMatrixIO()
        processed_matrix_previous = fm_io1.read_file_to_data_frame(tmp_path)
        processed_matrix = processed_matrix[processed_matrix_previous.columns]
        # TODO: tmp solution!

        for feat in self.feat2imputed_dict.keys():
            processed_matrix[feat] = processed_matrix[feat].fillna(
                self.feat2imputed_dict[feat])

        fm_io.write_data_frame_to_file(processed_matrix, \
                                       self._build_processed_matrix_path(), None)

    def _build_processed_feature_matrix(self):
        # Define parameters for processing steps.
        params = {}
        raw_matrix_path = self._build_raw_matrix_path()
        processed_matrix_path = self._build_processed_matrix_path()
        features_to_add = {}
        imputation_strategies = {  #'sxu_new_imputation'
        }

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':
            features_to_remove = [
                'pat_id', 'order_time', 'order_proc_id', 'Birth.pre',
                'Male.preTimeDays', 'Female.preTimeDays',
                'RaceWhiteHispanicLatino.preTimeDays',
                'RaceWhiteNonHispanicLatino.preTimeDays',
                'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays',
                'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays',
                'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays',
                'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays'
            ]
            if self._isLabPanel:
                features_to_remove += [
                    'proc_code', 'num_components', 'num_normal_components',
                    'abnormal_panel'
                ]
                outcome_label = 'all_components_normal'  #
            else:
                features_to_remove += ['base_name']
                outcome_label = 'component_normal'  # TODO: danger, previous version might not consistent!

        else:
            features_to_remove = [
                'pat_id',
                'order_time',
                'order_proc_id',
                'Birth.pre',
                'Male.preTimeDays',
                'Female.preTimeDays',
                # 'Caucasian.preTimeDays',
                # 'Hispanic.preTimeDays',
                # 'Native Hawaiian and Other Pacific Islander.preTimeDays'
            ]
            RACE_FEATURES = self._factory.queryAllRaces()
            features_to_remove += [x + '.preTimeDays' for x in RACE_FEATURES]
            if self._isLabPanel:
                features_to_remove += [
                    'proc_code', 'num_normal_components', 'num_components'
                ]
                outcome_label = 'all_components_normal'
            else:
                features_to_remove += ['base_name']

                outcome_label = 'component_normal'  #

        features_to_keep = [
            # Keep the # of times it's been ordered in past, even if low info.
            '%s.pre' % self._var
        ]
        if self.includeLastNormality:
            features_to_keep.append('last_normality')

        selection_problem = FeatureSelector.CLASSIFICATION
        selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION
        percent_features_to_select = 0.05
        matrix_class = LabNormalityMatrix
        pipeline_file_path = inspect.getfile(inspect.currentframe())
        random_state = self._random_state
        data_overview = [
            # Overview:
            'Overview',
            # The outcome label is ___.
            'The outcome label is %s.' % outcome_label,
            # %s is a boolean indicator which summarizes whether all components
            '%s is a boolean indicator which summarizes whether all components '
            % outcome_label,
            # in the lab panel order represented by a given row are normal.
            'in the lab panel order represented by a given row are normal.',
            # Each row represents a unique lab panel order.
            'Each row represents a unique lab panel order.',
            # Each row contains fields summarizing the patient's demographics,
            "Each row contains fields summarizing the patient's demographics",
            # inpatient admit date, prior vitals, and prior lab results.
            'inpatient admit date, prior vitals, and prior lab results.',
            # Most cells in matrix represent a count statistic for an event's
            "Most cells in matrix represent a count statistic for an event's",
            # occurrence or a difference between an event's time and index_time.
            "occurrence or a difference between an event's time and index_time.",
        ]

        # Bundle parameters into single object to be unpacked in SLP.
        params['raw_matrix_path'] = raw_matrix_path
        params['processed_matrix_path'] = processed_matrix_path
        params['features_to_add'] = features_to_add
        params['features_to_keep'] = features_to_keep
        params['imputation_strategies'] = imputation_strategies
        params['features_to_remove'] = features_to_remove
        params['outcome_label'] = outcome_label
        params['selection_problem'] = selection_problem
        params['selection_algorithm'] = selection_algorithm
        params['percent_features_to_select'] = percent_features_to_select
        params['matrix_class'] = matrix_class
        params['pipeline_file_path'] = pipeline_file_path
        params['data_overview'] = data_overview
        params['random_state'] = random_state

        # Defer processing logic to SupervisedLearningPipeline.
        SupervisedLearningPipeline._build_processed_feature_matrix(
            self, params)
        '''
        For testing the model on the holdout set, should remember features 
        to select from the raw matrix of the holdout data. 
        '''
        final_features = self._X_train.columns.values
        if not self.feat2imputed_dict:
            '''
            The dict was not created during imputation. 
            Probably because the processed matrix was loaded from previous session. 
            Take the 'best guess' for the imputed value as the most common one in
            any column. 
            '''
            for feat in final_features:
                most_freq_val = self._X_train[feat].value_counts().idxmax()
                self.feat2imputed_dict[feat] = most_freq_val
        '''
        TODO: useless?!
        '''
        # curr_keys = self.feat2imputed_dict.keys()
        #
        # '''
        # Only need to impute the selected features for the holdOut set.
        # '''
        # for one_key in curr_keys:
        #     if one_key not in final_features:
        #         self.feat2imputed_dict.pop(one_key)

    def _analyze_predictors_on_holdout(self):
        fm_io = FeatureMatrixIO()

        algorithms_to_test = list()
        algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS)

        pipeline_file_name = inspect.getfile(inspect.currentframe())
        data_dir = SupervisedLearningPipeline._fetch_data_dir_path(
            self, pipeline_file_name)
        # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS:
        #     algorithms_to_test.append('bifurcated-%s' % algorithm)
        log.debug('algorithms_to_test: %s' % algorithms_to_test)
        for algorithm in algorithms_to_test:
            log.info('analyzing %s...' % algorithm)
            # If report_dir does not exist, make it.
            report_dir = '/'.join([data_dir, algorithm])

            pipeline_prefix = '%s-normality-prediction-%s' % (self._var,
                                                              algorithm)

            predictor_path = self._build_model_dump_path(algorithm)

            if os.path.exists(
                    predictor_path) and 'bifurcated' not in algorithm:
                log.debug('Loading model from disk...')
                # TODO(sbala): Fix loblib.load so that it works for bifurcated
                # supervised classifiers.
                self._predictor = joblib.load(predictor_path)
                # self._features = self._X_train.columns
                status = SupervisedClassifier.TRAINED

            SupervisedLearningPipeline._analyze_predictor_holdoutset(
                self, report_dir, pipeline_prefix)

    def _train_and_analyze_predictors(self):
        log.info('Training and analyzing predictors...')
        problem = SupervisedLearningPipeline.CLASSIFICATION
        meta_report = None
        fm_io = FeatureMatrixIO()

        # Build paths for output.
        pipeline_file_name = inspect.getfile(inspect.currentframe())
        data_dir = SupervisedLearningPipeline._fetch_data_dir_path(
            self, pipeline_file_name)

        # Test BifurcatedSupervisedClassifier and SupervisedClassifier.
        algorithms_to_test = list()
        algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS)
        # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS:
        #     algorithms_to_test.append('bifurcated-%s' % algorithm)
        log.debug('algorithms_to_test: %s' % algorithms_to_test)

        # Train and analyse algorithms.
        for algorithm in algorithms_to_test:
            log.info('Training and analyzing %s...' % algorithm)
            # If report_dir does not exist, make it.
            report_dir = '/'.join([data_dir, algorithm])
            if not os.path.exists(report_dir):
                os.makedirs(report_dir)

            # Define hyperparams.
            hyperparams = {}
            hyperparams['algorithm'] = algorithm
            hyperparams[
                'hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH
            hyperparams['max_iter'] = 1024
            hyperparams['random_state'] = self._random_state

            # If bifurcated algorithm, define bifurcator.
            if 'bifurcated' in algorithm:
                # bifrucator = LAB.pre == 0
                hyperparams['bifurcator'] = '%s.pre' % self._var
                hyperparams[
                    'bifurcation_strategy'] = BifurcatedSupervisedClassifier.EQUAL
                hyperparams['bifurcation_value'] = 0
                hyperparams['bifurcated'] = True

            # Train classifier.
            predictor_path = self._build_model_dump_path(algorithm)
            if os.path.exists(
                    predictor_path) and 'bifurcated' not in algorithm:
                log.debug('Loading model from disk...')
                # TODO(sbala): Fix loblib.load so that it works for bifurcated
                # supervised classifiers.
                self._predictor = joblib.load(predictor_path)
                self._features = self._X_train.columns
                status = SupervisedClassifier.TRAINED
            else:
                status = SupervisedLearningPipeline._train_predictor(
                    self, problem, [0, 1], hyperparams)

            # If failed to train, write an error report.
            y_train_counts = self._y_train[
                self._y_train.columns[0]].value_counts()
            y_test_counts = self._y_test[
                self._y_test.columns[0]].value_counts()
            if status == SupervisedClassifier.INSUFFICIENT_SAMPLES:
                # Skip all analysis and reporting.
                # This will be true for all algorithms, so just return.
                # Build error report.
                algorithm_report = DataFrame(
                    {
                        'lab_panel': [self._var],
                        'algorithm': [algorithm],
                        'error': [status],
                        'y_train.value_counts()': [y_train_counts.to_dict()],
                        'y_test.value_counts()': [y_test_counts.to_dict()]
                    },
                    columns=[
                        'lab_panel', 'algorithm', 'error',
                        'y_train.value_counts()', 'y_test.value_counts()'
                    ])
                header = [
                    'LabNormalityPredictionPipeline("%s", 10000)' % self._var
                ]
                # Write error report.
                fm_io.write_data_frame_to_file(algorithm_report, \
                    '/'.join([report_dir, '%s-normality-prediction-report.tab' % (self._var)]), \
                    header)
            # If successfully trained, append to a meta report.
            elif status == SupervisedClassifier.TRAINED:
                pipeline_prefix = '%s-normality-prediction-%s' % (self._var,
                                                                  algorithm)

                SupervisedLearningPipeline._analyze_predictor(
                    self, report_dir, pipeline_prefix)
                SupervisedLearningPipeline._analyze_predictor_traindata(
                    self, report_dir, pipeline_prefix)

                # continue # Do not generate stats results here...

                if meta_report is None:
                    meta_report = fm_io.read_file_to_data_frame('/'.join(
                        [report_dir,
                         '%s-report.tab' % pipeline_prefix]))
                else:
                    algorithm_report = fm_io.read_file_to_data_frame('/'.join(
                        [report_dir,
                         '%s-report.tab' % pipeline_prefix]))
                    log.debug('algorithm_report: %s' % algorithm_report)
                    meta_report = meta_report.append(algorithm_report)
                # Write predictor to disk.
                predictor = SupervisedLearningPipeline.predictor(self)
                predictor_path = self._build_model_dump_path(algorithm)
                joblib.dump(predictor, predictor_path)

        # After building per-algorithm reports, write to meta report.
        # Note that if there were insufficient samples to build any of the
        # algorithms, then meta_report will still be None.
        if meta_report is not None:
            header = [
                'LabNormalityPredictionPipeline("%s", 10000)' % self._var
            ]
            fm_io.write_data_frame_to_file(meta_report, \
                '/'.join([data_dir, '%s-normality-prediction-report.tab' % self._var]), header)
Esempio n. 10
0
class FeatureMatrix:
    def __init__(self, variable, num_data_points, params=None):
        # Process arguments.
        self._var = variable
        self._num_rows = num_data_points
        if params is None:
            self._params = {}
        else:
            self._params = params

        # Initialize FeatureMatrixFactory.
        self._factory = FeatureMatrixFactory()

        # Initialize DB connection.
        self._connection = DBUtil.connection()

    def _query_patient_episodes(self,
                                query,
                                pat_id_col=None,
                                index_time_col=None):
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Fetch and return results.
        log.info('query: %s' % str(query))

        if isinstance(query, basestring):
            cursor.execute(query)
        else:
            log.info('query.params: %s' % str(query.params))
            cursor.execute(str(query), query.params)

        # Parse arguments.
        if pat_id_col is None:
            pat_id_col = 'pat_id'
        if index_time_col is None:
            index_time_col = 'index_time'

        self._factory.setPatientEpisodeInput(cursor, pat_id_col,
                                             index_time_col)
        num_episodes = self._factory.processPatientEpisodeInput()

        return num_episodes

    def _querystr_patient_episodes(self,
                                   querystr,
                                   pat_id_col=None,
                                   index_time_col=None):
        # Initialize DB cursor.
        cursor = self._connection.cursor()

        # Fetch and return results.
        log.info('query: %s' % querystr)
        cursor.execute(querystr)

        # Parse arguments.
        if pat_id_col is None:
            pat_id_col = 'pat_id'
        if index_time_col is None:
            index_time_col = 'index_time'

        self._factory.setPatientEpisodeInput(cursor, pat_id_col,
                                             index_time_col)
        num_episodes = self._factory.processPatientEpisodeInput()

        return num_episodes

    def _add_features(self, index_time_col=None):

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':
            self._add_time_features(index_time_col)
            self._add_demographic_features()
            self._add_treatment_team_features()
            self._add_comorbidity_features()
            self._add_flowsheet_features()
            self._add_lab_component_features()
        else:
            # elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
            self._add_time_features(index_time_col)
            self._add_demographic_features()
            self._add_comorbidity_features()
            self._add_lab_component_features()

            if LocalEnv.DATASET_SOURCE_NAME == 'UCSF':
                self._add_treatment_team_features()
                self._add_flowsheet_features()

    def _add_time_features(self, index_time_col=None):
        log.info('Adding admit date features...')
        # Add admission date.
        ADMIT_DX_CATEGORY_ID = 2

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':
            self._factory.addClinicalItemFeaturesByCategory([ADMIT_DX_CATEGORY_ID], \
                                                            dayBins=[], label='AdmitDxDate', features='pre')
        else:
            #elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
            self._factory.addClinicalItemFeaturesByCategory_UMich([ADMIT_DX_CATEGORY_ID], \
            dayBins=[], label='AdmitDxDate', features='pre', tableName='encounters')

        # Add time cycle features.
        log.info('Adding time cycle features...')
        if index_time_col is None:
            index_time_col = 'index_time'
        self._factory.addTimeCycleFeatures(index_time_col, 'month')
        self._factory.addTimeCycleFeatures(index_time_col, 'hour')

    def _add_demographic_features(self):
        log.info('Adding demographic features...')
        # Add birth and death.
        self._add_lifespan_features()
        # Add sex features.
        self._add_sex_features()
        # Add race features.
        self._add_race_features()

    def _add_lifespan_features(self):
        log.info('Adding lifespan features...')

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':  # TODO
            self._factory.addClinicalItemFeatures(['Birth'],
                                                  dayBins=[],
                                                  features="pre")
            self._factory.addClinicalItemFeatures(['Death'],
                                                  dayBins=[],
                                                  features="post")
        else:
            #elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
            self._factory.addClinicalItemFeatures_UMich(
                ['Birth'],
                dayBins=[],
                features="pre",
                clinicalItemType=None,
                clinicalItemTime='Birth',
                tableName='pt_info')

    def _add_sex_features(self):
        log.info('Adding sex features...')
        SEX_FEATURES = ["Male", "Female"]
        for feature in SEX_FEATURES:
            if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':  # TODO
                self._factory.addClinicalItemFeatures([feature],
                                                      dayBins=[],
                                                      features="pre")
            else:
                #elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
                self._factory.addClinicalItemFeatures_UMich(
                    [feature],
                    dayBins=[],
                    features="pre",
                    clinicalItemType='GenderName',
                    clinicalItemTime=None,
                    tableName="demographics")

    def _add_race_features(self):
        log.info('Adding race features...')
        for feature in self._factory.queryAllRaces():
            if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':  # TODO
                self._factory.addClinicalItemFeatures([feature],
                                                      dayBins=[],
                                                      features="pre")
            else:
                #elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
                self._factory.addClinicalItemFeatures_UMich(
                    [feature],
                    dayBins=[],
                    features="pre",
                    clinicalItemType='RaceName',
                    clinicalItemTime=None,
                    tableName='demographics')

    def _add_treatment_team_features(self):
        log.info('Adding treatment team features...')
        self._factory.addTreatmentTeamFeatures(features="pre")

    def _add_comorbidity_features(self):
        log.info('Adding comorbidity features...')
        self._factory.addCharlsonComorbidityFeatures(features='pre')

    def _add_flowsheet_features(self):
        log.info('Adding flowsheet features...')
        # Look at flowsheet results from the previous days
        FLOW_PRE_TIME_DELTAS = [datetime.timedelta(-3)]
        # Don't look into the future, otherwise cheating the prediction
        FLOW_POST_TIME_DELTA = datetime.timedelta(0)
        # Add flowsheet features for a variety of generally useful vitals.
        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':
            BASIC_FLOWSHEET_FEATURES = [
                "BP_High_Systolic", "BP_Low_Diastolic", "FiO2",
                "Glasgow Coma Scale Score", "Pulse", "Resp", "Temp", "Urine"
            ]
        elif LocalEnv.DATASET_SOURCE_NAME == 'UCSF':
            BASIC_FLOWSHEET_FEATURES = [
                'SBP', 'DBP', 'FiO2', 'Pulse', 'Resp', 'Temp', 'o2flow'
            ]
        for pre_time_delta in FLOW_PRE_TIME_DELTAS:
            log.info('\t\tpreTimeDelta: %s' % pre_time_delta)
            self._factory.addFlowsheetFeatures(BASIC_FLOWSHEET_FEATURES, \
                pre_time_delta, FLOW_POST_TIME_DELTA)

    def _add_lab_component_features(self):
        # Look for lab data 90 days before each episode, but never after self.
        # Look at lab results from the previous days
        LAB_PRE_TIME_DELTAS = [datetime.timedelta(-14)]
        # Don't look into the future, otherwise cheating the prediction
        LAB_POST_TIME_DELTA = datetime.timedelta(0)

        # Add result features for a variety of generally useful components.
        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':
            BASIC_LAB_COMPONENTS = [
                'WBC',  # White Blood Cell
                'HCT',  # Hematocrit
                'PLT',  # Platelet Count
                'NA',  # Sodium, Whole Blood
                'K',  # Potassium, Whole Blood
                'CO2',  # CO2, Serum/Plasma
                'BUN',  # Blood Urea Nitrogen
                'CR',  # Creatinine
                'TBIL',  # Total Bilirubin
                'ALB',  # Albumin
                'CA',  # Calcium
                'LAC',  # Lactic Acid
                'ESR',  # Erythrocyte Sedimentation Rate
                'CRP',  # C-Reactive Protein
                'TNI',  # Troponin I
                'PHA',  # Arterial pH
                'PO2A',  # Arterial pO2
                'PCO2A',  # Arterial pCO2
                'PHV',  # Venous pH
                'PO2V',  # Venous pO2
                'PCO2V'  # Venous pCO2
            ]
        elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
            BASIC_LAB_COMPONENTS = [
                'WBC',  # White Blood Cell
                'HCT',  # Hematocrit
                'PLT',  # Platelet Count
                'SOD',  # Sodium, Whole Blood
                'POT',  # Potassium, Whole Blood
                'CO2',  # CO2, Serum/Plasma
                'UN',  # Blood Urea Nitrogen
                'CREAT',  # Creatinine
                'TBIL',  # Total Bilirubin
                'ALB',  # Albumin
                'CAL',  # Calcium
                'LACTA',  # Lactic Acid; LACTA & LACTV are more frequent
                "WEST",  # Erythrocyte Sedimentation Rate
                'CRP',  # C-Reactive Protein
                'TROP',  # Troponin I
                'pHA',  # Arterial pH
                'PO2AA',  # Arterial pO2
                'PCOAA2',  # Arterial pCO2
                'pHV',  # Venous pH
                'pO2V',  # Venous pO2
                'pCO2V',  # Venous pCO2
            ]
        elif LocalEnv.DATASET_SOURCE_NAME == 'UCSF':
            BASIC_LAB_COMPONENTS = [
                'WBC',  # White Blood Cell
                'HCT',  # Hematocrit
                'PLT',  # Platelet Count
                'NAWB',  # Sodium, Whole Blood
                'K',  # Potassium, Whole Blood
                'CO2',  # CO2, Serum/Plasma
                'BUN',  # Blood Urea Nitrogen
                'CREAT',  # Creatinine
                'TBILI',  # Total Bilirubin
                'ALB',  # Albumin
                'CA',  # Calcium
                'LACTWB',  # Lactic Acid; LACTA & LACTV are more frequent
                "ESR",  # Erythrocyte Sedimentation Rate
                'CRP',  # C-Reactive Protein
                'TRPI',  # Troponin I
                'PH37',  # Arterial pH
                'PO2',  # Arterial pO2
                'PCO2'  # Arterial pCO2
                # 'pHV',  # Venous pH
                # 'pO2V',  # Venous pO2
                # 'pCO2V',  # Venous pCO2
            ]
        log.info('Adding lab component features...')
        for component in BASIC_LAB_COMPONENTS:
            log.info('\t%s' % component)
            for preTimeDelta in LAB_PRE_TIME_DELTAS:
                log.info('\t\t%s' % preTimeDelta)
                self._factory.addLabResultFeatures([component], False,
                                                   preTimeDelta,
                                                   LAB_POST_TIME_DELTA)

    def _build_matrix(self, header=None, dest_path=None):
        log.info('Building matrix...')
        self._factory.buildFeatureMatrix(header, dest_path)

    def write_matrix(self, dest_path, header=None):
        log.info('Writing matrix file...')
        fm_io = FeatureMatrixIO()
        # Get old matrix file.
        source_path = self._factory.getMatrixFileName()
        # Write to new matrix filee.
        matrix_file = open(dest_path, 'w')
        # for line in header:
        #     matrix_file.write('# %s\n' % line)
        for line in open(source_path, 'r'):
            if line[0] != '#':
                matrix_file.write(line)
        # Delete old matrix file.
        os.remove(source_path)

    def _build_matrix_header(self, params=None):
        # params['include_lab_suffix_summary'] = True/False
        # params['include_clinical_item_suffix_summary'] = True/False
        # params['data_overview'] = str description.
        # params['field_summary'] = str description
        header = list()

        file_summary = self._build_file_summary(params['matrix_path'], \
            params['matrix_module'])
        header.extend(file_summary)
        header.extend([''])

        if params.get('data_overview'):
            header.extend(params['data_overview'])
            header.extend([''])
        if params.get('field_summary'):
            header.extend(params['field_summary'])
            header.extend([''])
        if params.get('include_clinical_item_suffix_summary'):
            ci_suffix_summary = self._build_clinical_item_suffix_summary()
            header.extend(ci_suffix_summary)
            header.extend([''])
        if params.get('include_lab_suffix_summary'):
            lab_suffix_summary = self._build_flowsheet_and_lab_result_suffix_summary(
            )
            header.extend(lab_suffix_summary)
            header.extend([''])

        return header

    def _build_file_summary(self, matrix_path, matrix_module):
        summary = list()

        # <file_name.tab>
        matrix_name = matrix_path.split('/')[-1]
        summary.append(matrix_name)
        # Created: <timestamp>
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
        summary.append('Created: %s' % timestamp)
        # Source: __name__
        module_name = matrix_module.split('/')[-1]
        summary.append('Source: %s' % module_name)
        # Command: Pipeline()
        class_name = module_name.split('.')[0]
        args = [self._var, str(self._num_rows)]
        for key, value in self._params:
            args.append('%s=%s' % (key, value))
        command = '%s(%s)' % (class_name, ', '.join(args))
        summary.append('Command: %s' % command)

        return summary

    def _build_clinical_item_suffix_summary(self):
        summary = list()
        #   [clinical_item] fields may have the following suffixes:
        summary.append(
            '  [clinical_item] fields may have the following suffixes:')
        #       ___.pre - how many times has this occurred before order_time?
        summary.append(
            '    ___.pre - how many times has this occurred before order_time?'
        )
        #       ___.pre.Xd - how many times has this occurred within X days before index_time?
        summary.append(
            '    ___.pre.Xd - how many times has this occurred within X days before index_time?'
        )
        #       ___.preTimeDays - how many days before order_time was last occurrence?
        summary.append(
            '    ___.preTimeDays - how many days before order_time was last occurrence?'
        )

        return summary

    def _build_flowsheet_and_lab_result_suffix_summary(self):
        summary = list()
        #   [flowsheet] and [lab_result] fields may have the following suffixes:
        summary.append(
            '  [flowsheet] and [lab_result] fields may have the following suffixes:'
        )
        #       ___.X_Y.count - # of result values between X and Y days of index_time.
        summary.append(
            '    ___.X_Y.count - # of result values between X and Y days of index_time.'
        )
        #       ___.X_Y.countInRange - # of result values in normal range.
        summary.append(
            '    ___.X_Y.countInRange - # of result values in normal range.')
        #       ___.X_Y.min - minimum result value.
        summary.append('    ___.X_Y.min - minimum result value.')
        #       ___.X_Y.max - maximum result value.
        summary.append('    ___.X_Y.max - maximum result value.')
        #       ___.X_Y.median - median result value.
        summary.append('    ___.X_Y.median - median result value.')
        #       ___.X_Y.std - standard deviation of result values.
        summary.append(
            '    ___.X_Y.std - standard deviation of result values.')
        #       ___.X_Y.first - first result value.
        summary.append('    ___.X_Y.first - first result value.')
        #       ___.X_Y.last - last result value.
        summary.append('    ___.X_Y.last - last result value.')
        #       ___.X_Y.diff - difference between penultimate and proximate values.
        summary.append(
            '    ___.X_Y.diff - difference between penultimate and proximate values.'
        )
        #       ___.X_Y.slope - slope between penultimate and proximate values.
        summary.append(
            '    ___.X_Y.slope - slope between penultimate and proximate values.'
        )
        #       ___.X_Y.proximate - closest result value to order_time.
        summary.append(
            '    ___.X_Y.proximate - closest result value to order_time.')
        #       ___.X_Y.firstTimeDays - time between first and order_time.
        summary.append(
            '    ___.X_Y.firstTimeDays - time between first and order_time.')
        #       ___.X_Y.lastTimeDays - time between last and order_time.
        summary.append(
            '    ___.X_Y.lastTimeDays - time between last and order_time.')
        #       ___.X_Y.proximateTimeDays - time between proximate and order_time.
        summary.append(
            '    ___.X_Y.proximateTimeDays - time between proximate and order_time.'
        )

        return summary
Esempio n. 11
0
class TestFeatureMatrixFactory(DBTestCase):
    def setUp(self):
        """Prepare state for test cases."""
        DBTestCase.setUp(self)

        # StrideLoader.build_stride_psql_schemata()
        # ClinicalItemDataLoader.build_clinical_item_psql_schemata();

        # self._deleteTestRecords()
        # self._insertTestRecords()
        self.factory = FeatureMatrixFactory(
        )  # TODO: self.RACE_FEATURES = self.queryAllRaces(), OperationalError: no such table: demographics

        self.connection = DBUtil.connection()
        # Setup a common connection for test cases to work with, can catch in finally tearDown method to close/cleanup
        self.cursor = self.connection.cursor()
        self._insertUMichTestRecords()

    def _insertUMichTestRecords(self):
        db_name = medinfo.db.Env.DB_PARAM['DSN']
        db_path = medinfo.db.Env.DB_PARAM['DATAPATH']
        conn = sqlite3.connect(db_path + '/' + db_name)

        table_names = [
            'labs', 'pt_info', 'demographics', 'encounters', 'diagnoses'
        ]

        for table_name in table_names:
            columns = FMTU.FM_TEST_INPUT_TABLES["%s_columns" % table_name]
            column_types = FMTU.FM_TEST_INPUT_TABLES["%s_column_types" %
                                                     table_name]

            df = pd.DataFrame()
            for one_line in FMTU.FM_TEST_INPUT_TABLES['%s_data' % table_name]:
                df = df.append(dict(zip(columns, one_line)), ignore_index=True)

            df.to_sql(table_name, conn, if_exists="append", index=False)

        # First, write basic (pat_id, order_time) episode information to TempFile
        # Then, all [[pat_id, event_time]] operations are based on these episodes
        # i.e., pat_id are all from these pat_ids

        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS INTEGER) AS pat_id")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addFrom("labs")
        self.cursor.execute(str(patientEpisodeQuery),
                            patientEpisodeQuery.params)

        self.factory.setPatientEpisodeInput(self.cursor, "pat_id",
                                            "order_time")
        self.factory.processPatientEpisodeInput()
        resultEpisodeIterator = self.factory.getPatientEpisodeIterator()
        resultPatientEpisodes = list()
        for episode in resultEpisodeIterator:
            episode["pat_id"] = int(episode["pat_id"])
            episode["order_time"] = DBUtil.parseDateValue(
                episode["order_time"])
            resultPatientEpisodes.append(episode)

        # print 'len(resultPatientEpisodes):', len(resultPatientEpisodes)

    def tearDown(self):
        """Restore state from any setUp or test steps."""
        # self._deleteTestRecords()

        # Clean up files that might have lingered from failed tests.
        try:
            os.remove("patient_list.tsv")
        except:
            pass
        try:
            self.factory.cleanTempFiles()
        except:
            pass
        try:
            os.remove(self.factory.getMatrixFileName())
        except:
            pass
        try:
            os.remove("extractor.feature_matrix.tab.gz")
        except:
            pass

        self.connection.close()

        DBTestCase.tearDown(self)

    def test__queryMichiganItemsByName_UMich(self):
        ## Input parameters:
        # clinicalItemNames: (e.g. ['Male'], ['Caucasian'], ['Birth']),
        # clinicalItemType: (e.g. 'GenderName', 'RaceName', None),
        # tableName: (e.g. demographics, demographics, pt_info),
        # clinicalItemTime: (e.g. None, None, 'Birth', where the None values are imputed (1900,1,1) in FMF)

        ## Outputs:
        # clinicalItemEvents: [pat_id, event_time] pairs like
        # [
        # [-12411450059993L, datetime.datetime(2011, 8, 4, 23, 14)],
        # [-12411450059993L, datetime.datetime(2011, 8, 5, 0, 38)],
        # [-12392267210986L, datetime.datetime(2015, 11, 25, 22, 57)]
        # ...]

        # Test SEX feature
        MALE_events_expected = [[1, datetime.datetime(1900, 1, 1, 0, 0)],
                                [3, datetime.datetime(1900, 1, 1, 0, 0)]]
        MALE_events_queried = self.factory._queryMichiganItemsByName(
            clinicalItemNames=['Male'],
            clinicalItemType='GenderName',
            tableName='demographics',
            clinicalItemTime=None)

        self.assertEqualSet(MALE_events_expected, MALE_events_queried)

        HCT_events_expected = [[1, u'2050-01-08 23:44:00']]

        HCT_events_queried = self.factory._queryMichiganItemsByName(
            clinicalItemNames=['HCT'],
            clinicalItemType='base_name',
            tableName='labs',
            clinicalItemTime='order_time')
        self.assertEqualSet(HCT_events_expected, HCT_events_queried)

    def lists_to_pd(self, alists):
        # The first list should be the column list
        columns = alists[0]
        df = pd.DataFrame()
        for i in range(1, len(alists)):
            df = df.append(dict(zip(columns, alists[i])), ignore_index=True)
        return df

    def test__BuildMatrix_UMich(self):
        self.factory.addClinicalItemFeatures_UMich(
            clinicalItemNames=['CBCP'],
            clinicalItemType='proc_code',
            tableName='labs',
            clinicalItemTime='order_time')
        self.factory.addClinicalItemFeatures_UMich(
            clinicalItemNames=['WCB'],
            clinicalItemType='base_name',
            tableName='labs',
            clinicalItemTime='order_time')
        self.factory.addClinicalItemFeatures_UMich(
            clinicalItemNames=['HCT'],
            clinicalItemType='base_name',
            tableName='labs',
            clinicalItemTime='order_time')
        self.factory.addClinicalItemFeatures_UMich(
            clinicalItemNames=['Male'],
            clinicalItemType='GenderName',
            tableName='demographics',
            clinicalItemTime=None)
        self.factory.addClinicalItemFeatures_UMich(
            clinicalItemNames=['Caucasian'],
            clinicalItemType='RaceName',
            tableName='demographics',
            clinicalItemTime=None)
        self.factory.addClinicalItemFeatures_UMich(
            clinicalItemNames=['Hispanic'],
            clinicalItemType='RaceName',
            tableName='demographics',
            clinicalItemTime=None)
        self.factory.addClinicalItemFeatures_UMich(clinicalItemNames=['Birth'],
                                                   clinicalItemType=None,
                                                   tableName='pt_info',
                                                   clinicalItemTime='Birth')

        self.factory.addClinicalItemFeaturesByCategory_UMich(
            [2],  #ADMIT_DX_CATEGORY_ID = 2
            dayBins=[],
            label='AdmitDxDate',
            features='pre',
            tableName='encounters')

        self.factory.addCharlsonComorbidityFeatures(features='pre')

        self.factory.addLabResultFeatures(['WCB', 'CO2'],
                                          labIsPanel=False,
                                          preTimeDelta=datetime.timedelta(-3),
                                          postTimeDelta=datetime.timedelta(0))
        self.factory.buildFeatureMatrix()

        resultMatrix = self.factory.readFeatureMatrixFile()
        # for resultrow in resultMatrix:
        #     print resultrow, ','
        # quit()
        df = self.lists_to_pd(resultMatrix[2:])
        # df.to_csv('tmp.csv', index=False)

        expected_df = self.lists_to_pd(FMTU.FM_TEST_OUTPUT['OUTPUT_RAW_TABLE'])

        pd.testing.assert_frame_equal(df, expected_df)
Esempio n. 12
0
    def test_performance(self):
        """
        Test performance against DataExtractor.
        """
        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Initialize FeatureMatrixFactory.
        factoryStart = time.time()
        self.factory = FeatureMatrixFactory()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Add clinical item features.
        self.factory.addClinicalItemFeatures(["PerfItem300"])
        self.factory.addClinicalItemFeatures(["PerfItem400"])
        self.factory.addClinicalItemFeatures(["PerfItem500"])

        # Add lab result features.
        self.factory.addLabResultFeatures(["Foo"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Bar"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Baz"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Qux"], False, preTimeDelta,
                                          postTimeDelta)

        # Add flowsheet features.
        self.factory.addFlowsheetFeatures(["Perflow"], preTimeDelta,
                                          postTimeDelta)

        # Build matrix.
        self.factory.buildFeatureMatrix()

        # Stop timer.
        factoryStop = time.time()

        # Initialize DataExtractor.
        extractorStart = time.time()
        extractor = DataExtractor()
        extractor.dataCache = dict()

        # Initialize output file.
        outFile = open("extractor.feature_matrix.tab.gz", "w")
        formatter = TextResultsFormatter(outFile)

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Process patient episodes.
        patientEpisodes = list()
        row = cursor.fetchone()

        while row is not None:
            (pat_id, order_proc_id, proc_code, order_time,
             normal_results) = row
            patientEpisode = \
                RowItemModel \
                (
                    {
                        "patient_id": pat_id,
                        "order_proc_id": order_proc_id,
                        "proc_code": proc_code,
                        "order_time": order_time,
                        "result_normal_count": normal_results
                    }
                )
            patientEpisodes.append(patientEpisode)
            row = cursor.fetchone()

        # Initialize patient data.
        lastPatientId = None
        colNames = None
        patientEpisodeByIndexTime = None

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Populate patient data.
        tempColNames = \
            ["patient_id", "order_proc_id", "proc_code", "order_time",
                "result_normal_count"]
        for patientEpisode in patientEpisodes:
            patientId = patientEpisode["patient_id"]

            if lastPatientId is not None and lastPatientId != patientId:
                # New patient ID so start querying for patient specific data and
                # populating patient episode data.

                # Clinical Item (PerfItem300)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem300",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem400)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem400",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem500)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem500",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Lab Result (Foo)
                labResultTable = extractor.queryLabResults(["Foo"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Bar)
                labResultTable = extractor.queryLabResults(["Bar"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Baz)
                labResultTable = extractor.queryLabResults(["Baz"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Qux)
                labResultTable = extractor.queryLabResults(["Qux"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
                    preTimeDelta, postTimeDelta))

                # Flowsheet (Perflow)
                # tempFile = StringIO()
                # labResultTable = extractor.queryFlowsheet(["Perflow"], [patientId], tempFile)
                # flowsheetByNameByPatientId = extractor.parseFlowsheetFile(\
                #     StringIO(tempFile.getvalue()))
                # tempColNames.extend(extractor.addFlowsheetFeatures_singlePatient(\
                #     patientEpisodeByIndexTime, flowsheetByNameByPatientId[patientId], \
                #     ["Perflow"], preTimeDelta, postTimeDelta, tempColNames))

                if colNames is None:
                    # First row, print header row
                    colNames = tempColNames
                    formatter.formatTuple(colNames)

                # Print out patient (episode) data (one row per episode)
                formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                            colNames)

            if lastPatientId is None or lastPatientId != patientId:
                # Prepare to aggregate patient episode record per patient
                patientEpisodeByIndexTime = dict()

            patientEpisodeByIndexTime[
                patientEpisode["order_time"]] = patientEpisode
            lastPatientId = patientId
            outFile.flush()

        # Last Iteration
        patientId = lastPatientId
        # Clinical Item (PerfItem300)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem300",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem400)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem400",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem500)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem500",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Lab Result (Foo)
        labResultTable = extractor.queryLabResults(["Foo"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Bar)
        labResultTable = extractor.queryLabResults(["Bar"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Baz)
        labResultTable = extractor.queryLabResults(["Baz"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Qux)
        labResultTable = extractor.queryLabResults(["Qux"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
            preTimeDelta, postTimeDelta))

        formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                    colNames)

        # Close file.
        outFile.close()

        # Stop timer.
        extractorStop = time.time()

        # Compare results.
        factoryTime = factoryStop - factoryStart
        extractorTime = extractorStop - extractorStart
        self.assertTrue(extractorTime > factoryTime)

        # Clean up feature matrix files.
        try:
            os.remove("extractor.feature_matrix.tab.gz")
        except OSError:
            pass
        try:
            os.remove(self.factory.getMatrixFileName())
        except OSError:
            pass
Esempio n. 13
0
    def test_build_FeatureMatrix_multiLabTest(self):
        """
        Test buildFeatureMatrix() and addLabFeatures().
        """
        # Initialize FeatureMatrixFactory.
        self.factory = FeatureMatrixFactory()

        # Verify FeatureMatrixFactory throws Error if patientEpisodeInput
        # has not been set.
        with self.assertRaises(ValueError):
            self.factory.processPatientEpisodeInput()

        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB")
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()
        resultEpisodeIterator = self.factory.getPatientEpisodeIterator()
        resultPatientEpisodes = list()
        for episode in resultEpisodeIterator:
            episode["pat_id"] = int(episode["pat_id"])
            episode["order_time"] = DBUtil.parseDateValue(
                episode["order_time"])
            resultPatientEpisodes.append(episode)

        # Verify results (note sort order).
        expectedPatientEpisodes = FM_TEST_OUTPUT[
            "test_processPatientEpisodeInput"]
        self.assertEqualList(resultPatientEpisodes, expectedPatientEpisodes)

        # Add TNI and CR lab result data. LAC doesn't exist in data.
        labBaseNames = ["TNI", "CR", "LAC"]
        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)
        self.factory.addLabResultFeatures(labBaseNames, False, preTimeDelta,
                                          postTimeDelta)
        self.factory.buildFeatureMatrix()
        resultMatrix = self.factory.readFeatureMatrixFile()

        # Verify results.
        expectedMatrix = FM_TEST_OUTPUT[
            "test_buildFeatureMatrix_multiLabTest"]["expectedMatrix"]
        self.assertEqualTable(expectedMatrix, resultMatrix[2:], precision=5)

        try:
            os.remove(self.factory.getMatrixFileName())
        except OSError:
            pass
Esempio n. 14
0
class TestFeatureMatrixFactory(DBTestCase):
    def setUp(self):
        """Prepare state for test cases."""
        DBTestCase.setUp(self)
        StrideLoader.build_stride_psql_schemata()
        ClinicalItemDataLoader.build_clinical_item_psql_schemata()

        self._deleteTestRecords()
        self._insertTestRecords()

        self.factory = FeatureMatrixFactory()
        self.connection = DBUtil.connection()
        # Setup a common connection for test cases to work with, can catch in finally tearDown method to close/cleanup

    def _insertTestRecords(self):
        """Populate database for with patient data."""
        # Populate clinical_item_category.
        testRecords = FM_TEST_INPUT_TABLES.get("clinical_item_category")
        DBUtil.insertFile(StringIO(testRecords), "clinical_item_category", \
                            delim="\t")

        # Populate clinical_item.
        testRecords = FM_TEST_INPUT_TABLES.get("clinical_item")
        DBUtil.insertFile(StringIO(testRecords), "clinical_item", delim="\t")

        # Populate patient_item.
        testRecords = FM_TEST_INPUT_TABLES.get("patient_item")
        DBUtil.insertFile(StringIO(testRecords), "patient_item", delim="\t", \
                            dateColFormats={"item_date": None})

        # Populate stride_order_proc.
        testRecords = FM_TEST_INPUT_TABLES.get("stride_order_proc")
        DBUtil.insertFile(StringIO(testRecords), "stride_order_proc", \
                            delim="\t", \
                            dateColFormats={"item_date": None})

        # Populate stride_order_results.
        testRecords = FM_TEST_INPUT_TABLES.get("stride_order_results")
        DBUtil.insertFile(StringIO(testRecords), "stride_order_results", \
                            delim="\t", dateColFormats={"result_time": None})

        # Populate stride_flowsheet.
        testRecords = FM_TEST_INPUT_TABLES.get("stride_flowsheet")
        DBUtil.insertFile(StringIO(testRecords), "stride_flowsheet", \
                            delim="\t", \
                            dateColFormats={"shifted_record_dt_tm": None})

        # Populate stride_order_med.
        testRecords = FM_TEST_INPUT_TABLES.get("stride_order_med")
        DBUtil.insertFile(StringIO(testRecords), "stride_order_med", \
            delim="\t", dateColFormats = {"start_taking_time": None, \
                "end_taking_time": None})

    def _deleteTestRecords(self):
        """Delete test records from database."""
        DBUtil.execute("delete from stride_order_med where order_med_id < 0")
        DBUtil.execute("delete from stride_flowsheet where flo_meas_id < 0")
        DBUtil.execute(
            "delete from stride_order_results where order_proc_id < 0")
        DBUtil.execute("delete from stride_order_proc where order_proc_id < 0")
        DBUtil.execute("delete from patient_item where clinical_item_id < 0")
        # Must delete from clinical_item_assocatiation in order to make CDSS
        # test suite pass. Other suites may update this table.
        DBUtil.execute(
            "delete from clinical_item_association where clinical_item_id < 0")
        DBUtil.execute("delete from clinical_item where clinical_item_id < 0")
        DBUtil.execute(
            "delete from clinical_item_category where clinical_item_category_id < 0"
        )

    def tearDown(self):
        """Restore state from any setUp or test steps."""
        self._deleteTestRecords()

        # Clean up files that might have lingered from failed tests.
        try:
            os.remove("patient_list.tsv")
        except:
            pass
        try:
            self.factory.cleanTempFiles()
        except:
            pass
        try:
            os.remove(self.factory.getMatrixFileName())
        except:
            pass
        try:
            os.remove("extractor.feature_matrix.tab.gz")
        except:
            pass

        self.connection.close()

        DBTestCase.tearDown(self)

    def test_dbCache(self):
        """Test database result caching."""
        factory = FeatureMatrixFactory(cacheDBResults=False)
        self.assertEqual(factory.dbCache, None)

        factory = FeatureMatrixFactory()
        self.assertEqual(type(factory.dbCache), type(dict()))

    def test_processPatientListInput(self):
        """Test processPatientListInput()."""
        # Verify FeatureMatrixFactory throws Error if patientListInput
        # has not been set.
        with self.assertRaises(ValueError):
            self.factory.processPatientListInput()

        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Build SQL query for list of patients.
        patientListQuery = SQLQuery()
        patientListQuery.addSelect("CAST(pat_id AS bigint)")
        patientListQuery.addFrom("stride_order_proc")
        patientListQuery.addWhere("proc_code = 'LABMETB'")
        patientListQuery.addGroupBy("pat_id")
        patientListQuery.addOrderBy("1 ASC")
        cursor.execute(str(patientListQuery), patientListQuery.params)

        # Set and process patientListInput.
        self.factory.setPatientListInput(cursor, "pat_id")
        self.factory.processPatientListInput()
        resultPatientIterator = self.factory.getPatientListIterator()

        # Verify results.
        expectedPatientList = ["-789", "-456", "-123"]
        for expectedPatientId in expectedPatientList:
            resultPatientId = resultPatientIterator.next()['pat_id']
            self.assertEqual(resultPatientId, expectedPatientId)
        resultPatientIterator.close()

        # Build TSV file for list of patients.
        patientList = \
            "patient_item_id\tpatient_id\tclinical_item_id\titem_date\n\
            -1000\t-123\t-100\t10/6/2113 10:20\n\
            -2000\t-123\t-200\t10/6/2113 11:20\n\
            -2500\t-123\t-100\t10/7/2113 11:20\n\
            -3000\t-456\t-100\t11/6/2113 10:20\n\
            -6000\t-789\t-200\t12/6/2113 11:20\n"

        patientListTsv = open("patient_list.tsv", "w")
        patientListTsv.write(patientList)
        patientListTsv.close()

        # Initialize new FeatureMatrixFactory.
        self.factory = FeatureMatrixFactory()

        # Set and process patientListInput.
        patientListTsv = open("patient_list.tsv", "r")
        self.factory.setPatientListInput(patientListTsv, "patient_id")
        self.factory.processPatientListInput()
        resultPatientIterator = self.factory.getPatientListIterator()

        # Verify results.
        expectedPatientList = ["-123", "-123", "-123", "-456", "-789"]
        for expectedPatientId in expectedPatientList:
            resultPatientId = resultPatientIterator.next()['patient_id']
            self.assertEqual(resultPatientId, expectedPatientId)
        patientListTsv.close()
        resultPatientIterator.close()

        # Clean up patient_list.
        try:
            os.remove("patient_list.tsv")
            os.remove("fmf.patient_list.tsv")
        except OSError:
            pass

    def test_buildFeatureMatrix_multiClinicalItem(self):
        # Verify FeatureMatrixFactory throws Error if patientEpisodeInput
        # has not been set.
        with self.assertRaises(ValueError):
            self.factory.processPatientEpisodeInput()

        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB")
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()
        resultEpisodeIterator = self.factory.getPatientEpisodeIterator()
        resultPatientEpisodes = list()
        for episode in resultEpisodeIterator:
            episode["pat_id"] = int(episode["pat_id"])
            episode["order_time"] = DBUtil.parseDateValue(
                episode["order_time"])
            resultPatientEpisodes.append(episode)

        # Verify results (note sort order).
        expectedPatientEpisodes = FM_TEST_OUTPUT[
            "test_processPatientEpisodeInput"]
        self.assertEqualList(resultPatientEpisodes, expectedPatientEpisodes)

        # Add TestItem100 and TestItem200 clinical item data.
        self.factory.addClinicalItemFeatures(["TestItem100"])
        self.factory.addClinicalItemFeatures(["TestItem200"])
        self.factory.buildFeatureMatrix()
        resultMatrix = self.factory.readFeatureMatrixFile()
        expectedMatrix = FM_TEST_OUTPUT[
            "test_buildFeatureMatrix_multiClinicalItem"]

        self.assertEqualList(resultMatrix[2:], expectedMatrix)

    def test_buildFeatureMatrix_prePostFeatures(self):
        """
        Test features parameter in addClinicalItemFeatures which allows
        client to specify they only want .pre* or .post* columns in feature
        matrix.
        """
        # Verify FeatureMatrixFactory throws Error if patientEpisodeInput
        # has not been set.
        with self.assertRaises(ValueError):
            self.factory.processPatientEpisodeInput()

        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB")
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()
        resultEpisodeIterator = self.factory.getPatientEpisodeIterator()
        resultPatientEpisodes = list()
        for episode in resultEpisodeIterator:
            episode["pat_id"] = int(episode["pat_id"])
            episode["order_time"] = DBUtil.parseDateValue(
                episode["order_time"])
            resultPatientEpisodes.append(episode)

        # Add TestItem100 and TestItem200 clinical item data.
        self.factory.addClinicalItemFeatures(["TestItem100"], features="pre")
        self.factory.addClinicalItemFeatures(["TestItem200"], features="post")
        self.factory.buildFeatureMatrix()
        resultMatrix = self.factory.readFeatureMatrixFile()
        expectedMatrix = FM_TEST_OUTPUT[
            "test_buildFeatureMatrix_prePostFeatures"]

        self.assertEqualList(resultMatrix[2:], expectedMatrix)

    def test_build_FeatureMatrix_multiLabTest(self):
        """
        Test buildFeatureMatrix() and addLabFeatures().
        """
        # Initialize FeatureMatrixFactory.
        self.factory = FeatureMatrixFactory()

        # Verify FeatureMatrixFactory throws Error if patientEpisodeInput
        # has not been set.
        with self.assertRaises(ValueError):
            self.factory.processPatientEpisodeInput()

        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB")
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()
        resultEpisodeIterator = self.factory.getPatientEpisodeIterator()
        resultPatientEpisodes = list()
        for episode in resultEpisodeIterator:
            episode["pat_id"] = int(episode["pat_id"])
            episode["order_time"] = DBUtil.parseDateValue(
                episode["order_time"])
            resultPatientEpisodes.append(episode)

        # Verify results (note sort order).
        expectedPatientEpisodes = FM_TEST_OUTPUT[
            "test_processPatientEpisodeInput"]
        self.assertEqualList(resultPatientEpisodes, expectedPatientEpisodes)

        # Add TNI and CR lab result data. LAC doesn't exist in data.
        labBaseNames = ["TNI", "CR", "LAC"]
        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)
        self.factory.addLabResultFeatures(labBaseNames, False, preTimeDelta,
                                          postTimeDelta)
        self.factory.buildFeatureMatrix()
        resultMatrix = self.factory.readFeatureMatrixFile()

        # Verify results.
        expectedMatrix = FM_TEST_OUTPUT[
            "test_buildFeatureMatrix_multiLabTest"]["expectedMatrix"]
        self.assertEqualTable(expectedMatrix, resultMatrix[2:], precision=5)

        try:
            os.remove(self.factory.getMatrixFileName())
        except OSError:
            pass

    def test_buildFeatureMatrix_multiFlowsheet(self):
        """
        Test buildFeatureMatrix and addFlowsheet.
        """
        # Verify FeatureMatrixFactory throws Error if patientEpisodeInput
        # has not been set.
        with self.assertRaises(ValueError):
            self.factory.processPatientEpisodeInput()

        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB")
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()
        resultEpisodeIterator = self.factory.getPatientEpisodeIterator()
        resultPatientEpisodes = list()
        for episode in resultEpisodeIterator:
            episode["pat_id"] = int(episode["pat_id"])
            episode["order_time"] = DBUtil.parseDateValue(
                episode["order_time"])
            resultPatientEpisodes.append(episode)

        # Verify results (note sort order).
        expectedPatientEpisodes = FM_TEST_OUTPUT[
            "test_processPatientEpisodeInput"]
        self.assertEqualList(resultPatientEpisodes, expectedPatientEpisodes)

        # Add flowsheet features.
        flowsheetNames = ["Resp", "FiO2", "Glasgow Coma Scale Score"]
        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)
        self.factory.addFlowsheetFeatures(flowsheetNames, preTimeDelta,
                                          postTimeDelta)
        self.factory.buildFeatureMatrix()
        resultMatrix = self.factory.readFeatureMatrixFile()

        # Verify results.
        expectedMatrix = FM_TEST_OUTPUT[
            "test_buildFeatureMatrix_multiFlowsheet"]["expectedMatrix"]
        self.assertEqualTable(expectedMatrix, resultMatrix[2:], precision=5)

        try:
            os.remove(self.factory.getMatrixFileName())
        except OSError:
            pass

    def test_addTimeCycleFeatures(self):
        """
        Test .addTimeCycleFeatures()
        """
        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereEqual("proc_code", "LABMETB")
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()

        # Add time cycle features.
        self.factory.addTimeCycleFeatures("order_time", "month")
        self.factory.addTimeCycleFeatures("order_time", "hour")

        # Verify output.
        self.factory.buildFeatureMatrix()
        resultMatrix = self.factory.readFeatureMatrixFile()
        expectedMatrix = FM_TEST_OUTPUT["test_addTimeCycleFeatures"][
            "expectedMatrix"]
        self.assertEqualTable(expectedMatrix, resultMatrix[2:], precision=5)

        # Clean up feature matrix.
        try:
            os.remove(self.factory.getMatrixFileName())
        except OSError:
            pass

    def test_loadMapData(self):
        self.factory = FeatureMatrixFactory()

        # Depends on external data file
        reader = self.factory.loadMapData("CharlsonComorbidity-ICD9CM")
        charlsonByICD9 = dict()

        for row in reader:
            charlsonByICD9[row["icd9cm"]] = row["charlson"]

        self.assertEqual("Dementia", charlsonByICD9["294.1"])
        self.assertEqual("Dementia", charlsonByICD9["331.2"])
        self.assertEqual("COPD", charlsonByICD9["490"])
        self.assertEqual("COPD", charlsonByICD9["416.8"])
        self.assertEqual("Malignancy Metastatic", charlsonByICD9["199"])
        self.assertEqual("AIDS/HIV", charlsonByICD9["042"])

    def test_performance(self):
        """
        Test performance against DataExtractor.
        """
        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Initialize FeatureMatrixFactory.
        factoryStart = time.time()
        self.factory = FeatureMatrixFactory()

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Set and process patientEpisodeInput.
        self.factory.setPatientEpisodeInput(cursor, "pat_id", "order_time")
        self.factory.processPatientEpisodeInput()

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Add clinical item features.
        self.factory.addClinicalItemFeatures(["PerfItem300"])
        self.factory.addClinicalItemFeatures(["PerfItem400"])
        self.factory.addClinicalItemFeatures(["PerfItem500"])

        # Add lab result features.
        self.factory.addLabResultFeatures(["Foo"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Bar"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Baz"], False, preTimeDelta,
                                          postTimeDelta)
        self.factory.addLabResultFeatures(["Qux"], False, preTimeDelta,
                                          postTimeDelta)

        # Add flowsheet features.
        self.factory.addFlowsheetFeatures(["Perflow"], preTimeDelta,
                                          postTimeDelta)

        # Build matrix.
        self.factory.buildFeatureMatrix()

        # Stop timer.
        factoryStop = time.time()

        # Initialize DataExtractor.
        extractorStart = time.time()
        extractor = DataExtractor()
        extractor.dataCache = dict()

        # Initialize output file.
        outFile = open("extractor.feature_matrix.tab.gz", "w")
        formatter = TextResultsFormatter(outFile)

        # Build SQL query for list of patient episodes.
        patientEpisodeQuery = SQLQuery()
        patientEpisodeQuery.addSelect("CAST(pat_id AS bigint)")
        patientEpisodeQuery.addSelect("sop.order_proc_id AS order_proc_id")
        patientEpisodeQuery.addSelect("proc_code")
        patientEpisodeQuery.addSelect("order_time")
        patientEpisodeQuery.addSelect(
            "COUNT(CASE result_in_range_yn WHEN 'Y' THEN 1 ELSE null END) AS normal_results"
        )
        patientEpisodeQuery.addFrom("stride_order_proc AS sop")
        patientEpisodeQuery.addFrom("stride_order_results AS sor")
        patientEpisodeQuery.addWhere("sop.order_proc_id = sor.order_proc_id")
        patientEpisodeQuery.addWhereIn("proc_code",
                                       ["Foo", "Bar", "Baz", "Qux"])
        patientEpisodeQuery.addGroupBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        patientEpisodeQuery.addOrderBy(
            "pat_id, sop.order_proc_id, proc_code, order_time")
        cursor.execute(str(patientEpisodeQuery), patientEpisodeQuery.params)

        # Process patient episodes.
        patientEpisodes = list()
        row = cursor.fetchone()

        while row is not None:
            (pat_id, order_proc_id, proc_code, order_time,
             normal_results) = row
            patientEpisode = \
                RowItemModel \
                (
                    {
                        "patient_id": pat_id,
                        "order_proc_id": order_proc_id,
                        "proc_code": proc_code,
                        "order_time": order_time,
                        "result_normal_count": normal_results
                    }
                )
            patientEpisodes.append(patientEpisode)
            row = cursor.fetchone()

        # Initialize patient data.
        lastPatientId = None
        colNames = None
        patientEpisodeByIndexTime = None

        # Look for lab data 90 days before each episode, but never afterself.
        preTimeDelta = datetime.timedelta(-90)
        postTimeDelta = datetime.timedelta(0)

        # Populate patient data.
        tempColNames = \
            ["patient_id", "order_proc_id", "proc_code", "order_time",
                "result_normal_count"]
        for patientEpisode in patientEpisodes:
            patientId = patientEpisode["patient_id"]

            if lastPatientId is not None and lastPatientId != patientId:
                # New patient ID so start querying for patient specific data and
                # populating patient episode data.

                # Clinical Item (PerfItem300)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem300",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem400)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem400",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Clinical Item (PerfItem500)
                eventTimes = extractor.parseClinicalItemData_singlePatient(\
                    modelListFromTable(extractor.queryClinicalItemsByName(\
                        ("PerfItem500",), [patientId])))
                tempColNames.extend(\
                    extractor.addClinicalItemFeatures_singlePatient(\
                    eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
                    daysBins=[]))

                # Lab Result (Foo)
                labResultTable = extractor.queryLabResults(["Foo"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Bar)
                labResultTable = extractor.queryLabResults(["Bar"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Baz)
                labResultTable = extractor.queryLabResults(["Baz"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
                    preTimeDelta, postTimeDelta))

                # Lab Result (Qux)
                labResultTable = extractor.queryLabResults(["Qux"],
                                                           [patientId])
                labsByBaseName = extractor.parseLabResultsData_singlePatient(\
                    modelListFromTable(labResultTable))
                tempColNames.extend(extractor.addLabFeatures_singlePatient(\
                    patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
                    preTimeDelta, postTimeDelta))

                # Flowsheet (Perflow)
                # tempFile = StringIO()
                # labResultTable = extractor.queryFlowsheet(["Perflow"], [patientId], tempFile)
                # flowsheetByNameByPatientId = extractor.parseFlowsheetFile(\
                #     StringIO(tempFile.getvalue()))
                # tempColNames.extend(extractor.addFlowsheetFeatures_singlePatient(\
                #     patientEpisodeByIndexTime, flowsheetByNameByPatientId[patientId], \
                #     ["Perflow"], preTimeDelta, postTimeDelta, tempColNames))

                if colNames is None:
                    # First row, print header row
                    colNames = tempColNames
                    formatter.formatTuple(colNames)

                # Print out patient (episode) data (one row per episode)
                formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                            colNames)

            if lastPatientId is None or lastPatientId != patientId:
                # Prepare to aggregate patient episode record per patient
                patientEpisodeByIndexTime = dict()

            patientEpisodeByIndexTime[
                patientEpisode["order_time"]] = patientEpisode
            lastPatientId = patientId
            outFile.flush()

        # Last Iteration
        patientId = lastPatientId
        # Clinical Item (PerfItem300)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem300",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem400)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem400",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Clinical Item (PerfItem500)
        eventTimes = extractor.parseClinicalItemData_singlePatient(\
            modelListFromTable(extractor.queryClinicalItemsByName(\
                ("PerfItem500",), [patientId])))
        tempColNames.extend(\
            extractor.addClinicalItemFeatures_singlePatient(\
            eventTimes, patientEpisodeByIndexTime, "PerfItem300", \
            daysBins=[]))

        # Lab Result (Foo)
        labResultTable = extractor.queryLabResults(["Foo"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Foo"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Bar)
        labResultTable = extractor.queryLabResults(["Bar"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Bar"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Baz)
        labResultTable = extractor.queryLabResults(["Baz"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Baz"], \
            preTimeDelta, postTimeDelta))

        # Lab Result (Qux)
        labResultTable = extractor.queryLabResults(["Qux"], [patientId])
        labsByBaseName = extractor.parseLabResultsData_singlePatient(\
            modelListFromTable(labResultTable))
        tempColNames.extend(extractor.addLabFeatures_singlePatient(\
            patientEpisodeByIndexTime, labsByBaseName, ["Qux"], \
            preTimeDelta, postTimeDelta))

        formatter.formatResultDicts(patientEpisodeByIndexTime.values(),
                                    colNames)

        # Close file.
        outFile.close()

        # Stop timer.
        extractorStop = time.time()

        # Compare results.
        factoryTime = factoryStop - factoryStart
        extractorTime = extractorStop - extractorStart
        self.assertTrue(extractorTime > factoryTime)

        # Clean up feature matrix files.
        try:
            os.remove("extractor.feature_matrix.tab.gz")
        except OSError:
            pass
        try:
            os.remove(self.factory.getMatrixFileName())
        except OSError:
            pass
Esempio n. 15
0
    def test_processPatientListInput(self):
        """Test processPatientListInput()."""
        # Verify FeatureMatrixFactory throws Error if patientListInput
        # has not been set.
        with self.assertRaises(ValueError):
            self.factory.processPatientListInput()

        # Initialize DB cursor.
        cursor = self.connection.cursor()

        # Build SQL query for list of patients.
        patientListQuery = SQLQuery()
        patientListQuery.addSelect("CAST(pat_id AS bigint)")
        patientListQuery.addFrom("stride_order_proc")
        patientListQuery.addWhere("proc_code = 'LABMETB'")
        patientListQuery.addGroupBy("pat_id")
        patientListQuery.addOrderBy("1 ASC")
        cursor.execute(str(patientListQuery), patientListQuery.params)

        # Set and process patientListInput.
        self.factory.setPatientListInput(cursor, "pat_id")
        self.factory.processPatientListInput()
        resultPatientIterator = self.factory.getPatientListIterator()

        # Verify results.
        expectedPatientList = ["-789", "-456", "-123"]
        for expectedPatientId in expectedPatientList:
            resultPatientId = resultPatientIterator.next()['pat_id']
            self.assertEqual(resultPatientId, expectedPatientId)
        resultPatientIterator.close()

        # Build TSV file for list of patients.
        patientList = \
            "patient_item_id\tpatient_id\tclinical_item_id\titem_date\n\
            -1000\t-123\t-100\t10/6/2113 10:20\n\
            -2000\t-123\t-200\t10/6/2113 11:20\n\
            -2500\t-123\t-100\t10/7/2113 11:20\n\
            -3000\t-456\t-100\t11/6/2113 10:20\n\
            -6000\t-789\t-200\t12/6/2113 11:20\n"

        patientListTsv = open("patient_list.tsv", "w")
        patientListTsv.write(patientList)
        patientListTsv.close()

        # Initialize new FeatureMatrixFactory.
        self.factory = FeatureMatrixFactory()

        # Set and process patientListInput.
        patientListTsv = open("patient_list.tsv", "r")
        self.factory.setPatientListInput(patientListTsv, "patient_id")
        self.factory.processPatientListInput()
        resultPatientIterator = self.factory.getPatientListIterator()

        # Verify results.
        expectedPatientList = ["-123", "-123", "-123", "-456", "-789"]
        for expectedPatientId in expectedPatientList:
            resultPatientId = resultPatientIterator.next()['patient_id']
            self.assertEqual(resultPatientId, expectedPatientId)
        patientListTsv.close()
        resultPatientIterator.close()

        # Clean up patient_list.
        try:
            os.remove("patient_list.tsv")
            os.remove("fmf.patient_list.tsv")
        except OSError:
            pass
Esempio n. 16
0
class LabNormalityPredictionPipeline(SupervisedLearningPipeline):
    def __init__(self, lab_panel, num_episodes, use_cache=None, random_state=None, isLabPanel=True,
                 notUsePatIds=[], pat_batch_ind=None):
        self.notUsePatIds = notUsePatIds
        self.pat_batch_ind = pat_batch_ind
        self.usedPatIds = []
        SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state, isLabPanel)
        self._factory = FeatureMatrixFactory()
        self._build_raw_feature_matrix()
        #self._build_processed_feature_matrix()
        #self._train_and_analyze_predictors()

    def _build_model_dump_path(self, algorithm):
        template = '%s' + '-normality-%s-model.pkl' % algorithm
        pipeline_file_name = inspect.getfile(inspect.currentframe())
        return SupervisedLearningPipeline._build_model_dump_path(self, template, \
            pipeline_file_name)

    def _build_raw_matrix_path(self):
        if not self.pat_batch_ind:
            template = '%s-normality-matrix-%d-episodes-raw.tab'
        else:
            template = '%s-normality-matrix-%d-episodes-raw-'+str(self.pat_batch_ind)+'.tab'
        pipeline_file_name = inspect.getfile(inspect.currentframe())
        return SupervisedLearningPipeline._build_matrix_path(self, template, \
            pipeline_file_name)

    def _build_raw_feature_matrix(self):
        raw_matrix_path = self._build_raw_matrix_path()
        matrix_class = LabNormalityMatrix
        SupervisedLearningPipeline._build_raw_feature_matrix(self, matrix_class, \
            raw_matrix_path)

    def _build_processed_matrix_path(self):
        template = '%s-normality-matrix-%d-episodes-processed.tab'
        pipeline_file_path = inspect.getfile(inspect.currentframe())
        return SupervisedLearningPipeline._build_matrix_path(self, template, \
            pipeline_file_path)

    def _build_processed_feature_matrix(self):
        # Define parameters for processing steps.
        params = {}
        raw_matrix_path = self._build_raw_matrix_path()
        processed_matrix_path = self._build_processed_matrix_path()
        features_to_add = {}
        imputation_strategies = {
        }

        if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE':
            features_to_remove = [
                'pat_id', 'order_time', 'order_proc_id',
                'proc_code', 'abnormal_panel',
                'num_normal_components', 'Birth.pre',
                'Male.preTimeDays', 'Female.preTimeDays',
                'RaceWhiteHispanicLatino.preTimeDays',
                'RaceWhiteNonHispanicLatino.preTimeDays',
                'RaceHispanicLatino.preTimeDays',
                'RaceAsian.preTimeDays',
                'RaceBlack.preTimeDays',
                'RacePacificIslander.preTimeDays',
                'RaceNativeAmerican.preTimeDays',
                'RaceOther.preTimeDays',
                'RaceUnknown.preTimeDays',
                'Death.post',
                'Death.postTimeDays',
                'num_components'
            ]
            outcome_label = 'all_components_normal' # TODO: for component...

        elif LocalEnv.DATASET_SOURCE_NAME == 'UMich':
            features_to_remove = [
                'pat_id', 'order_time', 'order_proc_id',
                'Birth.pre',
                'Male.preTimeDays', 'Female.preTimeDays',
                # 'Caucasian.preTimeDays',
                # 'Hispanic.preTimeDays',
                # 'Native Hawaiian and Other Pacific Islander.preTimeDays'
            ]
            RACE_FEATURES = self._factory.queryAllRaces()
            features_to_remove += [x + '.preTimeDays' for x in RACE_FEATURES]
            if self._isLabPanel:
                features_to_remove += ['proc_code', 'num_normal_components', 'num_components']
            else:
                features_to_remove += ['base_name']

            outcome_label = 'abnormal_lab'

        features_to_keep = [
            # Keep the # of times it's been ordered in past, even if low info.
            '%s.pre' % self._var
        ]

        selection_problem = FeatureSelector.CLASSIFICATION
        selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION
        percent_features_to_select = 0.05
        matrix_class = LabNormalityMatrix
        pipeline_file_path = inspect.getfile(inspect.currentframe())
        random_state = self._random_state
        data_overview = [
            # Overview:
            'Overview',
            # The outcome label is ___.
            'The outcome label is %s.' % outcome_label,
            # %s is a boolean indicator which summarizes whether all components
            '%s is a boolean indicator which summarizes whether all components ' % outcome_label,
            # in the lab panel order represented by a given row are normal.
            'in the lab panel order represented by a given row are normal.',
            # Each row represents a unique lab panel order.
            'Each row represents a unique lab panel order.',
            # Each row contains fields summarizing the patient's demographics,
            "Each row contains fields summarizing the patient's demographics",
            # inpatient admit date, prior vitals, and prior lab results.
            'inpatient admit date, prior vitals, and prior lab results.',
            # Most cells in matrix represent a count statistic for an event's
            "Most cells in matrix represent a count statistic for an event's",
            # occurrence or a difference between an event's time and index_time.
            "occurrence or a difference between an event's time and index_time.",
        ]

        # Bundle parameters into single object to be unpacked in SLP.
        params['raw_matrix_path'] = raw_matrix_path
        params['processed_matrix_path'] = processed_matrix_path
        params['features_to_add'] = features_to_add
        params['features_to_keep'] = features_to_keep
        params['imputation_strategies'] = imputation_strategies
        params['features_to_remove'] = features_to_remove
        params['outcome_label'] = outcome_label
        params['selection_problem'] = selection_problem
        params['selection_algorithm'] = selection_algorithm
        params['percent_features_to_select'] = percent_features_to_select
        params['matrix_class'] = matrix_class
        params['pipeline_file_path'] = pipeline_file_path
        params['data_overview'] = data_overview
        params['random_state'] = random_state

        # Defer processing logic to SupervisedLearningPipeline.
        SupervisedLearningPipeline._build_processed_feature_matrix(self, params)

    def _train_and_analyze_predictors(self):
        log.info('Training and analyzing predictors...')
        problem = SupervisedLearningPipeline.CLASSIFICATION
        meta_report = None
        fm_io = FeatureMatrixIO()

        # Build paths for output.
        pipeline_file_name = inspect.getfile(inspect.currentframe())
        data_dir = SupervisedLearningPipeline._fetch_data_dir_path(self, pipeline_file_name)

        # Test BifurcatedSupervisedClassifier and SupervisedClassifier.
        algorithms_to_test = list()
        algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS)
        # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS:
        #     algorithms_to_test.append('bifurcated-%s' % algorithm)
        log.debug('algorithms_to_test: %s' % algorithms_to_test)

        # Train and analyse algorithms.
        for algorithm in algorithms_to_test:
            log.info('Training and analyzing %s...' % algorithm)
            # If report_dir does not exist, make it.
            report_dir = '/'.join([data_dir, algorithm])
            if not os.path.exists(report_dir):
                os.makedirs(report_dir)

            # Define hyperparams.
            hyperparams = {}
            hyperparams['algorithm'] = algorithm
            hyperparams['hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH
            hyperparams['max_iter'] = 1024
            hyperparams['random_state'] = self._random_state

            # If bifurcated algorithm, define bifurcator.
            if 'bifurcated' in algorithm:
                # bifrucator = LAB.pre == 0
                hyperparams['bifurcator'] = '%s.pre' % self._var
                hyperparams['bifurcation_strategy'] = BifurcatedSupervisedClassifier.EQUAL
                hyperparams['bifurcation_value'] = 0
                hyperparams['bifurcated'] = True

            # Train classifier.
            predictor_path = self._build_model_dump_path(algorithm)
            if os.path.exists(predictor_path) and 'bifurcated' not in algorithm:
                log.debug('Loading model from disk...')
                # TODO(sbala): Fix loblib.load so that it works for bifurcated
                # supervised classifiers.
                self._predictor = joblib.load(predictor_path)
                self._features = self._X_train.columns
                status = SupervisedClassifier.TRAINED
            else:
                status = SupervisedLearningPipeline._train_predictor(self, problem, [0, 1], hyperparams)

            # If failed to train, write an error report.
            y_train_counts = self._y_train[self._y_train.columns[0]].value_counts()
            y_test_counts = self._y_test[self._y_test.columns[0]].value_counts()
            if status == SupervisedClassifier.INSUFFICIENT_SAMPLES:
                # Skip all analysis and reporting.
                # This will be true for all algorithms, so just return.
                # Build error report.
                algorithm_report = DataFrame(
                    {
                    'lab_panel': [self._var],
                    'algorithm': [algorithm],
                    'error': [status],
                    'y_train.value_counts()': [y_train_counts.to_dict()],
                    'y_test.value_counts()': [y_test_counts.to_dict()]
                    },
                    columns=[
                        'lab_panel', 'algorithm', 'error',
                        'y_train.value_counts()', 'y_test.value_counts()'
                    ]
                )
                header = ['LabNormalityPredictionPipeline("%s", 10000)' % self._var]
                # Write error report.
                fm_io.write_data_frame_to_file(algorithm_report, \
                    '/'.join([report_dir, '%s-normality-prediction-report.tab' % (self._var)]), \
                    header)
            # If successfully trained, append to a meta report.
            elif status == SupervisedClassifier.TRAINED:
                pipeline_prefix = '%s-normality-prediction-%s' % (self._var, algorithm)
                SupervisedLearningPipeline._analyze_predictor(self, report_dir, pipeline_prefix)
                SupervisedLearningPipeline._analyze_predictor_traindata(self, report_dir, pipeline_prefix)
                if meta_report is None:
                    meta_report = fm_io.read_file_to_data_frame('/'.join([report_dir, '%s-report.tab' % pipeline_prefix]))
                else:
                    algorithm_report = fm_io.read_file_to_data_frame('/'.join([report_dir, '%s-report.tab' % pipeline_prefix]))
                    log.debug('algorithm_report: %s' % algorithm_report)
                    meta_report = meta_report.append(algorithm_report)
                # Write predictor to disk.
                predictor = SupervisedLearningPipeline.predictor(self)
                predictor_path = self._build_model_dump_path(algorithm)
                joblib.dump(predictor, predictor_path)

        # After building per-algorithm reports, write to meta report.
        # Note that if there were insufficient samples to build any of the
        # algorithms, then meta_report will still be None.
        if meta_report is not None:
            header = ['LabNormalityPredictionPipeline("%s", 10000)' % self._var]
            fm_io.write_data_frame_to_file(meta_report, \
                '/'.join([data_dir, '%s-normality-prediction-report.tab' % self._var]), header)