def _analyze_predictors_on_holdout(self): fm_io = FeatureMatrixIO() algorithms_to_test = list() algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS) pipeline_file_name = inspect.getfile(inspect.currentframe()) data_dir = SupervisedLearningPipeline._fetch_data_dir_path( self, pipeline_file_name) # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: # algorithms_to_test.append('bifurcated-%s' % algorithm) log.debug('algorithms_to_test: %s' % algorithms_to_test) for algorithm in algorithms_to_test: log.info('analyzing %s...' % algorithm) # If report_dir does not exist, make it. report_dir = '/'.join([data_dir, algorithm]) pipeline_prefix = '%s-normality-prediction-%s' % (self._var, algorithm) predictor_path = self._build_model_dump_path(algorithm) if os.path.exists( predictor_path) and 'bifurcated' not in algorithm: log.debug('Loading model from disk...') # TODO(sbala): Fix loblib.load so that it works for bifurcated # supervised classifiers. self._predictor = joblib.load(predictor_path) # self._features = self._X_train.columns status = SupervisedClassifier.TRAINED SupervisedLearningPipeline._analyze_predictor_holdoutset( self, report_dir, pipeline_prefix)
def __init__( self, change_params, lab_panel, num_episodes, use_cache=None, random_state=None, build_raw_only=False, ): SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state) self._change_params = change_params self._change_params[ 'feature_old'] = self._lookup_previous_measurement_feature( self._var) log.debug('change_params: %s' % self._change_params) if build_raw_only: self._build_raw_feature_matrix() return else: self._build_raw_feature_matrix() self._build_processed_feature_matrix() self._train_and_analyze_predictors()
def __init__(self, lab_panel, microcultures, num_episodes, use_cache=None, random_state=None): SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state) self.panel = microcultures self._build_raw_feature_matrix() self._build_processed_feature_matrix() self._train_and_analyze_predictors()
def __init__(self, lab_panel, num_episodes, use_cache=None, random_state=None, isLabPanel=True, notUsePatIds=[], pat_batch_ind=None): self.notUsePatIds = notUsePatIds self.pat_batch_ind = pat_batch_ind self.usedPatIds = [] SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state, isLabPanel) self._factory = FeatureMatrixFactory() self._build_raw_feature_matrix()
def __init__(self, lab_panel, num_episodes, use_cache=None, random_state=None, isLabPanel=True, timeLimit=None, notUsePatIds=None, holdOut=False, pat_batch_ind=None): self.notUsePatIds = notUsePatIds self.pat_batch_ind = pat_batch_ind self.usedPatIds = [] SupervisedLearningPipeline.__init__( self, lab_panel, num_episodes, use_cache, random_state, isLabPanel, timeLimit, holdOut, isLabNormalityPredictionPipeline=True) # TODO: naming of lab_panel self._factory = FeatureMatrixFactory() self._build_raw_feature_matrix() data_lab_folder = self._fetch_data_dir_path( inspect.getfile(inspect.currentframe())) feat2imputed_dict_path = data_lab_folder + '/feat2imputed_dict.pkl' if holdOut: ''' For holdOut evaluation data, produce the raw matrix, pick features according to the saved feat2imputed_dict. ''' self.feat2imputed_dict = pickle.load( open(feat2imputed_dict_path, 'r')) self._build_processed_feature_matrix_holdout() self._analyze_predictors_on_holdout() else: ''' For training/validation data, record the pat_ids, selected features and their imputed value correspondingly. ''' pickle.dump(self.usedPatIds, open('data/used_patient_set_%s.pkl' % self._var, 'w'), pickle.HIGHEST_PROTOCOL) self._build_processed_feature_matrix() self._build_baseline_results() # TODO: prototype in SLPP return # TODO: find better place to put the dict.pkl pickle.dump(self.feat2imputed_dict, open(feat2imputed_dict_path, 'w'), pickle.HIGHEST_PROTOCOL) self._train_and_analyze_predictors()
def _build_raw_feature_matrix(self): raw_matrix_path = self._build_raw_matrix_path() matrix_class = LabNormalityMatrix SupervisedLearningPipeline._build_raw_feature_matrix(self, matrix_class, \ raw_matrix_path) if not self._holdOut: fm_io = FeatureMatrixIO() matrix = fm_io.read_file_to_data_frame(raw_matrix_path) self.usedPatIds = set(matrix['pat_id'].values)
def __init__(self, lab_panel, num_episodes, use_cache=None, random_state=None, timeLimit=None, notUsePatIds=None, holdOut=False, pat_batch_ind=None, includeLastNormality=True): # self.notUsePatIds = notUsePatIds self.pat_batch_ind = pat_batch_ind self._holdOut = holdOut self.usedPatIds = [] SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state, timeLimit, notUsePatIds) # TODO: naming of lab_panel self._factory = FeatureMatrixFactory() self._build_raw_feature_matrix() if LAB_TYPE == 'panel': self.ylabel = 'all_components_normal' else: self.ylabel = 'component_normal' self.includeLastNormality = includeLastNormality if self.includeLastNormality: fm_io = FeatureMatrixIO() df = fm_io.read_file_to_data_frame('data/'+lab_panel+'/%s-normality-matrix-raw.tab'%lab_panel) df = df.sort_values(['pat_id', 'order_time']).reset_index(drop=True) df['last_normality'] = df['order_proc_id'].apply(lambda x:float('nan')) for i in range(1,df.shape[0]): if df.ix[i, 'pat_id'] == df.ix[i-1, 'pat_id']: df.ix[i, 'last_normality'] = df.ix[i-1, self.ylabel] df.to_csv('data/'+lab_panel+'/%s-normality-matrix-raw.tab'%lab_panel, index=False, sep='\t') data_lab_folder = self._fetch_data_dir_path(inspect.getfile(inspect.currentframe())) feat2imputed_dict_path = data_lab_folder + '/feat2imputed_dict.pkl' if holdOut: ''' For holdOut evaluation data, produce the raw matrix, pick features according to the saved feat2imputed_dict. ''' self.feat2imputed_dict = pickle.load(open(feat2imputed_dict_path, 'r')) self._build_processed_feature_matrix_holdout() self._analyze_predictors_on_holdout() else: ''' For training/validation data, record the pat_ids, selected features and their imputed value correspondingly. ''' pickle.dump(self.usedPatIds, open('data/used_patient_set_%s.pkl'%self._var, 'w'), pickle.HIGHEST_PROTOCOL) self._build_processed_feature_matrix() self._build_baseline_results() # TODO: prototype in SLPP # return # TODO: find better place to put the dict.pkl pickle.dump(self.feat2imputed_dict, open(feat2imputed_dict_path, 'w'), pickle.HIGHEST_PROTOCOL) self._train_and_analyze_predictors()
def _build_raw_matrix_path(self): if not self.pat_batch_ind: template = '%s-normality-matrix-%d-episodes-raw.tab' else: template = '%s-normality-matrix-%d-episodes-raw-'+str(self.pat_batch_ind)+'.tab' pipeline_file_name = inspect.getfile(inspect.currentframe()) return SupervisedLearningPipeline._build_matrix_path(self, template, \ pipeline_file_name)
def _build_processed_matrix_path(self): processed_matrix_filename = '%s-normality-matrix-processed.tab' % self._var # processed_matrix_path = os.path.join('data', self._var, processed_matrix_filename) # TODO return processed_matrix_path if not self._holdOut: template = '%s-normality-matrix-processed.tab' else: template = '%s-normality-matrix-%d-episodes-processed-holdout.tab' pipeline_file_path = inspect.getfile(inspect.currentframe()) return SupervisedLearningPipeline._build_matrix_path(self, template, \ pipeline_file_path)
def _build_raw_matrix_path(self): raw_matrix_filename = '%s-normality-matrix-raw.tab' % self._var # raw_matrix_filepath = os.path.join('data', self._var, raw_matrix_filename) # TODO if not os.path.exists('data'): os.mkdir('data') if not os.path.exists(os.path.join('data', self._var)): os.mkdir(os.path.join('data', self._var)) return raw_matrix_filepath if not self._holdOut: template = '%s-normality-matrix-raw.tab' else: template = '%s-normality-matrix-%d-episodes-raw-holdout.tab' pipeline_file_name = inspect.getfile(inspect.currentframe()) return SupervisedLearningPipeline._build_matrix_path(self, template, \ pipeline_file_name)
def _build_raw_matrix_path(self): template = '%s-change-matrix-%d-episodes-raw.tab' pipeline_file_name = inspect.getfile(inspect.currentframe()) # Build matrix file name. slugified_var = '-'.join(self._var.split()) matrix_name = template % (slugified_var, self._num_rows) # Build path using parent class logic for _fetch_data_dir_path. # This puts raw matrix in the directory for lab test rather than the # subdirectory for the specific change definition. That way it can be # reused in pipelines for multiple different change defs. data_dir = SupervisedLearningPipeline._fetch_data_dir_path(self, pipeline_file_name) matrix_path = '/'.join([data_dir, matrix_name]) return matrix_path
def _build_raw_feature_matrix(self): raw_matrix_path = self._build_raw_matrix_path() matrix_class = LabChangeMatrix SupervisedLearningPipeline._build_raw_feature_matrix(self, matrix_class, \ raw_matrix_path)
def _build_model_dump_path(self, algorithm): template = '%s' + '-change-%s-model.pkl' % algorithm pipeline_file_name = inspect.getfile(inspect.currentframe()) return SupervisedLearningPipeline._build_model_dump_path(self, template, \ pipeline_file_name)
def _train_and_analyze_predictors(self): log.info('Training and analyzing predictors...') problem = SupervisedLearningPipeline.CLASSIFICATION meta_report = None fm_io = FeatureMatrixIO() # Build paths for output. pipeline_file_name = inspect.getfile(inspect.currentframe()) data_dir = self._fetch_data_dir_path(pipeline_file_name) # Test BifurcatedSupervisedClassifier and SupervisedClassifier. algorithms_to_test = list() algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS) for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: pass # TODO:(raikens) something in the BifurcatedSupervisedClassifier pipeline is crashing #algorithms_to_test.append('bifurcated-%s' % algorithm) log.debug('algorithms_to_test: %s' % algorithms_to_test) # Train and analyse algorithms. for algorithm in algorithms_to_test: log.info('Training and analyzing %s...' % algorithm) # If report_dir does not exist, make it. report_dir = '/'.join([data_dir, algorithm]) if not os.path.exists(report_dir): os.makedirs(report_dir) log.debug('report_dir: %s' % report_dir) # Define hyperparams. hyperparams = {} hyperparams['algorithm'] = algorithm hyperparams[ 'hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH hyperparams['max_iter'] = 1024 # If bifurcated algorithm, define bifurcator. if 'bifurcated' in algorithm: # bifrucator = LAB.pre == 0 hyperparams['bifurcator'] = '%s.pre' % self._var hyperparams[ 'bifurcation_strategy'] = BifurcatedSupervisedClassifier.EQUAL hyperparams['bifurcation_value'] = 0 hyperparams['bifurcated'] = True # Train classifier. predictor_path = self._build_model_dump_path(algorithm) if os.path.exists( predictor_path) and 'bifurcated' not in algorithm: log.debug('Loading model from disk...') # TODO(sbala): Fix loblib.load so that it works for bifurcated # supervised classifiers. self._predictor = joblib.load(predictor_path) self._features = self._X_train.columns status = SupervisedClassifier.TRAINED else: status = SupervisedLearningPipeline._train_predictor( self, problem, [0, 1], hyperparams) # If failed to train, write an error report. y_train_counts = self._y_train[ self._y_train.columns[0]].value_counts() y_test_counts = self._y_test[ self._y_test.columns[0]].value_counts() if status == SupervisedClassifier.INSUFFICIENT_SAMPLES: # Skip all analysis and reporting. # This will be true for all algorithms, so just return. # Build error report. algorithm_report = DataFrame( { 'lab_panel': [self._var], 'algorithm': [algorithm], 'error': [status], 'y_train.value_counts()': [y_train_counts.to_dict()], 'y_test.value_counts()': [y_test_counts.to_dict()] }, columns=[ 'lab_panel', 'algorithm', 'error', 'y_train.value_counts()', 'y_test.value_counts()' ]) header = [ 'LabChangePredictionPipeline("%s", %d)' % (self._var, self._num_rows) ] # Write error report. fm_io.write_data_frame_to_file(algorithm_report, \ '/'.join([report_dir, '%s-change-prediction-report.tab' % (self._var)]), \ header) # If successfully trained, append to a meta report. elif status == SupervisedClassifier.TRAINED: pipeline_prefix = '%s-change-prediction-%s' % (self._var, algorithm) SupervisedLearningPipeline._analyze_predictor( self, report_dir, pipeline_prefix) if meta_report is None: meta_report = fm_io.read_file_to_data_frame('/'.join( [report_dir, '%s-report.tab' % pipeline_prefix])) else: algorithm_report = fm_io.read_file_to_data_frame('/'.join( [report_dir, '%s-report.tab' % pipeline_prefix])) log.debug('algorithm_report: %s' % algorithm_report) meta_report = meta_report.append(algorithm_report) # Write predictor to disk. predictor = SupervisedLearningPipeline.predictor(self) predictor_path = self._build_model_dump_path(algorithm) joblib.dump(predictor, predictor_path) # After building per-algorithm reports, write to meta report. # Note that if there were insufficient samples to build any of the # algorithms, then meta_report will still be None. if meta_report is not None: header = [ 'LabChangePredictionPipeline("%s", %d)' % (self._var, self._num_rows) ] fm_io.write_data_frame_to_file(meta_report, \ '/'.join([data_dir, '%s-change-prediction-report.tab' % self._var]), header)
def _build_composite_raw_feature_matrix(self): composite_raw_matrix_path = self._build_composite_raw_matrix_path() matrix_class = AntiBiogramMatrix SupervisedLearningPipeline._build_raw_feature_matrix(self, matrix_class, \ composite_raw_matrix_path)
def _build_composite_raw_matrix_path(self): template = '%s-normality-matrix-%d-episodes-raw.tab' pipeline_file_name = inspect.getfile(inspect.currentframe()) return SupervisedLearningPipeline._build_matrix_path(self, template, \ pipeline_file_name)
def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path() features_to_add = {} imputation_strategies = {} features_to_remove = [ 'pat_id', 'shifted_order_time', 'proc_code', 'abnormal_panel', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays', 'organism_name' ] antibiotic_flags = [ 'ceftriaxone', 'meropenem', 'piperacillin_tazobactam', 'cefazolin', 'imipenem', 'aztreonam' ] features_to_remove += [ '%s_susc' % flag for flag in antibiotic_flags if flag != self.drug ] features_to_remove += ['%s_tested' % flag for flag in antibiotic_flags] features_to_keep = ['LABBLC-LABBLC2-LABURNC.pre'] # Keep the # of times it's been ordered in past, even if low info. # '%s.pre' % '-'.join(self._var.split()) outcome_label = '%s_susc' % self.drug selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = AntiBiogramMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) random_state = self._random_state data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether all components '%s is a boolean indicator which summarizes whether all components ' % outcome_label, # in the lab panel order represented by a given row are normal. 'in the lab panel order represented by a given row are normal.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", ] # Bundle parameters into single object to be unpacked in SLP. params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview params['random_state'] = random_state # Defer processing logic to SupervisedLearningPipeline. SupervisedLearningPipeline._build_processed_feature_matrix( self, params)
def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path() features_to_add = {} imputation_strategies = { } features_to_remove = [ 'pat_anon_id', 'shifted_order_time', 'proc_code', 'abnormal_panel', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays', 'escherichia_coli', "organism_name", "staphylococcus_aureus", "enterococcus_species", "klebsiella_pneumoniae", "pseudomonas_aeruginosa", "coag_negative_staphylococcus", "enterococcus_faecalis", "proteus_mirabilis", "candida_albicans" ] features_to_keep = [ # Keep the # of times it's been ordered in past, even if low info. '%s.pre' % self._var ] outcome_label = 'no_bacteria' selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = LabCultureMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) random_state = self._random_state data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether all components '%s is a boolean indicator which summarizes whether all components ' % outcome_label, # in the lab panel order represented by a given row are normal. 'in the lab panel order represented by a given row are normal.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", ] # Bundle parameters into single object to be unpacked in SLP. params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview params['random_state'] = random_state # Defer processing logic to SupervisedLearningPipeline. SupervisedLearningPipeline._build_processed_feature_matrix(self, params)
def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path() features_to_add = {} imputation_strategies = { } if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': features_to_remove = [ 'pat_id', 'order_time', 'order_proc_id', 'proc_code', 'abnormal_panel', 'num_normal_components', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays', 'num_components' ] outcome_label = 'all_components_normal' # TODO: for component... elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': features_to_remove = [ 'pat_id', 'order_time', 'order_proc_id', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', # 'Caucasian.preTimeDays', # 'Hispanic.preTimeDays', # 'Native Hawaiian and Other Pacific Islander.preTimeDays' ] RACE_FEATURES = self._factory.queryAllRaces() features_to_remove += [x + '.preTimeDays' for x in RACE_FEATURES] if self._isLabPanel: features_to_remove += ['proc_code', 'num_normal_components', 'num_components'] else: features_to_remove += ['base_name'] outcome_label = 'abnormal_lab' features_to_keep = [ # Keep the # of times it's been ordered in past, even if low info. '%s.pre' % self._var ] selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = LabNormalityMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) random_state = self._random_state data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether all components '%s is a boolean indicator which summarizes whether all components ' % outcome_label, # in the lab panel order represented by a given row are normal. 'in the lab panel order represented by a given row are normal.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", ] # Bundle parameters into single object to be unpacked in SLP. params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview params['random_state'] = random_state # Defer processing logic to SupervisedLearningPipeline. SupervisedLearningPipeline._build_processed_feature_matrix(self, params)
def _build_processed_matrix_path(self, raw_matrix_path): template = '%s-change-matrix-%d-episodes-processed.tab' pipeline_file_path = inspect.getfile(inspect.currentframe()) return SupervisedLearningPipeline._build_matrix_path(self, template, \ pipeline_file_path)
def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path() features_to_add = {} imputation_strategies = { } features_to_remove = [ 'pat_id', 'order_time', 'pat_enc_csn_id', 'proc_code', 'abnormal_panel', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays' ] features_to_keep = [ # Keep the # of times it's been ordered in past, even if low info. "LABBLC.pre" # placeholder - one element at least needed for pipeline to function - weird ] outcome_label = 'mrsa_present' selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = MRSAMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) random_state = self._random_state data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether all components '%s is a boolean indicator which summarizes whether all components ' % outcome_label, # in the lab panel order represented by a given row are normal. 'in the lab panel order represented by a given row are normal.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", ] # Bundle parameters into single object to be unpacked in SLP. params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview params['random_state'] = random_state # Defer processing logic to SupervisedLearningPipeline. SupervisedLearningPipeline._build_processed_feature_matrix(self, params)
def _build_raw_feature_matrix(self): raw_matrix_path = self._build_raw_matrix_path() matrix_class = ComponentNormalityMatrix SupervisedLearningPipeline._build_raw_feature_matrix(self, matrix_class, \ raw_matrix_path)
def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path( raw_matrix_path) log.debug('params: %s' % params) prev_measurement_feature = self._change_params['feature_old'] features_to_add = {'change': [self._change_params]} features_to_filter_on = [{ 'feature': prev_measurement_feature, 'value': np.nan }] imputation_strategies = {} features_to_remove = [ 'pat_id', 'order_time', 'order_proc_id', 'ord_num_value', 'proc_code', 'abnormal_panel', 'all_components_normal', 'num_normal_components', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays', 'num_components' ] features_to_keep = [ # Keep the # of times it's been ordered in past, even if low info. '%s.pre' % self._var ] outcome_label = 'unchanged_yn' selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = LabChangeMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether the lab test '%s is a boolean indicator which summarizes whether the lab test ' % outcome_label, # result is unchanged compared to the previous measurement. 'result is unchanged compared to the previous measurement.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", # Lab panel orders were only included if a previous measurement of "Lab panel orders were only included if a previous measurement of", # the same lab panel has been recorded "the same lab panel has been recorded." ] # Bundle parameters into single object params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['features_to_filter_on'] = features_to_filter_on params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview # Defer processing logic to SupervisedLearningPipeline. SupervisedLearningPipeline._build_processed_feature_matrix( self, params)
def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path() features_to_add = {} imputation_strategies = { #'sxu_new_imputation' } if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': features_to_remove = [ 'pat_id', 'order_time', 'order_proc_id', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays' ] if self._isLabPanel: features_to_remove += [ 'proc_code', 'num_components', 'num_normal_components', 'abnormal_panel' ] outcome_label = 'all_components_normal' # else: features_to_remove += ['base_name'] outcome_label = 'component_normal' # TODO: danger, previous version might not consistent! else: features_to_remove = [ 'pat_id', 'order_time', 'order_proc_id', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', # 'Caucasian.preTimeDays', # 'Hispanic.preTimeDays', # 'Native Hawaiian and Other Pacific Islander.preTimeDays' ] RACE_FEATURES = self._factory.queryAllRaces() features_to_remove += [x + '.preTimeDays' for x in RACE_FEATURES] if self._isLabPanel: features_to_remove += [ 'proc_code', 'num_normal_components', 'num_components' ] outcome_label = 'all_components_normal' else: features_to_remove += ['base_name'] outcome_label = 'component_normal' # features_to_keep = [ # Keep the # of times it's been ordered in past, even if low info. '%s.pre' % self._var ] if self.includeLastNormality: features_to_keep.append('last_normality') selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = LabNormalityMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) random_state = self._random_state data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether all components '%s is a boolean indicator which summarizes whether all components ' % outcome_label, # in the lab panel order represented by a given row are normal. 'in the lab panel order represented by a given row are normal.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", ] # Bundle parameters into single object to be unpacked in SLP. params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview params['random_state'] = random_state # Defer processing logic to SupervisedLearningPipeline. SupervisedLearningPipeline._build_processed_feature_matrix( self, params) ''' For testing the model on the holdout set, should remember features to select from the raw matrix of the holdout data. ''' final_features = self._X_train.columns.values if not self.feat2imputed_dict: ''' The dict was not created during imputation. Probably because the processed matrix was loaded from previous session. Take the 'best guess' for the imputed value as the most common one in any column. ''' for feat in final_features: most_freq_val = self._X_train[feat].value_counts().idxmax() self.feat2imputed_dict[feat] = most_freq_val '''