def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path( raw_matrix_path) log.debug('params: %s' % params) prev_measurement_feature = self._change_params['feature_old'] features_to_add = {'change': [self._change_params]} features_to_filter_on = [{ 'feature': prev_measurement_feature, 'value': np.nan }] imputation_strategies = {} features_to_remove = [ 'pat_id', 'order_time', 'order_proc_id', 'ord_num_value', 'proc_code', 'abnormal_panel', 'all_components_normal', 'num_normal_components', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays', 'num_components' ] features_to_keep = [ # Keep the # of times it's been ordered in past, even if low info. '%s.pre' % self._var ] outcome_label = 'unchanged_yn' selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = LabChangeMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether the lab test '%s is a boolean indicator which summarizes whether the lab test ' % outcome_label, # result is unchanged compared to the previous measurement. 'result is unchanged compared to the previous measurement.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", # Lab panel orders were only included if a previous measurement of "Lab panel orders were only included if a previous measurement of", # the same lab panel has been recorded "the same lab panel has been recorded." ] # Bundle parameters into single object params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['features_to_filter_on'] = features_to_filter_on params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview # Defer processing logic to SupervisedLearningPipeline. SupervisedLearningPipeline._build_processed_feature_matrix( self, params)
def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path() features_to_add = {} imputation_strategies = {} features_to_remove = [ 'pat_id', 'shifted_order_time', 'proc_code', 'abnormal_panel', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays', 'organism_name' ] antibiotic_flags = [ 'ceftriaxone', 'meropenem', 'piperacillin_tazobactam', 'cefazolin', 'imipenem', 'aztreonam' ] features_to_remove += [ '%s_susc' % flag for flag in antibiotic_flags if flag != self.drug ] features_to_remove += ['%s_tested' % flag for flag in antibiotic_flags] features_to_keep = ['LABBLC-LABBLC2-LABURNC.pre'] # Keep the # of times it's been ordered in past, even if low info. # '%s.pre' % '-'.join(self._var.split()) outcome_label = '%s_susc' % self.drug selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = AntiBiogramMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) random_state = self._random_state data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether all components '%s is a boolean indicator which summarizes whether all components ' % outcome_label, # in the lab panel order represented by a given row are normal. 'in the lab panel order represented by a given row are normal.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", ] # Bundle parameters into single object to be unpacked in SLP. params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview params['random_state'] = random_state # Defer processing logic to SupervisedLearningPipeline. SupervisedLearningPipeline._build_processed_feature_matrix( self, params)
def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path() features_to_add = {} imputation_strategies = { #'sxu_new_imputation' } if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': features_to_remove = [ 'pat_id', 'order_time', 'order_proc_id', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays' ] if self._isLabPanel: features_to_remove += [ 'proc_code', 'num_components', 'num_normal_components', 'abnormal_panel' ] outcome_label = 'all_components_normal' # else: features_to_remove += ['base_name'] outcome_label = 'component_normal' # TODO: danger, previous version might not consistent! else: features_to_remove = [ 'pat_id', 'order_time', 'order_proc_id', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', # 'Caucasian.preTimeDays', # 'Hispanic.preTimeDays', # 'Native Hawaiian and Other Pacific Islander.preTimeDays' ] RACE_FEATURES = self._factory.queryAllRaces() features_to_remove += [x + '.preTimeDays' for x in RACE_FEATURES] if self._isLabPanel: features_to_remove += [ 'proc_code', 'num_normal_components', 'num_components' ] outcome_label = 'all_components_normal' else: features_to_remove += ['base_name'] outcome_label = 'component_normal' # features_to_keep = [ # Keep the # of times it's been ordered in past, even if low info. '%s.pre' % self._var ] if self.includeLastNormality: features_to_keep.append('last_normality') selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = LabNormalityMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) random_state = self._random_state data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether all components '%s is a boolean indicator which summarizes whether all components ' % outcome_label, # in the lab panel order represented by a given row are normal. 'in the lab panel order represented by a given row are normal.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", ] # Bundle parameters into single object to be unpacked in SLP. params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview params['random_state'] = random_state # Defer processing logic to SupervisedLearningPipeline. SupervisedLearningPipeline._build_processed_feature_matrix( self, params) ''' For testing the model on the holdout set, should remember features to select from the raw matrix of the holdout data. ''' final_features = self._X_train.columns.values if not self.feat2imputed_dict: ''' The dict was not created during imputation. Probably because the processed matrix was loaded from previous session. Take the 'best guess' for the imputed value as the most common one in any column. ''' for feat in final_features: most_freq_val = self._X_train[feat].value_counts().idxmax() self.feat2imputed_dict[feat] = most_freq_val '''
def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path() features_to_add = {} imputation_strategies = { } features_to_remove = [ 'pat_id', 'order_time', 'pat_enc_csn_id', 'proc_code', 'abnormal_panel', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays' ] features_to_keep = [ # Keep the # of times it's been ordered in past, even if low info. "LABBLC.pre" # placeholder - one element at least needed for pipeline to function - weird ] outcome_label = 'mrsa_present' selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = MRSAMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) random_state = self._random_state data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether all components '%s is a boolean indicator which summarizes whether all components ' % outcome_label, # in the lab panel order represented by a given row are normal. 'in the lab panel order represented by a given row are normal.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", ] # Bundle parameters into single object to be unpacked in SLP. params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview params['random_state'] = random_state # Defer processing logic to SupervisedLearningPipeline. SupervisedLearningPipeline._build_processed_feature_matrix(self, params)
def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path() features_to_add = {} imputation_strategies = { } if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': features_to_remove = [ 'pat_id', 'order_time', 'order_proc_id', 'proc_code', 'abnormal_panel', 'num_normal_components', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays', 'num_components' ] outcome_label = 'all_components_normal' # TODO: for component... elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': features_to_remove = [ 'pat_id', 'order_time', 'order_proc_id', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', # 'Caucasian.preTimeDays', # 'Hispanic.preTimeDays', # 'Native Hawaiian and Other Pacific Islander.preTimeDays' ] RACE_FEATURES = self._factory.queryAllRaces() features_to_remove += [x + '.preTimeDays' for x in RACE_FEATURES] if self._isLabPanel: features_to_remove += ['proc_code', 'num_normal_components', 'num_components'] else: features_to_remove += ['base_name'] outcome_label = 'abnormal_lab' features_to_keep = [ # Keep the # of times it's been ordered in past, even if low info. '%s.pre' % self._var ] selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = LabNormalityMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) random_state = self._random_state data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether all components '%s is a boolean indicator which summarizes whether all components ' % outcome_label, # in the lab panel order represented by a given row are normal. 'in the lab panel order represented by a given row are normal.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", ] # Bundle parameters into single object to be unpacked in SLP. params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview params['random_state'] = random_state # Defer processing logic to SupervisedLearningPipeline. SupervisedLearningPipeline._build_processed_feature_matrix(self, params)
def _build_processed_feature_matrix(self): # Define parameters for processing steps. params = {} raw_matrix_path = self._build_raw_matrix_path() processed_matrix_path = self._build_processed_matrix_path() features_to_add = {} imputation_strategies = { } features_to_remove = [ 'pat_anon_id', 'shifted_order_time', 'proc_code', 'abnormal_panel', 'Birth.pre', 'Male.preTimeDays', 'Female.preTimeDays', 'RaceWhiteHispanicLatino.preTimeDays', 'RaceWhiteNonHispanicLatino.preTimeDays', 'RaceHispanicLatino.preTimeDays', 'RaceAsian.preTimeDays', 'RaceBlack.preTimeDays', 'RacePacificIslander.preTimeDays', 'RaceNativeAmerican.preTimeDays', 'RaceOther.preTimeDays', 'RaceUnknown.preTimeDays', 'Death.post', 'Death.postTimeDays', 'escherichia_coli', "organism_name", "staphylococcus_aureus", "enterococcus_species", "klebsiella_pneumoniae", "pseudomonas_aeruginosa", "coag_negative_staphylococcus", "enterococcus_faecalis", "proteus_mirabilis", "candida_albicans" ] features_to_keep = [ # Keep the # of times it's been ordered in past, even if low info. '%s.pre' % self._var ] outcome_label = 'no_bacteria' selection_problem = FeatureSelector.CLASSIFICATION selection_algorithm = FeatureSelector.RECURSIVE_ELIMINATION percent_features_to_select = 0.05 matrix_class = LabCultureMatrix pipeline_file_path = inspect.getfile(inspect.currentframe()) random_state = self._random_state data_overview = [ # Overview: 'Overview', # The outcome label is ___. 'The outcome label is %s.' % outcome_label, # %s is a boolean indicator which summarizes whether all components '%s is a boolean indicator which summarizes whether all components ' % outcome_label, # in the lab panel order represented by a given row are normal. 'in the lab panel order represented by a given row are normal.', # Each row represents a unique lab panel order. 'Each row represents a unique lab panel order.', # Each row contains fields summarizing the patient's demographics, "Each row contains fields summarizing the patient's demographics", # inpatient admit date, prior vitals, and prior lab results. 'inpatient admit date, prior vitals, and prior lab results.', # Most cells in matrix represent a count statistic for an event's "Most cells in matrix represent a count statistic for an event's", # occurrence or a difference between an event's time and index_time. "occurrence or a difference between an event's time and index_time.", ] # Bundle parameters into single object to be unpacked in SLP. params['raw_matrix_path'] = raw_matrix_path params['processed_matrix_path'] = processed_matrix_path params['features_to_add'] = features_to_add params['features_to_keep'] = features_to_keep params['imputation_strategies'] = imputation_strategies params['features_to_remove'] = features_to_remove params['outcome_label'] = outcome_label params['selection_problem'] = selection_problem params['selection_algorithm'] = selection_algorithm params['percent_features_to_select'] = percent_features_to_select params['matrix_class'] = matrix_class params['pipeline_file_path'] = pipeline_file_path params['data_overview'] = data_overview params['random_state'] = random_state # Defer processing logic to SupervisedLearningPipeline. SupervisedLearningPipeline._build_processed_feature_matrix(self, params)