def run(self): file_organizer = Syst.FileOrganizerLocal( working_folderpath=self.working_folderpath) raw_matrix_train, raw_matrix_test = Utils.split_rows(self.input_matrix) X_train_raw, y_train = Utils.split_Xy(raw_matrix_train, ylabel=self.ylabel) feature_processing_pipeline = Pipeline( memory=None, # file_organizer.cached_pipeline_filepath, steps=[('impute_features', Clas.FeatureImputer()), ('remove_features', Clas.FeatureRemover()), ('select_features', Clas.Select_Features())]) X_train_processed = feature_processing_pipeline.fit_transform( X_train_raw, y_train) predictor = SupervisedClassifier( classes=[0, 1], hyperparams={ 'algorithm': 'random-forest', 'hyperparam_strategy': SupervisedClassifier.EXHAUSTIVE_SEARCH, 'max_iter': 1024 }) status = predictor.train(X_train_processed, column_or_1d(y_train)) X_test_raw, y_test = Utils.split_Xy(raw_matrix_test, ylabel=self.ylabel) X_test_processed = feature_processing_pipeline.transform(X_test_raw) y_test_pred_proba = predictor.predict_probability(X_test_processed)[:, 1] res_df = pd.DataFrame({'actual': y_test, 'predict': y_test_pred_proba}) res_df.to_csv(file_organizer.get_output_filepath()) '''TODO''' from scripts.LabTestAnalysis.lab_statistics.stats_utils import get_confusion_metrics from sklearn.metrics import roc_auc_score AUC = roc_auc_score(y_test, y_test_pred_proba) sensitivity, specificity, LR_p, LR_n, PPV, NPV = get_confusion_metrics( actual_labels=y_test.values, predict_probas=y_test_pred_proba, threshold=0.5) print("AUC: %s, sensitivity: %s, specificity: %s, LR_p: %s, LR_n: %s, PPV: %s, NPV: %s:. " \ % (AUC, sensitivity, specificity, LR_p, LR_n, PPV, NPV))
class BifurcatedSupervisedClassifier: BIFURCATION = 'bifurcation' EQUAL = '==' LTE = '<=' GTE = '>=' SUPPORTED_BIFURCATION_STRATEGIES = [EQUAL, GTE, LTE] def __init__(self, classes, hyperparams): if hyperparams[ 'bifurcation_strategy'] not in BifurcatedSupervisedClassifier.SUPPORTED_BIFURCATION_STRATEGIES: raise ValueError('Bifurcation strategy %s not supported.' % hyperparams['bifurcation_strategy']) self._classes = classes self._hyperparams = hyperparams # Note that if we don't pass a copies of hyperparams, then we won't # be able to change hyperparams independently in the two classifiers. self._sc_true = SupervisedClassifier(classes, hyperparams.copy()) self._sc_false = SupervisedClassifier(classes, hyperparams.copy()) def __repr__(self): bs = self._build_bifurcation_str() classes_str = str(self._classes) hyperparams_str = "hyperparams={'algorithm': %s, 'bifurcator': %s, 'bifurcation_strategy': %s, 'bifurcation_threshold': %s, 'random_state': %s}" % ( self._hyperparams['algorithm'], self._hyperparams['bifurcator'], self._hyperparams['bifurcation_strategy'], self._hyperparams['bifurcation_value'], self._hyperparams['random_state']) s = "BifurcatedSupervisedClassifier(%s, %s)" % (classes_str, hyperparams_str) return s __str__ = __repr__ def _build_bifurcation_str(self): args = (self._hyperparams['bifurcator'], self._hyperparams['bifurcation_strategy'], self._hyperparams['bifurcation_value']) return '%s %s %s' % args def fetch_bifurcation_masks(self, X): log.debug('bifurcator: %s' % self._hyperparams['bifurcator']) log.debug('bifurcation_strategy: %s' % BifurcatedSupervisedClassifier.EQUAL) log.debug('bifurcation_value: %s' % self._hyperparams['bifurcation_value']) if self._hyperparams[ 'bifurcation_strategy'] is BifurcatedSupervisedClassifier.EQUAL: true_mask = X[self._hyperparams['bifurcator']].astype( float) == self._hyperparams['bifurcation_value'] false_mask = X[self._hyperparams['bifurcator']].astype( float) != self._hyperparams['bifurcation_value'] elif self._hyperparams[ 'bifurcation_strategy'] is BifurcatedSupervisedClassifier.LTE: true_mask = X[self._hyperparams['bifurcator']].astype( float) <= self._hyperparams['bifurcation_value'] false_mask = X[self._hyperparams['bifurcator']].astype( float) > self._hyperparams['bifurcation_value'] elif self._hyperparams[ 'bifurcation_strategy'] is BifurcatedSupervisedClassifier.GTE: true_mask = X[self._hyperparams['bifurcator']].astype( float) >= self._hyperparams['bifurcation_value'] false_mask = X[self._hyperparams['bifurcator']].astype( float) < self._hyperparams['bifurcation_value'] log.debug('X[%s].value_counts(): %s' % (self._hyperparams['bifurcator'], X[self._hyperparams['bifurcator']].value_counts())) log.debug('true_mask.value_counts(): %s' % true_mask.value_counts()) log.debug('false_mask.value_counts(): %s' % false_mask.value_counts()) return true_mask, false_mask def description(self): args = (self._hyperparams['algorithm'].upper().replace('-', '_'), self._build_bifurcation_str(), self._sc_true.description(), self._sc_false.description()) return 'BIFURCATED_%s(%s, true=%s, false=%s)' % args def hyperparams(self): hyperparams = { 'model_true': self._sc_true.hyperparams(), 'model_false': self._sc_false.hyperparams() } return hyperparams def params(self): params = { 'bifurcator': self._hyperparams['bifurcator'], 'bifurcation_strategy': self._hyperparams['bifurcation_strategy'], 'bifurcation_value': self._hyperparams['bifurcation_value'], 'model_true': self._sc_true.description(), 'model_false': self._sc_false.description() } return params def train(self, X_train, y_train): true_mask, false_mask = self.fetch_bifurcation_masks(X_train) # Train sc_true. X_train_true = X_train[true_mask] y_train_true = y_train[true_mask] status_true = self._sc_true.train(X_train_true, y_train_true) if status_true == SupervisedClassifier.INSUFFICIENT_SAMPLES: return status_true # Train sc_true. X_train_false = X_train[false_mask] y_train_false = y_train[false_mask] status_false = self._sc_false.train(X_train_false, y_train_false) if status_false == SupervisedClassifier.INSUFFICIENT_SAMPLES: return status_false return SupervisedClassifier.TRAINED def _stitch_disjoint_row(self, row): if pd.isnull(row['y_pred_true']): val = row['y_pred_false'] else: val = row['y_pred_true'] return val def _stitch_prob_0(self, row): if pd.isnull(row['y_pred_prob_true_0']): val = row['y_pred_prob_false_0'] else: val = row['y_pred_prob_true_0'] return val def _stitch_prob_1(self, row): if pd.isnull(row['y_pred_prob_true_1']): val = row['y_pred_prob_false_1'] else: val = row['y_pred_prob_true_1'] return val def _predict_label_or_probability(self, X_test, probability=None): true_mask, false_mask = self.fetch_bifurcation_masks(X_test) # Predict X_test_true. X_test_true = X_test[true_mask] if probability: y_pred_true = self._sc_true.predict_probability(X_test_true) else: y_pred_true = self._sc_true.predict(X_test_true) log.debug('y_pred_true: %s' % y_pred_true) # Predict X_test_false. X_test_false = X_test[false_mask] if probability: y_pred_false = self._sc_false.predict_probability(X_test_false) else: y_pred_false = self._sc_false.predict(X_test_false) log.debug('y_pred_false: %s' % y_pred_false) # Stitch results. if probability: column_names = ['y_pred_true_0', 'y_pred_true_1'] else: column_names = ['y_pred_true'] y_pred_true_df = DataFrame(y_pred_true, index=X_test_true.index, \ columns=column_names) log.debug('y_pred_true_df: %s' % y_pred_true_df) if probability: column_names = ['y_pred_false_0', 'y_pred_false_1'] else: column_names = ['y_pred_false'] y_pred_false_df = DataFrame(y_pred_false, index=X_test_false.index, \ columns=column_names) log.debug('y_pred_false_df: %s' % y_pred_false_df) true_mask_df = DataFrame(true_mask) mask_plus_true = true_mask_df.merge(y_pred_true_df, how='left', \ left_index=True, right_index=True) mask_plus_true_plus_false = mask_plus_true.merge(y_pred_false_df, \ how='left', left_index=True, right_index=True) mask_plus_true_plus_false['y_pred'] = mask_plus_true_plus_false.apply( self._stitch_disjoint_row, axis=1) log.debug('mask_plus_false: %s' % mask_plus_true_plus_false) y_pred = mask_plus_true_plus_false['y_pred'].values return y_pred def predict(self, X_test): true_mask, false_mask = self.fetch_bifurcation_masks(X_test) # Predict X_test_true. X_test_true = X_test[true_mask] y_pred_true = self._sc_true.predict(X_test_true) log.debug('y_pred_true: %s' % y_pred_true) # Predict X_test_false. X_test_false = X_test[false_mask] y_pred_false = self._sc_false.predict(X_test_false) log.debug('y_pred_false: %s' % y_pred_false) # Stitch results. column_names = ['y_pred_true'] y_pred_true_df = DataFrame(y_pred_true, index=X_test_true.index, \ columns=column_names) log.debug('y_pred_true_df: %s' % y_pred_true_df) column_names = ['y_pred_false'] y_pred_false_df = DataFrame(y_pred_false, index=X_test_false.index, \ columns=column_names) log.debug('y_pred_false_df: %s' % y_pred_false_df) true_mask_df = DataFrame(true_mask) mask_plus_true = true_mask_df.merge(y_pred_true_df, how='left', \ left_index=True, right_index=True) mask_plus_true_plus_false = mask_plus_true.merge(y_pred_false_df, \ how='left', left_index=True, right_index=True) mask_plus_true_plus_false['y_pred'] = mask_plus_true_plus_false.apply( self._stitch_disjoint_row, axis=1) log.debug('mask_plus_false: %s' % mask_plus_true_plus_false) y_pred = mask_plus_true_plus_false['y_pred'].values return y_pred def predict_probability(self, X_test): true_mask, false_mask = self.fetch_bifurcation_masks(X_test) # Predict X_test_true. X_test_true = X_test[true_mask] y_pred_prob_true = self._sc_true.predict_probability(X_test_true) log.debug('y_pred_prob_true: %s' % y_pred_prob_true) # Predict X_test_false. X_test_false = X_test[false_mask] y_pred_prob_false = self._sc_false.predict_probability(X_test_false) log.debug('y_pred_prob_false: %s' % y_pred_prob_false) # Stitch results. column_names = ['y_pred_prob_true_0', 'y_pred_prob_true_1'] y_pred_prob_true_df = DataFrame(y_pred_prob_true, index=X_test_true.index, \ columns=column_names) log.debug('y_pred_prob_true_df: %s' % y_pred_prob_true_df) column_names = ['y_pred_prob_false_0', 'y_pred_prob_false_1'] y_pred_prob_false_df = DataFrame(y_pred_prob_false, index=X_test_false.index, \ columns=column_names) log.debug('y_pred_prob_false_df: %s' % y_pred_prob_false_df) true_mask_df = DataFrame(true_mask) mask_plus_true = true_mask_df.merge(y_pred_prob_true_df, how='left', \ left_index=True, right_index=True) composite = mask_plus_true.merge(y_pred_prob_false_df, \ how='left', left_index=True, right_index=True) composite['y_pred_prob_0'] = composite.apply(self._stitch_prob_0, axis=1) composite['y_pred_prob_1'] = composite.apply(self._stitch_prob_1, axis=1) log.debug('composite: %s' % composite) y_pred_prob = composite[['y_pred_prob_0', 'y_pred_prob_1']].values log.debug(y_pred_prob) return y_pred_prob
def run_one_lab_local(lab, lab_type, data_source, version, random_state=0): ''' Input: :return: ''' # X_train_raw, y_train = [[1], [2]], [1, 2] # X_test_raw, y_test = [[3], [4]], [3, 4] file_organizer = syst.FileOrganizerLocal(lab=lab, lab_type=lab_type, data_source=data_source, version=version) raw_matrix = file_organizer.get_raw_matrix() y_label = 'all_components_normal' ''' TODO: later on pat_ids ''' raw_matrix_train, raw_matrix_test = Utils.split_rows(raw_matrix) patIds_train = raw_matrix_train['pat_id'].values.tolist() X_train_raw, y_train = Utils.split_Xy(raw_matrix_train, ylabel=y_label) redundant_features = ['proc_code', 'num_components', 'num_normal_components', 'abnormal_panel'] id_features = ['pat_id', 'order_proc_id', 'order_time'] numeric_features = X_train_raw.columns[~X_train_raw.columns.isin([y_label]+redundant_features+id_features)] ''' Check if the left features are all numeric ''' assert X_train_raw[numeric_features].select_dtypes(exclude=['object']).shape == X_train_raw[numeric_features].shape features_by_type = {'redundant_features': redundant_features, 'id_features':id_features, 'numeric_features':numeric_features, 'y_label':y_label} ''' (1) Feature Impute: Imputation of some numerical values depend on prior stats of the same patient, so certain auxiliary columns are still useful (2) Feature Remove: Remove auxiliary columns (3) Feature Selection: Only select from numerical columns ''' feature_processing_pipeline = Pipeline( memory = None, #file_organizer.cached_pipeline_filepath, steps = [ ('impute_features', Cls.FeatureImputer()), ('remove_features', Cls.FeatureRemover(features_to_remove=Config.features_to_remove)), ('select_features', Cls.Select_Features(random_state=random_state, features_by_type=features_by_type)) ] ) # feature_engineering_pipeline.set_params() X_train_processed = feature_processing_pipeline.fit_transform(X_train_raw, y_train) hyperparams = {} hyperparams['algorithm'] = 'random-forest' predictor = SupervisedClassifier(classes=[0,1], hyperparams=hyperparams) ''' Automatically takes care of tuning hyperparameters via stochastic-search ''' status = predictor.train(X_train_processed, column_or_1d(y_train), groups = patIds_train) logging.INFO('status: %s'%status) ''' Test set ''' X_test_raw, y_test = Utils.split_Xy(raw_matrix_test, ylabel=y_label) X_test_processed = feature_processing_pipeline.transform(X_test_raw) y_test_pred_proba = predictor.predict_probability(X_test_processed) res_df = pd.DataFrame({'actual':y_test, 'predict': y_test_pred_proba}) res_df.to_csv(file_organizer.get_output_filepath(alg=hyperparams['algorithm']))