def _train_test_split(self, processed_matrix, outcome_label): log.debug('outcome_label: %s' % outcome_label) y = pd.DataFrame(processed_matrix.pop(outcome_label)) X = processed_matrix log.debug('X.columns: %s' % X.columns) self._X_train, self._X_test, self._y_train, self._y_test = train_test_split( X, y, random_state=self._random_state)
def _analyze_predictors_on_holdout(self): fm_io = FeatureMatrixIO() algorithms_to_test = list() algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS) pipeline_file_name = inspect.getfile(inspect.currentframe()) data_dir = SupervisedLearningPipeline._fetch_data_dir_path( self, pipeline_file_name) # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: # algorithms_to_test.append('bifurcated-%s' % algorithm) log.debug('algorithms_to_test: %s' % algorithms_to_test) for algorithm in algorithms_to_test: log.info('analyzing %s...' % algorithm) # If report_dir does not exist, make it. report_dir = '/'.join([data_dir, algorithm]) pipeline_prefix = '%s-normality-prediction-%s' % (self._var, algorithm) predictor_path = self._build_model_dump_path(algorithm) if os.path.exists( predictor_path) and 'bifurcated' not in algorithm: log.debug('Loading model from disk...') # TODO(sbala): Fix loblib.load so that it works for bifurcated # supervised classifiers. self._predictor = joblib.load(predictor_path) # self._features = self._X_train.columns status = SupervisedClassifier.TRAINED SupervisedLearningPipeline._analyze_predictor_holdoutset( self, report_dir, pipeline_prefix)
def _filter_on_features(self, fmt, features_to_filter_on): # Filter out rows with unwanted value for given feature for filter_feature in features_to_filter_on: feature = filter_feature.get('feature') value = filter_feature.get('value') self._num_rows = fmt.filter_on_feature(feature, value) log.debug('Removed rows where %s equals \'%s\'; %d rows remain.' % (feature, str(value), self._num_rows))
def _add_susc_features(self): for susc_name in self._susceptibility_names: log.debug('Adding %s feature...' % susc_name) self._factory.addClinicalItemFeatures([susc_name], column='name', label=susc_name, features="pre")
def _add_features(self, fmt, features_to_add): # Expected format for features_to_add: # { # 'threshold': [{arg1, arg2, etc.}, ...] # 'indicator': [{arg1, arg2, etc.}, ...] # 'logarithm': [{arg1, arg2, etc.}, ...] # } indicator_features = features_to_add.get('indicator') threshold_features = features_to_add.get('threshold') logarithm_features = features_to_add.get('logarithm') change_features = features_to_add.get('change') if indicator_features: for feature in indicator_features: base_feature = feature.get('base_feature') boolean_indicator = feature.get('boolean_indicator') added_feature = fmt.add_indicator_feature( base_feature, boolean_indicator) self._added_features.append(added_feature) if threshold_features: for feature in threshold_features: base_feature = feature.get('base_feature') lower_bound = feature.get('lower_bound') upper_bound = feature.get('upper_bound') added_feature = fmt.add_threshold_feature( base_feature, lower_bound, upper_bound) self._added_features.append(added_feature) if logarithm_features: for feature in logarithm_features: base_feature = feature.get('base_feature') logarithm = feature.get('logarithm') added_feature = fmt.add_threshold_feature( base_feature, logarithm) self._added_features.append(added_feature) # TODO (raikens): right now, unchanged_yn is the only allowable name for a # change feature, which means at most one change_feature can be added if change_features: if len(change_features) > 1: raise ValueError( "Adding multiple \'change\' type features is not yet supported" ) for feature in change_features: feature_old = feature.get('feature_old') feature_new = feature.get('feature_new') method = feature.get('method') param = feature.get('param') added_feature = fmt.add_change_feature(method, param, feature_old, feature_new) self._added_features.append(added_feature) # sd method discards 300 rows for measuring sd if method == 'sd': self._num_rows = self._num_rows - 300 log.debug('self._added_features: %s' % self._added_features)
def _analyze_predictor(self, dest_dir, pipeline_prefix): analyzer = ClassifierAnalyzer(self._predictor, self._X_test, self._y_test) # Build names for output plots and report. direct_comparisons_name = 'direct_comparisons.csv' #'%s-direct-compare-results.csv' % pipeline_prefix precision_at_k_plot_name = '%s-precision-at-k-plot.png' % pipeline_prefix precision_recall_plot_name = '%s-precision-recall-plot.png' % pipeline_prefix roc_plot_name = '%s-roc-plot.png' % pipeline_prefix report_name = '%s-report.tab' % pipeline_prefix # Build paths. direct_comparisons_path = '/'.join([dest_dir, direct_comparisons_name]) log.debug('direct_comparisons_path: %s' % direct_comparisons_path) precision_at_k_plot_path = '/'.join([dest_dir, precision_at_k_plot_name]) log.debug('precision_at_k_plot_path: %s' % precision_at_k_plot_path) precision_recall_plot_path = '/'.join([dest_dir, precision_recall_plot_name]) log.debug('precision_recall_plot_path: %s' % precision_recall_plot_path) roc_plot_path = '/'.join([dest_dir, roc_plot_name]) log.debug('roc_plot_path: %s' % roc_plot_path) report_path = '/'.join([dest_dir, report_name]) log.debug('report_path: %s' % report_path) # Build plot titles. roc_plot_title = 'ROC (%s)' % pipeline_prefix precision_recall_plot_title = 'Precision-Recall (%s)' % pipeline_prefix precision_at_k_plot_title = 'Precision @K (%s)' % pipeline_prefix # Write output. analyzer.output_direct_comparisons(direct_comparisons_path) analyzer.plot_roc_curve(roc_plot_title, roc_plot_path) analyzer.plot_precision_recall_curve(precision_recall_plot_title, precision_recall_plot_path) analyzer.plot_precision_at_k_curve(precision_at_k_plot_title, precision_at_k_plot_path) analyzer.write_report(report_path, ci=0.95)
def predict(self, X_test): true_mask, false_mask = self.fetch_bifurcation_masks(X_test) # Predict X_test_true. X_test_true = X_test[true_mask] y_pred_true = self._sc_true.predict(X_test_true) log.debug('y_pred_true: %s' % y_pred_true) # Predict X_test_false. X_test_false = X_test[false_mask] y_pred_false = self._sc_false.predict(X_test_false) log.debug('y_pred_false: %s' % y_pred_false) # Stitch results. column_names = ['y_pred_true'] y_pred_true_df = DataFrame(y_pred_true, index=X_test_true.index, \ columns=column_names) log.debug('y_pred_true_df: %s' % y_pred_true_df) column_names = ['y_pred_false'] y_pred_false_df = DataFrame(y_pred_false, index=X_test_false.index, \ columns=column_names) log.debug('y_pred_false_df: %s' % y_pred_false_df) true_mask_df = DataFrame(true_mask) mask_plus_true = true_mask_df.merge(y_pred_true_df, how='left', \ left_index=True, right_index=True) mask_plus_true_plus_false = mask_plus_true.merge(y_pred_false_df, \ how='left', left_index=True, right_index=True) mask_plus_true_plus_false['y_pred'] = mask_plus_true_plus_false.apply( self._stitch_disjoint_row, axis=1) log.debug('mask_plus_false: %s' % mask_plus_true_plus_false) y_pred = mask_plus_true_plus_false['y_pred'].values return y_pred
def _analyze_predictor_traindata(self, dest_dir): analyzer = ClassifierAnalyzer(self._predictor, self._X_train, self._y_train) direct_comparisons_name = 'direct_comparisons_train.csv' direct_comparisons_path = '/'.join([dest_dir, direct_comparisons_name]) log.debug('direct_comparisons_path: %s' % direct_comparisons_path) analyzer.output_direct_comparisons(direct_comparisons_path)
def _get_average_orders_per_patient(self): # Initialize DB cursor. cursor = self._connection.cursor() # Get average number of results for this lab test per patient. query = SQLQuery() if LocalEnv.DATASET_SOURCE_NAME == 'STRIDE': #TODO: add STRIDE component routine query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') query.addSelect('COUNT(sop.order_proc_id) AS num_orders') query.addFrom('stride_order_proc AS sop') query.addFrom('stride_order_results AS sor') query.addWhere('sop.order_proc_id = sor.order_proc_id') query.addWhereIn("proc_code", [self._lab_panel]) components = self._get_components_in_lab_panel() query.addWhereIn("base_name", components) query.addGroupBy('pat_id') elif LocalEnv.DATASET_SOURCE_NAME == 'UMich': query.addSelect('CAST(pat_id AS BIGINT) AS pat_id') query.addSelect('COUNT(order_proc_id) AS num_orders') query.addFrom('labs') query.addWhereIn(self._varTypeInTable, [self._lab_var]) components = self._get_components_in_lab_panel() query.addWhereIn("base_name", components) query.addGroupBy('pat_id') log.debug('Querying median orders per patient...') results = DBUtil.execute(query) order_counts = [row[1] for row in results] if len(order_counts) == 0: error_msg = '0 orders for lab "%s."' % self._lab_var log.critical(error_msg) raise Exception(error_msg) # sys.exit('[ERROR] %s' % error_msg) # sxu: sys.exit cannot be caught by Exception else: return numpy.median(order_counts)
def __init__(self, classifier, X_test, y_test, random_state=None): # TODO(sbala): Make this API more flexible, so that it can work # with multi-label classifiers or binary classifiers whose # positive label != 1. PredictorAnalyzer.__init__(self, classifier, X_test, y_test, random_state) # If there is only one class in y_test, abort. classes = y_test[y_test.columns.values[0]].value_counts().index.values if len(classes) <= 1: sole_class = classes[0] log.debug('y_test only has samples of 1 class: %s' % sole_class) sys.exit( '[ERROR] ClassifierAnalyzer: y_test only has samples of 1 class: %s' % sole_class) self._y_pred_prob = DataFrame( classifier.predict_probability(X_test)[:, 1]) log.debug('y_pred_prob[0].value_counts(): %s' % self._y_pred_prob[0].value_counts()) if random_state is None: self._random_state = np.random.RandomState(123456789) elif isinstance(random_state, int): self._random_state = np.random.RandomState(random_state) elif isinstance(random_state, np.random.RandomState): self._random_state = random_state
def _get_random_patient_list(self): # Initialize DB cursor. cursor = self._connection.cursor() # Get average number of results for this lab test per patient. avg_orders_per_patient = self._get_average_orders_per_patient() log.info('avg_orders_per_patient: %s' % avg_orders_per_patient) # Based on average # of results, figure out how many patients we'd # need to get for a feature matrix of requested size. self._num_patients = int(numpy.max([self._num_requested_episodes / \ avg_orders_per_patient, 1])) # Get numPatientsToQuery random patients who have gotten test. # TODO(sbala): Have option to feed in a seed for the randomness. query = SQLQuery() query.addSelect('pat_id') query.addFrom('stride_order_proc AS sop') query.addWhereIn('proc_code', [self._lab_panel]) query.addOrderBy('RANDOM()') query.setLimit(self._num_patients) log.debug('Querying random patient list...') results = DBUtil.execute(query) # Get patient list. random_patient_list = [ row[0] for row in results ] return random_patient_list
def _get_components_in_lab_panel(self): # Initialize DB connection. cursor = self._connection.cursor() # Doing a single query results in a sequential scan through # stride_order_results. To avoid this, break up the query in two. # First, get all the order_proc_ids for proc_code. query = SQLQuery() query.addSelect('order_proc_id') query.addFrom('stride_order_proc') query.addWhereIn('proc_code', [self._lab_panel]) query.addGroupBy('order_proc_id') log.debug('Querying order_proc_ids for %s...' % self._lab_panel) results = DBUtil.execute(query) lab_order_ids = [row[0] for row in results] # Second, get all base_names from those orders. query = SQLQuery() query.addSelect('base_name') query.addFrom('stride_order_results') query.addWhereIn('order_proc_id', lab_order_ids) query.addGroupBy('base_name') log.debug('Querying base_names for order_proc_ids...') results = DBUtil.execute(query) components = [row[0] for row in results] return components
def __init__( self, change_params, lab_panel, num_episodes, use_cache=None, random_state=None, build_raw_only=False, ): SupervisedLearningPipeline.__init__(self, lab_panel, num_episodes, use_cache, random_state) self._change_params = change_params self._change_params[ 'feature_old'] = self._lookup_previous_measurement_feature( self._var) log.debug('change_params: %s' % self._change_params) if build_raw_only: self._build_raw_feature_matrix() return else: self._build_raw_feature_matrix() self._build_processed_feature_matrix() self._train_and_analyze_predictors()
def test_train_and_predict(self): # Load data set. X = DataFrame(RANDOM_CLASSIFICATION_TEST_CASE['X'], columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']) y = DataFrame(RANDOM_CLASSIFICATION_TEST_CASE['y']) random_state = RANDOM_CLASSIFICATION_TEST_CASE['random_state'] expected_y_pred_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['y_predicted'] expected_str_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['str'] expected_hyperparams_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['hyperparams'] expected_params_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['params'] expected_descriptions_by_algorithm = RANDOM_CLASSIFICATION_TEST_CASE['description'] # Generate train/test split. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) # Iterate through SUPPORTED_ALGORITHMS. for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: log.info('Testing %s classifier...' % algorithm) # Train model. hyperparams = {'algorithm': algorithm, 'random_state': random_state} # Default to stochastic search for expensive algorithms. if algorithm in [SupervisedClassifier.RANDOM_FOREST]: hyperparams['hyperparam_strategy'] = SupervisedClassifier.STOCHASTIC_SEARCH # Test ability to force hyperparam values. hyperparams['max_depth'] = 2 hyperparams['n_estimators'] = 5 hyperparams['min_samples_leaf'] = 1 hyperparams['min_samples_split'] = 0.2 else: hyperparams['hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH classifier = SupervisedClassifier([0, 1], hyperparams) classifier.train(X_train, y_train) # Test str(). expected_str = expected_str_by_algorithm[algorithm] actual_str = str(classifier) self.assertEqual(expected_str, actual_str) # Test hyperparameters. expected_hyperparams = expected_hyperparams_by_algorithm[algorithm] actual_hyperparams = classifier.hyperparams() self._assert_equal_hyperparams(expected_hyperparams, actual_hyperparams) # Test model parameters. expected_params = expected_params_by_algorithm[algorithm] actual_params = classifier.params() self.assertEqualDict(expected_params, actual_params) # Test model description. expected_description = expected_descriptions_by_algorithm[algorithm] actual_description = classifier.description() self.assertEqual(expected_description, actual_description) # Test prediction values. expected_y_pred = expected_y_pred_by_algorithm[algorithm] log.debug('expected_y_pred: %s' % expected_y_pred) actual_y_pred = classifier.predict(X_test) log.debug('actual_y_pred: %s' % actual_y_pred) self.assertEqualList(expected_y_pred, actual_y_pred)
def clear_stride_psql_tables(): log.info('Clearing stride psql tables...') for params in list(STRIDE_LOADER_PARAMS.values()): psql_table = params['psql_table'] % TABLE_PREFIX log.debug('dropping table %s...' % psql_table) # load_stride_to_psql is not itempotent, so in case schema already # existed, clear table (avoid duplicate data). DBUtil.execute("DROP TABLE IF EXISTS %s CASCADE;" % psql_table)
def _add_med_features(self): # Adds all prior antibiotic use as features for med_set in self._med_panel: med_label = med_set[0].split()[0] # Takes name of antibiotic log.debug('Adding %s medication features...' % med_label) self._factory.addClinicalItemFeatures(med_set, column="description", label="Med." + med_label, features="pre")
def _remove_features(self, fmt, features_to_remove): # Prune manually identified features (meant for obviously unhelpful). # In theory, FeatureSelector should be able to prune these, but no # reason not to help it out a little bit. for feature in features_to_remove: fmt.remove_feature(feature) self._removed_features.append(feature) log.debug('self._removed_features: %s' % self._removed_features)
def predict(self, X): y_predicted = list() for row in X.iterrows(): prediction = self._predictions[self._index] y_predicted.append(prediction) self._index = (self._index + 1) % self._num_predictions log.debug('y_predicted: %s' % y_predicted) return DataFrame({'y_predicted': y_predicted})
def _select_features(self, problem, percent_features_to_select, algorithm, features_to_keep=None): # Initialize FeatureSelector. fs = FeatureSelector(problem=problem, algorithm=algorithm, random_state=self._random_state) fs.set_input_matrix(self._X_train, column_or_1d(self._y_train)) num_features_to_select = int(percent_features_to_select * len(self._X_train.columns.values)) # Parse features_to_keep. if features_to_keep is None: features_to_keep = [] # Select features. fs.select(k=num_features_to_select) # Enumerate eliminated features pre-transformation. feature_ranks = fs.compute_ranks() for i in range(len(feature_ranks)): if feature_ranks[i] > num_features_to_select: # If in features_to_keep, pretend it wasn't eliminated. if self._X_train.columns[i] not in features_to_keep: self._eliminated_features.append(self._X_train.columns[i]) # Hack: rather than making FeatureSelector handle the concept of # kept features, just copy the data here and add it back to the # transformed matrices. # Rather than looping, do this individually so that we can skip if # transformed X already has the feature. for feature in features_to_keep: kept_X_train_feature = self._X_train[[feature]].copy() log.debug('kept_X_train_feature.shape: %s' % str(kept_X_train_feature.shape)) self._X_train = fs.transform_matrix(self._X_train) if feature not in self._X_train: self._X_train = self._X_train.merge(kept_X_train_feature, left_index=True, right_index=True) kept_X_test_feature = self._X_test[[feature]].copy() log.debug('kept_X_test_feature.shape: %s' % str(kept_X_test_feature.shape)) self._X_test = fs.transform_matrix(self._X_test) if feature not in self._X_test: self._X_test = self._X_test.merge(kept_X_test_feature, left_index=True, right_index=True) if not features_to_keep: # Even if there is no feature to keep, still need to # perform transform_matrix to drop most low-rank features self._X_train = fs.transform_matrix(self._X_train) self._X_test = fs.transform_matrix(self._X_test)
def _maybe_reshape_y(self, y): # If necessary, reshape y from (n_samples, 1) to (n_samples, ) try: num_cols = y.shape[1] y = column_or_1d(y) log.debug('Reshaped y to 1d.') except IndexError: log.debug('Did not need to reshape y to 1d.') return y
def _bootstrap_score_ci(self, score_fn, ci, y_test, y_pred=None, y_pred_prob=None, n_bootstrap_iter=None, k=None, desired_precision=None): # Note that y_pred may either represent the predicted labels or the # predicted label probabilities. It's up to the caller to make the # right choice based on score_fn's expected input. if score_fn == self._score_precision_at_k: sample_score = score_fn(y_test, y_pred, y_pred_prob, k) elif score_fn == self._score_percent_predictably_positive: sample_score = score_fn(y_test, y_pred, y_pred_prob, desired_precision) elif score_fn in [average_precision_score, roc_auc_score]: sample_score = score_fn(y_test, y_pred_prob) else: sample_score = score_fn(y_test, y_pred) if n_bootstrap_iter is None: n_bootstrap_iter = 100 # For consistency of results, seed random number generator with # fixed number. rng = self._random_state # Use bootstrap to compute CIs. bootstrap_scores = list() for i in range(0, n_bootstrap_iter): # Sample y_test and y_pred with replacement. indices = rng.randint(0, len(y_test) - 1, len(y_test)) sample_y_test = np.array(y_test)[indices] if y_pred is not None: sample_y_pred = np.array(y_pred)[indices] if y_pred_prob is not None: sample_y_pred_prob = np.array(y_pred_prob)[indices] if len(np.unique(sample_y_test)) < 2: # We need at least one positive and one negative sample for ROC AUC # to be defined: reject the sample continue if score_fn == self._score_precision_at_k: score = score_fn(DataFrame(sample_y_test), DataFrame(sample_y_pred), DataFrame(sample_y_pred_prob), k) elif score_fn == self._score_percent_predictably_positive: score = score_fn(DataFrame(sample_y_test), DataFrame(sample_y_pred), DataFrame(sample_y_pred_prob), desired_precision) elif score_fn in [average_precision_score, roc_auc_score]: score = score_fn(sample_y_test, sample_y_pred_prob) else: score = score_fn(sample_y_test, sample_y_pred) bootstrap_scores.append(score) # Sort bootstrap scores to get CIs. bootstrap_scores.sort() log.debug('bootstrap_scores: %s' % bootstrap_scores) sorted_scores = np.array(bootstrap_scores) # May not be equal to n_bootstrap_iter if some samples were rejected num_bootstraps = len(sorted_scores) log.debug('sorted_scores: %s' % sorted_scores) ci_lower_bound_float = (1.0 - ci) / 2 ci_lower_bound = sorted_scores[int(ci_lower_bound_float * num_bootstraps)] ci_upper_bound_float = ci + ci_lower_bound_float ci_upper_bound = sorted_scores[int(ci_upper_bound_float * num_bootstraps)] return sample_score, ci_lower_bound, ci_upper_bound
def _tune_hyperparams_regress_and_round(self, X, y): self._hyperparams['hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH log.info('Tuning hyperparams via %s...' % self._hyperparams['hyperparam_strategy']) # If not provided, search for best coef_max. if self._hyperparams.get('coef_max') is None: self._hyperparams['coef_max'] = self._tune_coef_max(X, y) # Round linear coefficients. self._round_coefs(self._hyperparams['coef_max']) log.debug('hyperparams: %s' % self.hyperparams()) log.debug('params: %s' % self.params())
def _analyze_predictor_holdoutset(self, dest_dir, pipeline_prefix): slugified_var = '-'.join(self._var.split()) holdout_path = dest_dir + '/../' + '%s-normality-matrix-%d-episodes-processed-holdout.tab' % ( slugified_var, self._num_rows) fm_io = FeatureMatrixIO() processed_matrix = fm_io.read_file_to_data_frame(holdout_path) if self._isLabPanel: y_holdout = pd.DataFrame( processed_matrix.pop('all_components_normal')) else: y_holdout = pd.DataFrame(processed_matrix.pop('component_normal')) X_holdout = processed_matrix analyzer = ClassifierAnalyzer(self._predictor, X_holdout, y_holdout) train_label = 'holdoutset' # Build names for output plots and report. direct_comparisons_name = '%s-direct-compare-results-%s.csv' % ( pipeline_prefix, train_label) precision_at_k_plot_name = '%s-precision-at-k-plot-%s.png' % ( pipeline_prefix, train_label) precision_recall_plot_name = '%s-precision-recall-plot-%s.png' % ( pipeline_prefix, train_label) roc_plot_name = '%s-roc-plot-%s.png' % (pipeline_prefix, train_label) report_name = '%s-report-%s.tab' % (pipeline_prefix, train_label) # Build paths. direct_comparisons_path = '/'.join([dest_dir, direct_comparisons_name]) log.debug('direct_comparisons_path: %s' % direct_comparisons_path) precision_at_k_plot_path = '/'.join( [dest_dir, precision_at_k_plot_name]) log.debug('precision_at_k_plot_path: %s' % precision_at_k_plot_path) precision_recall_plot_path = '/'.join( [dest_dir, precision_recall_plot_name]) log.debug('precision_recall_plot_path: %s' % precision_recall_plot_path) roc_plot_path = '/'.join([dest_dir, roc_plot_name]) log.debug('roc_plot_path: %s' % roc_plot_path) report_path = '/'.join([dest_dir, report_name]) log.debug('report_path: %s' % report_path) # Build plot titles. roc_plot_title = 'ROC (%s)' % pipeline_prefix precision_recall_plot_title = 'Precision-Recall (%s)' % pipeline_prefix precision_at_k_plot_title = 'Precision @K (%s)' % pipeline_prefix # Write output. analyzer.output_direct_comparisons(direct_comparisons_path) analyzer.plot_roc_curve(roc_plot_title, roc_plot_path) analyzer.plot_precision_recall_curve(precision_recall_plot_title, precision_recall_plot_path) analyzer.plot_precision_at_k_curve(precision_at_k_plot_title, precision_at_k_plot_path) analyzer.write_report(report_path, ci=0.95)
def _train_test_split(self, processed_matrix, outcome_label, columnToSplitOn='pat_id'): log.debug('outcome_label: %s' % outcome_label) all_possible_ids = sorted(set(processed_matrix[columnToSplitOn].values.tolist())) train_ids, test_ids = train_test_split(all_possible_ids, random_state=self._random_state) train_matrix = processed_matrix[processed_matrix[columnToSplitOn].isin(train_ids)].copy() self._y_train = pd.DataFrame(train_matrix.pop(outcome_label)) self._X_train = train_matrix test_matrix = processed_matrix[processed_matrix[columnToSplitOn].isin(test_ids)].copy() self._y_test = pd.DataFrame(test_matrix.pop(outcome_label)) self._X_test = test_matrix
def build_stride_psql_schemata(): schemata_dir = StrideLoader.fetch_psql_schemata_dir() for params in list(STRIDE_LOADER_PARAMS.values()): psql_table = params['psql_table'] % TABLE_PREFIX log.debug('loading %s schema...' % psql_table) # Open file, feed to DBUtil, and close file. schema_file_name = '.'.join([psql_table, 'schema.sql']) schema_file_path = os.path.join(schemata_dir, schema_file_name) schema_file = open(schema_file_path, 'r') DBUtil.runDBScript(schema_file) schema_file.close()
def build_stride_psql_indices(): indices_dir = StrideLoader.fetch_psql_indices_dir() for params in list(STRIDE_LOADER_PARAMS.values()): psql_table = params['psql_table'] % TABLE_PREFIX # Open file, feed to DBUtil, and close file. indices_file_name = '.'.join([psql_table, 'indices.sql']) indices_file_path = os.path.join(indices_dir, indices_file_name) if os.path.exists(indices_file_path): log.debug('loading %s indices...' % psql_table) indices_file = open(indices_file_path, 'r') DBUtil.runDBScript(indices_file) indices_file.close()
def train(self, X, y, groups=None): self._groups = groups assert ('pat_id' not in X.columns) self._features = X.columns y = self._maybe_reshape_y(y) # Verify that there are at least 2 samples of each class. value_counts = Series(y).value_counts() log.debug('y.value_counts(): %s' % value_counts) for class_label in self._classes: # If there aren't 2+ samples of each class, exit gracefully. try: num_samples = value_counts[class_label] if num_samples < 10: log.error('Insufficient samples (%s) of label %s.' % (num_samples, class_label)) return SupervisedClassifier.INSUFFICIENT_SAMPLES except KeyError: log.error('Insufficient samples (0) of label %s.' % class_label) return SupervisedClassifier.INSUFFICIENT_SAMPLES log.info('Training %s classifier...' % self._hyperparams['algorithm']) if self._hyperparams[ 'algorithm'] == SupervisedClassifier.DECISION_TREE: self._train_decision_tree(X, y) elif self._hyperparams[ 'algorithm'] == SupervisedClassifier.LOGISTIC_REGRESSION: self._train_logistic_regression(X, y) elif self._hyperparams[ 'algorithm'] == SupervisedClassifier.RANDOM_FOREST: self._train_random_forest(X, y) elif self._hyperparams[ 'algorithm'] == SupervisedClassifier.REGRESS_AND_ROUND: self._train_regress_and_round(X, y) elif self._hyperparams['algorithm'] == SupervisedClassifier.ADABOOST: self._train_adaboost(X, y) elif self._hyperparams[ 'algorithm'] == SupervisedClassifier.GAUSSIAN_NAIVE_BAYES: self._train_gaussian_naive_bayes(X, y) elif self._hyperparams['algorithm'] == SupervisedClassifier.SVM: self._train_svm(X, y) elif self._hyperparams['algorithm'] == SupervisedClassifier.XGB: self._train_xgb(X, y) elif self._hyperparams['algorithm'] == SupervisedClassifier.NN: self._train_nn(X, y) return SupervisedClassifier.TRAINED
def build_virtual_clinical_item(item_sequence, virtual_id): """ Simple wrapper around medinfo/cpoe/TripleAssociationAnalysis. """ sequence_str = ' -> '.join([str(id) for id in item_sequence]) log.debug('(%s) = (%s)' % (sequence_str, virtual_id)) build_virtual_item_command = [ 'python', '-m', 'medinfo/cpoe/TripleAssociationAnalysis.py', '-s', ','.join([str(id) for id in item_sequence]), '-v', str(virtual_id) ] subprocess.call(build_virtual_item_command)
def build_composite_clinical_item(components, name, description, category_id): """ Simple wrapper around medinfo/cpoe/DataManager.py """ component_str = ','.join([str(id) for id in components]) log.debug('(%s, %s, %s) = (%s)' % (name, description, category_id, \ component_str)) composite_arg = '%s|%s|%s|%s' % (component_str, name, description, \ category_id) dm = DataManager() dm.main([ 'medinfo/cpoe/DataManager.py', '--compositeRelated', composite_arg ])
def load_stride_to_psql(): # Build clean data files. StrideLoader.build_clean_csv_files() # Build psql schemata. StrideLoader.build_stride_psql_schemata() # Build paths to clean data files. clean_data_dir = StrideLoader.fetch_clean_data_dir() for raw_file in sorted(STRIDE_LOADER_PARAMS.keys()): params = STRIDE_LOADER_PARAMS[raw_file] # Build clean data file. clean_file = params['clean_file'] % TABLE_PREFIX log.info('loading %s...' % clean_file) clean_path = os.path.join(clean_data_dir, clean_file) # Uncompress data file. unzipped_clean_path = clean_path[:-3] with gzip.open(clean_path, 'rb') as f_in, open(unzipped_clean_path, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # psql COPY data from clean files into DB. psql_table = params['psql_table'] % TABLE_PREFIX log.debug('stride/data/clean/%s ==> %s' % (clean_file, psql_table)) # In some cases, two files going to the same table will have # non-identical column names. Pass these explicitly so that # psql knows which columns to try to fill from file. # Strip the newline character. with open(unzipped_clean_path, 'r') as f_in: columns = f_in.readline()[:-1] command = "COPY %s (%s) FROM '%s' WITH (FORMAT csv, HEADER);" % ( psql_table, columns, unzipped_clean_path) DBUtil.execute(command) # Delete unzipped_clean_path. os.remove(unzipped_clean_path) # Run any one-off postprocessing transformations which all users # of the STRIDE database should receive. Defer any application-specific # transformations to other modules. StrideLoader.process_stride_psql_db() # Build indices. StrideLoader.build_stride_psql_indices()