def _split_axis(priors, split_ratio, axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED): """ Split by axis labels on the chosen axis :param priors: pd.DataFrame [M x N] :param split_ratio: float :param axis: [0, 1] :param seed: :return: """ check.argument_numeric(split_ratio, 0, 1) check.argument_enum(axis, [0, 1]) pc = priors.shape[axis] gs_count = int((1 - split_ratio) * pc) idx = _make_shuffled_index(pc, seed=seed) if axis == 0: axis_idx = priors.index elif axis == 1: axis_idx = priors.columns else: raise ValueError("Axis can only be 0 or 1") pr_idx = axis_idx[idx[0:gs_count]] gs_idx = axis_idx[idx[gs_count:]] priors_data = priors.drop(gs_idx, axis=axis) gold_standard = priors.drop(pr_idx, axis=axis) return priors_data, gold_standard
def split_for_cv(all_data, split_ratio, split_axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED): """ Take a dataframe and split it according to split_ratio on split_axis into two new dataframes. This is for crossvalidation splits of a gold standard :param all_data: pd.DataFrame [G x K] Existing prior or gold standard data :param split_ratio: float The proportion of the priors that should go into the gold standard :param split_axis: int Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None) :return prior_data, gold_standard: pd.DataFrame [G/2 x K], pd.DataFrame [G/2 x K] Returns a new prior and gold standard by splitting the old one in half """ check.argument_numeric(split_ratio, 0, 1) check.argument_enum(split_axis, [0, 1], allow_none=True) # Split the priors into gold standard based on axis (flatten if axis=None) if split_axis is None: priors_data, gold_standard = _split_flattened(all_data, split_ratio, seed=seed) else: priors_data, gold_standard = _split_axis(all_data, split_ratio, axis=split_axis, seed=seed) return priors_data, gold_standard
def test_enum(self): self.assertTrue(check.argument_enum("A", ("A", "B"))) self.assertTrue(check.argument_enum(["A", "B", "A"], ("A", "B"))) with self.assertRaises(ValueError): check.argument_enum(["A", "B", "C"], ("A", "B"))
def shuffle_priors(priors_data, shuffle_prior_axis, random_seed): """ Shuffle the labels on the priors on a specific axis :param priors_data: pd.DataFrame [G x K] Prior data :param shuffle_prior_axis: int Axis to shuffle. 0 is genes, 1 is regulators, None is skip shuffling. :param random_seed: int Random seed :return priors_data: Returns priors_data the data has been shuffled on a specific axis """ assert check.argument_enum(shuffle_prior_axis, [0, 1], allow_none=True) if shuffle_prior_axis is None: return priors_data elif shuffle_prior_axis == 0: # Shuffle index (genes) in the priors_data utils.Debug.vprint("Randomly shuffling prior [{sh}] gene data".format(sh=priors_data.shape)) prior_index = priors_data.index.tolist() priors_data = priors_data.sample(frac=1, axis=0, random_state=random_seed) priors_data.index = prior_index elif shuffle_prior_axis == 1: # Shuffle columns (TFs) in the priors_data utils.Debug.vprint("Randomly shuffling prior [{sh}] TF data".format(sh=priors_data.shape)) prior_index = priors_data.columns.tolist() priors_data = priors_data.sample(frac=1, axis=1, random_state=random_seed) priors_data.columns = prior_index return priors_data
def validate_init_args(betas, rescaled_betas, threshold=None, filter_method=None, metric=None): assert check.argument_type(betas, list) assert check.argument_type(betas[0], pd.DataFrame) assert check.dataframes_align(betas) assert check.argument_type(rescaled_betas, list) assert check.argument_type(rescaled_betas[0], pd.DataFrame) assert check.dataframes_align(rescaled_betas) assert check.argument_enum(filter_method, FILTER_METHODS, allow_none=True) assert check.argument_numeric(threshold, 0, 1, allow_none=True)
def remove_prior_circularity(priors, gold_standard, split_axis=default.DEFAULT_CV_AXIS): """ Remove all row labels that occur in the gold standard from the prior :param priors: pd.DataFrame [M x N] :param gold_standard: pd.DataFrame [m x n] :param split_axis: int (0,1) :return new_priors: pd.DataFrame [M-m x N] :return gold_standard: pd.DataFrame [m x n] """ check.argument_enum(split_axis, [0, 1]) new_priors = priors.drop(gold_standard.axes[split_axis], axis=split_axis, errors='ignore') return new_priors, gold_standard
def cross_validate_gold_standard(priors_data, gold_standard, cv_split_axis, cv_split_ratio, random_seed): """ Sample the gold standard for crossvalidation, and then remove the new gold standard from the priors (if split on an axis) :param priors_data: pd.DataFrame [G x K] Prior data :param gold_standard: pd.DataFrame [G x K] Gold standard data :param cv_split_ratio: float The proportion of the priors that should go into the gold standard :param cv_split_axis: int Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None) Note that if this is None, the returned gold standard will be the same as all_data, and the priors will have half of the data points of all_data :param random_seed: int Random seed :return priors_data, gold_standard: pd.DataFrame [G x K], pd.DataFrame [G x K] """ assert check.argument_enum(cv_split_axis, (0, 1), allow_none=True) assert check.argument_numeric(cv_split_ratio, low=0, high=1) if cv_split_axis == 1: utils.Debug.vprint( "Selecting cv_split_axis of 1 is possible but a very bad idea", level=1) utils.Debug.vprint("Resampling GS ({gs}) for crossvalidation".format( gs=gold_standard.shape), level=0) gs_to_prior, gold_standard = ManagePriors._split_for_cv( gold_standard, cv_split_ratio, split_axis=cv_split_axis, seed=random_seed) # If the priors are split on an axis, remove circularity if cv_split_axis is not None: priors_data, gold_standard = ManagePriors._remove_prior_circularity( priors_data, gold_standard, split_axis=cv_split_axis) else: if priors_data is not None: utils.Debug.vprint( "Existing prior is being replaced with a downsampled gold standard" ) priors_data = gs_to_prior _msg = "CV prior {pr} [{pr_x}] and gold standard {gs} [{gs_x}]" utils.Debug.vprint(_msg.format(pr=priors_data.shape, gs=gold_standard.shape, pr_x=(priors_data != 0).sum().sum(), gs_x=(gold_standard != 0).sum().sum()), level=0) return priors_data, gold_standard
def validate_init_args(betas, rescaled_betas, threshold=None, filter_method=None, metric=None): assert check.argument_type(betas, list) assert check.argument_list_type(betas, list) assert check.argument_list_type(betas[0], pd.DataFrame) assert check.argument_type(rescaled_betas, list) assert check.argument_list_type(rescaled_betas, list) assert check.argument_list_type(rescaled_betas[0], pd.DataFrame) assert all([check.dataframes_align(b_task + bresc_task) for b_task, bresc_task in zip(betas, rescaled_betas)]) assert check.argument_enum(filter_method, results_processor.FILTER_METHODS, allow_none=True) assert check.argument_numeric(threshold, 0, 1, allow_none=True)
def __init__(self, rankable_data, gold_standard, filter_method='keep_all_gold_standard'): """ Take rankable data and process it into confidence scores which are stored in this object :param rankable_data: list(pd.DataFrame) [B x [G x K]] A list of numeric dataframes (with identical axes) :param gold_standard: pd.DataFrame [G x K] A dataframe which corresponds to known, gold-standard data :param filter_method: str The method of aligning the """ # Get the filtering method assert check.argument_enum(filter_method, self.filter_method_lookup.keys()) self.filter_method = getattr(self, self.filter_method_lookup[filter_method]) # Explicitly cast the gold standard data to a boolean array [0,1] gold_standard = (gold_standard != 0).astype(int) self.gold_standard = gold_standard # Calculate confidences based on the ranked data self.all_confidences = self.compute_combined_confidences(rankable_data) # Convert the confidence data to long format confidence_data = utils.melt_and_reindex_dataframe( self.all_confidences, CONFIDENCE_COLUMN, idx_name=TARGET_COLUMN, col_name=REGULATOR_COLUMN) # Attach the gold standard confidence_data = self.attach_gs_to_confidences( confidence_data, gold_standard) # Sort by confidence (descending) and reset the index self.confidence_data = confidence_data.sort_values( by=CONFIDENCE_COLUMN, ascending=False, na_position='last') self.confidence_data.reset_index(inplace=True) # Filter the gold standard and confidences down to a format that can be directly compared utils.Debug.vprint("GS: {gs} edges, Confidences: {conf} edges".format( gs=gold_standard.shape[0], conf=self.confidence_data.shape[0]), level=0) self.filtered_data = self.filter_method(GOLD_STANDARD_COLUMN, CONFIDENCE_COLUMN, self.confidence_data) utils.Debug.vprint("Filtered data to {e} edges".format( e=self.filtered_data.shape[0], level=0))
def compute_combined_confidences(rankable_data, **kwargs): """ Calculate combined confidences from rank sum :param rankable_data: list(pd.DataFrame) R x [M x N] List of dataframes which have the same axes and need to be rank summed :return combine_conf: pd.DataFrame [M x N] """ rank_method = kwargs.pop("rank_method", "sum") assert check.argument_enum(rank_method, ("sum", "threshold_sum", "max", "geo_mean")) assert check.argument_type(rankable_data, list, allow_none=False) if rank_method == "sum": return RankSumming.rank_sum(rankable_data) elif rank_method == "threshold_sum": return RankSumming.rank_sum_threshold(rankable_data, data_threshold=kwargs.pop( "data_threshold", 0.9)) elif rank_method == "max": return RankSumming.rank_max_value(rankable_data) elif rank_method == "geo_mean": return RankSumming.rank_geo_mean(rankable_data)
def __init__(self, betas, rescaled_betas, threshold=0.5, filter_method='overlap'): """ :param betas: list(pd.DataFrame[G x K]) :param rescaled_betas: list(pd.DataFrame[G x K]) :param threshold: float :param filter_method: str How to handle gold standard filtering ('overlap' filters to beta, 'keep_all_gold_standard' doesn't filter) """ assert check.dataframes_align(betas) self.betas = betas assert check.dataframes_align(rescaled_betas) self.rescaled_betas = rescaled_betas assert check.argument_enum(filter_method, FILTER_METHODS) self.filter_method = filter_method assert check.argument_numeric(threshold, 0, 1) self.threshold = threshold
def _split_for_cv(all_data, split_ratio, split_axis=DEFAULT_CV_AXIS, seed=DEFAULT_SEED): """ Take a dataframe and split it according to split_ratio on split_axis into two new dataframes. This is for crossvalidation splits of a gold standard. :param all_data: pd.DataFrame [G x K] Existing prior or gold standard data :param split_ratio: float The proportion of the priors that should go into the gold standard :param split_axis: int Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None) Note that if this is None, the returned gold standard will be the same as all_data, and the priors will have half of the data points of all_data :param seed: int Seed for the random generator :return prior_data, gold_standard: pd.DataFrame [G/2 x K], pd.DataFrame [G/2 x K] Returns a new prior and gold standard by splitting the old one in half """ assert check.argument_numeric(split_ratio, 0, 1) assert check.argument_enum(split_axis, [0, 1], allow_none=True) # Split the priors into gold standard based on axis (flatten if axis=None) if split_axis is None: priors_data, _ = ManagePriors._split_flattened(all_data, split_ratio, seed=seed) gold_standard = all_data else: priors_data, gold_standard = ManagePriors._split_axis( all_data, split_ratio, axis=split_axis, seed=seed) return priors_data, gold_standard
def __init__(self, rankable_data, gold_standard, filter_method='keep_all_gold_standard', rank_method="sum"): assert check.argument_enum(filter_method, self.filter_method_lookup.keys()) self.filter_method = getattr(self, self.filter_method_lookup[filter_method]) # Calculate confidences based on the ranked data self.all_confidences = self.compute_combined_confidences( rankable_data, rank_method=rank_method) # Filter the gold standard and confidences down to a format that can be directly compared utils.Debug.vprint("GS: {gs}, Confidences: {conf}".format( gs=gold_standard.shape, conf=self.all_confidences.shape), level=0) self.gold_standard, self.filtered_confidences = self.filter_method( gold_standard, self.all_confidences) utils.Debug.vprint("Filtered to GS: {gs}, Confidences: {conf}".format( gs=gold_standard.shape, conf=self.all_confidences.shape), level=0)