def make_PR_data(gs, confidences): data = utils.melt_and_reindex_dataframe( confidences, value_name=CONFIDENCE_COLUMN).reset_index() data = data.join(utils.melt_and_reindex_dataframe( gs, value_name=GOLD_STANDARD_COLUMN), on=[TARGET_COLUMN, REGULATOR_COLUMN], how='outer') return data
def process_network(metric, priors, confidence_threshold=0, beta_threshold=None, extra_columns=None): """ Process rank-summed results into a network data frame :param metric: RankSummingMetric The rank-sum object with the math in it :param priors: pd.DataFrame [G x K] Prior data :param confidence_threshold: numeric The minimum confidence score needed to write a network edge :param beta_threshold: pd.DataFrame [G x K] The thresholded betas to include in the network. If None, include everything. :param extra_columns: dict(col_name: pd.DataFrame [G x K]) Any additional data to include, keyed by column name and indexable with row and column names :return network_data: pd.DataFrame [(G*K) x 7+] Network edge dataframe """ assert check.argument_type(metric, RankSummingMetric) assert check.argument_type(priors, pd.DataFrame, allow_none=True) assert check.argument_type(beta_threshold, pd.DataFrame, allow_none=True) assert check.argument_numeric(confidence_threshold, 0, 1) # Get the combined confidences and subset for confidence threshold network_data = metric.confidence_dataframe() network_data = network_data.loc[network_data[CONFIDENCE_COLUMN] > confidence_threshold, :] # If beta_threshold has been provided, melt and join it to the network data # Then discard anything which isn't meeting the threshold if beta_threshold is not None and False: beta_data = utils.melt_and_reindex_dataframe(beta_threshold, BETA_THRESHOLD_COLUMN) network_data = network_data.join(beta_data, on=[TARGET_COLUMN, REGULATOR_COLUMN]) network_data = network_data.loc[network_data[BETA_THRESHOLD_COLUMN] == 1, :] del network_data[BETA_THRESHOLD_COLUMN] if priors is not None: prior_data = utils.melt_and_reindex_dataframe(priors, PRIOR_COLUMN) network_data = network_data.join(prior_data, on=[TARGET_COLUMN, REGULATOR_COLUMN]) # Add any extra columns as needed if extra_columns is not None: for k in sorted(extra_columns.keys()): extra_data = utils.melt_and_reindex_dataframe(extra_columns[k], k) network_data = network_data.join(extra_data, on=[TARGET_COLUMN, REGULATOR_COLUMN]) # Make sure all missing values are NaN network_data[pd.isnull(network_data)] = np.nan return network_data
def __init__(self, rankable_data, gold_standard, filter_method='keep_all_gold_standard'): """ Take rankable data and process it into confidence scores which are stored in this object :param rankable_data: list(pd.DataFrame) [B x [G x K]] A list of numeric dataframes (with identical axes) :param gold_standard: pd.DataFrame [G x K] A dataframe which corresponds to known, gold-standard data :param filter_method: str The method of aligning the """ # Get the filtering method assert check.argument_enum(filter_method, self.filter_method_lookup.keys()) self.filter_method = getattr(self, self.filter_method_lookup[filter_method]) # Explicitly cast the gold standard data to a boolean array [0,1] gold_standard = (gold_standard != 0).astype(int) self.gold_standard = gold_standard # Calculate confidences based on the ranked data self.all_confidences = self.compute_combined_confidences(rankable_data) # Convert the confidence data to long format confidence_data = utils.melt_and_reindex_dataframe( self.all_confidences, CONFIDENCE_COLUMN, idx_name=TARGET_COLUMN, col_name=REGULATOR_COLUMN) # Attach the gold standard confidence_data = self.attach_gs_to_confidences( confidence_data, gold_standard) # Sort by confidence (descending) and reset the index self.confidence_data = confidence_data.sort_values( by=CONFIDENCE_COLUMN, ascending=False, na_position='last') self.confidence_data.reset_index(inplace=True) # Filter the gold standard and confidences down to a format that can be directly compared utils.Debug.vprint("GS: {gs} edges, Confidences: {conf} edges".format( gs=gold_standard.shape[0], conf=self.confidence_data.shape[0]), level=0) self.filtered_data = self.filter_method(GOLD_STANDARD_COLUMN, CONFIDENCE_COLUMN, self.confidence_data) utils.Debug.vprint("Filtered data to {e} edges".format( e=self.filtered_data.shape[0], level=0))
def attach_gs_to_confidences(confidence_data, gold_standard): """ Outer join the gold standard into the confidence data :param confidence_data: pd.DataFrame [G*K x n] :param gold_standard: pd.DataFrame [G x K] :return: """ gold_standard = utils.melt_and_reindex_dataframe( gold_standard, GOLD_STANDARD_COLUMN, idx_name=TARGET_COLUMN, col_name=REGULATOR_COLUMN) return confidence_data.join(gold_standard, how='outer', on=[TARGET_COLUMN, REGULATOR_COLUMN])