Ejemplo n.º 1
0
def _split_axis(priors,
                split_ratio,
                axis=default.DEFAULT_CV_AXIS,
                seed=default.DEFAULT_CV_RANDOM_SEED):
    """
    Split by axis labels on the chosen axis
    :param priors: pd.DataFrame [M x N]
    :param split_ratio: float
    :param axis: [0, 1]
    :param seed:
    :return:
    """

    check.argument_numeric(split_ratio, 0, 1)
    check.argument_enum(axis, [0, 1])

    pc = priors.shape[axis]
    gs_count = int((1 - split_ratio) * pc)
    idx = _make_shuffled_index(pc, seed=seed)

    if axis == 0:
        axis_idx = priors.index
    elif axis == 1:
        axis_idx = priors.columns
    else:
        raise ValueError("Axis can only be 0 or 1")

    pr_idx = axis_idx[idx[0:gs_count]]
    gs_idx = axis_idx[idx[gs_count:]]

    priors_data = priors.drop(gs_idx, axis=axis)
    gold_standard = priors.drop(pr_idx, axis=axis)

    return priors_data, gold_standard
Ejemplo n.º 2
0
def split_for_cv(all_data,
                 split_ratio,
                 split_axis=default.DEFAULT_CV_AXIS,
                 seed=default.DEFAULT_CV_RANDOM_SEED):
    """
    Take a dataframe and split it according to split_ratio on split_axis into two new dataframes. This is for
    crossvalidation splits of a gold standard

    :param all_data: pd.DataFrame [G x K]
        Existing prior or gold standard data
    :param split_ratio: float
        The proportion of the priors that should go into the gold standard
    :param split_axis: int
        Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None)
    :return prior_data, gold_standard: pd.DataFrame [G/2 x K], pd.DataFrame [G/2 x K]
        Returns a new prior and gold standard by splitting the old one in half
    """

    check.argument_numeric(split_ratio, 0, 1)
    check.argument_enum(split_axis, [0, 1], allow_none=True)

    # Split the priors into gold standard based on axis (flatten if axis=None)
    if split_axis is None:
        priors_data, gold_standard = _split_flattened(all_data,
                                                      split_ratio,
                                                      seed=seed)
    else:
        priors_data, gold_standard = _split_axis(all_data,
                                                 split_ratio,
                                                 axis=split_axis,
                                                 seed=seed)

    return priors_data, gold_standard
Ejemplo n.º 3
0
    def test_enum(self):

        self.assertTrue(check.argument_enum("A", ("A", "B")))
        self.assertTrue(check.argument_enum(["A", "B", "A"], ("A", "B")))

        with self.assertRaises(ValueError):
            check.argument_enum(["A", "B", "C"], ("A", "B"))
Ejemplo n.º 4
0
    def shuffle_priors(priors_data, shuffle_prior_axis, random_seed):
        """
        Shuffle the labels on the priors on a specific axis
        :param priors_data: pd.DataFrame [G x K]
            Prior data
        :param shuffle_prior_axis: int
            Axis to shuffle. 0 is genes, 1 is regulators, None is skip shuffling.
        :param random_seed: int
            Random seed
        :return priors_data:
            Returns priors_data the data has been shuffled on a specific axis
        """

        assert check.argument_enum(shuffle_prior_axis, [0, 1], allow_none=True)

        if shuffle_prior_axis is None:
            return priors_data
        elif shuffle_prior_axis == 0:
            # Shuffle index (genes) in the priors_data
            utils.Debug.vprint("Randomly shuffling prior [{sh}] gene data".format(sh=priors_data.shape))
            prior_index = priors_data.index.tolist()
            priors_data = priors_data.sample(frac=1, axis=0, random_state=random_seed)
            priors_data.index = prior_index
        elif shuffle_prior_axis == 1:
            # Shuffle columns (TFs) in the priors_data
            utils.Debug.vprint("Randomly shuffling prior [{sh}] TF data".format(sh=priors_data.shape))
            prior_index = priors_data.columns.tolist()
            priors_data = priors_data.sample(frac=1, axis=1, random_state=random_seed)
            priors_data.columns = prior_index

        return priors_data
Ejemplo n.º 5
0
 def validate_init_args(betas, rescaled_betas, threshold=None, filter_method=None, metric=None):
     assert check.argument_type(betas, list)
     assert check.argument_type(betas[0], pd.DataFrame)
     assert check.dataframes_align(betas)
     assert check.argument_type(rescaled_betas, list)
     assert check.argument_type(rescaled_betas[0], pd.DataFrame)
     assert check.dataframes_align(rescaled_betas)
     assert check.argument_enum(filter_method, FILTER_METHODS, allow_none=True)
     assert check.argument_numeric(threshold, 0, 1, allow_none=True)
Ejemplo n.º 6
0
def remove_prior_circularity(priors,
                             gold_standard,
                             split_axis=default.DEFAULT_CV_AXIS):
    """
    Remove all row labels that occur in the gold standard from the prior
    :param priors: pd.DataFrame [M x N]
    :param gold_standard: pd.DataFrame [m x n]
    :param split_axis: int (0,1)
    :return new_priors: pd.DataFrame [M-m x N]
    :return gold_standard: pd.DataFrame [m x n]
    """

    check.argument_enum(split_axis, [0, 1])
    new_priors = priors.drop(gold_standard.axes[split_axis],
                             axis=split_axis,
                             errors='ignore')

    return new_priors, gold_standard
Ejemplo n.º 7
0
    def cross_validate_gold_standard(priors_data, gold_standard, cv_split_axis,
                                     cv_split_ratio, random_seed):
        """
        Sample the gold standard for crossvalidation, and then remove the new gold standard from the priors (if split
        on an axis)

        :param priors_data: pd.DataFrame [G x K]
            Prior data
        :param gold_standard: pd.DataFrame [G x K]
            Gold standard data
        :param cv_split_ratio: float
            The proportion of the priors that should go into the gold standard
        :param cv_split_axis: int
            Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None)
            Note that if this is None, the returned gold standard will be the same as all_data, and the priors will have
            half of the data points of all_data
        :param random_seed: int
            Random seed
        :return priors_data, gold_standard: pd.DataFrame [G x K], pd.DataFrame [G x K]
        """

        assert check.argument_enum(cv_split_axis, (0, 1), allow_none=True)
        assert check.argument_numeric(cv_split_ratio, low=0, high=1)

        if cv_split_axis == 1:
            utils.Debug.vprint(
                "Selecting cv_split_axis of 1 is possible but a very bad idea",
                level=1)

        utils.Debug.vprint("Resampling GS ({gs}) for crossvalidation".format(
            gs=gold_standard.shape),
                           level=0)
        gs_to_prior, gold_standard = ManagePriors._split_for_cv(
            gold_standard,
            cv_split_ratio,
            split_axis=cv_split_axis,
            seed=random_seed)

        # If the priors are split on an axis, remove circularity
        if cv_split_axis is not None:
            priors_data, gold_standard = ManagePriors._remove_prior_circularity(
                priors_data, gold_standard, split_axis=cv_split_axis)
        else:
            if priors_data is not None:
                utils.Debug.vprint(
                    "Existing prior is being replaced with a downsampled gold standard"
                )
            priors_data = gs_to_prior

        _msg = "CV prior {pr} [{pr_x}] and gold standard {gs} [{gs_x}]"
        utils.Debug.vprint(_msg.format(pr=priors_data.shape,
                                       gs=gold_standard.shape,
                                       pr_x=(priors_data != 0).sum().sum(),
                                       gs_x=(gold_standard != 0).sum().sum()),
                           level=0)

        return priors_data, gold_standard
Ejemplo n.º 8
0
 def validate_init_args(betas, rescaled_betas, threshold=None, filter_method=None, metric=None):
     assert check.argument_type(betas, list)
     assert check.argument_list_type(betas, list)
     assert check.argument_list_type(betas[0], pd.DataFrame)
     assert check.argument_type(rescaled_betas, list)
     assert check.argument_list_type(rescaled_betas, list)
     assert check.argument_list_type(rescaled_betas[0], pd.DataFrame)
     assert all([check.dataframes_align(b_task + bresc_task) for b_task, bresc_task in zip(betas, rescaled_betas)])
     assert check.argument_enum(filter_method, results_processor.FILTER_METHODS, allow_none=True)
     assert check.argument_numeric(threshold, 0, 1, allow_none=True)
Ejemplo n.º 9
0
    def __init__(self,
                 rankable_data,
                 gold_standard,
                 filter_method='keep_all_gold_standard'):
        """
        Take rankable data and process it into confidence scores which are stored in this object
        :param rankable_data: list(pd.DataFrame) [B x [G x K]]
            A list of numeric dataframes (with identical axes)
        :param gold_standard: pd.DataFrame [G x K]
            A dataframe which corresponds to known, gold-standard data
        :param filter_method: str
            The method of aligning the

        """

        # Get the filtering method
        assert check.argument_enum(filter_method,
                                   self.filter_method_lookup.keys())
        self.filter_method = getattr(self,
                                     self.filter_method_lookup[filter_method])

        # Explicitly cast the gold standard data to a boolean array [0,1]
        gold_standard = (gold_standard != 0).astype(int)
        self.gold_standard = gold_standard

        # Calculate confidences based on the ranked data
        self.all_confidences = self.compute_combined_confidences(rankable_data)

        # Convert the confidence data to long format
        confidence_data = utils.melt_and_reindex_dataframe(
            self.all_confidences,
            CONFIDENCE_COLUMN,
            idx_name=TARGET_COLUMN,
            col_name=REGULATOR_COLUMN)

        # Attach the gold standard
        confidence_data = self.attach_gs_to_confidences(
            confidence_data, gold_standard)

        # Sort by confidence (descending) and reset the index
        self.confidence_data = confidence_data.sort_values(
            by=CONFIDENCE_COLUMN, ascending=False, na_position='last')
        self.confidence_data.reset_index(inplace=True)

        # Filter the gold standard and confidences down to a format that can be directly compared
        utils.Debug.vprint("GS: {gs} edges, Confidences: {conf} edges".format(
            gs=gold_standard.shape[0], conf=self.confidence_data.shape[0]),
                           level=0)

        self.filtered_data = self.filter_method(GOLD_STANDARD_COLUMN,
                                                CONFIDENCE_COLUMN,
                                                self.confidence_data)
        utils.Debug.vprint("Filtered data to {e} edges".format(
            e=self.filtered_data.shape[0], level=0))
Ejemplo n.º 10
0
    def compute_combined_confidences(rankable_data, **kwargs):
        """
        Calculate combined confidences from rank sum
        :param rankable_data: list(pd.DataFrame) R x [M x N]
            List of dataframes which have the same axes and need to be rank summed
        :return combine_conf: pd.DataFrame [M x N]
        """

        rank_method = kwargs.pop("rank_method", "sum")
        assert check.argument_enum(rank_method,
                                   ("sum", "threshold_sum", "max", "geo_mean"))
        assert check.argument_type(rankable_data, list, allow_none=False)

        if rank_method == "sum":
            return RankSumming.rank_sum(rankable_data)
        elif rank_method == "threshold_sum":
            return RankSumming.rank_sum_threshold(rankable_data,
                                                  data_threshold=kwargs.pop(
                                                      "data_threshold", 0.9))
        elif rank_method == "max":
            return RankSumming.rank_max_value(rankable_data)
        elif rank_method == "geo_mean":
            return RankSumming.rank_geo_mean(rankable_data)
Ejemplo n.º 11
0
    def __init__(self,
                 betas,
                 rescaled_betas,
                 threshold=0.5,
                 filter_method='overlap'):
        """
        :param betas: list(pd.DataFrame[G x K])
        :param rescaled_betas: list(pd.DataFrame[G x K])
        :param threshold: float
        :param filter_method: str
            How to handle gold standard filtering ('overlap' filters to beta, 'keep_all_gold_standard' doesn't filter)
        """

        assert check.dataframes_align(betas)
        self.betas = betas

        assert check.dataframes_align(rescaled_betas)
        self.rescaled_betas = rescaled_betas

        assert check.argument_enum(filter_method, FILTER_METHODS)
        self.filter_method = filter_method

        assert check.argument_numeric(threshold, 0, 1)
        self.threshold = threshold
Ejemplo n.º 12
0
    def _split_for_cv(all_data,
                      split_ratio,
                      split_axis=DEFAULT_CV_AXIS,
                      seed=DEFAULT_SEED):
        """
        Take a dataframe and split it according to split_ratio on split_axis into two new dataframes. This is for
        crossvalidation splits of a gold standard.

        :param all_data: pd.DataFrame [G x K]
            Existing prior or gold standard data
        :param split_ratio: float
            The proportion of the priors that should go into the gold standard
        :param split_axis: int
            Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None)
            Note that if this is None, the returned gold standard will be the same as all_data, and the priors will have
            half of the data points of all_data
        :param seed: int
            Seed for the random generator
        :return prior_data, gold_standard: pd.DataFrame [G/2 x K], pd.DataFrame [G/2 x K]
            Returns a new prior and gold standard by splitting the old one in half
        """

        assert check.argument_numeric(split_ratio, 0, 1)
        assert check.argument_enum(split_axis, [0, 1], allow_none=True)

        # Split the priors into gold standard based on axis (flatten if axis=None)
        if split_axis is None:
            priors_data, _ = ManagePriors._split_flattened(all_data,
                                                           split_ratio,
                                                           seed=seed)
            gold_standard = all_data
        else:
            priors_data, gold_standard = ManagePriors._split_axis(
                all_data, split_ratio, axis=split_axis, seed=seed)

        return priors_data, gold_standard
Ejemplo n.º 13
0
    def __init__(self,
                 rankable_data,
                 gold_standard,
                 filter_method='keep_all_gold_standard',
                 rank_method="sum"):

        assert check.argument_enum(filter_method,
                                   self.filter_method_lookup.keys())
        self.filter_method = getattr(self,
                                     self.filter_method_lookup[filter_method])

        # Calculate confidences based on the ranked data
        self.all_confidences = self.compute_combined_confidences(
            rankable_data, rank_method=rank_method)

        # Filter the gold standard and confidences down to a format that can be directly compared
        utils.Debug.vprint("GS: {gs}, Confidences: {conf}".format(
            gs=gold_standard.shape, conf=self.all_confidences.shape),
                           level=0)
        self.gold_standard, self.filtered_confidences = self.filter_method(
            gold_standard, self.all_confidences)
        utils.Debug.vprint("Filtered to GS: {gs}, Confidences: {conf}".format(
            gs=gold_standard.shape, conf=self.all_confidences.shape),
                           level=0)