Esempio n. 1
0
    def test_enum(self):

        self.assertTrue(check.argument_enum("A", ("A", "B")))
        self.assertTrue(check.argument_enum(["A", "B", "A"], ("A", "B")))

        with self.assertRaises(ValueError):
            check.argument_enum(["A", "B", "C"], ("A", "B"))
Esempio n. 2
0
def _split_axis(priors, split_ratio, axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED):
    """
    Split by axis labels on the chosen axis
    :param priors: pd.DataFrame [M x N]
    :param split_ratio: float
    :param axis: [0, 1]
    :param seed:
    :return:
    """

    check.argument_numeric(split_ratio, 0, 1)
    check.argument_enum(axis, [0, 1])

    pc = priors.shape[axis]
    gs_count = int((1 - split_ratio) * pc)
    idx = _make_shuffled_index(pc, seed=seed)

    if axis == 0:
        axis_idx = priors.index
    elif axis == 1:
        axis_idx = priors.columns
    else:
        raise ValueError("Axis can only be 0 or 1")

    pr_idx = axis_idx[idx[0:gs_count]]
    gs_idx = axis_idx[idx[gs_count:]]

    priors_data = priors.drop(gs_idx, axis=axis)
    gold_standard = priors.drop(pr_idx, axis=axis)

    return priors_data, gold_standard
Esempio n. 3
0
def split_for_cv(all_data, split_ratio, split_axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED):
    """
    Take a dataframe and split it according to split_ratio on split_axis into two new dataframes. This is for
    crossvalidation splits of a gold standard

    :param all_data: pd.DataFrame [G x K]
        Existing prior or gold standard data
    :param split_ratio: float
        The proportion of the priors that should go into the gold standard
    :param split_axis: int
        Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None)
    :return prior_data, gold_standard: pd.DataFrame [G/2 x K], pd.DataFrame [G/2 x K]
        Returns a new prior and gold standard by splitting the old one in half
    """

    check.argument_numeric(split_ratio, 0, 1)
    check.argument_enum(split_axis, [0, 1], allow_none=True)

    # Split the priors into gold standard based on axis (flatten if axis=None)
    if split_axis is None:
        priors_data, gold_standard = _split_flattened(all_data, split_ratio, seed=seed)
    else:
        priors_data, gold_standard = _split_axis(all_data, split_ratio, axis=split_axis, seed=seed)

    return priors_data, gold_standard
Esempio n. 4
0
def remove_prior_circularity(priors, gold_standard, split_axis=default.DEFAULT_CV_AXIS):
    """
    Remove all row labels that occur in the gold standard from the prior
    :param priors: pd.DataFrame [M x N]
    :param gold_standard: pd.DataFrame [m x n]
    :param split_axis: int (0,1)
    :return new_priors: pd.DataFrame [M-m x N]
    :return gold_standard: pd.DataFrame [m x n]
    """

    check.argument_enum(split_axis, [0, 1])
    new_priors = priors.drop(gold_standard.axes[split_axis], axis=split_axis, errors='ignore')

    return new_priors, gold_standard
    def __init__(self,
                 rankable_data,
                 gold_standard,
                 filter_method='keep_all_gold_standard',
                 rank_method="sum"):

        assert check.argument_enum(filter_method,
                                   self.filter_method_lookup.keys())
        self.filter_method = getattr(self,
                                     self.filter_method_lookup[filter_method])

        # Calculate confidences based on the ranked data
        self.all_confidences = self.compute_combined_confidences(
            rankable_data, rank_method=rank_method)

        # Filter the gold standard and confidences down to a format that can be directly compared
        utils.Debug.vprint("GS: {gs}, Confidences: {conf}".format(
            gs=gold_standard.shape, conf=self.all_confidences.shape),
                           level=0)
        self.gold_standard, self.filtered_confidences = self.filter_method(
            gold_standard, self.all_confidences)
        utils.Debug.vprint("Filtered to GS: {gs}, Confidences: {conf}".format(
            gs=gold_standard.shape, conf=self.all_confidences.shape),
                           level=0)

        # Calculate the precision and recall and save the index that sorts the ranked confidences (filtered)
        self.recall, self.precision, self.ranked_idx = self.calculate_precision_recall(
            self.filtered_confidences, self.gold_standard)
        self.aupr = self.calculate_aupr(self.recall, self.precision)
    def compute_combined_confidences(rankable_data, **kwargs):
        """
        Calculate combined confidences from rank sum
        :param rankable_data: list(pd.DataFrame) R x [M x N]
            List of dataframes which have the same axes and need to be rank summed
        :return combine_conf: pd.DataFrame [M x N]
        """

        rank_method = kwargs.pop("rank_method", "sum")
        assert check.argument_enum(rank_method,
                                   ("sum", "threshold_sum", "max", "geo_mean"))

        if rank_method == "sum":
            return RankSummaryPR.rank_sum(rankable_data)
        elif rank_method == "threshold_sum":
            return RankSummaryPR.rank_sum_threshold(rankable_data,
                                                    data_threshold=kwargs.pop(
                                                        "data_threshold", 0.9))
        elif rank_method == "max":
            return RankSummaryPR.rank_max_value(rankable_data)
        elif rank_method == "geo_mean":
            return RankSummaryPR.rank_geo_mean(rankable_data)
    def __init__(self,
                 betas,
                 rescaled_betas,
                 threshold=0.5,
                 filter_method='overlap'):
        """
        :param betas: list(pd.DataFrame[G x K])
        :param rescaled_betas: list(pd.DataFrame[G x K])
        :param threshold: float
        :param filter_method: str
            How to handle gold standard filtering ('overlap' filters to beta, 'keep_all_gold_standard' doesn't filter)
        """

        assert check.dataframes_align(betas)
        self.betas = betas

        assert check.dataframes_align(rescaled_betas)
        self.rescaled_betas = rescaled_betas

        assert check.argument_enum(filter_method, FILTER_METHODS)
        self.filter_method = filter_method

        assert check.argument_numeric(threshold, 0, 1)
        self.threshold = threshold