Esempio n. 1
0
def _split_axis(priors, split_ratio, axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED):
    """
    Split by axis labels on the chosen axis
    :param priors: pd.DataFrame [M x N]
    :param split_ratio: float
    :param axis: [0, 1]
    :param seed:
    :return:
    """

    check.argument_numeric(split_ratio, 0, 1)
    check.argument_enum(axis, [0, 1])

    pc = priors.shape[axis]
    gs_count = int((1 - split_ratio) * pc)
    idx = _make_shuffled_index(pc, seed=seed)

    if axis == 0:
        axis_idx = priors.index
    elif axis == 1:
        axis_idx = priors.columns
    else:
        raise ValueError("Axis can only be 0 or 1")

    pr_idx = axis_idx[idx[0:gs_count]]
    gs_idx = axis_idx[idx[gs_count:]]

    priors_data = priors.drop(gs_idx, axis=axis)
    gold_standard = priors.drop(pr_idx, axis=axis)

    return priors_data, gold_standard
Esempio n. 2
0
def _split_flattened(data, split_ratio, seed=default.DEFAULT_CV_RANDOM_SEED):
    """
    Instead of splitting by axis labels, split edges and ignore axes
    :param data: pd.DataFrame [M x N]
    :param split_ratio: float
    :param seed:
    :return priors_data: pd.DataFrame [M x N]
    :return gold_standard: pd.DataFrame [M x N]
    """

    check.argument_numeric(split_ratio, 0, 1)

    pc = np.sum(data.values != 0)
    gs_count = int(split_ratio * pc)
    idx = _make_shuffled_index(pc, seed=seed)

    pr_idx = data.values[data.values != 0].copy()
    gs_idx = data.values[data.values != 0].copy()

    pr_idx[idx[0:gs_count]] = 0
    gs_idx[idx[gs_count:]] = 0

    gs = data.values.copy()
    pr = data.values.copy()

    gs[gs != 0] = gs_idx
    pr[pr != 0] = pr_idx

    priors_data = pd.DataFrame(pr, index=data.index, columns=data.columns)
    gold_standard = pd.DataFrame(gs, index=data.index, columns=data.columns)

    return priors_data, gold_standard
Esempio n. 3
0
def split_for_cv(all_data, split_ratio, split_axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED):
    """
    Take a dataframe and split it according to split_ratio on split_axis into two new dataframes. This is for
    crossvalidation splits of a gold standard

    :param all_data: pd.DataFrame [G x K]
        Existing prior or gold standard data
    :param split_ratio: float
        The proportion of the priors that should go into the gold standard
    :param split_axis: int
        Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None)
    :return prior_data, gold_standard: pd.DataFrame [G/2 x K], pd.DataFrame [G/2 x K]
        Returns a new prior and gold standard by splitting the old one in half
    """

    check.argument_numeric(split_ratio, 0, 1)
    check.argument_enum(split_axis, [0, 1], allow_none=True)

    # Split the priors into gold standard based on axis (flatten if axis=None)
    if split_axis is None:
        priors_data, gold_standard = _split_flattened(all_data, split_ratio, seed=seed)
    else:
        priors_data, gold_standard = _split_axis(all_data, split_ratio, axis=split_axis, seed=seed)

    return priors_data, gold_standard
Esempio n. 4
0
    def test_numeric(self):

        self.assertTrue(check.argument_numeric(0))
        self.assertTrue(check.argument_numeric(0.0))

        with self.assertRaises(ValueError):
            check.argument_numeric("0")

        self.assertTrue(check.argument_numeric(1, 0, 2))

        with self.assertRaises(ValueError):
            self.assertTrue(check.argument_numeric(2, 0, 1))

        self.assertTrue(check.argument_numeric(None, allow_none=True))
    def __init__(self,
                 betas,
                 rescaled_betas,
                 threshold=0.5,
                 filter_method='overlap'):
        """
        :param betas: list(pd.DataFrame[G x K])
        :param rescaled_betas: list(pd.DataFrame[G x K])
        :param threshold: float
        :param filter_method: str
            How to handle gold standard filtering ('overlap' filters to beta, 'keep_all_gold_standard' doesn't filter)
        """

        assert check.dataframes_align(betas)
        self.betas = betas

        assert check.dataframes_align(rescaled_betas)
        self.rescaled_betas = rescaled_betas

        assert check.argument_enum(filter_method, FILTER_METHODS)
        self.filter_method = filter_method

        assert check.argument_numeric(threshold, 0, 1)
        self.threshold = threshold
    def save_network_to_tsv(pr_calc,
                            priors,
                            output_dir,
                            confidence_threshold=0,
                            output_file_name="network.tsv",
                            beta_threshold=None,
                            extra_columns=None):
        """
        Create a network file and save it
        :param pr_calc: RankSummaryPR
            The rank-sum object with the math in it
        :param priors: pd.DataFrame [G x K]
            Prior data
        :param output_dir: str
            The path to the output file. If None, don't save anything
        :param confidence_threshold: numeric
            The minimum confidence score needed to write a network edge
        :param output_file_name: str
            The output file name. If None, don't save anything
        :param beta_threshold: pd.DataFrame [G x K]
            The thresholded betas to include in the network. If None, include everything.
        :param extra_columns: dict(col_name: pd.DataFrame [G x K])
            Any additional data to include, keyed by column name and indexable with row and column names
        """

        assert check.argument_type(pr_calc, RankSummaryPR)
        assert check.argument_type(priors, pd.DataFrame)
        assert check.argument_type(beta_threshold,
                                   pd.DataFrame,
                                   allow_none=True)
        assert check.argument_path(output_dir, allow_none=True)
        assert check.argument_type(output_file_name, str, allow_none=True)
        assert check.argument_numeric(confidence_threshold, 0, 1)

        if output_dir is None or output_file_name is None:
            return False

        header = [
            'regulator', 'target', 'combined_confidences', 'prior',
            'gold.standard', 'precision', 'recall'
        ]
        if extra_columns is not None:
            header += [k for k in sorted(extra_columns.keys())]

        output_list = [header]

        recall_data, precision_data = pr_calc.dataframe_recall_precision()

        for row_name, column_name, conf in pr_calc.confidence_ordered_generator(
        ):
            if conf < confidence_threshold:
                continue

            if beta_threshold is not None and not beta_threshold.ix[
                    row_name, column_name]:
                continue

            row_data = [column_name, row_name, conf]

            # Add prior value (or nan if the priors does not cover this interaction)
            if row_name in priors.index and column_name in priors.columns:
                row_data += [priors.ix[row_name, column_name]]
            else:
                row_data += [np.nan]

            # Add gold standard, precision, and recall (or nan if the gold standard does not cover this interaction)
            if row_name in pr_calc.gold_standard.index and column_name in pr_calc.gold_standard.columns:
                row_data += [
                    pr_calc.gold_standard.ix[row_name, column_name],
                    precision_data.ix[row_name, column_name],
                    recall_data.ix[row_name, column_name]
                ]
            else:
                row_data += [np.nan, np.nan, np.nan]

            if extra_columns is not None:
                for k in sorted(extra_columns.keys()):
                    if row_name in extra_columns[
                            k].index and column_name in extra_columns[
                                k].columns:
                        row_data += [
                            extra_columns[k].ix[row_name, column_name]
                        ]
                    else:
                        row_data += [np.nan]

            output_list.append(row_data)

        with open(os.path.join(output_dir, output_file_name), 'w') as myfile:
            wr = csv.writer(myfile, delimiter='\t')
            for row in output_list:
                wr.writerow(row)
    def get_sample_index(self,
                         meta_data=None,
                         sample_ratio=None,
                         sample_size=None,
                         min_size=default.DEFAULT_MINIMUM_SAMPLE_SIZE,
                         stratified_sampling=None):
        """
        Produce an integer index to sample data using .iloc. If the self.stratified_sampling flag is True, sample
        separately from each group, as defined by the self.stratified_batch_lookup column.
        :param meta_data: pd.DataFrame [N x ?]
            Data frame to sample from. Use self.meta_data if this is not set.
        :param sample_ratio: float
            Sample expression_matrix to this proportion of data points
        :param sample_size: int
            Sample expression matrix to this absolute number of data points. If sampling from each stratified group,
            this is the absolute number of data points PER GROUP (not total)
        :return new_size, new_idx: int, np.ndarray
            Return the total number of
        """

        # Sanity check inputs
        assert check.arguments_not_none((sample_ratio, sample_size),
                                        num_none=1)
        assert check.argument_numeric(sample_ratio, low=0, allow_none=True)
        assert check.argument_numeric(sample_size, low=0, allow_none=True)

        stratified_sampling = stratified_sampling if stratified_sampling is not None else self.stratified_sampling

        if stratified_sampling:
            # Use the main meta_data if there's nothing given
            meta_data = meta_data if meta_data is not None else self.meta_data

            # Copy and reindex the meta_data so that the index can be used with iloc
            meta_data = meta_data.copy()
            meta_data.index = pd.Index(range(meta_data.shape[0]))
            new_idx = np.ndarray(0, dtype=int)

            # For each factor in the batch column
            for batch in meta_data[
                    self.stratified_batch_lookup].unique().tolist():
                # Get the integer index of the data points in this batch
                batch_idx = meta_data.loc[
                    meta_data[self.stratified_batch_lookup] ==
                    batch, :].index.tolist()

                # Decide how many to collect from this batch
                size = sample_size if sample_ratio is None else max(
                    int(len(batch_idx) * sample_ratio), min_size)

                # Resample and append the new sample index to the index array
                new_idx = np.append(
                    new_idx,
                    np.random.choice(batch_idx,
                                     size=size,
                                     replace=self.sample_with_replacement))
            return new_idx
        else:
            # Decide how many to collect from the total expression matrix or the meta_data
            num_samples = self.expression_matrix.shape[
                1] if meta_data is None else meta_data.shape[0]
            size = sample_size if sample_ratio is None else max(
                int(sample_ratio * num_samples), min_size)
            return np.random.choice(num_samples,
                                    size=size,
                                    replace=self.sample_with_replacement)