Beispiel #1
0
    def test_none(self):

        self.assertTrue(check.arguments_not_none(("A", "B")))
        self.assertTrue(check.arguments_not_none(("A", None), num_none=1))
        with self.assertRaises(ValueError):
            self.assertTrue(check.arguments_not_none((None, None, "A")))
        with self.assertRaises(ValueError):
            self.assertTrue(
                check.arguments_not_none((None, None, "A"), num_none=0))
    def get_sample_index(self,
                         meta_data=None,
                         sample_ratio=None,
                         sample_size=None,
                         min_size=default.DEFAULT_MINIMUM_SAMPLE_SIZE,
                         stratified_sampling=None):
        """
        Produce an integer index to sample data using .iloc. If the self.stratified_sampling flag is True, sample
        separately from each group, as defined by the self.stratified_batch_lookup column.
        :param meta_data: pd.DataFrame [N x ?]
            Data frame to sample from. Use self.meta_data if this is not set.
        :param sample_ratio: float
            Sample expression_matrix to this proportion of data points
        :param sample_size: int
            Sample expression matrix to this absolute number of data points. If sampling from each stratified group,
            this is the absolute number of data points PER GROUP (not total)
        :return new_size, new_idx: int, np.ndarray
            Return the total number of
        """

        # Sanity check inputs
        assert check.arguments_not_none((sample_ratio, sample_size),
                                        num_none=1)
        assert check.argument_numeric(sample_ratio, low=0, allow_none=True)
        assert check.argument_numeric(sample_size, low=0, allow_none=True)

        stratified_sampling = stratified_sampling if stratified_sampling is not None else self.stratified_sampling

        if stratified_sampling:
            # Use the main meta_data if there's nothing given
            meta_data = meta_data if meta_data is not None else self.meta_data

            # Copy and reindex the meta_data so that the index can be used with iloc
            meta_data = meta_data.copy()
            meta_data.index = pd.Index(range(meta_data.shape[0]))
            new_idx = np.ndarray(0, dtype=int)

            # For each factor in the batch column
            for batch in meta_data[
                    self.stratified_batch_lookup].unique().tolist():
                # Get the integer index of the data points in this batch
                batch_idx = meta_data.loc[
                    meta_data[self.stratified_batch_lookup] ==
                    batch, :].index.tolist()

                # Decide how many to collect from this batch
                size = sample_size if sample_ratio is None else max(
                    int(len(batch_idx) * sample_ratio), min_size)

                # Resample and append the new sample index to the index array
                new_idx = np.append(
                    new_idx,
                    np.random.choice(batch_idx,
                                     size=size,
                                     replace=self.sample_with_replacement))
            return new_idx
        else:
            # Decide how many to collect from the total expression matrix or the meta_data
            num_samples = self.expression_matrix.shape[
                1] if meta_data is None else meta_data.shape[0]
            size = sample_size if sample_ratio is None else max(
                int(sample_ratio * num_samples), min_size)
            return np.random.choice(num_samples,
                                    size=size,
                                    replace=self.sample_with_replacement)