def test_subsample_counts_without_replacement(self):
        # Selecting 2 counts from the vector 1000 times yields each of the two
        # possible results at least once each.
        a = np.array([2, 0, 1])
        actual = set()
        for i in range(1000):
            obs = subsample_counts(a, 2)
            actual.add(tuple(obs))
        self.assertEqual(actual, {(1, 0, 1), (2, 0, 0)})

        obs = subsample_counts(a, 2)
        self.assertTrue(np.array_equal(obs, np.array([1, 0, 1])) or
                        np.array_equal(obs, np.array([2, 0, 0])))
Beispiel #2
0
    def test_subsample_counts_without_replacement(self):
        # Selecting 2 counts from the vector 1000 times yields each of the two
        # possible results at least once each.
        a = np.array([2, 0, 1])
        actual = set()
        for i in range(1000):
            obs = subsample_counts(a, 2)
            actual.add(tuple(obs))
        self.assertEqual(actual, {(1, 0, 1), (2, 0, 0)})

        obs = subsample_counts(a, 2)
        self.assertTrue(
            np.array_equal(obs, np.array([1, 0, 1]))
            or np.array_equal(obs, np.array([2, 0, 0])))
Beispiel #3
0
def rarefy_and_recode(filenames, rawCounts, samplingDepth):
    """
    Summary: subsamples all samples to the median (average of 100 times),  

    Args:
        filenames ()
        rawCounts ()
        samplingDepth ()

    Returns:
    """
    for i in range(len(rawCounts)):
        subsampleList = []
        if int(rawCounts[i].sum()) < samplingDepth:
            meanSubsample = rawCounts[i]
        else:
            for j in range(100):
                sample = subsample_counts(rawCounts[i].transpose().values[0],
                                          samplingDepth)
                subsampleList.insert(j, sample)
            print("completed 100 subsamples for sample number " + str(i))
            meanSubsample = pd.Series(subsampleList).mean()
            #recodification: setting all values less than 1.01 to zero
            meanSubsample[meanSubsample < 1.01] = 0
        sampleName = filenames[i].split('.')[0]
        rawCounts[i][sampleName] = meanSubsample
        newFileName = sampleName + "_norm.csv"
        create_path('normalised_counts')
        rawCounts[i].to_csv(os.path.join('normalised_counts', newFileName))
        print("written " + newFileName + " to file.")
    return
Beispiel #4
0
def replicatize(sample, reps=10):
    """
    Basically does subsampling without replacement.
    Calculates which sample has the highest abundance :math:`n`
    and obtains multiple samples of size :math:`n+1`

    Parameters
    ----------
    sample: np.array, int
        A count vector of abundances

    Returns
    -------
    mat: np.array, int
        A count matrix where
        rows = replicate samples
        columns = features
    """
    sample = np.array(sample)
    n = sample.max()

    mat = np.zeros((reps, len(sample)))
    for rep in range(reps):
        mat[rep, :] = subsample_counts(sample, n + 1)
    return mat
Beispiel #5
0
def rarefy_counts(counts, depth=10000):
    """Normalize a count matrix by rarefaction (subsampling).

    Parameters
    ----------
    counts : pandas.DataFrame
        The count matrix to be normalized. Contains variables as columns and
        samples as rows.

    Returns
    -------
    pandas.DataFrame
        A new data frame with normalized samples such that each sample has
        a depth of `depth` (sum of variables equals depth).

    """
    log.info(
        "Subsampling %dx%d count matrix to a depth of %d."
        % (counts.shape[0], counts.shape[1], depth)
    )
    bad = counts.astype("int").sum(1) < depth
    log.info("Removing %d samples due to low depth." % bad.sum())
    rare = counts[~bad].apply(
        lambda x: pd.Series(
            subsample_counts(x.astype("int"), depth), index=counts.columns
        ),
        axis=1,
    )
    return rare
 def _rfx(data, sid, md):
     if sid in sinks:
         return subsample_counts(data.astype(np.int64),
                                 sinks_depth,
                                 replace=False)
     else:
         return data
Beispiel #7
0
def subsample_count(exp: Experiment, total, replace=False, inplace=False, random_seed=None):
    """Randomly subsample each sample to the same number of counts.

    .. warning:: This function will change the :attr:`Experiment.data`
       object from sparse to dense. The input ``Experiment`` object
       should not have been normalized by total sum and its data
       should be discrete count. The samples that have few total count
       than ``total`` will be dropped.

    .. note:: This function may not work on Windows OS. It relies on
       the :func:`skbio.stats.subsample_counts` which have
       `ValueError: Buffer dtype mismatch, expected 'int64_t' but got
       'long'` in `_subsample_counts_without_replacement` function of
       `skbio/stats/__subsample.pyx`

    Parameters
    ----------
    total : int, optional
        cap the tiny values and then clr transform the data.
    replace : bool, optional
        If True, subsample with replacement. If False (the default), subsample without replacement
    inplace : bool, optional
        False (default) to create a new experiment, True to do it in place
    random_seed : int or None, optional, default=None
        passed to :func:`numpy.random.seed`

    Returns
    -------
    Experiment
        The subsampled experiment.

    See Also
    --------
    :func:`skbio.stats.subsample_counts`

    """
    # import here to make skbio optional dependency
    from skbio.stats import subsample_counts

    if not inplace:
        exp = deepcopy(exp)
    if exp.sparse:
        exp.sparse = False
    # subsample_counts() require int as input; if not, raise error
    if exp.data.dtype.kind not in {'u', 'i'}:
        raise ValueError('Your `Experiment` object is normalized: subsample operates on integer raw data, not on normalized data.')

    drops = []
    np.random.seed(random_seed)
    for row in range(exp.data.shape[0]):
        counts = exp.data[row, :]
        if total > counts.sum() and not replace:
            drops.append(row)
        else:
            exp.data[row, :] = subsample_counts(counts, n=total, replace=replace)

    exp.reorder([i not in drops for i in range(exp.data.shape[0])], inplace=True)
    exp.normalized = total
    return exp
Beispiel #8
0
    def test_subsample_counts_nonrandom(self):
        a = np.array([0, 5, 0])

        # Subsample same number of items that are in input (without
        # replacement).
        npt.assert_equal(subsample_counts(a, 5), a)

        # Can only choose from one bin.
        exp = np.array([0, 2, 0])
        npt.assert_equal(subsample_counts(a, 2), exp)
        npt.assert_equal(subsample_counts(a, 2, replace=True), exp)

        # Subsample zero items.
        a = [3, 0, 1]
        exp = np.array([0, 0, 0])
        npt.assert_equal(subsample_counts(a, 0), exp)
        npt.assert_equal(subsample_counts(a, 0, replace=True), exp)
Beispiel #9
0
def find_subsystems_of_interest(studyName, groupsList, geneCounts, level,
                                percentage):
    """
    Summary: uses Boruta machine learning method to roughly determine potential genes of interest. requires tab-separated  matrix from MG-RAST analysis page

    Args:
        studyName (str): directory (study name)
        groupsList (list): list of group names
        level (str): subsystems level at which to run Boruta
        percentage (int): threshold for Boruta feature selection


    Returns: None, outputs files with tentative genes/gene families of interest

    """

    numGeneCounts = geneCounts.select_dtypes(include=[np.number])
    Y = numGeneCounts.transpose().index.str.split('_').str[0].values
    samplingDepth = numGeneCounts.sum().median()
    os.chdir(studyName)
    for i in range(len(numGeneCounts.columns)):
        subsampleList = []
        if int(numGeneCounts[numGeneCounts.columns[i]].sum()) < samplingDepth:
            meanSubsample = numGeneCounts[numGeneCounts.columns[i]]
        else:
            for j in range(100):
                sample = subsample_counts(
                    numGeneCounts[numGeneCounts.columns[i]].transpose().values,
                    int(samplingDepth))
                subsampleList.insert(j, sample)
            print("completed 100 subsamples for sample number " + str(i))
            meanSubsample = pd.Series(subsampleList).mean()
            #recodification: setting all values less than 1.01 to zero
            meanSubsample[meanSubsample < 1.01] = 0
        meanSubsample = 100 * meanSubsample / meanSubsample.sum()
        numGeneCounts[numGeneCounts.columns[i]] = meanSubsample
    numGeneCounts['level1'] = geneCounts['level1']
    numGeneCounts['level2'] = geneCounts['level2']
    numGeneCounts['level3'] = geneCounts['level3']
    numGeneCounts['function'] = geneCounts['function']
    countsLvl = numGeneCounts.groupby(level).sum()
    groupsDict = dict(enumerate(pd.Series(groupsList).unique()))
    dictGroups = {y: x for x, y in groupsDict.items()}
    rf = RandomForestClassifier(n_jobs=-1,
                                class_weight='balanced',
                                max_depth=3)
    X = countsLvl.transpose().values
    feat_selector = BorutaPy(rf,
                             n_estimators='auto',
                             verbose=2,
                             perc=int(percentage))
    feat_selector.fit(X, Y)
    if len(countsLvl[feat_selector.support_]) > 0:
        countsLvl[feat_selector.support_].to_csv(str(level) + '_tentative.csv')
    countsLvl[feat_selector.support_weak_].to_csv(
        str(level) + '_tentative_weak.csv')
    os.chdir('..')
    def test_subsample_counts_nonrandom(self):
        a = np.array([0, 5, 0])

        # Subsample same number of items that are in input (without
        # replacement).
        npt.assert_equal(subsample_counts(a, 5), a)

        # Can only choose from one bin.
        exp = np.array([0, 2, 0])
        npt.assert_equal(subsample_counts(a, 2), exp)
        npt.assert_equal(
            subsample_counts(a, 2, replace=True), exp)

        # Subsample zero items.
        a = [3, 0, 1]
        exp = np.array([0, 0, 0])
        npt.assert_equal(subsample_counts(a, 0), exp)
        npt.assert_equal(subsample_counts(a, 0, replace=True), exp)
Beispiel #11
0
 def test_subsample_counts_with_replacement_equal_n(self):
     # test when n == counts.sum()
     a = np.array([0, 0, 3, 4, 2, 1])
     actual = set()
     for i in range(1000):
         obs = subsample_counts(a, 10, replace=True)
         self.assertEqual(obs.sum(), 10)
         actual.add(tuple(obs))
     self.assertTrue(len(actual) > 1)
 def test_subsample_counts_with_replacement_equal_n(self):
     # test when n == counts.sum()
     a = np.array([0, 0, 3, 4, 2, 1])
     actual = set()
     for i in range(1000):
         obs = subsample_counts(a, 10, replace=True)
         self.assertEqual(obs.sum(), 10)
         actual.add(tuple(obs))
     self.assertTrue(len(actual) > 1)
 def _subsample(self, X):
     X = X.astype(int)
     X_out = list()
     iter_var = X.values if isinstance(X, pd.DataFrame) else X
     for row in iter_var:
         new_X = subsample_counts(row, n=self.depth, replace=self.replace)
         X_out.append(new_X)
     X = np.vstack(X_out)
     return X
Beispiel #14
0
    def test_subsample_counts_invalid_input(self):
        # Negative n.
        with self.assertRaises(ValueError):
            subsample_counts([1, 2, 3], -1)

        # Floats.
        with self.assertRaises(TypeError):
            subsample_counts([1, 2.3, 3], 2)

        # Wrong number of dimensions.
        with self.assertRaises(ValueError):
            subsample_counts([[1, 2, 3], [4, 5, 6]], 2)

        # Input has too few counts.
        with self.assertRaises(ValueError):
            subsample_counts([0, 5, 0], 6, replace=False)

        # Input has too counts, but should work with bootstrap
        subsample_counts([0, 5, 0], 6, replace=True)
Beispiel #15
0
    def test_subsample_counts_invalid_input(self):
        # Negative n.
        with self.assertRaises(ValueError):
            subsample_counts([1, 2, 3], -1)

        # Floats.
        with self.assertRaises(TypeError):
            subsample_counts([1, 2.3, 3], 2)

        # Wrong number of dimensions.
        with self.assertRaises(ValueError):
            subsample_counts([[1, 2, 3], [4, 5, 6]], 2)

        # Input has too few counts.
        with self.assertRaises(ValueError):
            subsample_counts([0, 5, 0], 6, replace=False)

        # Input has too counts, but should work with bootstrap
        subsample_counts([0, 5, 0], 6, replace=True)
Beispiel #16
0
def rarify(biom, even_sampling_depth):
    data = []
    sample_ids = []
    for e in biom.columns:
        count_vector = biom[e]
        if count_vector.sum() < even_sampling_depth:
            continue
        else:
            sample_ids.append(e)
            data.append(subsample_counts(count_vector.astype(int), even_sampling_depth))
    return pd.DataFrame(np.asarray(data).T, index=biom.index, columns=sample_ids)
Beispiel #17
0
def subsample_count(exp: Experiment, total, replace=False, inplace=False):
    """Randomly subsample each sample to the same number of counts.

    .. warning:: This function will change the :attr:`Experiment.data`
       object from sparse to dense. The input ``Experiment`` object
       should not have been normalized by total sum and its data
       should be discrete count. The samples that have few total count
       than ``total`` will be dropped.

    Parameters
    ----------
    total : int, optional
        cap the tiny values and then clr transform the data.
    replace : bool, optional
        If True, subsample with replacement. If False (the default), subsample without replacement
    inplace : bool, optional
        False (default) to create a new experiment, True to do it in place

    Returns
    -------
    Experiment
        The subsampled experiment.

    See Also
    --------
    :func:`skbio.stats.subsample_counts`

    """
    if inplace:
        newexp = exp
    else:
        newexp = deepcopy(exp)
    if newexp.sparse:
        newexp.sparse = False
    # subsample_counts() require int as input;
    # check if it is normalized: if so, raise error
    if exp.exp_metadata.get('normalized'):
        raise ValueError(
            'Your `Experiment` object is normalized: subsample operates on integer raw data, not on normalized data.'
        )
    newexp.data = newexp.data.astype(int)
    drops = []
    for row in range(newexp.data.shape[0]):
        try:
            newexp.data[row, :] = subsample_counts(newexp.data[row, :],
                                                   n=total,
                                                   replace=replace)
        except ValueError:
            # if the row sum is smaller than total in case replace is True, this row should be dropped
            drops.append(row)
    newexp.reorder([i not in drops for i in range(newexp.data.shape[0])],
                   inplace=True)
    return newexp
    def test_subsample_counts_with_replacement(self):
        # Can choose from all in first bin, all in last bin (since we're
        # sampling with replacement), or split across bins.
        a = np.array([2, 0, 1])
        actual = set()
        for i in range(1000):
            obs = subsample_counts(a, 2, replace=True)
            actual.add(tuple(obs))
        self.assertEqual(actual, {(1, 0, 1), (2, 0, 0), (0, 0, 2)})

        # Test that selecting 35 counts from a 36-count vector 1000 times
        # yields more than 10 different subsamples. If we were subsampling
        # *without* replacement, there would be only 10 possible subsamples
        # because there are 10 nonzero bins in array a. However, there are more
        # than 10 possibilities when sampling *with* replacement.
        a = np.array([2, 0, 1, 2, 1, 8, 6, 0, 3, 3, 5, 0, 0, 0, 5])
        actual = set()
        for i in range(1000):
            obs = subsample_counts(a, 35, replace=True)
            self.assertEqual(obs.sum(), 35)
            actual.add(tuple(obs))
        self.assertTrue(len(actual) > 10)
Beispiel #19
0
    def test_subsample_counts_with_replacement(self):
        # Can choose from all in first bin, all in last bin (since we're
        # sampling with replacement), or split across bins.
        a = np.array([2, 0, 1])
        actual = set()
        for i in range(1000):
            obs = subsample_counts(a, 2, replace=True)
            actual.add(tuple(obs))
        self.assertEqual(actual, {(1, 0, 1), (2, 0, 0), (0, 0, 2)})

        # Test that selecting 35 counts from a 36-count vector 1000 times
        # yields more than 10 different subsamples. If we were subsampling
        # *without* replacement, there would be only 10 possible subsamples
        # because there are 10 nonzero bins in array a. However, there are more
        # than 10 possibilities when sampling *with* replacement.
        a = np.array([2, 0, 1, 2, 1, 8, 6, 0, 3, 3, 5, 0, 0, 0, 5])
        actual = set()
        for i in range(1000):
            obs = subsample_counts(a, 35, replace=True)
            self.assertEqual(obs.sum(), 35)
            actual.add(tuple(obs))
        self.assertTrue(len(actual) > 10)
Beispiel #20
0
    def subsample(self, level):
        dropped = []

        for (i, row) in enumerate(self.data.to_numpy()):
            try:
                row_subsampled = subsample_counts(row, level)
            except ValueError:
                dropped.append(i)
                continue

            self.data.iloc[i] = row_subsampled

        self.data.drop(self.data.index[dropped], inplace=True)
Beispiel #21
0
def create_fake_observation():
    """Create a subsample with defined property"""

    # Create a subsample of a larger sample such that we can compute
    # the expected probability of the unseen portion.
    # This is used in the tests of lladser_pe and lladser_ci
    counts = np.ones(1001, dtype='int64')
    counts[0] = 9000
    total = counts.sum()

    fake_obs = subsample_counts(counts, 1000)
    exp_p = 1 - sum([x / total for (x, y) in zip(counts, fake_obs) if y > 0])

    return fake_obs, exp_p
Beispiel #22
0
def create_fake_observation():
    """Create a subsample with defined property"""

    # Create a subsample of a larger sample such that we can compute
    # the expected probability of the unseen portion.
    # This is used in the tests of lladser_pe and lladser_ci
    counts = np.ones(1001, dtype='int64')
    counts[0] = 9000
    total = counts.sum()

    fake_obs = subsample_counts(counts, 1000)
    exp_p = 1 - sum([x/total for (x, y) in zip(counts, fake_obs) if y > 0])

    return fake_obs, exp_p
Beispiel #23
0
    def test_subsample_counts_invalid_input(self):
        # Negative n.
        with self.assertRaises(ValueError):
            subsample_counts([1, 2, 3], -1)

        # Floats.
        with self.assertRaises(TypeError):
            subsample_counts([1, 2.3, 3], 2)

        # Wrong number of dimensions.
        with self.assertRaises(ValueError):
            subsample_counts([[1, 2, 3], [4, 5, 6]], 2)

        # Input has too few counts.
        with self.assertRaises(ValueError):
            subsample_counts([0, 5, 0], 6)
    def test_subsample_counts_invalid_input(self):
        # Negative n.
        with self.assertRaises(ValueError):
            subsample_counts([1, 2, 3], -1)

        # Floats.
        with self.assertRaises(TypeError):
            subsample_counts([1, 2.3, 3], 2)

        # Wrong number of dimensions.
        with self.assertRaises(ValueError):
            subsample_counts([[1, 2, 3], [4, 5, 6]], 2)

        # Input has too few counts.
        with self.assertRaises(ValueError):
            subsample_counts([0, 5, 0], 6)
Beispiel #25
0
def rarify(biom, even_sampling_depth):
    data = []
    sample_ids = []
    for e in biom.columns:
        count_vector = biom[e]
        if count_vector.sum() < even_sampling_depth:
            continue
        else:
            sample_ids.append(e)
            data.append(
                subsample_counts(count_vector.astype(int),
                                 even_sampling_depth))
    return pd.DataFrame(np.asarray(data).T,
                        index=biom.index,
                        columns=sample_ids)
Beispiel #26
0
def get_rarefied(otu_table, seqs_per_sample):
    """
    Args:
        otu_table:(dataframe) load biom file and change to dataframe
        seqs_per_sample:...
    Rerutn:
        a rarefied OTU table
    """
    new_counts = []
    for sample in otu_table.columns:
        arr = []
        seqs = sum(otu_table[sample])
        if seqs <= seqs_per_sample:
            arr = np.array(otu_table[sample].values).astype(int)
        else:
            values = np.array(otu_table[sample].values).astype(int)
            arr = subsample_counts(values, seqs_per_sample)
        new_counts.append(arr)
    rarefied = pd.DataFrame(new_counts,
                            columns=otu_table.index,
                            index=otu_table.columns)
    return rarefied.T
Beispiel #27
0
def michaelis_menten_fit(counts, num_repeats=1, params_guess=None):
    r"""Calculate Michaelis-Menten fit to rarefaction curve of observed OTUs.

    The Michaelis-Menten equation is defined as:

    .. math::

       S=\frac{nS_{max}}{n+B}

    where :math:`n` is the number of individuals and :math:`S` is the number of
    OTUs. This function estimates the :math:`S_{max}` parameter.

    The fit is made to datapoints for :math:`n=1,2,...,N`, where :math:`N` is
    the total number of individuals (sum of abundances for all OTUs).
    :math:`S` is the number of OTUs represented in a random sample of :math:`n`
    individuals.

    Parameters
    ----------
    counts : 1-D array_like, int
        Vector of counts.
    num_repeats : int, optional
        The number of times to perform rarefaction (subsampling without
        replacement) at each value of :math:`n`.
    params_guess : tuple, optional
        Initial guess of :math:`S_{max}` and :math:`B`. If ``None``, default
        guess for :math:`S_{max}` is :math:`S` (as :math:`S_{max}` should
        be >= :math:`S`) and default guess for :math:`B` is ``round(N / 2)``.

    Returns
    -------
    S_max : double
        Estimate of the :math:`S_{max}` parameter in the Michaelis-Menten
        equation.

    See Also
    --------
    skbio.stats.subsample_counts

    Notes
    -----
    There is some controversy about how to do the fitting. The ML model given
    in [1]_ is based on the assumption that error is roughly proportional to
    magnitude of observation, reasonable for enzyme kinetics but not reasonable
    for rarefaction data. Here we just do a nonlinear curve fit for the
    parameters using least-squares.

    References
    ----------
    .. [1] Raaijmakers, J. G. W. 1987 Statistical analysis of the
       Michaelis-Menten equation. Biometrics 43, 793-803.

    """
    counts = _validate_counts_vector(counts)

    n_indiv = counts.sum()
    if params_guess is None:
        S_max_guess = observed_otus(counts)
        B_guess = int(round(n_indiv / 2))
        params_guess = (S_max_guess, B_guess)

    # observed # of OTUs vs # of individuals sampled, S vs n
    xvals = np.arange(1, n_indiv + 1)
    ymtx = np.empty((num_repeats, len(xvals)), dtype=int)
    for i in range(num_repeats):
        ymtx[i] = np.asarray([observed_otus(subsample_counts(counts, n))
                              for n in xvals], dtype=int)
    yvals = ymtx.mean(0)

    # Vectors of actual vals y and number of individuals n.
    def errfn(p, n, y):
        return (((p[0] * n / (p[1] + n)) - y) ** 2).sum()

    # Return S_max.
    return fmin_powell(errfn, params_guess, ftol=1e-5, args=(xvals, yvals),
                       disp=False)[0]
Beispiel #28
0
					def subsample( si, i ):
						ssi = skstats.subsample_counts( si, i )
						return np.count_nonzero( ssi )
 def subsample(x):
     return pd.Series(subsample_counts(x.values, n=depth, replace=replace),
                      index=x.index)
def subsample_sources_sinks(sources_data, sinks, feature_table, sources_depth,
                            sinks_depth):
    '''Rarify data for sources and sinks.

    Notes
    -----
    This function rarifies `sources_data` to `sources_depth`, and `sinks` in
    `feature_table` to `sink_depth`. This function is neccesary because of
    ipyparallel and the partial functions.

    Parameters
    ----------
    sources_data : np.array
        Two dimensional array with collapsed source data.
    sinks : np.array
        One dimensional array of strings, with each string being the sample ID
        of a sink in `feature_table`.
    feature_table : biom.table.Table
        Biom table containing data for `sinks` to be rarified.
    sources_depth : int
        Depth at which to subsample each source. If 0, no rarefaction will be
        performed.
    sinks_depth : int
        Depth at which to subsample each sink. If 0, no rarefaction will be
        performed.

    Returns
    -------
    rsd : np.array
        Rarified `sources_data`.
    rft : biom.table.Table
        `feature_table` with samples identified in `sinks` rarified.
    '''
    # Check that supplied depths do not exceed available sequences. Cryptic
    # errors will be raised otherwise.
    if sources_depth > 0 and (sources_data.sum(1) < sources_depth).any():
        raise ValueError('Invalid rarefaction depth for source data. There '
                         'are not enough sequences in at least one collapsed '
                         'source.')
    if sinks_depth > 0:
        for sample in sinks:
            if feature_table.data(sample, axis='sample').sum() < sinks_depth:
                raise ValueError('Invalid rarefaction depth for sink data. '
                                 'There are not enough sequences in at least '
                                 'one sink.')

    # Rarify source data.
    if sources_depth == 0:
        rsd = sources_data
    else:
        rsd = np.empty(sources_data.shape, dtype=np.float64)
        for row in range(sources_data.shape[0]):
            rsd[row] = subsample_counts(sources_data[row], sources_depth,
                                        replace=False)
    # Rarify sinks data in the biom table.
    if sinks_depth == 0:
        rft = feature_table
    else:
        # We'd like to use Table.subsample, but it removes features that have
        # 0 count across every sample, which changes the size of the matrix.
        # rft = feature_table.filter(sinks, axis='sample', inplace=False)
        # rft = rft.subsample(sinks_depth)
        def _rfx(data, sid, md):
            if sid in sinks:
                return subsample_counts(data.astype(np.int64), sinks_depth,
                                        replace=False)
            else:
                return data
        rft = feature_table.transform(_rfx, axis='sample', inplace=False)
    return rsd, rft
 def _rfx(data, sid, md):
     if sid in sinks:
         return subsample_counts(data.astype(np.int64), sinks_depth,
                                 replace=False)
     else:
         return data
Beispiel #32
0
def find_genes_of_interest(studyName,
                           groupsList,
                           geneCounts,
                           lvl1pct=70,
                           lvl2pct=70,
                           lvl3pct=60,
                           fxnpct=40):
    """
    Summary: uses Boruta machine learning method to roughly determine potential genes of interest. requires tab-separated  matrix from MG-RAST analysis page

    Args:
        studyName (str): directory (study name)
        geneCountsName (str): filename for tab separated matrix
        lvl1pct (int): threshold for Boruta on level 1
        lvl2pct (int): threshold for Boruta on level 2
        lvl3pct (int): threshold for Boruta on level 3
        fxnpct (int): threshold for Boruta on gene name


    Returns: None, outputs files with tentative genes/gene families of interest

    """

    #geneCounts = pd.read_table(geneCountsName, header=0)#, header=0)#header=0
    numGeneCounts = geneCounts.select_dtypes(include=[np.number])
    Y = numGeneCounts.transpose().index.str.split('_').str[0].values
    samplingDepth = numGeneCounts.sum().median()
    os.chdir(studyName)
    for i in range(len(numGeneCounts.columns)):
        subsampleList = []
        if int(numGeneCounts[numGeneCounts.columns[i]].sum()) < samplingDepth:
            meanSubsample = numGeneCounts[numGeneCounts.columns[i]]
        else:
            for j in range(100):
                sample = subsample_counts(
                    numGeneCounts[numGeneCounts.columns[i]].transpose().values,
                    int(samplingDepth))
                subsampleList.insert(j, sample)
            print("completed 100 subsamples for sample number " + str(i))
            meanSubsample = pd.Series(subsampleList).mean()
            #recodification: setting all values less than 1.01 to zero
            meanSubsample[meanSubsample < 1.01] = 0
        meanSubsample = 100 * meanSubsample / meanSubsample.sum()
        numGeneCounts[numGeneCounts.columns[i]] = meanSubsample
    numGeneCounts['level1'] = geneCounts['level1']
    numGeneCounts['level2'] = geneCounts['level2']
    numGeneCounts['level3'] = geneCounts['level3']
    numGeneCounts['function'] = geneCounts['function']
    countsLvl1 = numGeneCounts.groupby('level1').sum()
    countsLvl2 = numGeneCounts.groupby('level2').sum()
    countsLvl3 = numGeneCounts.groupby('level3').sum()
    countsLvl4 = numGeneCounts.groupby('function').sum()
    levelList = [countsLvl1, countsLvl2, countsLvl3, countsLvl4]
    countsLvl1.to_csv(studyName + 'genes_lvl1.csv')
    countsLvl2.to_csv(studyName + 'genes_lvl2.csv')
    countsLvl3.to_csv(studyName + 'genes_lvl3.csv')
    countsLvl4.to_csv(studyName + 'genes_function.csv')
    groupsDict = dict(enumerate(pd.Series(groupsList).unique()))
    dictGroups = {y: x for x, y in groupsDict.items()}
    rf = RandomForestClassifier(n_jobs=-1,
                                class_weight='balanced',
                                max_depth=3)

    X = countsLvl1.transpose().values
    feat_selector = BorutaPy(rf,
                             n_estimators='auto',
                             verbose=2,
                             perc=int(lvl1pct))
    feat_selector.fit(X, Y)
    if len(countsLvl1[feat_selector.support_]) > 0:
        countsLvl1[feat_selector.support_].to_csv('level1_tentative.csv')
    countsLvl1[feat_selector.support_weak_].to_csv('level1_tentative_weak.csv')

    X = countsLvl2.transpose().values
    feat_selector = BorutaPy(rf,
                             n_estimators='auto',
                             verbose=2,
                             perc=int(lvl2pct),
                             max_iter=300)
    feat_selector.fit(X, Y)
    if len(countsLvl2[feat_selector.support_]) > 0:
        countsLvl2[feat_selector.support_].to_csv('level2_tentative.csv')
    countsLvl2[feat_selector.support_weak_].to_csv('level2_tentative_weak.csv')

    X = countsLvl3.transpose().values
    feat_selector = BorutaPy(rf,
                             n_estimators='auto',
                             verbose=2,
                             perc=int(lvl3pct),
                             max_iter=500)
    feat_selector.fit(X, Y)
    if len(countsLvl3[feat_selector.support_]) > 0:
        countsLvl3[feat_selector.support_].to_csv('level3_tentative.csv')
    countsLvl3[feat_selector.support_weak_].to_csv('level3_tentative_weak.csv')

    X = countsLvl4.transpose().values
    feat_selector = BorutaPy(rf,
                             n_estimators='auto',
                             verbose=2,
                             perc=int(fxnpct),
                             max_iter=700)
    feat_selector.fit(X, Y)
    if len(countsLvl4[feat_selector.support_]) > 0:
        countsLvl4[feat_selector.support_].to_csv('level4_tentative_.csv')
    countsLvl4[feat_selector.support_weak_].to_csv('level4_tentative_weak.csv')
    os.chdir('..')
def subsample_sources_sinks(sources_data, sinks, feature_table, sources_depth,
                            sinks_depth):
    '''Rarify data for sources and sinks.

    Notes
    -----
    This function rarifies `sources_data` to `sources_depth`, and `sinks` in
    `feature_table` to `sink_depth`. This function is neccesary because of
    ipyparallel and the partial functions.

    Parameters
    ----------
    sources_data : np.array
        Two dimensional array with collapsed source data.
    sinks : np.array
        One dimensional array of strings, with each string being the sample ID
        of a sink in `feature_table`.
    feature_table : biom.table.Table
        Biom table containing data for `sinks` to be rarified.
    sources_depth : int
        Depth at which to subsample each source. If 0, no rarefaction will be
        performed.
    sinks_depth : int
        Depth at which to subsample each sink. If 0, no rarefaction will be
        performed.

    Returns
    -------
    rsd : np.array
        Rarified `sources_data`.
    rft : biom.table.Table
        `feature_table` with samples identified in `sinks` rarified.
    '''
    # Check that supplied depths do not exceed available sequences. Cryptic
    # errors will be raised otherwise.
    if sources_depth > 0 and (sources_data.sum(1) < sources_depth).any():
        raise ValueError('Invalid rarefaction depth for source data. There '
                         'are not enough sequences in at least one collapsed '
                         'source.')
    if sinks_depth > 0:
        for sample in sinks:
            if feature_table.data(sample, axis='sample').sum() < sinks_depth:
                raise ValueError('Invalid rarefaction depth for sink data. '
                                 'There are not enough sequences in at least '
                                 'one sink.')

    # Rarify source data.
    if sources_depth == 0:
        rsd = sources_data
    else:
        rsd = np.empty(sources_data.shape, dtype=np.float64)
        for row in range(sources_data.shape[0]):
            rsd[row] = subsample_counts(sources_data[row],
                                        sources_depth,
                                        replace=False)
    # Rarify sinks data in the biom table.
    if sinks_depth == 0:
        rft = feature_table
    else:
        # We'd like to use Table.subsample, but it removes features that have
        # 0 count across every sample, which changes the size of the matrix.
        # rft = feature_table.filter(sinks, axis='sample', inplace=False)
        # rft = rft.subsample(sinks_depth)
        def _rfx(data, sid, md):
            if sid in sinks:
                return subsample_counts(data.astype(np.int64),
                                        sinks_depth,
                                        replace=False)
            else:
                return data

        rft = feature_table.transform(_rfx, axis='sample', inplace=False)
    return rsd, rft
Beispiel #34
0
def michaelis_menten_fit(counts, num_repeats=1, params_guess=None):
    r"""Calculate Michaelis-Menten fit to rarefaction curve of observed OTUs.

    The Michaelis-Menten equation is defined as:

    .. math::

       S=\frac{nS_{max}}{n+B}

    where :math:`n` is the number of individuals and :math:`S` is the number of
    OTUs. This function estimates the :math:`S_{max}` parameter.

    The fit is made to datapoints for :math:`n=1,2,...,N`, where :math:`N` is
    the total number of individuals (sum of abundances for all OTUs).
    :math:`S` is the number of OTUs represented in a random sample of :math:`n`
    individuals.

    Parameters
    ----------
    counts : 1-D array_like, int
        Vector of counts.
    num_repeats : int, optional
        The number of times to perform rarefaction (subsampling without
        replacement) at each value of :math:`n`.
    params_guess : tuple, optional
        Initial guess of :math:`S_{max}` and :math:`B`. If ``None``, default
        guess for :math:`S_{max}` is :math:`S` (as :math:`S_{max}` should
        be >= :math:`S`) and default guess for :math:`B` is ``round(N / 2)``.

    Returns
    -------
    S_max : double
        Estimate of the :math:`S_{max}` parameter in the Michaelis-Menten
        equation.

    See Also
    --------
    skbio.stats.subsample_counts

    Notes
    -----
    There is some controversy about how to do the fitting. The ML model given
    in [1]_ is based on the assumption that error is roughly proportional to
    magnitude of observation, reasonable for enzyme kinetics but not reasonable
    for rarefaction data. Here we just do a nonlinear curve fit for the
    parameters using least-squares.

    References
    ----------
    .. [1] Raaijmakers, J. G. W. 1987 Statistical analysis of the
       Michaelis-Menten equation. Biometrics 43, 793-803.

    """
    counts = _validate_counts_vector(counts)

    n_indiv = counts.sum()
    if params_guess is None:
        S_max_guess = observed_otus(counts)
        B_guess = int(round(n_indiv / 2))
        params_guess = (S_max_guess, B_guess)

    # observed # of OTUs vs # of individuals sampled, S vs n
    xvals = np.arange(1, n_indiv + 1)
    ymtx = np.empty((num_repeats, len(xvals)), dtype=int)
    for i in range(num_repeats):
        ymtx[i] = np.asarray([observed_otus(subsample_counts(counts, n))
                              for n in xvals], dtype=int)
    yvals = ymtx.mean(0)

    # Vectors of actual vals y and number of individuals n.
    def errfn(p, n, y):
        return (((p[0] * n / (p[1] + n)) - y) ** 2).sum()

    # Return S_max.
    return fmin_powell(errfn, params_guess, ftol=1e-5, args=(xvals, yvals),
                       disp=False)[0]