def test_subsample_counts_without_replacement(self): # Selecting 2 counts from the vector 1000 times yields each of the two # possible results at least once each. a = np.array([2, 0, 1]) actual = set() for i in range(1000): obs = subsample_counts(a, 2) actual.add(tuple(obs)) self.assertEqual(actual, {(1, 0, 1), (2, 0, 0)}) obs = subsample_counts(a, 2) self.assertTrue(np.array_equal(obs, np.array([1, 0, 1])) or np.array_equal(obs, np.array([2, 0, 0])))
def test_subsample_counts_without_replacement(self): # Selecting 2 counts from the vector 1000 times yields each of the two # possible results at least once each. a = np.array([2, 0, 1]) actual = set() for i in range(1000): obs = subsample_counts(a, 2) actual.add(tuple(obs)) self.assertEqual(actual, {(1, 0, 1), (2, 0, 0)}) obs = subsample_counts(a, 2) self.assertTrue( np.array_equal(obs, np.array([1, 0, 1])) or np.array_equal(obs, np.array([2, 0, 0])))
def rarefy_and_recode(filenames, rawCounts, samplingDepth): """ Summary: subsamples all samples to the median (average of 100 times), Args: filenames () rawCounts () samplingDepth () Returns: """ for i in range(len(rawCounts)): subsampleList = [] if int(rawCounts[i].sum()) < samplingDepth: meanSubsample = rawCounts[i] else: for j in range(100): sample = subsample_counts(rawCounts[i].transpose().values[0], samplingDepth) subsampleList.insert(j, sample) print("completed 100 subsamples for sample number " + str(i)) meanSubsample = pd.Series(subsampleList).mean() #recodification: setting all values less than 1.01 to zero meanSubsample[meanSubsample < 1.01] = 0 sampleName = filenames[i].split('.')[0] rawCounts[i][sampleName] = meanSubsample newFileName = sampleName + "_norm.csv" create_path('normalised_counts') rawCounts[i].to_csv(os.path.join('normalised_counts', newFileName)) print("written " + newFileName + " to file.") return
def replicatize(sample, reps=10): """ Basically does subsampling without replacement. Calculates which sample has the highest abundance :math:`n` and obtains multiple samples of size :math:`n+1` Parameters ---------- sample: np.array, int A count vector of abundances Returns ------- mat: np.array, int A count matrix where rows = replicate samples columns = features """ sample = np.array(sample) n = sample.max() mat = np.zeros((reps, len(sample))) for rep in range(reps): mat[rep, :] = subsample_counts(sample, n + 1) return mat
def rarefy_counts(counts, depth=10000): """Normalize a count matrix by rarefaction (subsampling). Parameters ---------- counts : pandas.DataFrame The count matrix to be normalized. Contains variables as columns and samples as rows. Returns ------- pandas.DataFrame A new data frame with normalized samples such that each sample has a depth of `depth` (sum of variables equals depth). """ log.info( "Subsampling %dx%d count matrix to a depth of %d." % (counts.shape[0], counts.shape[1], depth) ) bad = counts.astype("int").sum(1) < depth log.info("Removing %d samples due to low depth." % bad.sum()) rare = counts[~bad].apply( lambda x: pd.Series( subsample_counts(x.astype("int"), depth), index=counts.columns ), axis=1, ) return rare
def _rfx(data, sid, md): if sid in sinks: return subsample_counts(data.astype(np.int64), sinks_depth, replace=False) else: return data
def subsample_count(exp: Experiment, total, replace=False, inplace=False, random_seed=None): """Randomly subsample each sample to the same number of counts. .. warning:: This function will change the :attr:`Experiment.data` object from sparse to dense. The input ``Experiment`` object should not have been normalized by total sum and its data should be discrete count. The samples that have few total count than ``total`` will be dropped. .. note:: This function may not work on Windows OS. It relies on the :func:`skbio.stats.subsample_counts` which have `ValueError: Buffer dtype mismatch, expected 'int64_t' but got 'long'` in `_subsample_counts_without_replacement` function of `skbio/stats/__subsample.pyx` Parameters ---------- total : int, optional cap the tiny values and then clr transform the data. replace : bool, optional If True, subsample with replacement. If False (the default), subsample without replacement inplace : bool, optional False (default) to create a new experiment, True to do it in place random_seed : int or None, optional, default=None passed to :func:`numpy.random.seed` Returns ------- Experiment The subsampled experiment. See Also -------- :func:`skbio.stats.subsample_counts` """ # import here to make skbio optional dependency from skbio.stats import subsample_counts if not inplace: exp = deepcopy(exp) if exp.sparse: exp.sparse = False # subsample_counts() require int as input; if not, raise error if exp.data.dtype.kind not in {'u', 'i'}: raise ValueError('Your `Experiment` object is normalized: subsample operates on integer raw data, not on normalized data.') drops = [] np.random.seed(random_seed) for row in range(exp.data.shape[0]): counts = exp.data[row, :] if total > counts.sum() and not replace: drops.append(row) else: exp.data[row, :] = subsample_counts(counts, n=total, replace=replace) exp.reorder([i not in drops for i in range(exp.data.shape[0])], inplace=True) exp.normalized = total return exp
def test_subsample_counts_nonrandom(self): a = np.array([0, 5, 0]) # Subsample same number of items that are in input (without # replacement). npt.assert_equal(subsample_counts(a, 5), a) # Can only choose from one bin. exp = np.array([0, 2, 0]) npt.assert_equal(subsample_counts(a, 2), exp) npt.assert_equal(subsample_counts(a, 2, replace=True), exp) # Subsample zero items. a = [3, 0, 1] exp = np.array([0, 0, 0]) npt.assert_equal(subsample_counts(a, 0), exp) npt.assert_equal(subsample_counts(a, 0, replace=True), exp)
def find_subsystems_of_interest(studyName, groupsList, geneCounts, level, percentage): """ Summary: uses Boruta machine learning method to roughly determine potential genes of interest. requires tab-separated matrix from MG-RAST analysis page Args: studyName (str): directory (study name) groupsList (list): list of group names level (str): subsystems level at which to run Boruta percentage (int): threshold for Boruta feature selection Returns: None, outputs files with tentative genes/gene families of interest """ numGeneCounts = geneCounts.select_dtypes(include=[np.number]) Y = numGeneCounts.transpose().index.str.split('_').str[0].values samplingDepth = numGeneCounts.sum().median() os.chdir(studyName) for i in range(len(numGeneCounts.columns)): subsampleList = [] if int(numGeneCounts[numGeneCounts.columns[i]].sum()) < samplingDepth: meanSubsample = numGeneCounts[numGeneCounts.columns[i]] else: for j in range(100): sample = subsample_counts( numGeneCounts[numGeneCounts.columns[i]].transpose().values, int(samplingDepth)) subsampleList.insert(j, sample) print("completed 100 subsamples for sample number " + str(i)) meanSubsample = pd.Series(subsampleList).mean() #recodification: setting all values less than 1.01 to zero meanSubsample[meanSubsample < 1.01] = 0 meanSubsample = 100 * meanSubsample / meanSubsample.sum() numGeneCounts[numGeneCounts.columns[i]] = meanSubsample numGeneCounts['level1'] = geneCounts['level1'] numGeneCounts['level2'] = geneCounts['level2'] numGeneCounts['level3'] = geneCounts['level3'] numGeneCounts['function'] = geneCounts['function'] countsLvl = numGeneCounts.groupby(level).sum() groupsDict = dict(enumerate(pd.Series(groupsList).unique())) dictGroups = {y: x for x, y in groupsDict.items()} rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=3) X = countsLvl.transpose().values feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, perc=int(percentage)) feat_selector.fit(X, Y) if len(countsLvl[feat_selector.support_]) > 0: countsLvl[feat_selector.support_].to_csv(str(level) + '_tentative.csv') countsLvl[feat_selector.support_weak_].to_csv( str(level) + '_tentative_weak.csv') os.chdir('..')
def test_subsample_counts_nonrandom(self): a = np.array([0, 5, 0]) # Subsample same number of items that are in input (without # replacement). npt.assert_equal(subsample_counts(a, 5), a) # Can only choose from one bin. exp = np.array([0, 2, 0]) npt.assert_equal(subsample_counts(a, 2), exp) npt.assert_equal( subsample_counts(a, 2, replace=True), exp) # Subsample zero items. a = [3, 0, 1] exp = np.array([0, 0, 0]) npt.assert_equal(subsample_counts(a, 0), exp) npt.assert_equal(subsample_counts(a, 0, replace=True), exp)
def test_subsample_counts_with_replacement_equal_n(self): # test when n == counts.sum() a = np.array([0, 0, 3, 4, 2, 1]) actual = set() for i in range(1000): obs = subsample_counts(a, 10, replace=True) self.assertEqual(obs.sum(), 10) actual.add(tuple(obs)) self.assertTrue(len(actual) > 1)
def _subsample(self, X): X = X.astype(int) X_out = list() iter_var = X.values if isinstance(X, pd.DataFrame) else X for row in iter_var: new_X = subsample_counts(row, n=self.depth, replace=self.replace) X_out.append(new_X) X = np.vstack(X_out) return X
def test_subsample_counts_invalid_input(self): # Negative n. with self.assertRaises(ValueError): subsample_counts([1, 2, 3], -1) # Floats. with self.assertRaises(TypeError): subsample_counts([1, 2.3, 3], 2) # Wrong number of dimensions. with self.assertRaises(ValueError): subsample_counts([[1, 2, 3], [4, 5, 6]], 2) # Input has too few counts. with self.assertRaises(ValueError): subsample_counts([0, 5, 0], 6, replace=False) # Input has too counts, but should work with bootstrap subsample_counts([0, 5, 0], 6, replace=True)
def rarify(biom, even_sampling_depth): data = [] sample_ids = [] for e in biom.columns: count_vector = biom[e] if count_vector.sum() < even_sampling_depth: continue else: sample_ids.append(e) data.append(subsample_counts(count_vector.astype(int), even_sampling_depth)) return pd.DataFrame(np.asarray(data).T, index=biom.index, columns=sample_ids)
def subsample_count(exp: Experiment, total, replace=False, inplace=False): """Randomly subsample each sample to the same number of counts. .. warning:: This function will change the :attr:`Experiment.data` object from sparse to dense. The input ``Experiment`` object should not have been normalized by total sum and its data should be discrete count. The samples that have few total count than ``total`` will be dropped. Parameters ---------- total : int, optional cap the tiny values and then clr transform the data. replace : bool, optional If True, subsample with replacement. If False (the default), subsample without replacement inplace : bool, optional False (default) to create a new experiment, True to do it in place Returns ------- Experiment The subsampled experiment. See Also -------- :func:`skbio.stats.subsample_counts` """ if inplace: newexp = exp else: newexp = deepcopy(exp) if newexp.sparse: newexp.sparse = False # subsample_counts() require int as input; # check if it is normalized: if so, raise error if exp.exp_metadata.get('normalized'): raise ValueError( 'Your `Experiment` object is normalized: subsample operates on integer raw data, not on normalized data.' ) newexp.data = newexp.data.astype(int) drops = [] for row in range(newexp.data.shape[0]): try: newexp.data[row, :] = subsample_counts(newexp.data[row, :], n=total, replace=replace) except ValueError: # if the row sum is smaller than total in case replace is True, this row should be dropped drops.append(row) newexp.reorder([i not in drops for i in range(newexp.data.shape[0])], inplace=True) return newexp
def test_subsample_counts_with_replacement(self): # Can choose from all in first bin, all in last bin (since we're # sampling with replacement), or split across bins. a = np.array([2, 0, 1]) actual = set() for i in range(1000): obs = subsample_counts(a, 2, replace=True) actual.add(tuple(obs)) self.assertEqual(actual, {(1, 0, 1), (2, 0, 0), (0, 0, 2)}) # Test that selecting 35 counts from a 36-count vector 1000 times # yields more than 10 different subsamples. If we were subsampling # *without* replacement, there would be only 10 possible subsamples # because there are 10 nonzero bins in array a. However, there are more # than 10 possibilities when sampling *with* replacement. a = np.array([2, 0, 1, 2, 1, 8, 6, 0, 3, 3, 5, 0, 0, 0, 5]) actual = set() for i in range(1000): obs = subsample_counts(a, 35, replace=True) self.assertEqual(obs.sum(), 35) actual.add(tuple(obs)) self.assertTrue(len(actual) > 10)
def subsample(self, level): dropped = [] for (i, row) in enumerate(self.data.to_numpy()): try: row_subsampled = subsample_counts(row, level) except ValueError: dropped.append(i) continue self.data.iloc[i] = row_subsampled self.data.drop(self.data.index[dropped], inplace=True)
def create_fake_observation(): """Create a subsample with defined property""" # Create a subsample of a larger sample such that we can compute # the expected probability of the unseen portion. # This is used in the tests of lladser_pe and lladser_ci counts = np.ones(1001, dtype='int64') counts[0] = 9000 total = counts.sum() fake_obs = subsample_counts(counts, 1000) exp_p = 1 - sum([x / total for (x, y) in zip(counts, fake_obs) if y > 0]) return fake_obs, exp_p
def create_fake_observation(): """Create a subsample with defined property""" # Create a subsample of a larger sample such that we can compute # the expected probability of the unseen portion. # This is used in the tests of lladser_pe and lladser_ci counts = np.ones(1001, dtype='int64') counts[0] = 9000 total = counts.sum() fake_obs = subsample_counts(counts, 1000) exp_p = 1 - sum([x/total for (x, y) in zip(counts, fake_obs) if y > 0]) return fake_obs, exp_p
def test_subsample_counts_invalid_input(self): # Negative n. with self.assertRaises(ValueError): subsample_counts([1, 2, 3], -1) # Floats. with self.assertRaises(TypeError): subsample_counts([1, 2.3, 3], 2) # Wrong number of dimensions. with self.assertRaises(ValueError): subsample_counts([[1, 2, 3], [4, 5, 6]], 2) # Input has too few counts. with self.assertRaises(ValueError): subsample_counts([0, 5, 0], 6)
def rarify(biom, even_sampling_depth): data = [] sample_ids = [] for e in biom.columns: count_vector = biom[e] if count_vector.sum() < even_sampling_depth: continue else: sample_ids.append(e) data.append( subsample_counts(count_vector.astype(int), even_sampling_depth)) return pd.DataFrame(np.asarray(data).T, index=biom.index, columns=sample_ids)
def get_rarefied(otu_table, seqs_per_sample): """ Args: otu_table:(dataframe) load biom file and change to dataframe seqs_per_sample:... Rerutn: a rarefied OTU table """ new_counts = [] for sample in otu_table.columns: arr = [] seqs = sum(otu_table[sample]) if seqs <= seqs_per_sample: arr = np.array(otu_table[sample].values).astype(int) else: values = np.array(otu_table[sample].values).astype(int) arr = subsample_counts(values, seqs_per_sample) new_counts.append(arr) rarefied = pd.DataFrame(new_counts, columns=otu_table.index, index=otu_table.columns) return rarefied.T
def michaelis_menten_fit(counts, num_repeats=1, params_guess=None): r"""Calculate Michaelis-Menten fit to rarefaction curve of observed OTUs. The Michaelis-Menten equation is defined as: .. math:: S=\frac{nS_{max}}{n+B} where :math:`n` is the number of individuals and :math:`S` is the number of OTUs. This function estimates the :math:`S_{max}` parameter. The fit is made to datapoints for :math:`n=1,2,...,N`, where :math:`N` is the total number of individuals (sum of abundances for all OTUs). :math:`S` is the number of OTUs represented in a random sample of :math:`n` individuals. Parameters ---------- counts : 1-D array_like, int Vector of counts. num_repeats : int, optional The number of times to perform rarefaction (subsampling without replacement) at each value of :math:`n`. params_guess : tuple, optional Initial guess of :math:`S_{max}` and :math:`B`. If ``None``, default guess for :math:`S_{max}` is :math:`S` (as :math:`S_{max}` should be >= :math:`S`) and default guess for :math:`B` is ``round(N / 2)``. Returns ------- S_max : double Estimate of the :math:`S_{max}` parameter in the Michaelis-Menten equation. See Also -------- skbio.stats.subsample_counts Notes ----- There is some controversy about how to do the fitting. The ML model given in [1]_ is based on the assumption that error is roughly proportional to magnitude of observation, reasonable for enzyme kinetics but not reasonable for rarefaction data. Here we just do a nonlinear curve fit for the parameters using least-squares. References ---------- .. [1] Raaijmakers, J. G. W. 1987 Statistical analysis of the Michaelis-Menten equation. Biometrics 43, 793-803. """ counts = _validate_counts_vector(counts) n_indiv = counts.sum() if params_guess is None: S_max_guess = observed_otus(counts) B_guess = int(round(n_indiv / 2)) params_guess = (S_max_guess, B_guess) # observed # of OTUs vs # of individuals sampled, S vs n xvals = np.arange(1, n_indiv + 1) ymtx = np.empty((num_repeats, len(xvals)), dtype=int) for i in range(num_repeats): ymtx[i] = np.asarray([observed_otus(subsample_counts(counts, n)) for n in xvals], dtype=int) yvals = ymtx.mean(0) # Vectors of actual vals y and number of individuals n. def errfn(p, n, y): return (((p[0] * n / (p[1] + n)) - y) ** 2).sum() # Return S_max. return fmin_powell(errfn, params_guess, ftol=1e-5, args=(xvals, yvals), disp=False)[0]
def subsample( si, i ): ssi = skstats.subsample_counts( si, i ) return np.count_nonzero( ssi )
def subsample(x): return pd.Series(subsample_counts(x.values, n=depth, replace=replace), index=x.index)
def subsample_sources_sinks(sources_data, sinks, feature_table, sources_depth, sinks_depth): '''Rarify data for sources and sinks. Notes ----- This function rarifies `sources_data` to `sources_depth`, and `sinks` in `feature_table` to `sink_depth`. This function is neccesary because of ipyparallel and the partial functions. Parameters ---------- sources_data : np.array Two dimensional array with collapsed source data. sinks : np.array One dimensional array of strings, with each string being the sample ID of a sink in `feature_table`. feature_table : biom.table.Table Biom table containing data for `sinks` to be rarified. sources_depth : int Depth at which to subsample each source. If 0, no rarefaction will be performed. sinks_depth : int Depth at which to subsample each sink. If 0, no rarefaction will be performed. Returns ------- rsd : np.array Rarified `sources_data`. rft : biom.table.Table `feature_table` with samples identified in `sinks` rarified. ''' # Check that supplied depths do not exceed available sequences. Cryptic # errors will be raised otherwise. if sources_depth > 0 and (sources_data.sum(1) < sources_depth).any(): raise ValueError('Invalid rarefaction depth for source data. There ' 'are not enough sequences in at least one collapsed ' 'source.') if sinks_depth > 0: for sample in sinks: if feature_table.data(sample, axis='sample').sum() < sinks_depth: raise ValueError('Invalid rarefaction depth for sink data. ' 'There are not enough sequences in at least ' 'one sink.') # Rarify source data. if sources_depth == 0: rsd = sources_data else: rsd = np.empty(sources_data.shape, dtype=np.float64) for row in range(sources_data.shape[0]): rsd[row] = subsample_counts(sources_data[row], sources_depth, replace=False) # Rarify sinks data in the biom table. if sinks_depth == 0: rft = feature_table else: # We'd like to use Table.subsample, but it removes features that have # 0 count across every sample, which changes the size of the matrix. # rft = feature_table.filter(sinks, axis='sample', inplace=False) # rft = rft.subsample(sinks_depth) def _rfx(data, sid, md): if sid in sinks: return subsample_counts(data.astype(np.int64), sinks_depth, replace=False) else: return data rft = feature_table.transform(_rfx, axis='sample', inplace=False) return rsd, rft
def find_genes_of_interest(studyName, groupsList, geneCounts, lvl1pct=70, lvl2pct=70, lvl3pct=60, fxnpct=40): """ Summary: uses Boruta machine learning method to roughly determine potential genes of interest. requires tab-separated matrix from MG-RAST analysis page Args: studyName (str): directory (study name) geneCountsName (str): filename for tab separated matrix lvl1pct (int): threshold for Boruta on level 1 lvl2pct (int): threshold for Boruta on level 2 lvl3pct (int): threshold for Boruta on level 3 fxnpct (int): threshold for Boruta on gene name Returns: None, outputs files with tentative genes/gene families of interest """ #geneCounts = pd.read_table(geneCountsName, header=0)#, header=0)#header=0 numGeneCounts = geneCounts.select_dtypes(include=[np.number]) Y = numGeneCounts.transpose().index.str.split('_').str[0].values samplingDepth = numGeneCounts.sum().median() os.chdir(studyName) for i in range(len(numGeneCounts.columns)): subsampleList = [] if int(numGeneCounts[numGeneCounts.columns[i]].sum()) < samplingDepth: meanSubsample = numGeneCounts[numGeneCounts.columns[i]] else: for j in range(100): sample = subsample_counts( numGeneCounts[numGeneCounts.columns[i]].transpose().values, int(samplingDepth)) subsampleList.insert(j, sample) print("completed 100 subsamples for sample number " + str(i)) meanSubsample = pd.Series(subsampleList).mean() #recodification: setting all values less than 1.01 to zero meanSubsample[meanSubsample < 1.01] = 0 meanSubsample = 100 * meanSubsample / meanSubsample.sum() numGeneCounts[numGeneCounts.columns[i]] = meanSubsample numGeneCounts['level1'] = geneCounts['level1'] numGeneCounts['level2'] = geneCounts['level2'] numGeneCounts['level3'] = geneCounts['level3'] numGeneCounts['function'] = geneCounts['function'] countsLvl1 = numGeneCounts.groupby('level1').sum() countsLvl2 = numGeneCounts.groupby('level2').sum() countsLvl3 = numGeneCounts.groupby('level3').sum() countsLvl4 = numGeneCounts.groupby('function').sum() levelList = [countsLvl1, countsLvl2, countsLvl3, countsLvl4] countsLvl1.to_csv(studyName + 'genes_lvl1.csv') countsLvl2.to_csv(studyName + 'genes_lvl2.csv') countsLvl3.to_csv(studyName + 'genes_lvl3.csv') countsLvl4.to_csv(studyName + 'genes_function.csv') groupsDict = dict(enumerate(pd.Series(groupsList).unique())) dictGroups = {y: x for x, y in groupsDict.items()} rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=3) X = countsLvl1.transpose().values feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, perc=int(lvl1pct)) feat_selector.fit(X, Y) if len(countsLvl1[feat_selector.support_]) > 0: countsLvl1[feat_selector.support_].to_csv('level1_tentative.csv') countsLvl1[feat_selector.support_weak_].to_csv('level1_tentative_weak.csv') X = countsLvl2.transpose().values feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, perc=int(lvl2pct), max_iter=300) feat_selector.fit(X, Y) if len(countsLvl2[feat_selector.support_]) > 0: countsLvl2[feat_selector.support_].to_csv('level2_tentative.csv') countsLvl2[feat_selector.support_weak_].to_csv('level2_tentative_weak.csv') X = countsLvl3.transpose().values feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, perc=int(lvl3pct), max_iter=500) feat_selector.fit(X, Y) if len(countsLvl3[feat_selector.support_]) > 0: countsLvl3[feat_selector.support_].to_csv('level3_tentative.csv') countsLvl3[feat_selector.support_weak_].to_csv('level3_tentative_weak.csv') X = countsLvl4.transpose().values feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, perc=int(fxnpct), max_iter=700) feat_selector.fit(X, Y) if len(countsLvl4[feat_selector.support_]) > 0: countsLvl4[feat_selector.support_].to_csv('level4_tentative_.csv') countsLvl4[feat_selector.support_weak_].to_csv('level4_tentative_weak.csv') os.chdir('..')