コード例 #1
    def test_multiplicative_replacement(self):
        amat = multiplicative_replacement(closure(self.cdata3))
            np.array([[0.087273, 0.174545, 0.261818, 0.04, 0.436364],
                      [0.092, 0.04, 0.04, 0.368, 0.46],
                      [0.066667, 0.133333, 0.2, 0.266667, 0.333333]]),

        amat = multiplicative_replacement(closure(self.cdata4))
            np.array([0.087273, 0.174545, 0.261818, 0.04, 0.436364]),

        amat = multiplicative_replacement(closure(self.cdata6))
            np.array([[0.087273, 0.174545, 0.261818, 0.04, 0.436364],
                      [0.092, 0.04, 0.04, 0.368, 0.46],
                      [0.066667, 0.133333, 0.2, 0.266667, 0.333333]]),

        with self.assertRaises(ValueError):
        with self.assertRaises(ValueError):

        # make sure that inplace modification is not occurring
        npt.assert_allclose(self.cdata4, np.array([1, 2, 3, 0, 5]))
コード例 #2
    def test_multiplicative_replacement(self):
        amat = multiplicative_replacement(closure(self.data3))
                            np.array([[0.087273, 0.174545, 0.261818,
                                       0.04, 0.436364],
                                      [0.092, 0.04, 0.04, 0.368, 0.46],
                                      [0.066667, 0.133333, 0.2,
                                       0.266667, 0.333333]]),
                            rtol=1e-5, atol=1e-5)

        amat = multiplicative_replacement(closure(self.data4))
                            np.array([0.087273, 0.174545, 0.261818,
                                      0.04, 0.436364]),
                            rtol=1e-5, atol=1e-5)

        amat = multiplicative_replacement(closure(self.data6))
                            np.array([[0.087273, 0.174545, 0.261818,
                                       0.04, 0.436364],
                                      [0.092, 0.04, 0.04, 0.368, 0.46],
                                      [0.066667, 0.133333, 0.2,
                                       0.266667, 0.333333]]),
                            rtol=1e-5, atol=1e-5)

        with self.assertRaises(ValueError):
        with self.assertRaises(ValueError):

        # make sure that inplace modification is not occurring
        npt.assert_allclose(self.data4, np.array([1, 2, 3, 0, 5]))
コード例 #3
def mult_replace(df):
    wrapper for skbio's multiplicative multiplicative_replacement

    df : DataFrame

    df_mr : DataFrame
        modified via multiplicative replacement

    Replaces zeros with the minimum non zero value in the entire
    matrix. Use multiplicaive replacement to ensure rows
    sum close to 1.

    assert (isinstance(df, pd.DataFrame))
    nzra = np.min(df.values.flatten()[df.values.flatten() > 0])
    half_nzra = nzra / 2
    # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1
    df_mr = pd.DataFrame(multiplicative_replacement(df, delta=half_nzra))
    assert (np.all(df_mr.values > 0))
    return df_mr
コード例 #4
ファイル: batch.py プロジェクト: ramellose/massoc
    def normalize_transform(self, mode='clr'):
        Some operations may require transformed data.
        This function performs normalization and
        a clr transform on all OTU tables in a Batch object.
        It returns a deep copy of the original Batch object,
        so the original file is not modified.

        :param mode: transformation mode; clr (centered log-ratio) or ilr (isometric log-ratio)
        :return: Transformed copy of Batch object.
        batchcopy = copy.deepcopy(self)
            for x in list(self.otu):
                # normalizes the data by samples
                normbiom = batchcopy.otu[x].norm(axis='sample', inplace=False)
                mat = csr_matrix.toarray(normbiom.matrix_data)
                # replaces all zeros with a small value
                # multiplicative replacement preserves ratios between values
                mat = multiplicative_replacement(mat)
                if mode is 'clr':
                    mat = clr(mat)
                elif mode is 'ilr':
                    mat = ilr(mat)
                    raise ValueError("Only CLR and ILR transformations are currently supported.")
                normbiom._data = csc_matrix(mat)
                batchcopy.otu[x] = normbiom
        except Exception:
            logger.error("Failed to normalize data", exc_info=True)
        return batchcopy
コード例 #5
def globalCLRPermTest(otuDf,
    """Calculates centered-log-ratios (CLR) for each sample and performs global
    permutation tests to determine if there is a significant correlation
    over all log-median-ratios, with respect to the label variable of interest.

    otuDf : pd.DataFrame [samples x OTUs]
        Contains relative abundance [0-1] for all samples (rows) and OTUs (colums)
    labels: pd.Series (float)
        Contains binary variable indicating membership into one of two categories
        (e.g. treatment conditions). Must share index with otuDf.
    statfunc : function
        Takes a np.ndarray [n x k] and float index [n] as parameters and
        returns a float summarizing over k.
    nperms : int
        Number of iterations for the permutation test.
    seed :int
        Seed for random permutation generation.
    pvalue : float
        Global p-value for a significant association of OTU log-median-ratios
        with label, based on the summary statistic.
    obs : float
        Statistic summarizing the label difference."""

    nSamples, nOTUs = otuDf.shape

    if binary:
        labelValues = labels.values.astype(bool)
        labelValues = labels.values.astype(float)

    # Make proportions
    otuDf = otuDf / otuDf.sum()
    # Apply multiplicative replacement for zero values
    otuMR = multiplicative_replacement(otuDf.values)
    # Calculate the CLR
    otuCLR = clr(otuMR)
    # Make into a DataFrame
    otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns)

    obs = statfunc(otuCLR.values, labelValues)
    samples = np.array([
        statfunc(otuCLR.values, labelValues[np.random.permutation(nSamples)])
        for permi in range(nperms)
    """Since test is based on the abs statistic it is inherently two-sided"""
    pvalue = ((np.abs(samples) >= np.abs(obs)).sum() + 1) / (nperms + 1)

    return pvalue, obs
コード例 #6
ファイル: kmers.py プロジェクト: chasemc/Autometa
def normalize(df: pd.DataFrame,
              method: str = "am_clr",
              out: str = None,
              force: bool = False) -> pd.DataFrame:
    """Normalize raw k-mer counts by center or isometric log-ratio transform.

    df : pd.DataFrame
        k-mer counts dataframe.
        i.e. for 3-mers; Index='contig', columns=[AAA, AAT, ...]
    method : str, optional
        Normalize k-mer counts using CLR or ILR transformation
        (the default is Autometa's CLR implementation).
        choices = ['ilr', 'clr', 'am_clr']
        Other transformations come from the skbio.stats.composition module
    out : str, optional
        Path to write normalized k-mers.
    force : bool, optional
        Whether to overwrite existing `out` file path, by default False.

        Normalized counts using provided `method`.

        Provided `method` is not available.
    method = method.lower()
    out_specified = out is not None
    out_exists = os.path.exists(out) if out else False
    case1 = out_specified and out_exists and not force
    if case1:
            f"{out} already exists. Use force to overwrite. retrieving...")
        return pd.read_csv(out, sep="\t", index_col="contig")
    logger.debug(f"Transforming k-mer counts using {method}")
    choices = {"ilr", "clr", "am_clr"}
    if method == "am_clr":
        norm_df = autometa_clr(df)
    elif method in choices:
        transforms = {"ilr": ilr, "clr": clr}
        X = df.fillna(0).to_numpy()
        X = multiplicative_replacement(X)
        X_norm = transforms[method](X)
        norm_df = pd.DataFrame(X_norm, index=df.index)
        choices = ", ".join(choices)
        raise ValueError(
            f"Normalize Method not available! {method}. choices: {choices}")
    case2 = out_specified and out_exists and force
    case3 = out_specified and not out_exists
    if case2 or case3:
        norm_df.to_csv(out, sep="\t", index=True, header=True)
    return norm_df
コード例 #7
ファイル: Cagalog.py プロジェクト: kmayerb/cagalog
    def clr_transform_cags_via_mult_rep_method(self):
        uses multiplicative replacement to replace zeros with half of
        the lowest non-zero relative abundance value. Then performs clr

        taxonomic_level : string
            "phlyum" through "species"

        self.cags_dict : dictionary
        dictionary keyed on 'cags' with the following attributes:
            1. cags_wide_df -  relative abundances
            2. cags_wide_mr_clr_df - clr transformed
               abundances (uses multiplicative replacement)
            3. half_nzra - on-zero relative abundance (NZRA) used for Mult Rep step

        cag_wide = self._pivot_cags()
        # one solution is to use the lowest non-zero relative abundance (NZRA), or more typically NZRA/2
        nzra = np.min(cag_wide.values.flatten()[cag_wide.values.flatten() > 0])
        half_nzra = nzra / 2
        # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1
        cag_wide_mr = multiplicative_replacement(cag_wide, delta=half_nzra)
        # clr transform
        cag_wide_mr_clr = clr(cag_wide_mr)
        # clr transform array to data.frame with index and column matching mp_wide_taxa
        cag_wide_mr_clr_df = pd.DataFrame(cag_wide_mr_clr)
        cag_wide_mr_clr_df.columns = cag_wide.columns
        cag_wide_mr_clr_df.index = cag_wide.index

        self.cags_dict["cags"] = {
            "cags_wide_df": cag_wide,
            "cags_wide_mr_clr_df": cag_wide_mr_clr_df,
            "half_nzra": half_nzra
        return cag_wide_mr_clr_df

        def fetch_metaphlan_result(self, clr=True, taxonomic_level="phylum"):
            if clr:
                key = 'mp_wide_taxa_mr_clr_df'
                key = 'mp_wide_taxa_df'
                return (self.metaphlan_dict[taxonomic_level][key])
            except KeyError:
                    "NO METAPHLAN MATRIX CREATED SEE clr_transform_metaphlan_via_mult_rep_method()"
コード例 #8
ファイル: ancom.py プロジェクト: agartland/utils
def globalCLRPermTest(otuDf, labels, statfunc=_sumRhoStat, nperms=999, seed=110820, binary=False):
    """Calculates centered-log-ratios (CLR) for each sample and performs global
    permutation tests to determine if there is a significant correlation
    over all log-median-ratios, with respect to the label variable of interest.

    otuDf : pd.DataFrame [samples x OTUs]
        Contains relative abundance [0-1] for all samples (rows) and OTUs (colums)
    labels: pd.Series (float)
        Contains binary variable indicating membership into one of two categories
        (e.g. treatment conditions). Must share index with otuDf.
    statfunc : function
        Takes a np.ndarray [n x k] and float index [n] as parameters and
        returns a float summarizing over k.
    nperms : int
        Number of iterations for the permutation test.
    seed :int
        Seed for random permutation generation.
    pvalue : float
        Global p-value for a significant association of OTU log-median-ratios
        with label, based on the summary statistic.
    obs : float
        Statistic summarizing the label difference."""

    nSamples, nOTUs = otuDf.shape

    if binary:
        labelValues = labels.values.astype(bool)
        labelValues = labels.values.astype(float)

    # Make proportions
    otuDf = otuDf / otuDf.sum()
    # Apply multiplicative replacement for zero values
    otuMR = multiplicative_replacement(otuDf.values)
    # Calculate the CLR
    otuCLR = clr(otuMR)
    # Make into a DataFrame
    otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns)

    obs = statfunc(otuCLR.values, labelValues)
    samples = np.array([
        statfunc(otuCLR.values, labelValues[np.random.permutation(nSamples)])
        for permi in range(nperms)
    """Since test is based on the abs statistic it is inherently two-sided"""
    pvalue = ((np.abs(samples) >= np.abs(obs)).sum() + 1) / (nperms + 1)

    return pvalue, obs
コード例 #9
def normalize_clr(data):

    "replace zeros and apply clr"

    assert data.shape[0]< data.shape[1], "samples should be indexes, I don't think you have"

    normalized= pd.DataFrame(normalized,
                             index= data.index,columns= data.columns)

    return normalized
コード例 #10
ファイル: Cagalog.py プロジェクト: kmayerb/cagalog
 def _clr_transform_via_mult_rep_method(self, df):
     nzra = np.min(df.values.flatten()[df.values.flatten() > 0])
     half_nzra = nzra / 2
     # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1
     df_mr = multiplicative_replacement(df, delta=half_nzra)
     # clr transform
     mr_clr = clr(df_mr)
     # clr transform array to data.frame with index and column matching mp_wide_taxa
     mr_clr_df = pd.DataFrame(mr_clr)
     mr_clr_df.columns = df.columns
     mr_clr_df.index = df.index
     return mr_clr_df
コード例 #11
ファイル: Cagalog.py プロジェクト: kmayerb/cagalog
    def mult_replace(self, df):
        replace zeros with the minimum non zero value in the entire
        matrix. Use multiplicaive replacement to ensure rows
        sum close to 1.s
        nzra = np.min(df.values.flatten()[df.values.flatten() > 0])
        half_nzra = nzra / 2
        # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1
        df_mr = pd.DataFrame(multiplicative_replacement(df, delta=half_nzra))

        return (df_mr)
コード例 #12
def aitchison_transform_part(df):
    Aitchison tranformation on df with all columns belonging to same batch.
    df should consist of all samples tagged together in one channel (i.e. A549_S_rep1 etc.)
    df_aitchison = multiplicative_replacement(df)
    #df_aitchison = closure(df)
    df_idx = df.index
    df_col = df.columns
    df_aitchison = pd.DataFrame(df_aitchison, index=df_idx, columns=df_col)
    return df_aitchison
コード例 #13
def preprocess_df(df, rep, state):
    Aitchi transformed subset of data.
    df_subset = df[select_rep_state_intensities(rep, state)]
    cols = df_subset.columns
    df_subset = drop_zero_rows(
        df_subset)  #index should be the same as protein/peptides
    index = df_subset.index
    df_subset = multiplicative_replacement(df_subset)
    df_subset = clr(df_subset)
    df_subset = pd.DataFrame(df_subset, index=index, columns=cols)
    return df_subset
コード例 #14
ファイル: coda.py プロジェクト: SilasK/CMGM
def clr(counts_data, log=np.log2):

    #TODO: check if count data

    # remove columns with all
    data = counts_data.loc[:, ~(counts_data <= 1).all()]

    #dataframe with replace zeros
    data = pd.DataFrame(composition.multiplicative_replacement(data),

    data = log(data)
    data = (data.T - data.mean(1)).T

    return data
コード例 #15
ファイル: Cagalog.py プロジェクト: kmayerb/cagalog
    def clr_transform_metaphlan_via_mult_rep_method(self,

        uses multiplicative replacement to replace zeros with half of
        the lowest non-zero relative abundance value. Then performs clr

        taxonomic_level : string
            "phlyum" through "species"

        self.metaphlan_dict : dictionary
            dictionary keyed on taxa level with the following attributes:
                1. mp_wide_taxa_df - taxa level relative abundances
                2. mp_wide_taxa_mr_clr_df - taxa level clr transformed
                   abundances (uses multiplicative replacement)
                3. half_nzra - on-zero relative abundance (NZRA) used for Mult Rep step

        mp_wide_taxa = self._pivot_metaphlan(taxonomic_level=taxonomic_level)
        # one solution is to use the lowest non-zero relative abundance (NZRA), or more typically NZRA/2
        nzra = np.min(
            mp_wide_taxa.values.flatten()[mp_wide_taxa.values.flatten() > 0])
        half_nzra = nzra / 2
        # multiplicative replacement adds small value to non-zero entries while maintaining row sums equal to 1
        mp_wide_taxa_mr = multiplicative_replacement(mp_wide_taxa,
        # clr transform
        mp_wide_taxa_mr_clr = clr(mp_wide_taxa_mr)
        # clr transform array to data.frame with index and column matching mp_wide_taxa
        mp_wide_taxa_mr_clr_df = pd.DataFrame(mp_wide_taxa_mr_clr)
        mp_wide_taxa_mr_clr_df.columns = mp_wide_taxa.columns
        mp_wide_taxa_mr_clr_df.index = mp_wide_taxa.index

        self.metaphlan_dict[taxonomic_level] = {
            "mp_wide_taxa_df": mp_wide_taxa,
            "mp_wide_taxa_mr_clr_df": mp_wide_taxa_mr_clr_df,
            "half_nzra": half_nzra
        return (mp_wide_taxa_mr_clr_df)
コード例 #16
ファイル: distance.py プロジェクト: fossabot/onecodex
    def aitchison_distance(self, rank=Rank.Auto):
        """Calculate the Aitchison distance between samples.

        Aitchison distance is the Euclidean distance between centre logratio-normalized samples (abundances).
        As this requires log-transforms, we first need to 'estimate' zeros in the data;
        i.e. replace zeros with small, positive values, while maintaining a constant sum to 1.

        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
            Analysis will be restricted to abundances of taxa at the specified level.

        skbio.stats.distance.DistanceMatrix, a distance matrix.
        import numpy as np
        from skbio.stats.composition import multiplicative_replacement, clr
        from sklearn.metrics.pairwise import euclidean_distances
        from skbio.stats.distance import DistanceMatrix

        df = self.to_df(rank=rank, normalize=self._guess_normalized()
                        )  # get a dataframe of abundances
        df_n0 = multiplicative_replacement(
            df)  # replace 0s with positive small numbers
        df_n0_clr = clr(df_n0)  # clr-normalize
        aitchison_array = euclidean_distances(
            df_n0_clr, df_n0_clr)  # get the euclidean distances

        # Due to rounding differences, we must force mirroring on the matrix
        aitchison_dm = np.zeros(aitchison_array.shape)
                                     k=0)] = aitchison_array[np.triu_indices(
                                         aitchison_array.shape[0], k=0)]
        aitchison_dm = aitchison_dm + aitchison_dm.T - np.diag(
        aitchison_dm = DistanceMatrix(aitchison_dm, df.index)

        return aitchison_dm
コード例 #17
    def cluster_heatmap(self, working_samples, samples_list, tax_level):
        """ saves a cluster heatmap based on Aitchison distance and the y-axis labels"""
        from skbio.stats.composition import clr
        from skbio.stats.composition import multiplicative_replacement
        import seaborn as sns

        if self.abundance_df.groupAbsoluteSamples() is not None:
            data0 = self.abundance_df.groupAbsoluteSamples(
            ids = list(data0.columns)
            index0 = list(data0.index)
            data1 = clr(data0.transpose().values.tolist())
            mr_df = multiplicative_replacement(data0.T)
            mr_clr = clr(mr_df)
            mr_clr_df = pd.DataFrame(mr_clr.T, index=index0, columns=ids)

            #g = sns.clustermap(mr_clr_df, metric="correlation", cmap="mako", robust=True, annot_kws={"size": 6})
            g = sns.clustermap(mr_clr_df,
                               annot_kws={"size": 6},

            filename = self.save_high_resolution_figure(
                'Select file to save the cluster heatmap',
            filename = ('.').join(filename.split('.')[:-1])
            #save y-axis labels
            y_labels = list(data0.iloc[g.dendrogram_row.reordered_ind].index)
            with open(filename + '_yaxis_labels.txt', 'w') as f:
                f.write('\n'.join([x.strip('_') for x in y_labels]))

            import matplotlib.pyplot as plt
コード例 #18
def clr(counts_data,log= np.log2):
    "Convert counts data to centered log ratio with log2. "
    "Zeros are replaced by multiplicative_replacement from scikit-bio. " 
    "See wikipedia for centered log ratio."
    from skbio.stats import composition

    #TODO: check if count data
    data= counts_data.astype(int)

    # remove columns with all zeros
    data= data.loc[:,~(data<=1).all()]

    #dataframe with replace zeros
    data= pd.DataFrame( composition.multiplicative_replacement(data),
                       index= data.index

    data= log(data)
    data = (data.T-data.mean(1)).T

    return data
コード例 #19
 def multiplicative_replacement_warning(self):
     with self.assertRaises(ValueError):
         multiplicative_replacement([0, 1, 2], delta=1)
コード例 #20
ファイル: ancom.py プロジェクト: agartland/utils
def loadAbundance(filename, compositionNorm=True, truncate=True):
    """Load OTU counts file (phylum, genus or species level)
    with OTUs along the rows and samples along the columns.

    filename : str
        Excel file from QIIME pipeline.
        Contains OTUs along the rows and samples along the columns,
        with a few header rows.
    compositionNorm : bool
        Add delta count to zeros and normalize each sample by the
        total number of reads. (uses skbio.stats.composition.multiplicative_replacement)
    truncate : bool
        Discard taxa with less than 0.5% of total reads.
        Discard taxa that are not present in 25% of samples.
    def _cleanCountDf(df):
        """Drop extra columns/headers and transpose so that
        samples are along rows and OTUs along columns.

        outDf : pd.DataFrame [index: samples, columns: OTUs]"""

        df = df.drop(['tax_id', 'rank'], axis = 1)
        df = df.dropna(subset=['tax_name'], axis = 0)
        df = df.rename_axis({'tax_name':'OTU'}, axis=1)
        df = df.set_index('OTU')
        df = df.drop(['specimen'], axis = 0)
        df = df.T
        df = df.dropna(subset=['label'], axis=0)
        df['sid'] = df.label.str.replace('Sample-', 'S')
        df = df.set_index('sid')
        df = df.drop('label', axis=1)
        df = df.astype(float)
        return df

    def _discardLow(df, thresh=0.005):
        """Discard taxa/columns with less than 0.5% of reads"""
        totReads = df.values.sum()
        keepInd1 = (df.sum(axis=0)/totReads) > thresh
        """Also discard taxa that are not present in 25% of samples"""
        keepInd2 = (df>0).sum(axis=0)/df.shape[0] > 0.25
        return df.loc[:, keepInd1 & keepInd2]
    df = pd.read_excel(filename)
    df = _cleanCountDf(df)
    if truncate:
        df = _discardLow(df)

    if compositionNorm:
        values = composition.multiplicative_replacement(df.values)
        df = pd.DataFrame(values, columns=df.columns, index=df.index)

    cols = [c for c in df.columns if not c in ['sid']]
    print('Abundance data: %s samples, %s taxa' % (df.shape[0], len(cols)))
    return df, cols
コード例 #21
def clr_on_subset(df_subset):
    df_subset = drop_zero_rows(df_subset)
    df_subset = multiplicative_replacement(df_subset)
    df_subset = clr(df_subset)
    return df_subset
コード例 #22
#data_corrected = pycombat(df_norm_prot,batch)
data_corrected = pycombat(df_norm.fillna(0),batch[0])

##################### THiNK ABOUT THIS... maybe aitchison before ComBat?
# Aitchison multiplicative_replacement #
df_aitchison = multiplicative_replacement(df_int)
df_aitchison = pd.DataFrame(df_int, columns = midx)

def aitchison_transform(df):
    Aitchison tranformation on df.
    df should consist of all samples tagged together in one channel (i.e. A549_S_rep1 etc.)
    df_aitchison = multiplicative_replacement(df)
    #df_aitchison = closure(df)
    df_idx = df.index
    df_col = df.columns
    df_aitchison = pd.DataFrame(df_aitchison, index = df_idx, columns = df_col)
    return df_aitchison
コード例 #23
    labs = f.read().split("\t")
# Remove new-line characters that have numbers after them
regex = re.compile(r'\n.*')
labels = [re.sub(regex, "", e) for e in labs]
# Remove first element "x"

# Ensure that this is not the rarefied ASV table
sample_counts = unscaled_tab.sum(axis=1)  # T

# Perform total sum scaling normalization (TSS)
scaled = unscaled_tab.div(unscaled_tab.sum(axis=1), axis=0)
# scaled.sum(axis=1) # check

# Substitute zeros with small pseudocounts since...
zeros_scaled = comp.multiplicative_replacement(scaled)  # numpy.ndarray

# Isoform log transform since...
ilr_transformed = comp.ilr(zeros_scaled)

# Convert ndarray back to dataframe because...
df_ilr_transformed = pd.DataFrame(ilr_transformed,

# Decision tree methods tended to perform well
# HFE OTU feature reduction method brought a substantial performance improvement for nearly all methods
# After feature reduction most methods performed similarly so need to do that
コード例 #24
microbe_iv['group'] = microbe_iv['group'].map(catdict)
metabolite_iv['group'] = metabolite_iv['group'].map(catdict)

# highlight features with p-value <= 0.001
max_pval = 0.001

microbe_iv.loc[microbe_iv.pval > max_pval, 'group'] = 'None'
print('Number of significant microbes: %d' %
      microbe_iv[microbe_iv['group'] != 'None'].shape[0])

metabolite_iv.loc[metabolite_iv.pval > max_pval, 'group'] = 'None'
print('Number of significant metabolites: %d' %
      metabolite_iv[metabolite_iv['group'] != 'None'].shape[0])

plssvd = PLSSVD(n_components=3)

def standardize(A):
    A = (A - np.mean(A, axis=0)) / np.std(A, axis=0)
    return A

pls_microbes = pd.DataFrame(standardize(plssvd.x_weights_),
                            columns=['PCA1', 'PCA2', 'PCA3'],
pls_metabolites = pd.DataFrame(standardize(plssvd.y_weights_),
                               columns=['PCA1', 'PCA2', 'PCA3'],
コード例 #25
ファイル: chemifrac.py プロジェクト: afcarl/chemifrac
shortest = dijkstra(dm.values)
shortest = pd.DataFrame(shortest, columns=dm.index, index=dm.columns)
shortest = shortest.reindex_axis(sorted(otu_table.columns), axis=0)
shortest = shortest.reindex_axis(sorted(otu_table.columns), axis=1)
# otu_table = table.T
otu_table = otu_table.reindex_axis(sorted(otu_table.columns), axis=1)

# Uses an idea similar to simrank
graph_dm = (otu_table > 0).dot(cosine).dot((otu_table > 0).T)
graph_dm.to_csv('../results/simrank.txt', '\t')
# Uses Aitchison distance
# samples = ['CF31_A', u'CF31_B', u'CF141_A', u'CF141_B', u'Tuni', u'Bry']
dm = cosine.values
dm[dm == np.inf] = 0
mat = otu_table.values
mat = multiplicative_replacement(mat)
graph_dm = connected_dm(mat, dm)
graph_dm += graph_dm.T
samples = otu_table.index
graph_dm = pd.DataFrame(graph_dm, index=samples, columns=samples)
graph_dm.to_csv('../results/aitchison.txt', '\t')

# Read in graph_dm
graph_dm = pd.read_csv('../results/unconnected_aitchison.txt',
# table = pd.read_table('../data/skinmap_chemiFrac_test.txt',
#                        sep='\t', index_col=0)
graph_dm.index = table.columns
graph_dm.columns = table.columns
# _dm = pw_distances('braycurtis', table.values, table.index.values)
コード例 #26
 def multiplicative_replacement_warning(self):
     with self.assertRaises(ValueError):
         multiplicative_replacement([0, 1, 2], delta=1)
コード例 #27
ファイル: chemifrac.py プロジェクト: mortonjt/chemifrac
shortest = pd.DataFrame(shortest,
                        columns=dm.index, index=dm.columns)
shortest = shortest.reindex_axis(sorted(otu_table.columns), axis=0)
shortest = shortest.reindex_axis(sorted(otu_table.columns), axis=1)
# otu_table = table.T
otu_table = otu_table.reindex_axis(sorted(otu_table.columns), axis=1)

# Uses an idea similar to simrank
graph_dm = (otu_table>0).dot(cosine).dot((otu_table>0).T)
graph_dm.to_csv('../results/simrank.txt', '\t')
# Uses Aitchison distance
# samples = ['CF31_A', u'CF31_B', u'CF141_A', u'CF141_B', u'Tuni', u'Bry']
dm = cosine.values
mat = otu_table.values
mat = multiplicative_replacement(mat)
graph_dm = connected_dm(mat, dm)
graph_dm += graph_dm.T
samples = otu_table.index
graph_dm = pd.DataFrame(graph_dm,
graph_dm.to_csv('../results/aitchison.txt', '\t')

# Read in graph_dm
graph_dm = pd.read_csv('../results/unconnected_aitchison.txt',
                       sep='\t', index_col=0)
# table = pd.read_table('../data/skinmap_chemiFrac_test.txt',
#                        sep='\t', index_col=0)
graph_dm.index = table.columns
graph_dm.columns = table.columns
コード例 #28
# which is much more faster than R package ancom.R::ANCOM

from ancomP.stats.ancom import ancom
import pandas as pd
import numpy as np
from skbio.stats.composition import multiplicative_replacement

p = 20

for j in range(50):
    dir1 = 'H:/Tree/tree_base/p=' str(p) + '/otu_table.' + str(j+1) +'.txt'
    data = open(dir1, 'r')
    lines = data.readlines()
    for line in lines:
        line = list(line.strip().split(' '))
        s = []
        for n in line:
    dat = multiplicative_replacement(tmp)
    ind = np.arange(1,p+1,1)
    sam = np.arange(1,101,1)
    table = pd.DataFrame(dat, index = sam, columns = ind)
    grouping = pd.Series(sorted([0,1]*50),index = sam)

    results = ancom(table, grouping) # default parameters
    resultsT = results.T
    resultsT.to_csv('H:/Tree/tree_base/p=' + str(p) + '/ANCOM.csv', mode = 'a',header = False)
コード例 #29
def loadAbundance(filename, compositionNorm=True, truncate=True):
    """Load OTU counts file (phylum, genus or species level)
    with OTUs along the rows and samples along the columns.

    filename : str
        Excel file from QIIME pipeline.
        Contains OTUs along the rows and samples along the columns,
        with a few header rows.
    compositionNorm : bool
        Add delta count to zeros and normalize each sample by the
        total number of reads. (uses skbio.stats.composition.multiplicative_replacement)
    truncate : bool
        Discard taxa with less than 0.5% of total reads.
        Discard taxa that are not present in 25% of samples.
    def _cleanCountDf(df):
        """Drop extra columns/headers and transpose so that
        samples are along rows and OTUs along columns.

        outDf : pd.DataFrame [index: samples, columns: OTUs]"""

        df = df.drop(['tax_id', 'rank'], axis=1)
        df = df.dropna(subset=['tax_name'], axis=0)
        df = df.rename_axis({'tax_name': 'OTU'}, axis=1)
        df = df.set_index('OTU')
        df = df.drop(['specimen'], axis=0)
        df = df.T
        df = df.dropna(subset=['label'], axis=0)
        df['sid'] = df.label.str.replace('Sample-', 'S')
        df = df.set_index('sid')
        df = df.drop('label', axis=1)
        df = df.astype(float)
        return df

    def _discardLow(df, thresh=0.005):
        """Discard taxa/columns with less than 0.5% of reads"""
        totReads = df.values.sum()
        keepInd1 = (df.sum(axis=0) / totReads) > thresh
        """Also discard taxa that are not present in 25% of samples"""
        keepInd2 = (df > 0).sum(axis=0) / df.shape[0] > 0.25

        return df.loc[:, keepInd1 & keepInd2]

    df = pd.read_excel(filename)
    df = _cleanCountDf(df)

    if truncate:
        df = _discardLow(df)

    if compositionNorm:
        values = composition.multiplicative_replacement(df.values)
        df = pd.DataFrame(values, columns=df.columns, index=df.index)

    cols = [c for c in df.columns if not c in ['sid']]

    print('Abundance data: %s samples, %s taxa' % (df.shape[0], len(cols)))
    return df, cols
コード例 #30
def CLRPermTest(otuDf,
    """Calculates centered-log-ratio (CLR) for all OTUs and performs
    permutation tests to determine if there is a significant correlation
    in OTU ratios with respect to the label variable of interest.

    otuDf : pd.DataFrame [samples x OTUs]
        Contains relative abundance [0-1] for all samples (rows) and OTUs (colums)
    labels: pd.Series (float)
        Contains binary variable indicating membership into one of two categories
        (e.g. treatment conditions). Must share index with otuDf.
    statfunc : function
        Takes a np.array [n x k] and float index [n] as parameters and
        returns a 1-D array of the statistic [k].
    nperms : int
        Number of iterations for the permutation test.
    adjMethod : string
        Passed to sm.stats.multipletests for p-value multiplicity adjustment.
        If value is None then no adjustment is made.
    seed :int
        Seed for random permutation generation.
    qvalues : pd.Series [index: OTU]
        Q/P-values for each OTU computed.
    observed : pd.Series [index: OTU]
        Log-ratio statistic summarizing across samples."""

    nSamples, nOTUs = otuDf.shape

    if binary:
        labelValues = labels.values.astype(bool)
        labelValues = labels.values.astype(float)

    # Make proportions
    otuDf = otuDf / otuDf.sum()
    # Apply multiplicative replacement for zero values
    otuMR = multiplicative_replacement(otuDf.values)
    # Calculate the CLR
    otuCLR = clr(otuMR)
    # Make into a DataFrame
    otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns)

    obs = statfunc(otuCLR.values, labelValues)

    samples = np.zeros((nperms, nOTUs))

    for permi in range(nperms):
        samples[permi, :] = statfunc(
            otuCLR.values, labelValues[np.random.permutation(nSamples)])

    pvalues = ((np.abs(samples) >= np.abs(obs[None, :])).sum(axis=0) +
               1) / (nperms + 1)

    if adjMethod is None or adjMethod.lower() == 'none':
        qvalues = pvalues
        qvalues = _pvalueAdjust(pvalues, method=adjMethod)

    qvalues = pd.Series(qvalues, index=otuDf.columns)
    observed = pd.Series(obs, index=otuDf.columns)

    return qvalues, observed
コード例 #31
ファイル: ancom.py プロジェクト: agartland/utils
def CLRPermTest(otuDf, labels, statfunc=_rhoStat, nperms=999, adjMethod='fdr_bh', seed=110820, binary=False):
    """Calculates centered-log-ratio (CLR) for all OTUs and performs
    permutation tests to determine if there is a significant correlation
    in OTU ratios with respect to the label variable of interest.

    otuDf : pd.DataFrame [samples x OTUs]
        Contains relative abundance [0-1] for all samples (rows) and OTUs (colums)
    labels: pd.Series (float)
        Contains binary variable indicating membership into one of two categories
        (e.g. treatment conditions). Must share index with otuDf.
    statfunc : function
        Takes a np.array [n x k] and float index [n] as parameters and
        returns a 1-D array of the statistic [k].
    nperms : int
        Number of iterations for the permutation test.
    adjMethod : string
        Passed to sm.stats.multipletests for p-value multiplicity adjustment.
        If value is None then no adjustment is made.
    seed :int
        Seed for random permutation generation.
    qvalues : pd.Series [index: OTU]
        Q/P-values for each OTU computed.
    observed : pd.Series [index: OTU]
        Log-ratio statistic summarizing across samples."""

    nSamples, nOTUs = otuDf.shape

    if binary:
        labelValues = labels.values.astype(bool)
        labelValues = labels.values.astype(float)

    # Make proportions
    otuDf = otuDf / otuDf.sum()
    # Apply multiplicative replacement for zero values
    otuMR = multiplicative_replacement(otuDf.values)
    # Calculate the CLR
    otuCLR = clr(otuMR)
    # Make into a DataFrame
    otuCLR = pd.DataFrame(otuCLR, index=otuDf.index, columns=otuDf.columns)

    obs = statfunc(otuCLR.values, labelValues)

    samples = np.zeros((nperms, nOTUs))

    for permi in range(nperms):
        samples[permi, :] = statfunc(

    pvalues = ((np.abs(samples) >= np.abs(obs[None, :])).sum(
        axis=0) + 1) / (nperms + 1)

    if adjMethod is None or adjMethod.lower() == 'none':
        qvalues = pvalues
        qvalues = _pvalueAdjust(pvalues, method=adjMethod)

    qvalues = pd.Series(qvalues, index=otuDf.columns)
    observed = pd.Series(obs, index=otuDf.columns)

    return qvalues, observed