Exemple #1
0
def pcoa(lines):
    """Run PCoA on the distance matrix present on lines"""
    # Parse the distance matrix
    dist_mtx = DistanceMatrix.read(lines)
    # Create the PCoA object
    pcoa_obj = PCoA(dist_mtx)
    # Get the PCoA results and return them
    return pcoa_obj.scores()
def pcoa(lines):
    """Run PCoA on the distance matrix present on lines"""
    # Parse the distance matrix
    dist_mtx = DistanceMatrix.read(lines)
    # Create the PCoA object
    pcoa_obj = PCoA(dist_mtx)
    # Get the PCoA results and return them
    return pcoa_obj.scores()
def single_file_nj(input_file, output_file):
    dm = DistanceMatrix.read(input_file)

    tree = nj(dm)

    # write output
    f = open(output_file, 'w')
    f.write(tree.to_newick(with_distances=True))
    f.close()
def single_file_nj(input_file, output_file):
    dm = DistanceMatrix.read(input_file)

    tree = nj(dm)

    # write output
    f = open(output_file, 'w')
    f.write(tree.to_newick(with_distances=True))
    f.close()
Exemple #5
0
    def setUp(self):
        self.counts = pd.read_csv(get_data_path('analyses/raw_otu_table.csv'),
                                  sep='\t',
                                  dtype={'#SampleID': str})
        self.counts.set_index('#SampleID', inplace=True)

        self.metrics_beta = ["unweighted_unifrac", "bray_curtis"]

        self.beta = dict()
        for metric in self.metrics_beta:
            self.beta[metric] = DistanceMatrix.read(
                get_data_path('analyses/beta_%s.dm.txt' % metric))
Exemple #6
0
    def __call__(self, distance_matrix, output, verbose, *args, **kwargs):
        logger.info("Loading distance matrix...")
        dm = DistanceMatrix.read(distance_matrix)

        logger.info("Building tree...")
        tree = skbio.tree.nj(dm)
        tree = tree.root_at_midpoint()

        if verbose > 0:
            logger.info("Approximate tree using neighbour joining:\n%s",
                        tree.ascii_art())

        tree.write(output, format='newick')
        logger.info("Done.")
def single_file_upgma(input_file, output_file):
    # read in dist matrix
    dist_mat = DistanceMatrix.read(input_file)

    # SciPy uses average as UPGMA:
    # http://docs.scipy.org/doc/scipy/reference/generated/
    #    scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    linkage_matrix = linkage(dist_mat.condensed_form(), method='average')

    tree = TreeNode.from_linkage_matrix(linkage_matrix, dist_mat.ids)

    # write output
    f = open(output_file, 'w')
    try:
        f.write(tree.to_newick(with_distances=True))
    except AttributeError:
        if c is None:
            raise RuntimeError("""input file %s did not make a UPGMA tree.
 Ensure it has more than one sample present""" % (str(input_file),))
        raise
    f.close()
def single_file_upgma(input_file, output_file):
    # read in dist matrix
    dist_mat = DistanceMatrix.read(input_file)

    # SciPy uses average as UPGMA:
    # http://docs.scipy.org/doc/scipy/reference/generated/
    #    scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    linkage_matrix = linkage(dist_mat.condensed_form(), method='average')

    tree = TreeNode.from_linkage_matrix(linkage_matrix, dist_mat.ids)

    # write output
    f = open(output_file, 'w')
    try:
        f.write(tree.to_newick(with_distances=True))
    except AttributeError:
        if c is None:
            raise RuntimeError("""input file %s did not make a UPGMA tree.
 Ensure it has more than one sample present""" % (str(input_file), ))
        raise
    f.close()
Exemple #9
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'bioenv', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'bioenv' or 'morans_i', this parameter will
            be ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """
    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BIO-ENV "
                         "analyses). Please choose a different metadata "
                         "column to perform statistical tests on.")

    dm = DistanceMatrix.read(dm_fp)

    if method in ('anosim', 'permanova', 'bioenv'):
        with open(map_fp, 'U') as map_f:
            md_dict = parse_mapping_file_to_dict(map_f)[0]
        df = pd.DataFrame.from_dict(md_dict, orient='index')

        out_fp = join(out_dir, '%s_results.txt' % method)

        if method in ('anosim', 'permanova'):
            if method == 'anosim':
                method_fn = anosim
            elif method == 'permanova':
                method_fn = permanova

            results = method_fn(dm,
                                df,
                                column=categories[0],
                                permutations=num_perms)
        elif method == 'bioenv':
            results = bioenv(dm, df, columns=categories)

        results.to_csv(out_fp, sep='\t')
    else:
        # Remove any samples from the mapping file that aren't in the distance
        # matrix (important for validation checks). Use strict=True so that an
        # error is raised if the distance matrix contains any samples that
        # aren't in the mapping file.
        with open(map_fp, 'U') as map_f:
            md_map = MetadataMap.parseMetadataMap(map_f)
        md_map.filterSamples(dm.ids, strict=True)

        # These methods are run in R. Input validation must be done here before
        # running the R commands.
        if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
            # Check to make sure all categories passed in are in mapping file
            # and are not all the same value.
            for category in categories:
                if not category in md_map.CategoryNames:
                    raise ValueError("Category '%s' not found in mapping file "
                                     "columns." % category)

                if md_map.hasSingleCategoryValue(category):
                    raise ValueError("All values in category '%s' are the "
                                     "same. The statistical method '%s' "
                                     "cannot operate on a category that "
                                     "creates only a single group of samples "
                                     "(e.g. there are no 'between' distances "
                                     "because there is only a single group)." %
                                     (category, method))

            # Build the command arguments string.
            command_args = [
                '-d %s -m %s -c %s -o %s' %
                (dm_fp, map_fp, categories[0], out_dir)
            ]

            if method == 'morans_i':
                # Moran's I requires only numeric categories.
                for category in categories:
                    if not md_map.isNumericCategory(category):
                        raise TypeError("The category '%s' is not numeric. "
                                        "Not all values could be converted to "
                                        "numbers." % category)
            else:
                # The rest require groups of samples, so the category values
                # cannot all be unique.
                for category in categories:
                    if (md_map.hasUniqueCategoryValues(category)
                            and not (method == 'adonis'
                                     and md_map.isNumericCategory(category))):
                        raise ValueError("All values in category '%s' are "
                                         "unique. This statistical method "
                                         "cannot operate on a category with "
                                         "unique values (e.g. there are no "
                                         "'within' distances because each "
                                         "group of samples contains only a "
                                         "single sample)." % category)

                # Only Moran's I doesn't accept a number of permutations.
                if num_perms < 0:
                    raise ValueError("The number of permutations must be "
                                     "greater than or equal to zero.")

                command_args[0] += ' -n %d' % num_perms

            rex = RExecutor(TmpDir=get_qiime_temp_dir())
            rex(command_args, '%s.r' % method)
        else:
            raise ValueError("Unrecognized method '%s'. Valid methods: %r" %
                             (method, methods))
 def setUp(self):
     self.dm100 = DistanceMatrix.read(get_data_path('distMatrix_100.txt'))
     self.dm20 = DistanceMatrix.read(get_data_path('distMatrix_20_f5.txt'))
Exemple #11
0
def pwmantel(dms, labels=None, method='pearson', permutations=999,
             alternative='two-sided', strict=True, lookup=None):
    """Run Mantel tests for every pair of given distance matrices.

    Runs a Mantel test for each pair of distance matrices and collates the
    results in a ``DataFrame``. Distance matrices do not need to be in the same
    ID order if they are ``DistanceMatrix`` instances. Distance matrices will
    be re-ordered prior to running each pairwise test, and if ``strict=False``,
    IDs that don't match between a pair of distance matrices will be dropped
    prior to running the test (otherwise a ``ValueError`` will be raised if
    there are nonmatching IDs between any pair of distance matrices).

    Parameters
    ----------
    dms : iterable of DistanceMatrix objects, array_like objects, or filepaths
        to distance matrices. If they are ``array_like``, no reordering or
        matching of IDs will be performed.
    labels : iterable of str or int, optional
        Labels for each distance matrix in `dms`. These are used in the results
        ``DataFrame`` to identify the pair of distance matrices used in a
        pairwise Mantel test. If ``None``, defaults to monotonically-increasing
        integers starting at zero.
    method : {'pearson', 'spearman'}
        Correlation method. See ``mantel`` function for more details.
    permutations : int, optional
        Number of permutations. See ``mantel`` function for more details.
    alternative : {'two-sided', 'greater', 'less'}
        Alternative hypothesis. See ``mantel`` function for more details.
    strict : bool, optional
        Handling of nonmatching IDs. See ``mantel`` function for more details.
    lookup : dict, optional
        Map existing IDs to new IDs. See ``mantel`` function for more details.

    Returns
    -------
    pandas.DataFrame
        ``DataFrame`` containing the results of each pairwise test (one per
        row). Includes the number of objects considered in each test as column
        ``n`` (after applying `lookup` and filtering nonmatching IDs if
        ``strict=False``). Column ``p-value`` will display p-values as ``NaN``
        if p-values could not be computed (they are stored as ``np.nan`` within
        the ``DataFrame``; see ``mantel`` for more details).

    See Also
    --------
    mantel
    DistanceMatrix.read

    Notes
    --------
    Passing a list of filepaths can be useful as it allows for a smaller amount
    of memory consumption as it only loads two matrices at a time as opposed to
    loading all distance matrices into memory.

    Examples
    --------
    Import the functionality we'll use in the following examples:

    >>> from skbio import DistanceMatrix
    >>> from skbio.stats.distance import pwmantel

    Define three 3x3 distance matrices:

    >>> x = DistanceMatrix([[0, 1, 2],
    ...                     [1, 0, 3],
    ...                     [2, 3, 0]])
    >>> y = DistanceMatrix([[0, 2, 7],
    ...                     [2, 0, 6],
    ...                     [7, 6, 0]])
    >>> z = DistanceMatrix([[0, 5, 6],
    ...                     [5, 0, 1],
    ...                     [6, 1, 0]])

    Run Mantel tests for each pair of distance matrices (there are 3 possible
    pairs):

    >>> pwmantel((x, y, z), labels=('x', 'y', 'z'),
    ...          permutations=0) # doctest: +NORMALIZE_WHITESPACE
                 statistic p-value  n   method  permutations alternative
    dm1 dm2
    x   y     0.755929     NaN  3  pearson             0   two-sided
        z    -0.755929     NaN  3  pearson             0   two-sided
    y   z    -0.142857     NaN  3  pearson             0   two-sided

    Note that we passed ``permutations=0`` to suppress significance tests; the
    p-values in the output are labelled ``NaN``.

    """
    num_dms = len(dms)

    if num_dms < 2:
        raise ValueError("Must provide at least two distance matrices.")

    if labels is None:
        labels = range(num_dms)
    else:
        if num_dms != len(labels):
            raise ValueError("Number of labels must match the number of "
                             "distance matrices.")
        if len(set(labels)) != len(labels):
            raise ValueError("Labels must be unique.")

    num_combs = scipy.special.comb(num_dms, 2, exact=True)
    results_dtype = [('dm1', object), ('dm2', object), ('statistic', float),
                     ('p-value', float), ('n', int), ('method', object),
                     ('permutations', int), ('alternative', object)]
    results = np.empty(num_combs, dtype=results_dtype)

    for i, pair in enumerate(combinations(zip(labels, dms), 2)):
        (xlabel, x), (ylabel, y) = pair
        if isinstance(x, str):
            x = DistanceMatrix.read(x)
        if isinstance(y, str):
            y = DistanceMatrix.read(y)

        stat, p_val, n = mantel(x, y, method=method, permutations=permutations,
                                alternative=alternative, strict=strict,
                                lookup=lookup)

        results[i] = (xlabel, ylabel, stat, p_val, n, method, permutations,
                      alternative)

    return pd.DataFrame.from_records(results, index=('dm1', 'dm2'))
Exemple #12
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'bioenv', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'bioenv' or 'morans_i', this parameter will
            be ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """
    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BIO-ENV "
                         "analyses). Please choose a different metadata "
                         "column to perform statistical tests on.")

    dm = DistanceMatrix.read(dm_fp)

    if method in ('anosim', 'permanova', 'bioenv'):
        with open(map_fp, 'U') as map_f:
            md_dict = parse_mapping_file_to_dict(map_f)[0]
        df = pd.DataFrame.from_dict(md_dict, orient='index')

        out_fp = join(out_dir, '%s_results.txt' % method)

        if method in ('anosim', 'permanova'):
            if method == 'anosim':
                method_cls = ANOSIM
            elif method == 'permanova':
                method_cls = PERMANOVA

            method_inst = method_cls(dm, df, column=categories[0])
            results = method_inst(num_perms)

            with open(out_fp, 'w') as out_f:
                out_f.write(results.summary())
        elif method == 'bioenv':
            results = bioenv(dm, df, columns=categories)
            results.to_csv(out_fp, sep='\t')
    else:
        # Remove any samples from the mapping file that aren't in the distance
        # matrix (important for validation checks). Use strict=True so that an
        # error is raised if the distance matrix contains any samples that
        # aren't in the mapping file.
        with open(map_fp, 'U') as map_f:
            md_map = MetadataMap.parseMetadataMap(map_f)
        md_map.filterSamples(dm.ids, strict=True)

        # These methods are run in R. Input validation must be done here before
        # running the R commands.
        if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
            # Check to make sure all categories passed in are in mapping file
            # and are not all the same value.
            for category in categories:
                if not category in md_map.CategoryNames:
                    raise ValueError("Category '%s' not found in mapping file "
                                     "columns." % category)

                if md_map.hasSingleCategoryValue(category):
                    raise ValueError("All values in category '%s' are the "
                                     "same. The statistical method '%s' "
                                     "cannot operate on a category that "
                                     "creates only a single group of samples "
                                     "(e.g. there are no 'between' distances "
                                     "because there is only a single group)."
                                     % (category, method))

            # Build the command arguments string.
            command_args = ['-d %s -m %s -c %s -o %s'
                            % (dm_fp, map_fp, categories[0], out_dir)]

            if method == 'morans_i':
                # Moran's I requires only numeric categories.
                for category in categories:
                    if not md_map.isNumericCategory(category):
                        raise TypeError("The category '%s' is not numeric. "
                                        "Not all values could be converted to "
                                        "numbers." % category)
            else:
                # The rest require groups of samples, so the category values
                # cannot all be unique.
                for category in categories:
                    if md_map.hasUniqueCategoryValues(category):
                        raise ValueError("All values in category '%s' are "
                                         "unique. This statistical method "
                                         "cannot operate on a category with "
                                         "unique values (e.g. there are no "
                                         "'within' distances because each "
                                         "group of samples contains only a "
                                         "single sample)." % category)

                # Only Moran's I doesn't accept a number of permutations.
                if num_perms < 0:
                    raise ValueError("The number of permutations must be "
                                     "greater than or equal to zero.")

                command_args[0] += ' -n %d' % num_perms

            rex = RExecutor(TmpDir=get_qiime_temp_dir())
            rex(command_args, '%s.r' % method, output_dir=out_dir)
        else:
            raise ValueError("Unrecognized method '%s'. Valid methods: %r"
                             % (method, methods))
Exemple #13
0
import skbio
import pandas as pd
from skbio.stats.distance import DistanceMatrix
from skbio.stats.ordination import pcoa
import matplotlib
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import warnings
import sys

Input = sys.argv[1]
Meta = sys.argv[2]
Output = sys.argv[3]
fig = sys.argv[4]
warnings.filterwarnings("ignore")
metadata = pd.read_csv(Meta, sep = '\t', index_col = 0)
my_obj =  DistanceMatrix.read(Input, 'lsmat')
PC = pcoa(my_obj)



def plot_PCoA(matrix, ID_column,Fig):
    figure = matrix.plot(metadata, ID_column, axis_labels=('PC 1', 'PC 2', 'PC 3'), cmap='jet', s=50)
    figure.set_size_inches(12.5, 8.5)
    figure.text(0,0.9, r'Samples colored by {}'.format(ID_column), fontsize=16)
    figure.savefig(Output + 'PCOA_{}'.format(ID_column) + Fig, bbox_inches='tight')
    
for x in metadata.columns:
    plot_PCoA(PC, x, fig)
Exemple #14
0
Usage:
    sample_ab_dists.py k phy.dm seq.dm archaea.txt result.tsv
"""

from sys import argv
from random import seed, sample
import numpy as np
import pandas as pd
from skbio.stats.distance import DistanceMatrix

seed(42)

k = int(argv[1])  # number of taxa to sample from each domain

phydm = DistanceMatrix.read(argv[2])
seqdm = DistanceMatrix.read(argv[3])

ids = sorted(phydm.ids)
if ids != sorted(seqdm.ids):
    raise ValueError('IDs do not match.')

with open(argv[4], 'r') as f:
    archaea = set(f.read().splitlines())

a_sample = sample([x for x in ids if x in archaea], k)
b_sample = sample([x for x in ids if x not in archaea], k)

ids = sorted(a_sample + b_sample)

phydm = phydm.filter(ids)
 def setUp(self):
     self.dm100 = DistanceMatrix.read(get_data_path('distMatrix_100.txt'))
     self.dm20 = DistanceMatrix.read(get_data_path('distMatrix_20_f5.txt'))