Esempio n. 1
0
 def test_validate_counts_matrix_suppress_cast(self):
     # suppress_cast is passed through to _validate_counts_vector
     obs = _validate_counts_matrix([[42.2, 42.1, 0], [42.2, 42.1, 1.0]], suppress_cast=True)
     npt.assert_array_equal(obs[0], np.array([42.2, 42.1, 0]))
     npt.assert_array_equal(obs[1], np.array([42.2, 42.1, 1.0]))
     self.assertEqual(obs[0].dtype, float)
     self.assertEqual(obs[1].dtype, float)
     with self.assertRaises(TypeError):
         _validate_counts_matrix([[0.0], [1]], suppress_cast=False)
Esempio n. 2
0
 def test_validate_counts_matrix_suppress_cast(self):
     # suppress_cast is passed through to _validate_counts_vector
     obs = _validate_counts_matrix([[42.2, 42.1, 0], [42.2, 42.1, 1.0]],
                                   suppress_cast=True)
     npt.assert_array_equal(obs[0], np.array([42.2, 42.1, 0]))
     npt.assert_array_equal(obs[1], np.array([42.2, 42.1, 1.0]))
     self.assertEqual(obs[0].dtype, float)
     self.assertEqual(obs[1].dtype, float)
     with self.assertRaises(TypeError):
         _validate_counts_matrix([[0.0], [1]], suppress_cast=False)
Esempio n. 3
0
 def test_validate_counts_matrix_pandas(self):
     obs = _validate_counts_matrix(pd.DataFrame([[0, 1, 1, 0, 2],
                                                 [0, 0, 2, 1, 3],
                                                 [1, 1, 1, 1, 1]]))
     npt.assert_array_equal(obs[0], np.array([0, 1, 1, 0, 2]))
     npt.assert_array_equal(obs[1], np.array([0, 0, 2, 1, 3]))
     npt.assert_array_equal(obs[2], np.array([1, 1, 1, 1, 1]))
Esempio n. 4
0
    def test_validate_counts_matrix(self):
        # basic valid input (n=2)
        obs = _validate_counts_matrix([[0, 1, 1, 0, 2], [0, 0, 2, 1, 3]])
        npt.assert_array_equal(obs[0], np.array([0, 1, 1, 0, 2]))
        npt.assert_array_equal(obs[1], np.array([0, 0, 2, 1, 3]))

        # basic valid input (n=3)
        obs = _validate_counts_matrix([[0, 1, 1, 0, 2], [0, 0, 2, 1, 3], [1, 1, 1, 1, 1]])
        npt.assert_array_equal(obs[0], np.array([0, 1, 1, 0, 2]))
        npt.assert_array_equal(obs[1], np.array([0, 0, 2, 1, 3]))
        npt.assert_array_equal(obs[2], np.array([1, 1, 1, 1, 1]))

        # empty counts vectors
        obs = _validate_counts_matrix(np.array([[], []], dtype=int))
        npt.assert_array_equal(obs[0], np.array([]))
        npt.assert_array_equal(obs[1], np.array([]))
Esempio n. 5
0
    def test_validate_counts_matrix(self):
        # basic valid input (n=2)
        obs = _validate_counts_matrix([[0, 1, 1, 0, 2], [0, 0, 2, 1, 3]])
        npt.assert_array_equal(obs[0], np.array([0, 1, 1, 0, 2]))
        npt.assert_array_equal(obs[1], np.array([0, 0, 2, 1, 3]))

        # basic valid input (n=3)
        obs = _validate_counts_matrix([[0, 1, 1, 0, 2], [0, 0, 2, 1, 3],
                                       [1, 1, 1, 1, 1]])
        npt.assert_array_equal(obs[0], np.array([0, 1, 1, 0, 2]))
        npt.assert_array_equal(obs[1], np.array([0, 0, 2, 1, 3]))
        npt.assert_array_equal(obs[2], np.array([1, 1, 1, 1, 1]))

        # empty counts vectors
        obs = _validate_counts_matrix(np.array([[], []], dtype=int))
        npt.assert_array_equal(obs[0], np.array([]))
        npt.assert_array_equal(obs[1], np.array([]))
Esempio n. 6
0
 def test_validate_counts_matrix_unequal_lengths(self):
     # len of vectors not equal
     with self.assertRaises(ValueError):
         _validate_counts_matrix([[0], [0, 0], [9, 8]])
     with self.assertRaises(ValueError):
         _validate_counts_matrix([[0, 0], [0, 0, 8], [9, 8]])
     with self.assertRaises(ValueError):
         _validate_counts_matrix([[0, 0, 75], [0, 0, 3], [9, 8, 22, 44]])
Esempio n. 7
0
 def test_validate_counts_matrix_unequal_lengths(self):
     # len of vectors not equal
     with self.assertRaises(ValueError):
         _validate_counts_matrix([[0], [0, 0], [9, 8]])
     with self.assertRaises(ValueError):
         _validate_counts_matrix([[0, 0], [0, 0, 8], [9, 8]])
     with self.assertRaises(ValueError):
         _validate_counts_matrix([[0, 0, 75], [0, 0, 3], [9, 8, 22, 44]])
Esempio n. 8
0
def alpha_diversity(metric, counts, ids=None, validate=True, **kwargs):
    """ Compute alpha diversity for one or more samples

    Parameters
    ----------
    metric : str, callable
        The alpha diversity metric to apply to the sample(s). Passing metric as
        a string is preferable as this often results in an optimized version of
        the metric being used.
    counts : 1D or 2D array_like of ints or floats
        Vector or matrix containing count/abundance data. If a matrix, each row
        should contain counts of OTUs in a given sample.
    ids : iterable of strs, optional
        Identifiers for each sample in ``counts``. By default, samples will be
        assigned integer identifiers in the order that they were provided.
    validate: bool, optional
        If `False`, validation of the input won't be performed. This step can
        be slow, so if validation is run elsewhere it can be disabled here.
        However, invalid input data can lead to invalid results or error
        messages that are hard to interpret, so this step should not be
        bypassed if you're not certain that your input data are valid. See
        :mod:`skbio.diversity` for the description of what validation entails
        so you can determine if you can safely disable validation.
    kwargs : kwargs, optional
        Metric-specific parameters.

    Returns
    -------
    pd.Series
        Values of ``metric`` for all vectors provided in ``counts``. The index
        will be ``ids``, if provided.

    Raises
    ------
    ValueError, MissingNodeError, DuplicateNodeError
        If validation fails. Exact error will depend on what was invalid.
    TypeError
        If invalid method-specific parameters are provided.

    See Also
    --------
    skbio.diversity
    skbio.diversity.alpha
    skbio.diversity.get_alpha_diversity_metrics
    skbio.diversity.beta_diversity

    """
    metric_map = _get_alpha_diversity_metric_map()

    if validate:
        counts = _validate_counts_matrix(counts, ids=ids)

    if metric == 'faith_pd':
        otu_ids, tree, kwargs = _get_phylogenetic_kwargs(counts, **kwargs)
        counts_by_node, branch_lengths = _setup_faith_pd(
            counts, otu_ids, tree, validate, single_sample=False)
        counts = counts_by_node
        metric = functools.partial(_faith_pd, branch_lengths=branch_lengths)
    elif callable(metric):
        metric = functools.partial(metric, **kwargs)
    elif metric in metric_map:
        metric = functools.partial(metric_map[metric], **kwargs)
    else:
        raise ValueError('Unknown metric provided: %r.' % metric)

    # kwargs is provided here so an error is raised on extra kwargs
    results = [metric(c, **kwargs) for c in counts]
    return pd.Series(results, index=ids)
Esempio n. 9
0
def _validate(u_counts, v_counts, otu_ids, tree):
    _validate_counts_matrix([u_counts, v_counts], suppress_cast=True)
    _validate_otu_ids_and_tree(counts=u_counts, otu_ids=otu_ids, tree=tree)
Esempio n. 10
0
 def test_validate_counts_matrix_negative_counts(self):
     with self.assertRaises(ValueError):
         _validate_counts_matrix([[0, 1, 1, 0, 2], [0, 0, 2, -1, 3]])
     with self.assertRaises(ValueError):
         _validate_counts_matrix([[0, 0, 2, -1, 3], [0, 1, 1, 0, 2]])
Esempio n. 11
0
def beta_diversity(metric, counts, ids=None, validate=True, **kwargs):
    """Compute distances between all pairs of samples

    Parameters
    ----------
    metric : str, callable
        The pairwise distance function to apply. See the scipy ``pdist`` docs
        and the scikit-bio functions linked under *See Also* for available
        metrics. Passing metrics as a strings is preferable as this often
        results in an optimized version of the metric being used.
    counts : 2D array_like of ints or floats
        Matrix containing count/abundance data where each row contains counts
        of OTUs in a given sample.
    ids : iterable of strs, optional
        Identifiers for each sample in ``counts``. By default, samples will be
        assigned integer identifiers in the order that they were provided
        (where the type of the identifiers will be ``str``).
    validate: bool, optional
        If `False`, validation of the input won't be performed. This step can
        be slow, so if validation is run elsewhere it can be disabled here.
        However, invalid input data can lead to invalid results or error
        messages that are hard to interpret, so this step should not be
        bypassed if you're not certain that your input data are valid. See
        Notes for the description of what validation entails so you can
        determine if you can safely disable validation.
    kwargs : kwargs, optional
        Metric-specific parameters.

    Returns
    -------
    skbio.DistanceMatrix
        Distances between all pairs of samples (i.e., rows). The number of
        rows and columns will be equal to the number of rows in ``counts``.

    Raises
    ------
    ValueError, MissingNodeError, DuplicateNodeError
        If validation fails (see description of validation in Notes). Exact
        error will depend on what was invalid.
    TypeError
        If invalid method-specific parameters are provided.

    See Also
    --------
    skbio.diversity.beta
    skbio.diversity.alpha_diversity
    scipy.spatial.distance.pdist

    Notes
    -----
    The value that you provide for ``metric`` can be either a string (e.g.,
    ``"unweighted_unifrac"``) or a function
    (e.g., ``skbio.diversity.beta.unweighted_unifrac``). The metric should
    generally be passed as a string, as this often uses an optimized version
    of the metric. For example, passing  ``"unweighted_unifrac"`` (a string)
    will be hundreds of times faster than passing the function
    ``skbio.diversity.beta.unweighted_unifrac``. The latter is faster if
    computing only one or a few distances, but in these cases the difference in
    runtime is negligible, so it's safer to just err on the side of passing
    ``metric`` as a string.

    Validation of input data confirms the following:
     * ``counts`` data can be safely cast to integers
     * there are no negative values in ``counts``
     * ``counts`` has the correct number of dimensions
     * all vectors in ``counts`` are of equal length
     * the correct number of ``ids`` is provided (if any are provided)

    For phylogenetic diversity metrics, validation additional confirms that:
     * ``otu_ids`` does not contain duplicate values
     * the length of each ``counts`` vector is equal to ``len(otu_ids)``
     * ``tree`` is rooted
     * ``tree`` has more than one node
     * all nodes in ``tree`` except for the root node have branch lengths
     * all tip names in ``tree`` are unique
     * all ``otu_ids`` correspond to tip names in ``tree``

    """
    if validate:
        counts = _validate_counts_matrix(counts, ids=ids)

    if metric == 'unweighted_unifrac':
        otu_ids, tree, kwargs = _get_phylogenetic_kwargs(counts, **kwargs)
        metric, counts_by_node = _setup_multiple_unweighted_unifrac(
                counts, otu_ids=otu_ids, tree=tree, validate=validate)
        counts = counts_by_node
    elif metric == 'weighted_unifrac':
        # get the value for normalized. if it was not provided, it will fall
        # back to the default value inside of _weighted_unifrac_pdist_f
        normalized = kwargs.pop('normalized',
                                _normalize_weighted_unifrac_by_default)
        otu_ids, tree, kwargs = _get_phylogenetic_kwargs(counts, **kwargs)
        metric, counts_by_node = _setup_multiple_weighted_unifrac(
                counts, otu_ids=otu_ids, tree=tree, normalized=normalized,
                validate=validate)
        counts = counts_by_node
    elif callable(metric):
        metric = functools.partial(metric, **kwargs)
        # remove all values from kwargs, since they have already been provided
        # through the partial
        kwargs = {}
    else:
        # metric is a string that scikit-bio doesn't know about, for
        # example one of the SciPy metrics
        pass

    distances = scipy.spatial.distance.pdist(counts, metric, **kwargs)
    return DistanceMatrix(distances, ids)
Esempio n. 12
0
 def test_validate_counts_matrix_negative_counts(self):
     with self.assertRaises(ValueError):
         _validate_counts_matrix([[0, 1, 1, 0, 2], [0, 0, 2, -1, 3]])
     with self.assertRaises(ValueError):
         _validate_counts_matrix([[0, 0, 2, -1, 3], [0, 1, 1, 0, 2]])
Esempio n. 13
0
def _validate(u_counts, v_counts, otu_ids, tree):
    _validate_counts_matrix([u_counts, v_counts], suppress_cast=True)
    _validate_otu_ids_and_tree(counts=u_counts, otu_ids=otu_ids, tree=tree)
Esempio n. 14
0
def block_beta_diversity(metric,
                         counts,
                         ids,
                         validate=True,
                         k=64,
                         reduce_f=None,
                         map_f=None,
                         **kwargs):
    """Perform a block-decomposition beta diversity calculation

    Parameters
    ----------
    metric : str or callable
        The pairwise distance function to apply. If ``metric`` is a string, it
        must be resolvable by scikit-bio (e.g., UniFrac methods), or must be
        callable.
    counts : 2D array_like of ints or floats
        Matrix containing count/abundance data where each row contains counts
        of OTUs in a given sample.
    ids : iterable of strs
        Identifiers for each sample in ``counts``.
    validate : bool, optional
        See ``skbio.diversity.beta_diversity`` for details.
    reduce_f : function, optional
        A method to reduce `PartialDistanceMatrix` objects into a single
        `DistanceMatrix`. The expected signature is:

            `f(Iterable of DistanceMatrix) -> DistanceMatrix`

        Note, this is the reduce within a map/reduce.
    map_f: function, optional
        A method that accepts a `_block_compute`. The expected signature is:

            `f(**kwargs) -> DistanceMatrix`

        NOTE: ipyparallel's `map_async` will not work here as we need to be
        able to pass around `**kwargs``.
    k : int, optional
        The blocksize used when computing distances
    kwargs : kwargs, optional
        Metric-specific parameters.

    Returns
    -------
    DistanceMatrix
        A distance matrix relating all samples represented by counts to each
        other.

    Note
    ----
    This method is designed to facilitate computing beta diversity in parallel.
    In general, if you are processing a few hundred samples or less, then it is
    likely the case that `skbio.diversity.beta_diversity` will be faster. The
    original need which motivated the development of this method was processing
    the Earth Microbiome Project [1]_ dataset which at the time spanned over
    25,000 samples and 7.5 million open reference OTUs.

    See Also
    --------
    skbio.diversity.beta_diversity
    skbio.diversity.partial_beta_diversity

    References
    ----------
    .. [1] http://www.earthmicrobiome.org/
    """
    if validate:
        counts = _validate_counts_matrix(counts, ids=ids)

    if reduce_f is None:
        reduce_f = _reduce

    if map_f is None:
        map_f = _map

    # The block method uses numeric IDs to take advantage of fancy indexing
    # with numpy.
    tmp_ids = np.arange(len(counts))
    kwargs['ids'] = tmp_ids

    kwargs['metric'] = metric
    kwargs['counts'] = counts
    kwargs['k'] = k
    kwargs['validate'] = False  # we've already validated if necessary

    dm = reduce_f(map_f(_block_compute, _block_kwargs(**kwargs)))
    dm.ids = ids

    return dm
Esempio n. 15
0
def block_beta_diversity(metric, counts, ids, validate=True, k=64,
                         reduce_f=None, map_f=None, **kwargs):
    """Perform a block-decomposition beta diversity calculation

    Parameters
    ----------
    metric : str or callable
        The pairwise distance function to apply. If ``metric`` is a string, it
        must be resolvable by scikit-bio (e.g., UniFrac methods), or must be
        callable.
    counts : 2D array_like of ints or floats
        Matrix containing count/abundance data where each row contains counts
        of OTUs in a given sample.
    ids : iterable of strs
        Identifiers for each sample in ``counts``.
    validate : bool, optional
        See ``skbio.diversity.beta_diversity`` for details.
    reduce_f : function, optional
        A method to reduce `PartialDistanceMatrix` objects into a single
        `DistanceMatrix`. The expected signature is:

            `f(Iterable of DistanceMatrix) -> DistanceMatrix`

        Note, this is the reduce within a map/reduce.
    map_f: function, optional
        A method that accepts a `_block_compute`. The expected signature is:

            `f(**kwargs) -> DistanceMatrix`

        NOTE: ipyparallel's `map_async` will not work here as we need to be
        able to pass around `**kwargs``.
    k : int, optional
        The blocksize used when computing distances
    kwargs : kwargs, optional
        Metric-specific parameters.

    Returns
    -------
    DistanceMatrix
        A distance matrix relating all samples represented by counts to each
        other.

    Note
    ----
    This method is designed to facilitate computing beta diversity in parallel.
    In general, if you are processing a few hundred samples or less, then it is
    likely the case that `skbio.diversity.beta_diversity` will be faster. The
    original need which motivated the development of this method was processing
    the Earth Microbiome Project [1]_ dataset which at the time spanned over
    25,000 samples and 7.5 million open reference OTUs.

    See Also
    --------
    skbio.diversity.beta_diversity
    skbio.diversity.partial_beta_diversity

    References
    ----------
    .. [1] http://www.earthmicrobiome.org/
    """
    if validate:
        counts = _validate_counts_matrix(counts, ids=ids)

    if reduce_f is None:
        reduce_f = _reduce

    if map_f is None:
        map_f = _map

    # The block method uses numeric IDs to take advantage of fancy indexing
    # with numpy.
    tmp_ids = np.arange(len(counts))
    kwargs['ids'] = tmp_ids

    kwargs['metric'] = metric
    kwargs['counts'] = counts
    kwargs['k'] = k
    kwargs['validate'] = False  # we've already validated if necessary

    dm = reduce_f(map_f(_block_compute, _block_kwargs(**kwargs)))
    dm.ids = ids

    return dm
Esempio n. 16
0
def partial_beta_diversity(metric, counts, ids, id_pairs, validate=True,
                           **kwargs):
    """Compute distances only between specified ID pairs

    Parameters
    ----------
    metric : str or callable
        The pairwise distance function to apply. If ``metric`` is a string, it
        must be resolvable by scikit-bio (e.g., UniFrac methods), or must be
        callable.
    counts : 2D array_like of ints or floats
        Matrix containing count/abundance data where each row contains counts
        of OTUs in a given sample.
    ids : iterable of strs
        Identifiers for each sample in ``counts``.
    id_pairs : iterable of tuple
        An iterable of tuples of IDs to compare (e.g., ``[('a', 'b'), ('a',
        'c'), ...])``. If specified, the set of IDs described must be a subset
        of ``ids``.
    validate : bool, optional
        See ``skbio.diversity.beta_diversity`` for details.
    kwargs : kwargs, optional
        Metric-specific parameters.

    Returns
    -------
    skbio.DistanceMatrix
        Distances between pairs of samples indicated by id_pairs. Pairwise
        distances not defined by id_pairs will be 0.0. Use this resulting
        DistanceMatrix with caution as 0.0 is a valid distance.

    Raises
    ------
    ValueError
        If ``ids`` are not specified.
        If ``id_pairs`` are not a subset of ``ids``.
        If ``metric`` is not a callable or is unresolvable string by
        scikit-bio.
        If duplicates are observed in ``id_pairs``.

    See Also
    --------
    skbio.diversity.beta_diversity
    skbio.diversity.get_beta_diversity_metrics

    """
    if validate:
        counts = _validate_counts_matrix(counts, ids=ids)

    id_pairs = list(id_pairs)
    all_ids_in_pairs = set(itertools.chain.from_iterable(id_pairs))
    if not all_ids_in_pairs.issubset(ids):
        raise ValueError("`id_pairs` are not a subset of `ids`")

    hashes = {i for i in id_pairs}.union({i[::-1] for i in id_pairs})
    if len(hashes) != len(id_pairs) * 2:
        raise ValueError("A duplicate or a self-self pair was observed.")

    if metric == 'unweighted_unifrac':
        otu_ids, tree, kwargs = _get_phylogenetic_kwargs(counts, **kwargs)
        metric, counts_by_node = _setup_multiple_unweighted_unifrac(
                counts, otu_ids=otu_ids, tree=tree, validate=validate)
        counts = counts_by_node
    elif metric == 'weighted_unifrac':
        # get the value for normalized. if it was not provided, it will fall
        # back to the default value inside of _weighted_unifrac_pdist_f
        normalized = kwargs.pop('normalized',
                                _normalize_weighted_unifrac_by_default)
        otu_ids, tree, kwargs = _get_phylogenetic_kwargs(counts, **kwargs)
        metric, counts_by_node = _setup_multiple_weighted_unifrac(
                counts, otu_ids=otu_ids, tree=tree, normalized=normalized,
                validate=validate)
        counts = counts_by_node
    elif callable(metric):
        metric = functools.partial(metric, **kwargs)
        # remove all values from kwargs, since they have already been provided
        # through the partial
        kwargs = {}
    else:
        raise ValueError("partial_beta_diversity is only compatible with "
                         "optimized unifrac methods and callable functions.")

    dm = np.zeros((len(ids), len(ids)), dtype=float)
    id_index = {id_: idx for idx, id_ in enumerate(ids)}
    id_pairs_indexed = ((id_index[u], id_index[v]) for u, v in id_pairs)

    for u, v in id_pairs_indexed:
        dm[u, v] = metric(counts[u], counts[v], **kwargs)

    return DistanceMatrix(dm + dm.T, ids)
Esempio n. 17
0
def beta_diversity(metric, counts, ids=None, validate=True, pairwise_func=None,
                   **kwargs):
    """Compute distances between all pairs of samples

    Parameters
    ----------
    metric : str, callable
        The pairwise distance function to apply. See the scipy ``pdist`` docs
        and the scikit-bio functions linked under *See Also* for available
        metrics. Passing metrics as a strings is preferable as this often
        results in an optimized version of the metric being used.
    counts : 2D array_like of ints or floats
        Matrix containing count/abundance data where each row contains counts
        of OTUs in a given sample.
    ids : iterable of strs, optional
        Identifiers for each sample in ``counts``. By default, samples will be
        assigned integer identifiers in the order that they were provided
        (where the type of the identifiers will be ``str``).
    validate : bool, optional
        If `False`, validation of the input won't be performed. This step can
        be slow, so if validation is run elsewhere it can be disabled here.
        However, invalid input data can lead to invalid results or error
        messages that are hard to interpret, so this step should not be
        bypassed if you're not certain that your input data are valid. See
        :mod:`skbio.diversity` for the description of what validation entails
        so you can determine if you can safely disable validation.
    pairwise_func : callable, optional
        The function to use for computing pairwise distances. This function
        must take ``counts`` and ``metric`` and return a square, hollow, 2-D
        ``numpy.ndarray`` of dissimilarities (floats). Examples of functions
        that can be provided are ``scipy.spatial.distance.pdist`` and
        ``sklearn.metrics.pairwise_distances``. By default,
        ``sklearn.metrics.pairwise_distances`` will be used.
    kwargs : kwargs, optional
        Metric-specific parameters.

    Returns
    -------
    skbio.DistanceMatrix
        Distances between all pairs of samples (i.e., rows). The number of
        rows and columns will be equal to the number of rows in ``counts``.

    Raises
    ------
    ValueError, MissingNodeError, DuplicateNodeError
        If validation fails. Exact error will depend on what was invalid.
    TypeError
        If invalid method-specific parameters are provided.

    See Also
    --------
    skbio.diversity
    skbio.diversity.beta
    skbio.diversity.get_beta_diversity_metrics
    skbio.diversity.alpha_diversity
    scipy.spatial.distance.pdist
    sklearn.metrics.pairwise_distances

    """
    if validate:
        counts = _validate_counts_matrix(counts, ids=ids)

    if 0 in counts.shape:
        # if the input counts are empty, return an empty DistanceMatrix.
        # this check is not necessary for scipy.spatial.distance.pdist but
        # it is necessary for sklearn.metrics.pairwise_distances where the
        # latter raises an exception over empty data.
        return DistanceMatrix(np.zeros((len(ids), len(ids))), ids)

    if metric == 'unweighted_unifrac':
        otu_ids, tree, kwargs = _get_phylogenetic_kwargs(counts, **kwargs)
        metric, counts_by_node = _setup_multiple_unweighted_unifrac(
                counts, otu_ids=otu_ids, tree=tree, validate=validate)
        counts = counts_by_node
    elif metric == 'weighted_unifrac':
        # get the value for normalized. if it was not provided, it will fall
        # back to the default value inside of _weighted_unifrac_pdist_f
        normalized = kwargs.pop('normalized',
                                _normalize_weighted_unifrac_by_default)
        otu_ids, tree, kwargs = _get_phylogenetic_kwargs(counts, **kwargs)
        metric, counts_by_node = _setup_multiple_weighted_unifrac(
                counts, otu_ids=otu_ids, tree=tree, normalized=normalized,
                validate=validate)
        counts = counts_by_node
    elif callable(metric):
        metric = functools.partial(metric, **kwargs)
        # remove all values from kwargs, since they have already been provided
        # through the partial
        kwargs = {}
    else:
        # metric is a string that scikit-bio doesn't know about, for
        # example one of the SciPy metrics
        pass

    if pairwise_func is None:
        pairwise_func = sklearn.metrics.pairwise_distances

    distances = pairwise_func(counts, metric=metric, **kwargs)
    return DistanceMatrix(distances, ids)
Esempio n. 18
0
def alpha_diversity(metric, counts, ids=None, validate=True, **kwargs):
    """ Compute alpha diversity for one or more samples

    Parameters
    ----------
    metric : str, callable
        The alpha diversity metric to apply to the sample(s). Passing metric as
        a string is preferable as this often results in an optimized version of
        the metric being used.
    counts : 1D or 2D array_like of ints or floats
        Vector or matrix containing count/abundance data. If a matrix, each row
        should contain counts of OTUs in a given sample.
    ids : iterable of strs, optional
        Identifiers for each sample in ``counts``. By default, samples will be
        assigned integer identifiers in the order that they were provided.
    validate: bool, optional
        If `False`, validation of the input won't be performed. This step can
        be slow, so if validation is run elsewhere it can be disabled here.
        However, invalid input data can lead to invalid results or error
        messages that are hard to interpret, so this step should not be
        bypassed if you're not certain that your input data are valid. See
        Notes for the description of what validation entails so you can
        determine if you can safely disable validation.
    kwargs : kwargs, optional
        Metric-specific parameters.

    Returns
    -------
    pd.Series
        Values of ``metric`` for all vectors provided in ``counts``. The index
        will be ``ids``, if provided.

    Raises
    ------
    ValueError, MissingNodeError, DuplicateNodeError
        If validation fails (see description of validation in Notes). Exact
        error will depend on what was invalid.
    TypeError
        If invalid method-specific parameters are provided.

    See Also
    --------
    skbio.diversity.alpha
    skbio.diversity.beta_diversity

    Notes
    -----
    The value that you provide for ``metric`` can be either a string (e.g.,
    ``"faith_pd"``) or a function (e.g., ``skbio.diversity.alpha.faith_pd``).
    The metric should generally be passed as a string, as this often uses an
    optimized version of the metric. For example, passing  ``"faith_pd"`` (a
    string) will be tens of times faster than passing the function
    ``skbio.diversity.alpha.faith_pd``. The latter may be faster if computing
    alpha diversity for only one or a few samples, but in these cases the
    difference in runtime is negligible, so it's safer to just err on the side
    of passing ``metric`` as a string.

    Validation of input data confirms the following:
     * ``counts`` data can be safely cast to integers
     * there are no negative values in ``counts``
     * ``counts`` has the correct number of dimensions
     * if ``counts`` is 2-D, all vectors are of equal length
     * the correct number of ``ids`` is provided (if any are provided)

    For phylogenetic diversity metrics, validation additional confirms that:
     * ``otu_ids`` does not contain duplicate values
     * the length of each ``counts`` vector is equal to ``len(otu_ids)``
     * ``tree`` is rooted
     * ``tree`` has more than one node
     * all nodes in ``tree`` except for the root node have branch lengths
     * all tip names in ``tree`` are unique
     * all ``otu_ids`` correspond to tip names in ``tree``

    """
    metric_map = _get_alpha_diversity_metric_map()

    if validate:
        counts = _validate_counts_matrix(counts, ids=ids)

    if metric == 'faith_pd':
        otu_ids, tree, kwargs = _get_phylogenetic_kwargs(counts, **kwargs)
        counts_by_node, branch_lengths = _setup_faith_pd(
            counts, otu_ids, tree, validate, single_sample=False)
        counts = counts_by_node
        metric = functools.partial(_faith_pd, branch_lengths=branch_lengths)
    elif callable(metric):
        metric = functools.partial(metric, **kwargs)
    elif metric in metric_map:
        metric = functools.partial(metric_map[metric], **kwargs)
    else:
        raise ValueError('Unknown metric provided: %r.' % metric)

    results = [metric(c) for c in counts]
    return pd.Series(results, index=ids)