def get_group(self, ids: Iterable[str], name: str = None) -> GroupTaxonomy:
        """Get taxonomic detail for a given group

        Parameters
        ----------
        ids : list of str
            The identifiers of a group to obtain
        name : str
            The name of the set of group. It must be provided if multiple
            IDs are requested.

        Raises
        ------
        UnknownID
            If an identifier is not present in the data.
        ValueError
            If a name is not specified when asking for multiple IDs

        Returns
        -------
        GroupTaxonomy
            Taxonomic detail associated with the ID
        """
        for i in ids:
            if i not in self._group_id_lookup:
                raise UnknownID('%s does not exist' % i)

        if len(ids) > 1:
            if name is None:
                raise ValueError("Name not specified.")

            table = self._table.filter(set(ids), inplace=False).remove_empty()
            features = table.ids(axis='observation')
            feature_values = table.sum('observation')
            feature_values /= feature_values.sum()
            feature_variances = [0.] * len(feature_values)
        else:
            id_ = ids[0]
            name = id_

            # get data, pull feature ids out. Zeros are not an issue here as
            # if it were zero, that means the feature isn't present
            group_vec = self._table.data(id_, dense=False)
            features = self._feature_order[group_vec.indices]
            feature_values = group_vec.data

            # handle variances, which may have zeros
            feature_variances = self._variances.data(id_, dense=True)
            feature_variances = feature_variances[group_vec.indices]

        # construct the group specific taxonomy
        taxonomy = self._taxonomy_tree_from_features(features)

        return GroupTaxonomy(
            name=name,
            taxonomy=str(taxonomy).strip(),
            features=list(features),
            feature_values=list(feature_values),
            feature_variances=list(feature_variances),
        )
    def __init__(self,
                 *args,
                 name=None,
                 taxonomy=None,
                 features=None,
                 feature_values=None,
                 feature_variances=None):
        if args:
            raise NotImplementedError("%s only supports kwargs" %
                                      str(self.__class__))

        for k in features:
            # a _very_ lightweight check to avoid expense of full newick parse.
            # this is not a perfect sanity test
            if k not in taxonomy:
                raise UnknownID("%s is not in the taxonomy." % k)

        if (features and
            (feature_values is None)) or len(features) != len(feature_values):
            raise ValueError("features and feature_values have a length "
                             "mismatch")

        if feature_variances is not None and len(features) != len(
                feature_variances):
            raise ValueError("features and feature_variances have a length "
                             "mismatch")

        super().__init__()
    def ranks_order(self, taxa: Iterable[str] = None) -> List:
        """Obtain the rank order of the requested taxa names

        Parameters
        ----------
        taxa : Iterable[str], optional
            The taxa to request ordering for. If not specified, return the
            order of all contained taxa

        Raises
        ------
        UnknownID
            If a requested taxa is not ranked or otherwise unknown

        Returns
        -------
        list
            The order of the taxa, where index 0 corresponds to the highest
            ranked taxon, index 1 the next highest, etc
        """
        if taxa is None:
            taxa = set(self._ranked_order.index)
        else:
            taxa = set(taxa)
            known = set(self._ranked_order.index)

            unk = taxa - known
            if len(unk) > 0:
                raise UnknownID("One or more names are not in the top "
                                "ranks: %s" % ",".join(unk))

        return [t for t in self._ranked_order.index if t in taxa]
Exemple #4
0
def _filter_ids(metadata_repo, alpha_repo, alpha_metric, query, sample_id):
    matching_ids = metadata_repo.sample_id_matches(query)
    matches_alpha = alpha_repo.exists(matching_ids, alpha_metric)
    matching_ids = [
        id_ for id_, exists_ in zip(matching_ids, matches_alpha) if exists_
    ]
    if sample_id:
        if not all(alpha_repo.exists([sample_id], alpha_metric)):
            raise UnknownID(sample_id)
    return matching_ids
    def k_nearest(self, sample_id, metric, k=1):
        nearest_ids = self._get_resource(metric)
        if not self.exists(sample_id, metric):
            raise UnknownID(sample_id)
        n_neighbors = len(nearest_ids.columns)
        if k > n_neighbors:
            raise InvalidParameter(
                f"k={k} is greater than the maximum ({n_neighbors}).")

        nearest = nearest_ids.loc[sample_id]
        return nearest[:k].to_list()
def _get_alpha(alpha_repo, alpha_metric, sample_id):
    if not all(alpha_repo.exists([sample_id], alpha_metric)):
        raise UnknownID(f"Sample ID not found. Got: {sample_id}")
    alpha_series = alpha_repo.get_alpha_diversity([sample_id], alpha_metric)
    alpha_ = Alpha(alpha_series)
    alpha_data = alpha_.get_group_raw().to_dict()
    ret_val = {
        'sample_id': sample_id,
        'alpha_metric': alpha_data['alpha_metric'],
        'data': alpha_data['alpha_diversity'][sample_id],
    }
    return ret_val
def _get_metadata_values(body, cat, repo):
    invalid_categories = list(filter(lambda x: not repo.has_category(x), cat))
    if invalid_categories:
        raise UnknownCategory(
            f"Cannot find metadata categories corresponding to: "
            f"{invalid_categories}")
    # check all sample ID's are valid
    sample_ids = body
    invalid_ids = list(filter(lambda x: not repo.has_sample_id(x), sample_ids))
    if invalid_ids:
        raise UnknownID(
            f"Cannot find sample ID's corresponding to: {invalid_ids}")
    metadata = repo.get_metadata(
        cat,
        sample_ids=sample_ids,
        fillna=None,
    )
    return metadata
    def rare_unique(self, id_, rare_threshold=0.1):
        """Obtain the rare and unique features for an ID

        Parameters
        ----------
        id_ : str
            The identifier to obtain rare/unique information for
        rare_threshold : float
            The threshold to consider a feature rare. Defaults to 0.1,
            which is the historical rare value from the AGP

        Raises
        ------
        UnknownID
            If the requested sample is not present

        Returns
        -------
        dict
            {'rare': {feature: prevalence}, 'unique': [feature, ]}
        """
        if id_ not in self._group_id_lookup:
            raise UnknownID('%s does not exist' % id_)

        sample_data = self._table.data(id_, dense=False)

        # self.feature_prevalence and self.feature_uniques are derived from
        # self._table so the ordering of features is consistent
        sample_prevalences = self.feature_prevalence.iloc[sample_data.indices]
        sample_uniques = self.feature_uniques.iloc[sample_data.indices]

        rare_at_threshold = sample_prevalences < rare_threshold
        if rare_at_threshold.sum() == 0:
            rares = None
        else:
            rares = sample_prevalences[rare_at_threshold].to_dict()

        if sample_uniques.sum() == 0:
            uniques = None
        else:
            uniques = list(sample_uniques[sample_uniques].index)

        return {'rare': rares, 'unique': uniques}
Exemple #9
0
    def get_group_raw(self,
                      ids: List[str] = None,
                      name: str = None) -> GroupAlphaRaw:
        """Get raw values for a set of IDs

        Parameters
        ----------
        ids : list of str, optional
            The IDs to obtain data for. If not specified, values for all IDs
            are returned
        name : str, optional
            The name of the group. A name must be specified if ids is not None.

        Raises
        ------
        UnknownID
            If a requested ID is not present
        ValueError
            If ids are specified but a name is not

        Returns
        -------
        GroupAlphaRaw
            The corresponding values for the requested IDs.
        """
        if ids is None:
            ids = self._get_sample_ids()
        else:
            if name is None:
                raise ValueError("Name not specified.")

        try:
            vals = self._series.loc[ids]
        except KeyError:
            raise UnknownID('Identifier not found.')

        return GroupAlphaRaw(name=name,
                             alpha_metric=self._series.name,
                             alpha_diversity=vals.to_dict())
    def get_alpha_diversity(self, sample_ids, metric):
        """Obtains alpha diversity of a given metric for a list of samples.

        Parameters
        ----------
        sample_ids : str or list of str
            Ids for which to obtain alpha diversity measure.

        metric : str
            Alpha diversity metric.

        Returns
        -------
        pandas.Series
            Contains alpha diversity with metric `metric` for the
            union of samples ids in the database the ids in `sample_ids`.
            Sets the name of the series to `metric`.

        Raises
        ------

        UnknownMetric
            If the metric is not in the repo's resources
        Unknown Id
            If the id does not have value for the requested metric

        """
        # this could raise an UnknownMetric or ConfigurationError
        alpha_series = self._get_resource(metric)
        if isinstance(sample_ids, str):
            ids = pd.Series([sample_ids])
        else:
            ids = pd.Series(sample_ids)
        unknown = ~ids.isin(alpha_series.index)
        if any(unknown):
            raise UnknownID(f"For metric='{metric}', unknown ids: "
                            f"{ids.loc[unknown]}")
        return alpha_series.loc[ids]
    def ranks_specific(self, sample_id: str) -> pd.DataFrame:
        """Obtain the taxonomy rank information for a specific sample

        Parameters
        ----------
        sample_id : str
            The sample identifier to obtain ranks for

        Raises
        ------
        UnknownID
            If the requested sample is not present

        Returns
        -------
        pd.DataFrame
            The subset of .ranked for the sample
        """
        subset = self._ranked[self._ranked['Sample ID'] == sample_id]
        if len(subset) == 0:
            raise UnknownID("%s not found" % sample_id)
        else:
            return subset.copy()
 def k_nearest(self, sample_id, metric, k=1):
     distance_matrix = self._get_resource(metric)
     if not self.exists(sample_id, metric):
         raise UnknownID(sample_id)
     n_neighbors = len(distance_matrix.ids) - 1
     if k > n_neighbors:
         raise InvalidParameter(
             f"k={k} is greater than the number of neighbors of the "
             f"sample ID. Number of neighbors: {n_neighbors}")
     # get
     sample_idx = distance_matrix.index(sample_id)
     distances = distance_matrix[sample_idx]
     # has indices partitioned by distance, around the `kth` entry of the
     # array
     idx = np.argpartition(distances, kth=k)
     # get the k + 1 closest samples (including this sample)
     k_nearest_idx = idx[:k + 1]
     # sort the k closest samples by their distance, so the closest are
     k_distances = distances[k_nearest_idx]
     # remove the sample itself
     sorted_k_indices = np.argsort(k_distances)[1:]
     k_nearest_idx = k_nearest_idx[sorted_k_indices]
     return [distance_matrix.ids[idx] for idx in k_nearest_idx]
Exemple #13
0
    def get_group(self, ids: List[str] = None, name: str = None) -> GroupAlpha:
        """Get group values

        Parameters
        ----------
        ids : list of str
            The IDs to represent the distribution
        name : str
            The name of the group. It must be provided if requesting multiple
            IDs

        Raises
        ------
        UnknownID
            If a requested ID is not present
        ValueError
            If a name is not specified when asking for multiple IDs.

        Returns
        -------
        GroupAlpha
            The corresponding distribution or individual data
        """
        if ids is None:
            ids = self._get_sample_ids()
        elif len(ids) == 1:
            name = ids[0]
        else:
            if name is None:
                raise ValueError("Name not specified.")

        try:
            vals = self._series.loc[ids]
        except KeyError:
            raise UnknownID('Identifier not found.')

        mean = vals.mean()
        median = vals.median()

        if len(ids) == 1:
            std = 0.
            return GroupAlpha(name=name,
                              alpha_metric=self._series.name,
                              mean=mean,
                              median=median,
                              std=std,
                              group_size=1,
                              percentile=None,
                              percentile_values=None)
        else:
            if name is None:
                raise ValueError("Name not specified.")

            std = vals.std(ddof=0)
            percentile_values = np.percentile(vals, self._percentiles)
            return GroupAlpha(name=name,
                              alpha_metric=self._series.name,
                              mean=mean,
                              median=median,
                              std=std,
                              group_size=len(vals),
                              percentile=self._percentiles,
                              percentile_values=list(percentile_values))
Exemple #14
0
def check_missing_ids_alt(missing_ids, alpha_metric, type_):
    if len(missing_ids) > 0:
        raise UnknownID(f"Sample ID(s) not found for {type_}: {alpha_metric}. "
                        f"Unknown IDs: {missing_ids}")