def get_group(self, ids: Iterable[str], name: str = None) -> GroupTaxonomy: """Get taxonomic detail for a given group Parameters ---------- ids : list of str The identifiers of a group to obtain name : str The name of the set of group. It must be provided if multiple IDs are requested. Raises ------ UnknownID If an identifier is not present in the data. ValueError If a name is not specified when asking for multiple IDs Returns ------- GroupTaxonomy Taxonomic detail associated with the ID """ for i in ids: if i not in self._group_id_lookup: raise UnknownID('%s does not exist' % i) if len(ids) > 1: if name is None: raise ValueError("Name not specified.") table = self._table.filter(set(ids), inplace=False).remove_empty() features = table.ids(axis='observation') feature_values = table.sum('observation') feature_values /= feature_values.sum() feature_variances = [0.] * len(feature_values) else: id_ = ids[0] name = id_ # get data, pull feature ids out. Zeros are not an issue here as # if it were zero, that means the feature isn't present group_vec = self._table.data(id_, dense=False) features = self._feature_order[group_vec.indices] feature_values = group_vec.data # handle variances, which may have zeros feature_variances = self._variances.data(id_, dense=True) feature_variances = feature_variances[group_vec.indices] # construct the group specific taxonomy taxonomy = self._taxonomy_tree_from_features(features) return GroupTaxonomy( name=name, taxonomy=str(taxonomy).strip(), features=list(features), feature_values=list(feature_values), feature_variances=list(feature_variances), )
def __init__(self, *args, name=None, taxonomy=None, features=None, feature_values=None, feature_variances=None): if args: raise NotImplementedError("%s only supports kwargs" % str(self.__class__)) for k in features: # a _very_ lightweight check to avoid expense of full newick parse. # this is not a perfect sanity test if k not in taxonomy: raise UnknownID("%s is not in the taxonomy." % k) if (features and (feature_values is None)) or len(features) != len(feature_values): raise ValueError("features and feature_values have a length " "mismatch") if feature_variances is not None and len(features) != len( feature_variances): raise ValueError("features and feature_variances have a length " "mismatch") super().__init__()
def ranks_order(self, taxa: Iterable[str] = None) -> List: """Obtain the rank order of the requested taxa names Parameters ---------- taxa : Iterable[str], optional The taxa to request ordering for. If not specified, return the order of all contained taxa Raises ------ UnknownID If a requested taxa is not ranked or otherwise unknown Returns ------- list The order of the taxa, where index 0 corresponds to the highest ranked taxon, index 1 the next highest, etc """ if taxa is None: taxa = set(self._ranked_order.index) else: taxa = set(taxa) known = set(self._ranked_order.index) unk = taxa - known if len(unk) > 0: raise UnknownID("One or more names are not in the top " "ranks: %s" % ",".join(unk)) return [t for t in self._ranked_order.index if t in taxa]
def _filter_ids(metadata_repo, alpha_repo, alpha_metric, query, sample_id): matching_ids = metadata_repo.sample_id_matches(query) matches_alpha = alpha_repo.exists(matching_ids, alpha_metric) matching_ids = [ id_ for id_, exists_ in zip(matching_ids, matches_alpha) if exists_ ] if sample_id: if not all(alpha_repo.exists([sample_id], alpha_metric)): raise UnknownID(sample_id) return matching_ids
def k_nearest(self, sample_id, metric, k=1): nearest_ids = self._get_resource(metric) if not self.exists(sample_id, metric): raise UnknownID(sample_id) n_neighbors = len(nearest_ids.columns) if k > n_neighbors: raise InvalidParameter( f"k={k} is greater than the maximum ({n_neighbors}).") nearest = nearest_ids.loc[sample_id] return nearest[:k].to_list()
def _get_alpha(alpha_repo, alpha_metric, sample_id): if not all(alpha_repo.exists([sample_id], alpha_metric)): raise UnknownID(f"Sample ID not found. Got: {sample_id}") alpha_series = alpha_repo.get_alpha_diversity([sample_id], alpha_metric) alpha_ = Alpha(alpha_series) alpha_data = alpha_.get_group_raw().to_dict() ret_val = { 'sample_id': sample_id, 'alpha_metric': alpha_data['alpha_metric'], 'data': alpha_data['alpha_diversity'][sample_id], } return ret_val
def _get_metadata_values(body, cat, repo): invalid_categories = list(filter(lambda x: not repo.has_category(x), cat)) if invalid_categories: raise UnknownCategory( f"Cannot find metadata categories corresponding to: " f"{invalid_categories}") # check all sample ID's are valid sample_ids = body invalid_ids = list(filter(lambda x: not repo.has_sample_id(x), sample_ids)) if invalid_ids: raise UnknownID( f"Cannot find sample ID's corresponding to: {invalid_ids}") metadata = repo.get_metadata( cat, sample_ids=sample_ids, fillna=None, ) return metadata
def rare_unique(self, id_, rare_threshold=0.1): """Obtain the rare and unique features for an ID Parameters ---------- id_ : str The identifier to obtain rare/unique information for rare_threshold : float The threshold to consider a feature rare. Defaults to 0.1, which is the historical rare value from the AGP Raises ------ UnknownID If the requested sample is not present Returns ------- dict {'rare': {feature: prevalence}, 'unique': [feature, ]} """ if id_ not in self._group_id_lookup: raise UnknownID('%s does not exist' % id_) sample_data = self._table.data(id_, dense=False) # self.feature_prevalence and self.feature_uniques are derived from # self._table so the ordering of features is consistent sample_prevalences = self.feature_prevalence.iloc[sample_data.indices] sample_uniques = self.feature_uniques.iloc[sample_data.indices] rare_at_threshold = sample_prevalences < rare_threshold if rare_at_threshold.sum() == 0: rares = None else: rares = sample_prevalences[rare_at_threshold].to_dict() if sample_uniques.sum() == 0: uniques = None else: uniques = list(sample_uniques[sample_uniques].index) return {'rare': rares, 'unique': uniques}
def get_group_raw(self, ids: List[str] = None, name: str = None) -> GroupAlphaRaw: """Get raw values for a set of IDs Parameters ---------- ids : list of str, optional The IDs to obtain data for. If not specified, values for all IDs are returned name : str, optional The name of the group. A name must be specified if ids is not None. Raises ------ UnknownID If a requested ID is not present ValueError If ids are specified but a name is not Returns ------- GroupAlphaRaw The corresponding values for the requested IDs. """ if ids is None: ids = self._get_sample_ids() else: if name is None: raise ValueError("Name not specified.") try: vals = self._series.loc[ids] except KeyError: raise UnknownID('Identifier not found.') return GroupAlphaRaw(name=name, alpha_metric=self._series.name, alpha_diversity=vals.to_dict())
def get_alpha_diversity(self, sample_ids, metric): """Obtains alpha diversity of a given metric for a list of samples. Parameters ---------- sample_ids : str or list of str Ids for which to obtain alpha diversity measure. metric : str Alpha diversity metric. Returns ------- pandas.Series Contains alpha diversity with metric `metric` for the union of samples ids in the database the ids in `sample_ids`. Sets the name of the series to `metric`. Raises ------ UnknownMetric If the metric is not in the repo's resources Unknown Id If the id does not have value for the requested metric """ # this could raise an UnknownMetric or ConfigurationError alpha_series = self._get_resource(metric) if isinstance(sample_ids, str): ids = pd.Series([sample_ids]) else: ids = pd.Series(sample_ids) unknown = ~ids.isin(alpha_series.index) if any(unknown): raise UnknownID(f"For metric='{metric}', unknown ids: " f"{ids.loc[unknown]}") return alpha_series.loc[ids]
def ranks_specific(self, sample_id: str) -> pd.DataFrame: """Obtain the taxonomy rank information for a specific sample Parameters ---------- sample_id : str The sample identifier to obtain ranks for Raises ------ UnknownID If the requested sample is not present Returns ------- pd.DataFrame The subset of .ranked for the sample """ subset = self._ranked[self._ranked['Sample ID'] == sample_id] if len(subset) == 0: raise UnknownID("%s not found" % sample_id) else: return subset.copy()
def k_nearest(self, sample_id, metric, k=1): distance_matrix = self._get_resource(metric) if not self.exists(sample_id, metric): raise UnknownID(sample_id) n_neighbors = len(distance_matrix.ids) - 1 if k > n_neighbors: raise InvalidParameter( f"k={k} is greater than the number of neighbors of the " f"sample ID. Number of neighbors: {n_neighbors}") # get sample_idx = distance_matrix.index(sample_id) distances = distance_matrix[sample_idx] # has indices partitioned by distance, around the `kth` entry of the # array idx = np.argpartition(distances, kth=k) # get the k + 1 closest samples (including this sample) k_nearest_idx = idx[:k + 1] # sort the k closest samples by their distance, so the closest are k_distances = distances[k_nearest_idx] # remove the sample itself sorted_k_indices = np.argsort(k_distances)[1:] k_nearest_idx = k_nearest_idx[sorted_k_indices] return [distance_matrix.ids[idx] for idx in k_nearest_idx]
def get_group(self, ids: List[str] = None, name: str = None) -> GroupAlpha: """Get group values Parameters ---------- ids : list of str The IDs to represent the distribution name : str The name of the group. It must be provided if requesting multiple IDs Raises ------ UnknownID If a requested ID is not present ValueError If a name is not specified when asking for multiple IDs. Returns ------- GroupAlpha The corresponding distribution or individual data """ if ids is None: ids = self._get_sample_ids() elif len(ids) == 1: name = ids[0] else: if name is None: raise ValueError("Name not specified.") try: vals = self._series.loc[ids] except KeyError: raise UnknownID('Identifier not found.') mean = vals.mean() median = vals.median() if len(ids) == 1: std = 0. return GroupAlpha(name=name, alpha_metric=self._series.name, mean=mean, median=median, std=std, group_size=1, percentile=None, percentile_values=None) else: if name is None: raise ValueError("Name not specified.") std = vals.std(ddof=0) percentile_values = np.percentile(vals, self._percentiles) return GroupAlpha(name=name, alpha_metric=self._series.name, mean=mean, median=median, std=std, group_size=len(vals), percentile=self._percentiles, percentile_values=list(percentile_values))
def check_missing_ids_alt(missing_ids, alpha_metric, type_): if len(missing_ids) > 0: raise UnknownID(f"Sample ID(s) not found for {type_}: {alpha_metric}. " f"Unknown IDs: {missing_ids}")