def generate_class_weights( reference_taxonomy: Series, reference_sequences: DNAIterator, samples: biom.Table, taxonomy_classification: DataFrame, unobserved_weight: float = 1e-6, normalise: bool = False, allow_weight_outside_reference: bool = False) \ -> biom.Table: weights = { reference_taxonomy[seq.metadata['id']]: 0. for seq in reference_sequences } if normalise: samples.norm() tax_map = taxonomy_classification['Taxon'] try: taxa = [tax_map[s] for s in samples.ids(axis='observation')] except KeyError as s: raise ValueError(str(s) + ' not in taxonomy_classification') if not allow_weight_outside_reference and not set(taxa).issubset(weights): raise ValueError( 'taxonomy_classification does not match reference_taxonomy') for taxon, count in zip(taxa, samples.sum('observation')): if taxon in weights: weights[taxon] += count taxa, weights = zip(*weights.items()) weights = array(weights) weights /= weights.sum() weights = \ (1. - unobserved_weight) * weights + unobserved_weight / len(weights) weights /= weights.sum() return biom.Table(weights[None].T, taxa, sample_ids=['Weight'])
def rpca( table: biom.Table, rank: int = 3, min_sample_count: int = 500, min_feature_count: int = 10, iterations: int = 5 ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """ Runs RPCA with an rclr preprocessing step""" # filter sample to min depth def sample_filter(val, id_, md): return sum(val) > min_sample_count table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T.drop_duplicates() table = table.T[table.sum() > min_feature_count].T # rclr preprocessing and OptSpace (RPCA) opt = OptSpace(rank=rank, iteration=iterations).fit(rclr().fit_transform( table.copy())) rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)} # Feature Loadings feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns) feature_loading = feature_loading.rename(columns=rename_cols) feature_loading.sort_values('PC1', inplace=True, ascending=True) # Sample Loadings sample_loading = pd.DataFrame(opt.sample_weights, index=table.index) sample_loading = sample_loading.rename(columns=rename_cols) # % var explained proportion_explained = pd.Series(opt.explained_variance_ratio, index=list(rename_cols.values())) # eigan-vals eigvals = pd.Series(opt.eigenvalues, index=list(rename_cols.values())) # if the rank is two add PC3 of zeros if rank == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res
def cluster_features(query_table: biom.Table, closed_reference_table: biom.Table, query_sequences: DNAFASTAFormat, reference_sequences: pd.Series, thr: float = 0.97, threads: int = 1, output_log_file: str = None) -> ( biom.Table, DNAFASTAFormat, DNAFASTAFormat): reference_sequences_fasta = get_reference_seqs_from_ids(closed_reference_table, reference_sequences) results = cluster_features_closed_reference(sequences=query_sequences, table=query_table, reference_sequences=reference_sequences_fasta, perc_identity=thr, threads=threads) clustered_table_biom = results[0] clustered_sequences_pd = Artifact.load(str(results[1])).view(pd.Series) unmatched_sequences_pd = Artifact.load(str(results[2])).view(pd.Series) with tempfile.mktemp() as tmp_fp: logger_ins = LOG(tmp_fp).get_logger('clustering_features') logger_ins.info("The number of OTUs in the reference database is", _15(reference_sequences_fasta).size) logger_ins.info("The number of unmatched sequence to the reference alignment is", unmatched_sequences_pd.size) logger_ins.info("The number of matched sequences to the reference alignment is", clustered_sequences_pd.size) logger_ins.info("Before applying clustering, the total number of counts " "in the original feature table was", np.sum(query_table.sum())) logger_ins.info("Before applying clustering, the number of non-zero elements" " of the underlying feature table is", query_table.nnz) logger_ins.info("After applying clustering, the total number of counts " "in the original feature table was", np.sum(clustered_table_biom.sum())) logger_ins.info("After applying clustering, the number of non-zero elements" " of the underlying feature table is", clustered_table_biom.nnz) logger_ins.info("The percent of total counts retained is", np.sum(query_table.sum()) / np.sum(clustered_table_biom.sum()) * 100, "%s") query_samples = clustered_table_biom.ids('sample') closed_reference_features = closed_reference_table.ids('observation') clustered_table_biom = closed_reference_table.merge(clustered_table_biom) clustered_table_biom.filter(ids_to_keep=query_samples, axis='sample', inplace=True) if len(set(closed_reference_features) - set(clustered_table_biom.ids('sample'))) != 0: raise ValueError( "Merging two tables failed! There are less features in the final table than expected!" ) if output_log_file: shutil.copy(tmp_fp, output_log_file) return clustered_table_biom, results[1], results[2]
def __init__(self, table: biom.Table, metadata: pd.DataFrame = None, batch_category: str = None): super(BiomDataset).__init__() if np.any(table.sum(axis='sample') <= 0): ValueError('Biom table has zero counts.') self.table = table self.metadata = metadata self.batch_category = batch_category self.populate()
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode = None, metrics: set = None, metadata: qiime2.Metadata = None, min_depth: int = 1, steps: int = 10, iterations: int = 10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is None: columns, filtered_columns = set(), set() else: # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) # Drop metadata columns that aren't categorical, or consist solely of # missing values. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) metadata_df = metadata.to_dataframe() metadata_df.columns = pd.MultiIndex.from_tuples([ (c, '') for c in metadata_df.columns ]) columns = metadata_df.columns.get_level_values(0) data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) filenames = [] for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: merged = data.join(metadata_df, how='left') for column in columns: column_name = quote(column) reindexed_df, counts = _reindex_with_metadata( column, columns, merged) c_df = _compute_summary(reindexed_df, column, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, column) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = [ 'depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values ] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={ 'metrics': list(metrics), 'filenames': [quote(f) for f in filenames], 'columns': list(columns), 'steps': steps, 'filtered_columns': sorted(filtered_columns) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def ctf_helper( table: biom.Table, sample_metadata: DataFrame, individual_id_column: str, state_columns: list, n_components: int = DEFAULT_COMP, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, max_iterations_als: int = DEFAULT_MAXITER, max_iterations_rptm: int = DEFAULT_MAXITER, n_initializations: int = DEFAULT_MAXITER, feature_metadata: DataFrame = DEFFM ) -> (dict, OrdinationResults, dict, tuple): """ Runs Compositional Tensor Factorization CTF. """ # validate the metadata using q2 as a wrapper if sample_metadata is not None and not isinstance(sample_metadata, DataFrame): sample_metadata = sample_metadata.to_dataframe() keep_cols = state_columns + [individual_id_column] all_sample_metadata = sample_metadata.drop(keep_cols, axis=1) sample_metadata = sample_metadata[keep_cols] # validate the metadata using q2 as a wrapper if feature_metadata is not None and not isinstance(feature_metadata, DataFrame): feature_metadata = feature_metadata.to_dataframe() # match the data (borrowed in part from gneiss.util.match) subtablefids = table.ids('observation') subtablesids = table.ids('sample') if len(subtablesids) != len(set(subtablesids)): raise ValueError('Data-table contains duplicate sample IDs') if len(subtablefids) != len(set(subtablefids)): raise ValueError('Data-table contains duplicate feature IDs') submetadataids = set(sample_metadata.index) subtablesids = set(subtablesids) subtablefids = set(subtablefids) if feature_metadata is not None: submetadatafeat = set(feature_metadata.index) fidx = subtablefids & submetadatafeat if len(fidx) == 0: raise ValueError(("No more features left. Check to make " "sure that the sample names between " "`feature-metadata` and `table` are " "consistent")) feature_metadata = feature_metadata.reindex(fidx) sidx = subtablesids & submetadataids if len(sidx) == 0: raise ValueError(("No more features left. Check to make sure that " "the sample names between `sample-metadata` and" " `table` are consistent")) if feature_metadata is not None: table.filter(list(fidx), axis='observation', inplace=True) table.filter(list(sidx), axis='sample', inplace=True) sample_metadata = sample_metadata.reindex(sidx) # filter and import table for axis, min_sum in zip(['sample', 'observation'], [min_sample_count, min_feature_count]): table = table.filter(table.ids(axis)[table.sum(axis) >= min_sum], axis=axis, inplace=True) # table to dataframe table = DataFrame(table.matrix_data.toarray(), table.ids('observation'), table.ids('sample')) # tensor building tensor = build() tensor.construct(table, sample_metadata, individual_id_column, state_columns) # factorize TF = TensorFactorization(n_components=n_components, max_als_iterations=max_iterations_als, max_rtpm_iterations=max_iterations_rptm, n_initializations=n_initializations).fit( rclr(tensor.counts)) # label tensor loadings TF.label(tensor, taxonomy=feature_metadata) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> if n_components == 2: TF.subjects.loc[:, 'PC3'] = [0] * len(TF.subjects.index) TF.features.loc[:, 'PC3'] = [0] * len(TF.features.index) TF.proportion_explained['PC3'] = 0 TF.eigvals['PC3'] = 0 # save ordination results short_method_name = 'CTF_Biplot' long_method_name = 'Compositional Tensor Factorization Biplot' # only keep PC -- other tools merge metadata keep_PC = [col for col in TF.features.columns if 'PC' in col] subj_ordin = OrdinationResults( short_method_name, long_method_name, TF.eigvals, samples=TF.subjects[keep_PC].dropna(axis=0), features=TF.features[keep_PC].dropna(axis=0), proportion_explained=TF.proportion_explained) # save distance matrix for each condition distances = {} state_ordn = {} subject_trajectories = {} feature_trajectories = {} for condition, cond, dist, straj, ftraj in zip(tensor.conditions, TF.conditions, TF.subject_distances, TF.subject_trajectory, TF.feature_trajectory): # match distances to metadata ids = straj.index ind_dict = dict((ind, ind_i) for ind_i, ind in enumerate(ids)) inter = set(ind_dict).intersection(sample_metadata.index) indices = sorted([ind_dict[ind] for ind in inter]) dist = dist[indices, :][:, indices] distances[condition] = skbio.stats.distance.DistanceMatrix( dist, ids=ids[indices]) # fix conditions if n_components == 2: cond['PC3'] = [0] * len(cond.index) cond = OrdinationResults(short_method_name, long_method_name, TF.eigvals, samples=cond[keep_PC].dropna(axis=0), features=TF.features[keep_PC].dropna(axis=0), proportion_explained=TF.proportion_explained) state_ordn[condition] = cond # add the sample metadata before returning output # addtionally only keep metadata with trajectory # output available. pre_merge_cols = list(straj.columns) straj = concat( [straj.reindex(all_sample_metadata.index), all_sample_metadata], axis=1, sort=True) straj = straj.dropna(subset=pre_merge_cols) # ensure index name for q2 straj.index.name = "#SampleID" # save traj. keep_PC_traj = [col for col in straj.columns if 'PC' in col] straj[keep_PC_traj] -= straj[keep_PC_traj].mean() ftraj[keep_PC_traj] -= ftraj[keep_PC_traj].mean() subject_trajectories[condition] = straj ftraj.index = ftraj.index.astype(str) feature_trajectories[condition] = ftraj return (state_ordn, subj_ordin, distances, subject_trajectories, feature_trajectories)
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode = None, metrics: set = None, metadata: qiime2.Metadata = None, min_depth: int = 1, steps: int = 10, iterations: int = 10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is None: columns, filtered_columns = set(), set() else: # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) # Drop metadata columns that aren't categorical, or consist solely of # missing values. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) metadata_df = metadata.to_dataframe() if metadata_df.empty or len(metadata.columns) == 0: raise ValueError("All metadata filtered after dropping columns " "that contained non-categorical data.") metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) columns = metadata_df.columns.get_level_values(0) data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) filenames = [] for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: merged = data.join(metadata_df, how='left') for column in columns: column_name = quote(column) reindexed_df, counts = _reindex_with_metadata(column, columns, merged) c_df = _compute_summary(reindexed_df, column, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, column) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': [quote(f) for f in filenames], 'columns': list(columns), 'steps': steps, 'filtered_columns': sorted(filtered_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode=None, metrics: set=None, metadata: qiime2.Metadata=None, min_depth: int=1, steps: int=10, iterations: int=10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is not None: metadata_ids = metadata.ids() table_ids = set(table.ids(axis='sample')) if not table_ids.issubset(metadata_ids): raise ValueError('Missing samples in metadata: %r' % table_ids.difference(metadata_ids)) filenames, categories, empty_columns = [], [], [] data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: metadata_df = metadata.to_dataframe() metadata_df = metadata_df.loc[data.index] all_columns = metadata_df.columns metadata_df.dropna(axis='columns', how='all', inplace=True) empty_columns = set(all_columns) - set(metadata_df.columns) metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) merged = data.join(metadata_df, how='left') categories = metadata_df.columns.get_level_values(0) for category in categories: category_name = quote(category) reindexed_df, counts = _reindex_with_metadata(category, categories, merged) c_df = _compute_summary(reindexed_df, category, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, category_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, category_name) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': filenames, 'categories': list(categories), 'empty_columns': sorted(empty_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))