def subsample(table: biom.Table, subsampling_depth: int, axis: str) -> biom.Table: if axis == 'feature': # we are transposing the table due to biocore/biom-format#759 table = table.transpose() if len(table.ids()) < subsampling_depth: raise ValueError('The subsampling depth exceeds the number of ' 'elements on the desired axis. The maximum depth ' 'is: %d.' % len(table.ids())) # the axis is always 'sample' due to the above transpose table = table.subsample(subsampling_depth, axis='sample', by_id=True) # the inverted axis is always observation due to the above transpose invaxis = 'observation' table.filter(lambda v, i, m: v.sum() > 0, axis=invaxis) if axis == 'feature': # reverse the transpose necessary due to biocore/biom-format#759 table = table.transpose() if table.is_empty(): raise ValueError('The subsampled table contains no samples or features' ' (samples/features that sum to zero after filtering' ' are automatically removed). It may be a good idea' ' to double check that your table is valid/nonempty.') return table
def cscs(features: biom.Table, css_edges: str, cosine_threshold: float = 0.6, normalization: bool = True, weighted: bool = True) -> skbio.DistanceMatrix: observationids = { x: index for index, x in enumerate(features.ids(axis='observation')) } edgesdok = dok_matrix((features.shape[0], features.shape[0]), dtype=np.float32) for line in open(css_edges, "r"): if line.find("CLUSTERID1") > -1: continue linesplit = line.split("\t") if float(linesplit[4]) < cosine_threshold: edgesdok[observationids[linesplit[0]], observationids[linesplit[1]]] = 0.0 else: edgesdok[observationids[linesplit[0]], observationids[linesplit[1]]] = float(linesplit[4]) edgesdok[observationids[linesplit[1]], observationids[linesplit[0]]] = float(linesplit[4]) edgesdok.setdiag(1) if normalization: features = features.norm(axis='sample', inplace=False) if weighted == False: features = features.pa #TODO: make new option in cscs() sample_names = features.ids() cscs = parallel_make_distance_matrix(features, edgesdok, sample_names) cscs = 1 - cscs print(cscs) return (skbio.DistanceMatrix(cscs, ids=cscs.index))
def _read_inputs(biom_table: biom.Table, phylogeny_fp: NewickFormat, meta_data: NumericMetadataColumn = None): if meta_data: generate_strategy = "balancing" meta, biom_table = _sort_metada(meta_data, biom_table) y = meta.iloc[:, 0] samples = meta.index else: generate_strategy = "augmentation" y = pd.Series(data=np.ones((len(biom_table.ids('sample')),)), index=biom_table.ids('sample')) samples = biom_table.ids('sample') _table_tmp = biom_table.sort_order(axis='sample', order=samples) _table = _map_observations(_table_tmp) pruned_phylogeny_fp = _prune_features_from_phylogeny(_table, phylogeny_fp) _tree = dendropy.Tree.get(path=str(pruned_phylogeny_fp), preserve_underscores=False, schema="newick", rooting='default-rooted') if sum(samples != _table.ids('sample')) > 0: raise ValueError("The samples IDs in meta data and biom table are " "not the same! The difference is:", set(samples) - set(_table.ids('sample')), "Please double check.") return _table, y, _tree, generate_strategy, pruned_phylogeny_fp
def alpha(table: biom.Table): """ :param table: :return: """ if table.is_empty(): raise ValueError("The provided table object is empty") table = get_biom_table(table) alpha_diversities = [] counts = table.matrix_data.toarray().astype(float).T sample_ids = table.ids(axis='sample') sample_metadata = dict(zip(table.ids(), table.metadata())) for metric in ALPHA_DIVERSITY_METHODS: result = alpha_diversity(metric=metric, counts=counts, ids=sample_ids) result.name = metric alpha_diversities.append(result) aggregated_diversity_results = aggregate_results(alpha_diversities, sample_ids) formatted_diversity_results = _format_alpha_results_to_json( aggregated_diversity_results, sample_metadata) return formatted_diversity_results
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode, metric: str, n_jobs: int=1)-> skbio.DistanceMatrix: if metric not in phylogenetic_metrics(): raise ValueError("Unknown phylogenetic metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") if n_jobs != 1 and metric == 'weighted_unifrac': raise ValueError("Weighted UniFrac is not parallelizable") counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') feature_ids = table.ids(axis='observation') try: results = skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, otu_ids=feature_ids, tree=phylogeny, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs ) except skbio.tree.MissingNodeError as e: message = str(e).replace('otu_ids', 'feature_ids') message = message.replace('tree', 'phylogeny') raise skbio.tree.MissingNodeError(message) return results
def merge(table1: biom.Table, table2: biom.Table) -> biom.Table: table1_sids = set(table1.ids(axis='sample')) table2_sids = set(table2.ids(axis='sample')) if len(table1_sids & table2_sids) > 0: raise ValueError('Some samples are present in both tables: %s' % ', '.join(table1_sids & table2_sids)) return table1.merge(table2)
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode, metric: str, n_jobs: int = 1) -> skbio.DistanceMatrix: if metric not in phylogenetic_metrics(): raise ValueError("Unknown phylogenetic metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") if n_jobs != 1 and metric == 'weighted_unifrac': raise ValueError("Weighted UniFrac is not parallelizable") counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') feature_ids = table.ids(axis='observation') try: results = skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, otu_ids=feature_ids, tree=phylogeny, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs) except skbio.tree.MissingNodeError as e: message = str(e).replace('otu_ids', 'feature_ids') message = message.replace('tree', 'phylogeny') raise skbio.tree.MissingNodeError(message) return results
def deposit_biofilm(table1, table2, metadata, U, V, edges, it, rep, output_dir): """ Writes down tables, metadata and feature metadata into files. Parameters ---------- table : biom.Table Biom table metadata : pd.DataFrame Dataframe of sample metadata feature_metadata : pd.DataFrame Dataframe of features metadata it : int iteration number rep : int repetition number output_dir : str output directory """ choice = 'abcdefghijklmnopqrstuvwxyz' output_microbes = "%s/table_microbes.%d_%s.biom" % (output_dir, it, choice[rep]) output_metabolites = "%s/table_metabolites.%d_%s.biom" % (output_dir, it, choice[rep]) output_md = "%s/metadata.%d_%s.txt" % (output_dir, it, choice[rep]) output_U = "%s/U.%d_%s.txt" % (output_dir, it, choice[rep]) output_V = "%s/V.%d_%s.txt" % (output_dir, it, choice[rep]) output_B = "%s/edges.%d_%s.txt" % (output_dir, it, choice[rep]) output_ranks = "%s/ranks.%d_%s.txt" % (output_dir, it, choice[rep]) idx1 = table1.sum(axis=0) > 0 idx2 = table2.sum(axis=0) > 0 table1 = table1.loc[:, idx1] table2 = table2.loc[:, idx2] table1 = Table(table1.values.T, table1.columns, table1.index) table2 = Table(table2.values.T, table2.columns, table2.index) with biom_open(output_microbes, 'w') as f: table1.to_hdf5(f, generated_by='moi1') with biom_open(output_metabolites, 'w') as f: table2.to_hdf5(f, generated_by='moi2') ranks = (U @ V) ranks = ranks[idx1, :] ranks = ranks[:, idx2] ranks = pd.DataFrame(ranks, index=table1.ids(axis='observation'), columns=table2.ids(axis='observation')) ranks.to_csv(output_ranks, sep='\t') metadata.to_csv(output_md, sep='\t', index_label='#SampleID') B = B[:, idx1] np.savetxt(output_U, U) np.savetxt(output_V, V) np.savetxt(output_B, B)
def rclr_transformation(table: Table) -> Table: """ Takes biom table and returns a matrix_rclr transformed biom table. """ # transform table values (and return biom.Table) table = Table( matrix_rclr(table.matrix_data.toarray().T).T, table.ids('observation'), table.ids('sample')) return table
def add_pseudocount(table: biom.Table, pseudocount: int = 1) -> biom.Table: # This is ugly, and it requires a sparse and dense representation to # be in memory at the same time, but biom.Table.transform only operates # on non-zero values, so it isn't useful here (as we need to operate on # all values). result = biom.Table([ v + pseudocount for v in table.iter_data(dense=True, axis='observation') ], table.ids(axis='observation'), table.ids()) return result
def count_vectors(table: biom.Table, phylogeny: skbio.TreeNode, method: str = 'weighted_unifrac') -> biom.Table: table = _map_observations(table) pruned_phylo = prune_features_from_phylogeny(table, phylogeny) pruned_phylo = rename_nodes(pruned_phylo) table = table.sort(axis='observation') otu_ids = np.asarray(table.ids('observation')) counts = np.asarray(table.matrix_data.todense().transpose()) features, tree_index = _run_unifrac(counts, otu_ids, pruned_phylo, method) return biom.Table(data=features.transpose(), observation_ids=rename_otus(tree_index), sample_ids=table.ids())
def group(table: biom.Table, axis: str, metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table: if table.is_empty(): raise ValueError("Cannot group an empty table.") if axis == 'feature': biom_axis = 'observation' else: biom_axis = axis metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis), axis) grouped_table = table.collapse( lambda axis_id, _: metadata.get_value(axis_id), collapse_f=_mode_lookup[mode], axis=biom_axis, norm=False, include_collapsed_metadata=False) # Reorder axis by first unique appearance of each group value in metadata # (makes it stable for identity mappings and easier to test) # TODO use CategoricalMetadataColumn API for retrieving categories/groups, # when the API exists. series = metadata.to_series() return grouped_table.sort_order(series.unique(), axis=biom_axis)
def tree_cluster( phylogeny_fp: NewickFormat, table: biom.Table, method: Str = 'max_clade', threshold: Float = 0.0, threshold_free: Str = None ) -> (skbio.TreeNode, pd.DataFrame, biom.Table): phylogeny_cleaned_fp = _clean_phylogeny(phylogeny_fp, table) print("The numbe of features in the original table is", len(table.ids('observation'))) print("The phylogeny loaded and randomly resolved!") tree = _read_phylogeny(str(phylogeny_cleaned_fp)) if threshold_free is None: clusters = METHODS[method.lower()](tree, threshold, float('-inf')) else: clusters = THRESHOLDFREE[threshold_free](METHODS[method.lower()], tree, threshold, float('-inf')) cluster_df = _convert_result_to_data_frame(clusters) print( "TreeCluster finished successfully! The number of clusters using the threshold", threshold, "is", len(clusters)) final_table_filtered, pivot_mapping_dct = map_features(cluster_df, table) print("The clustered table is generated!") new_tree = generate_new_phylogeny(phylogeny_cleaned_fp, pivot_mapping_dct) print("The clustered phylogeny is generated!") cluster_df = cluster_df.set_index('Feature ID') return new_tree, cluster_df, final_table_filtered
def beta_passthrough(table: biom.Table, metric: str, pseudocount: int = 1, n_jobs: int = 1) -> skbio.DistanceMatrix: def aitchison(x, y, **kwds): return euclidean(clr(x), clr(y)) def canberra_adkins(x, y, **kwds): nz = ((x > 0) | (y > 0)) x_ = x[nz] y_ = y[nz] nnz = nz.sum() return (1. / nnz) * np.sum(np.abs(x_ - y_) / (x_ + y_)) def jensen_shannon(x, y, **kwds): return jensenshannon(x, y) counts = table.matrix_data.toarray().T sample_ids = table.ids(axis='sample') if metric == 'aitchison': counts += pseudocount metric = aitchison elif metric == 'canberra_adkins': metric = canberra_adkins elif metric == 'jensenshannon': metric = jensen_shannon else: pass return skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, validate=True, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs)
def get_reference_seqs_from_ids(table: biom.Table, reference_seqs_pd: pd.Series) -> DNAFASTAFormat: output_references = pd.Series() for obs in table.ids('observation'): seq = reference_seqs_pd[obs] output_references[obs] = seq output_references_fasta = _16(output_references) return output_references_fasta
def beta(table: biom.Table, metric: str, pseudocount: int=1, n_jobs: int=1)-> skbio.DistanceMatrix: if not (metric in non_phylogenetic_metrics()): raise ValueError("Unknown metric: %s" % metric) counts = table.matrix_data.toarray().T def aitchison(x, y, **kwds): return euclidean(clr(x), clr(y)) if metric == 'aitchison': counts += pseudocount metric = aitchison if table.is_empty(): raise ValueError("The provided table object is empty") sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, validate=True, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs )
def beta(table: biom.Table, metric: str, pseudocount: int = 1, n_jobs: int = 1) -> skbio.DistanceMatrix: if not (metric in non_phylogenetic_metrics()): raise ValueError("Unknown metric: %s" % metric) counts = table.matrix_data.toarray().T def aitchison(x, y, **kwds): return euclidean(clr(x), clr(y)) if metric == 'aitchison': counts += pseudocount metric = aitchison if table.is_empty(): raise ValueError("The provided table object is empty") sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, validate=True, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs)
def filter_features(table: biom.Table, tree: NewickFormat) -> (biom.Table, biom.Table): # load the insertion tree tree = skbio.TreeNode.read(str(tree)) # collect all tips=inserted fragments+reference taxa names fragments_tree = { str(tip.name) for tip in tree.tips() if tip.name is not None} # collect all fragments/features from table fragments_table = set(map(str, table.ids(axis='observation'))) if len(fragments_table & fragments_tree) <= 0: raise ValueError(('Not a single fragment of your table is part of your' ' tree. The resulting table would be empty.')) tbl_positive = table.filter(fragments_table & fragments_tree, axis='observation', inplace=False) tbl_negative = table.filter(fragments_table - fragments_tree, axis='observation', inplace=False) # print some information for quality control, # which user can request via --verbose results = pd.DataFrame( data={'kept_reads': tbl_positive.sum(axis='sample'), 'removed_reads': tbl_negative.sum(axis='sample')}, index=tbl_positive.ids()) results['removed_ratio'] = results['removed_reads'] / \ (results['kept_reads'] + results['removed_reads']) return (tbl_positive, tbl_negative)
def generate_class_weights( reference_taxonomy: Series, reference_sequences: DNAIterator, samples: biom.Table, taxonomy_classification: DataFrame, unobserved_weight: float = 1e-6, normalise: bool = False, allow_weight_outside_reference: bool = False) \ -> biom.Table: weights = { reference_taxonomy[seq.metadata['id']]: 0. for seq in reference_sequences } if normalise: samples.norm() tax_map = taxonomy_classification['Taxon'] try: taxa = [tax_map[s] for s in samples.ids(axis='observation')] except KeyError as s: raise ValueError(str(s) + ' not in taxonomy_classification') if not allow_weight_outside_reference and not set(taxa).issubset(weights): raise ValueError( 'taxonomy_classification does not match reference_taxonomy') for taxon, count in zip(taxa, samples.sum('observation')): if taxon in weights: weights[taxon] += count taxa, weights = zip(*weights.items()) weights = array(weights) weights /= weights.sum() weights = \ (1. - unobserved_weight) * weights + unobserved_weight / len(weights) weights /= weights.sum() return biom.Table(weights[None].T, taxa, sample_ids=['Weight'])
def import_shogun_biom(f, annotation_table=None, annotation_type=None, names_to_taxonomy=False): import_funcs = { 'module': shogun_parse_module_table, 'pathway': shogun_parse_pathway_table, 'enzyme': shogun_parse_enzyme_table } table = pd.read_csv(f, sep='\t', index_col=0) bt = Table(table.values, observation_ids=list(map(str, table.index)), sample_ids=list(map(str, table.columns))) if names_to_taxonomy: metadata = { x: { 'taxonomy': x.split(';') } for x in bt.ids(axis='observation') } bt.add_metadata(metadata, axis='observation') if annotation_table is not None: metadata = import_funcs[annotation_type](annotation_table) bt.add_metadata(metadata, axis='observation') return (bt)
def _map_observations(table: biom.Table) -> biom.Table: obs_dict = {} for taxa in table.ids('observation'): obs_dict[taxa] = taxa.replace('_', ' ') table = table.update_ids(id_map=obs_dict, axis='observation', inplace=False) return table
def filter_table(table: biom.Table, tree: skbio.TreeNode) -> biom.Table: """ Filter table to remove feature ids that are not tip ids in tree """ tip_ids = set([t.name for t in tree.tips()]) feature_ids = set(table.ids(axis='observation')) # ids_to_keep can only include ids that are in table ids_to_keep = tip_ids & feature_ids table.filter(ids_to_keep, axis='observation', inplace=True) return table
def test_collapse_full(self): obs = collapse_full(table) exp = Table(array([[0.00769230769231], [0.0282051282051], [0.0487179487179], [0.0692307692308], [0.0897435897436], [0.110256410256], [0.130769230769], [0.151282051282], [0.171794871795], [0.192307692308]]), observ_ids, ['average'], observation_metadata=observ_metadata) for r in range(10): assert_almost_equal(obs[r, 0], exp[r, 0]) self.assertEqual(obs.ids(), exp.ids()) self.assertItemsEqual(obs.ids('observation'), exp.ids('observation')) obs_meta = [] for _, _, m in obs.iter(axis='observation'): obs_meta.append(m) self.assertItemsEqual(obs_meta, observ_metadata)
def pad_features_in_test_data(train_table: biom.Table, test_table: biom.Table) -> biom.Table: ''' Do feature alignment on train and test tables by adding zero-padding features that only existed in the train table into test table. Parameters ---------- train_table: biom.Table A biom table with train data test_table: biom.Table A biom table with test data Returns ------- new_test_biom: biom.Table A biom table with the updated test data with identical set of features in the train table. ''' train_feature_ids = train_table.ids(axis='observation') test_feature_ids = test_table.ids(axis='observation') n_samples = test_table.shape[0] #n_features = test_table.shape[1] sample_ids= test_table.ids(axis='sample') #print("The # of features in the train data: ", len(train_feature_ids)) #print("The # of features in the original test data: ", len(test_feature_ids)) train_uniq_f=list(set(train_feature_ids)-set(test_feature_ids)) shared_f=set(train_feature_ids).intersection(set(test_feature_ids)) # create a zero matrix for all features uniquely existed in the train table padding_table = biom.Table(np.zeros((len(train_uniq_f), n_samples)), train_uniq_f, sample_ids) # filter out features that don't exist in the train table in the test table test_table.filter(shared_f, axis='observation') n_filtered_features = test_table.shape[1] if n_filtered_features == 0: raise ValueError('No feature overlap between train and test table!' 'Check the feature-format consistentcy between tables!') # merge the two tables new_test_table = test_table.merge(padding_table) return new_test_table
def plot(output_dir, table: biom.Table, metadata: q2.Metadata, case_where: str, control_where: str, feature_tree: skbio.TreeNode = None): with open('/tmp/tree.nwk', 'w') as fh: feature_tree.write(fh) copy_tree(os.path.join(PLOT, 'assets', 'dist'), output_dir) data_dir = os.path.join(output_dir, 'data') os.mkdir(data_dir) metadata = metadata.filter_ids(table.ids(axis='sample')) case_samples = sorted(list(metadata.get_ids(case_where))) control_samples = sorted(list(metadata.get_ids(control_where))) table.filter(case_samples + control_samples) table.remove_empty('observation') features = list(table.ids(axis='observation')) if feature_tree is not None: feature_tree = shear_no_prune(feature_tree, features) else: feature_tree = TreeNode() tree_data = tree_to_array(feature_tree) idx, = np.where(np.asarray(tree_data['children']) == 0) tree_data['lookup'] = dict(zip(map(str, idx), range(len(idx)))) tip_order = np.asarray(tree_data['names'])[idx] table = table.sort_order(tip_order, axis='observation') table = table.sort_order(case_samples + control_samples, axis='sample') with open(os.path.join(data_dir, 'packed_table.jsonp'), 'w') as fh: fh.write('LOAD_PACKED_TABLE(') fh.write(json.dumps(table_to_b64pa(table))) fh.write(');') with open(os.path.join(data_dir, 'tree.jsonp'), 'w') as fh: fh.write('LOAD_TREE(') fh.write(json.dumps(tree_data)) fh.write(');')
def beta(table: biom.Table, metric: str) -> skbio.DistanceMatrix: if metric not in non_phylogenetic_metrics(): raise ValueError("Unknown metric: %s" % metric) counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity(metric=metric, counts=counts, ids=sample_ids)
def alpha_passthrough(table: biom.Table, metric: str) -> pd.Series: # Note: some metrics require ints, but biom.Table seems to default to float # (e.g. ace, lladser_pe, michaelis_menten_fit) counts = table.matrix_data.astype(int).toarray().T sample_ids = table.ids(axis='sample') result = skbio.diversity.alpha_diversity(metric=metric, counts=counts, ids=sample_ids) result.name = metric return result
def jaccard(table: biom.Table, n_jobs: int = 1) -> skbio.DistanceMatrix: counts = table.matrix_data.toarray().T sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity( metric='jaccard', counts=counts, ids=sample_ids, validate=True, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs)
def multinomial(table: biom.Table, metadata: Metadata, formula: str, training_column: str = None, num_random_test_examples: int = 10, epoch: int = 10, batch_size: int = 5, beta_prior: float = 1, learning_rate: float = 0.1, clipnorm: float = 10, min_sample_count: int = 10, min_feature_count: int = 10, summary_interval: int = 60) -> (pd.DataFrame): # load metadata and tables metadata = metadata.to_dataframe() # match them table, metadata, design = match_and_filter(table, metadata, formula, training_column, num_random_test_examples, min_sample_count, min_feature_count) # convert to dense representation dense_table = table.to_dataframe().to_dense().T # split up training and testing trainX, testX, trainY, testY = split_training(dense_table, metadata, design, training_column, num_random_test_examples) model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm, beta_mean=beta_prior, batch_size=batch_size, save_path=None) with tf.Graph().as_default(), tf.Session() as session: model(session, trainX, trainY, testX, testY) model.fit(epoch=epoch, summary_interval=summary_interval, checkpoint_interval=None) md_ids = np.array(design.columns) obs_ids = table.ids(axis='observation') beta_ = clr(clr_inv(np.hstack((np.zeros((model.p, 1)), model.B)))) beta_ = pd.DataFrame( beta_.T, columns=md_ids, index=obs_ids, ) return beta_
def reorder_feature_table(query_table: biom.Table, reference_table: biom.Table) -> biom.Table: query_samples = query_table.ids() ref_samples = set(reference_table.ids()) for sample in query_samples: if sample in ref_samples: raise ValueError( "The sample", sample, "from your reference data found in your query one, while " "the two tables should be disjoint." ) merged_table = reference_table.merge(query_table) merged_table.filter(ids_to_keep=reference_table.ids('observation')) merged_table.filter(ids_to_keep=query_samples) for sample in merged_table.ids(): if sample in ref_samples: raise ValueError( "The sample", sample, "from your reference data found in your query one, while " "the two tables should be disjoint." ) return merged_table.sort_order(reference_table.ids('observation'))
def alpha(table: biom.Table, metric: str) -> pd.Series: if metric not in non_phylogenetic_metrics(): raise ValueError("Unknown metric: %s" % metric) counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') result = skbio.diversity.alpha_diversity(metric=metric, counts=counts, ids=sample_ids) result.name = metric return result
def shannon_entropy(table: biom.Table, drop_undefined_samples: bool = False) -> pd.Series: counts = table.matrix_data.toarray().T sample_ids = table.ids(axis='sample') if drop_undefined_samples: counts, sample_ids = _drop_undefined_samples( counts, sample_ids, minimum_nonzero_elements=1) result = skbio.diversity.alpha_diversity(metric='shannon', counts=counts, ids=sample_ids) result.name = 'shannon_entropy' return result
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode, metric: str) -> skbio.DistanceMatrix: if metric not in phylogenetic_metrics(): raise ValueError("Unknown phylogenetic metric: %s" % metric) counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') feature_ids = table.ids(axis='observation') try: results = skbio.diversity.beta_diversity(metric=metric, counts=counts, ids=sample_ids, otu_ids=feature_ids, tree=phylogeny) except skbio.tree.MissingNodeError as e: message = str(e).replace('otu_ids', 'feature_ids') message = message.replace('tree', 'phylogeny') raise skbio.tree.MissingNodeError(message) return results
def alpha_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode, metric: str) -> pd.Series: if metric not in phylogenetic_metrics(): raise ValueError("Unknown phylogenetic metric: %s" % metric) counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') feature_ids = table.ids(axis='observation') try: result = skbio.diversity.alpha_diversity(metric=metric, counts=counts, ids=sample_ids, otu_ids=feature_ids, tree=phylogeny) except skbio.tree.MissingNodeError as e: message = str(e).replace('otu_ids', 'feature_ids') message = message.replace('tree', 'phylogeny') raise skbio.tree.MissingNodeError(message) result.name = metric return result
def beta(table: biom.Table, metric: str, n_jobs: int=1)-> skbio.DistanceMatrix: if metric not in non_phylogenetic_metrics(): raise ValueError("Unknown metric: %s" % metric) if table.is_empty(): raise ValueError("The provided table object is empty") counts = table.matrix_data.toarray().astype(int).T sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs )
def beta(table: biom.Table, metric: str, pseudocount: int = 1, n_jobs: int = 1) -> skbio.DistanceMatrix: if not (metric in non_phylogenetic_metrics()): raise ValueError("Unknown metric: %s" % metric) counts = table.matrix_data.toarray().T def aitchison(x, y, **kwds): return euclidean(clr(x), clr(y)) def canberra_adkins(x, y, **kwds): if (x < 0).any() or (y < 0).any(): raise ValueError("Canberra-Adkins is only defined over positive " "values.") nz = ((x > 0) | (y > 0)) x_ = x[nz] y_ = y[nz] nnz = nz.sum() return (1. / nnz) * np.sum(np.abs(x_ - y_) / (x_ + y_)) if metric == 'aitchison': counts += pseudocount metric = aitchison elif metric == 'canberra_adkins': metric = canberra_adkins if table.is_empty(): raise ValueError("The provided table object is empty") sample_ids = table.ids(axis='sample') return skbio.diversity.beta_diversity( metric=metric, counts=counts, ids=sample_ids, validate=True, pairwise_func=sklearn.metrics.pairwise_distances, n_jobs=n_jobs )
def filter_seqs(data: pd.Series, table: biom.Table=None, metadata: qiime2.Metadata=None, where: str=None, exclude_ids: bool=False) -> pd.Series: if table is not None and metadata is not None: raise ValueError('Filtering with metadata and filtering with a table ' 'are mutually exclusive.') elif table is None and metadata is None: raise ValueError('No filtering requested. Must provide either table ' 'or metadata.') elif table is not None: ids_to_keep = table.ids(axis='observation') else: # Note, no need to check for missing feature IDs in the metadata, # because that is basically the point of this method. ids_to_keep = metadata.get_ids(where=where) if exclude_ids is True: ids_to_keep = set(data.index) - set(ids_to_keep) filtered = data[data.index.isin(ids_to_keep)] if filtered.empty is True: raise ValueError('All features were filtered out of the data.') return filtered
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode=None, metrics: set=None, metadata: qiime2.Metadata=None, min_depth: int=1, steps: int=10, iterations: int=10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is not None: metadata_ids = metadata.ids() table_ids = set(table.ids(axis='sample')) if not table_ids.issubset(metadata_ids): raise ValueError('Missing samples in metadata: %r' % table_ids.difference(metadata_ids)) filenames, categories, empty_columns = [], [], [] data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: metadata_df = metadata.to_dataframe() metadata_df = metadata_df.loc[data.index] all_columns = metadata_df.columns metadata_df.dropna(axis='columns', how='all', inplace=True) empty_columns = set(all_columns) - set(metadata_df.columns) metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) merged = data.join(metadata_df, how='left') categories = metadata_df.columns.get_level_values(0) for category in categories: category_name = quote(category) reindexed_df, counts = _reindex_with_metadata(category, categories, merged) c_df = _compute_summary(reindexed_df, category, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, category_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, category_name) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': filenames, 'categories': list(categories), 'empty_columns': sorted(empty_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def beta_rarefaction(output_dir: str, table: biom.Table, metric: str, clustering_method: str, metadata: qiime2.Metadata, sampling_depth: int, iterations: int=10, phylogeny: skbio.TreeNode=None, correlation_method: str='spearman', color_scheme: str='BrBG') -> None: if metric in phylogenetic_metrics(): if phylogeny is None: raise ValueError("A phylogenetic metric (%s) was requested, " "but a phylogenetic tree was not provided. " "Phylogeny must be provided when using a " "phylogenetic diversity metric." % metric) beta_func = functools.partial(beta_phylogenetic, phylogeny=phylogeny) else: beta_func = beta if table.is_empty(): raise ValueError("Input feature table is empty.") # Filter metadata to only include sample IDs present in the feature table. # Also ensures every feature table sample ID is present in the metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) distance_matrices = _get_multiple_rarefaction( beta_func, metric, iterations, table, sampling_depth) primary = distance_matrices[0] support = distance_matrices[1:] heatmap_fig, similarity_df = _make_heatmap( distance_matrices, metric, correlation_method, color_scheme) heatmap_fig.savefig(os.path.join(output_dir, 'heatmap.svg')) similarity_df.to_csv( os.path.join(output_dir, 'rarefaction-iteration-correlation.tsv'), sep='\t') tree = _cluster_samples(primary, support, clustering_method) tree.write(os.path.join(output_dir, 'sample-clustering-%s.tre' % clustering_method)) emperor = _jackknifed_emperor(primary, support, metadata) emperor_dir = os.path.join(output_dir, 'emperor') emperor.copy_support_files(emperor_dir) with open(os.path.join(emperor_dir, 'index.html'), 'w') as fh: fh.write(emperor.make_emperor(standalone=True)) templates = list(map( lambda page: os.path.join(TEMPLATES, 'beta_rarefaction_assets', page), ['index.html', 'heatmap.html', 'tree.html', 'emperor.html'])) context = { 'metric': metric, 'clustering_method': clustering_method, 'tabs': [{'url': 'emperor.html', 'title': 'PCoA'}, {'url': 'heatmap.html', 'title': 'Heatmap'}, {'url': 'tree.html', 'title': 'Clustering'}] } q2templates.render(templates, output_dir, context=context)
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int, phylogeny: skbio.TreeNode = None, metrics: set = None, metadata: qiime2.Metadata = None, min_depth: int = 1, steps: int = 10, iterations: int = 10) -> None: if metrics is None: metrics = {'observed_otus', 'shannon'} if phylogeny is not None: metrics.add('faith_pd') elif not metrics: raise ValueError('`metrics` was given an empty set.') else: phylo_overlap = phylogenetic_metrics() & metrics if phylo_overlap and phylogeny is None: raise ValueError('Phylogenetic metric %s was requested but ' 'phylogeny was not provided.' % phylo_overlap) if max_depth <= min_depth: raise ValueError('Provided max_depth of %d must be greater than ' 'provided min_depth of %d.' % (max_depth, min_depth)) possible_steps = max_depth - min_depth if possible_steps < steps: raise ValueError('Provided number of steps (%d) is greater than the ' 'steps possible between min_depth and ' 'max_depth (%d).' % (steps, possible_steps)) if table.is_empty(): raise ValueError('Provided table is empty.') max_frequency = max(table.sum(axis='sample')) if max_frequency < max_depth: raise ValueError('Provided max_depth of %d is greater than ' 'the maximum sample total frequency of the ' 'feature_table (%d).' % (max_depth, max_frequency)) if metadata is None: columns, filtered_columns = set(), set() else: # Filter metadata to only include sample IDs present in the feature # table. Also ensures every feature table sample ID is present in the # metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) # Drop metadata columns that aren't categorical, or consist solely of # missing values. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) metadata_df = metadata.to_dataframe() if metadata_df.empty or len(metadata.columns) == 0: raise ValueError("All metadata filtered after dropping columns " "that contained non-categorical data.") metadata_df.columns = pd.MultiIndex.from_tuples( [(c, '') for c in metadata_df.columns]) columns = metadata_df.columns.get_level_values(0) data = _compute_rarefaction_data(table, min_depth, max_depth, steps, iterations, phylogeny, metrics) filenames = [] for m, data in data.items(): metric_name = quote(m) filename = '%s.csv' % metric_name if metadata is None: n_df = _compute_summary(data, 'sample-id') jsonp_filename = '%s.jsonp' % metric_name _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, n_df, '') filenames.append(jsonp_filename) else: merged = data.join(metadata_df, how='left') for column in columns: column_name = quote(column) reindexed_df, counts = _reindex_with_metadata(column, columns, merged) c_df = _compute_summary(reindexed_df, column, counts=counts) jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name) _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name, c_df, column) filenames.append(jsonp_filename) with open(os.path.join(output_dir, filename), 'w') as fh: data.columns = ['depth-%d_iter-%d' % (t[0], t[1]) for t in data.columns.values] if metadata is not None: data = data.join(metadata.to_dataframe(), how='left') data.to_csv(fh, index_label=['sample-id']) index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html') q2templates.render(index, output_dir, context={'metrics': list(metrics), 'filenames': [quote(f) for f in filenames], 'columns': list(columns), 'steps': steps, 'filtered_columns': sorted(filtered_columns)}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'dist'), os.path.join(output_dir, 'dist'))
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): """Creates the initial non-rarefied BIOM artifact of the analysis Parameters ---------- analysis : dict Dictionary with the analysis information biom_data : dict Dictionary with the biom file information rarefied_table : biom.Table The rarefied BIOM table Returns ------- int The id of the new artifact """ # The non rarefied biom artifact is the initial biom table of the analysis. # This table does not currently exist anywhere, so we need to actually # create the BIOM file. To create this BIOM file we need: (1) the samples # and artifacts they come from and (2) whether the samples where # renamed or not. (1) is on the database, but we need to inferr (2) from # the existing rarefied BIOM table. Fun, fun... with TRN: # Get the samples included in the BIOM table grouped by artifact id # Note that the analysis contains a BIOM table per data type included # in it, and the table analysis_sample does not differentiate between # datatypes, so we need to check the data type in the artifact table sql = """SELECT artifact_id, array_agg(sample_id) FROM qiita.analysis_sample JOIN qiita.artifact USING (artifact_id) WHERE analysis_id = %s AND data_type_id = %s GROUP BY artifact_id""" TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) samples_by_artifact = TRN.execute_fetchindex() # Create an empty BIOM table to be the new master table new_table = Table([], [], []) ids_map = {} for a_id, samples in samples_by_artifact: # Get the filepath of the BIOM table from the artifact artifact = Artifact(a_id) biom_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': biom_fp = fp # Note that we are sure that the biom table exists for sure, so # no need to check if biom_fp is undefined biom_table = load_table(biom_fp) samples = set(samples).intersection(biom_table.ids()) biom_table.filter(samples, axis='sample', inplace=True) # we need to check if the table has samples left before merging if biom_table.shape[0] != 0 and biom_table.shape[1] != 0: new_table = new_table.merge(biom_table) ids_map.update({sid: "%d.%s" % (a_id, sid) for sid in biom_table.ids()}) # Check if we need to rename the sample ids in the biom table new_table_ids = set(new_table.ids()) if not new_table_ids.issuperset(rarefied_table.ids()): # We need to rename the sample ids new_table.update_ids(ids_map, 'sample', True, True) sql = """INSERT INTO qiita.artifact (generated_timestamp, data_type_id, visibility_id, artifact_type_id, submitted_to_vamps) VALUES (%s, %s, %s, %s, %s) RETURNING artifact_id""" # Magic number 4 -> visibility sandbox # Magix number 7 -> biom artifact type TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False]) artifact_id = TRN.execute_fetchlast() # Associate the artifact with the analysis sql = """INSERT INTO qiita.analysis_artifact (analysis_id, artifact_id) VALUES (%s, %s)""" TRN.add(sql, [analysis['analysis_id'], artifact_id]) # Link the artifact with its file dd_id, mp = get_mountpoint('BIOM')[0] dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) if not exists(dir_fp): makedirs(dir_fp) new_table_fp = join(dir_fp, "biom_table.biom") with biom_open(new_table_fp, 'w') as f: new_table.to_hdf5(f, "Generated by Qiita") sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES (%s, %s, %s, %s, %s) RETURNING filepath_id""" # Magic number 7 -> filepath_type_id = 'biom' # Magic number 1 -> the checksum algorithm id TRN.add(sql, [basename(new_table_fp), 7, compute_checksum(new_table_fp), 1, dd_id]) fp_id = TRN.execute_fetchlast() sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, fp_id]) TRN.execute() return artifact_id
def _table_to_dataframe(table: biom.Table) -> pd.DataFrame: array = table.matrix_data.toarray().T sample_ids = table.ids(axis='sample') feature_ids = table.ids(axis='observation') return pd.DataFrame(array, index=sample_ids, columns=feature_ids)