def import_shogun_biom(f, annotation_table=None, annotation_type=None, names_to_taxonomy=False): import_funcs = { 'module': shogun_parse_module_table, 'pathway': shogun_parse_pathway_table, 'enzyme': shogun_parse_enzyme_table } table = pd.read_csv(f, sep='\t', index_col=0) bt = Table(table.values, observation_ids=list(map(str, table.index)), sample_ids=list(map(str, table.columns))) if names_to_taxonomy: metadata = { x: { 'taxonomy': x.split(';') } for x in bt.ids(axis='observation') } bt.add_metadata(metadata, axis='observation') if annotation_table is not None: metadata = import_funcs[annotation_type](annotation_table) bt.add_metadata(metadata, axis='observation') return (bt)
def rm_sparse_obs(self, prevalence_thres: float = 0.05, abundance_thres: float = 0.01) -> "Otu": """ Remove observations with prevalence < `prevalence_thres` and abundance < `abundance_thres` Parameters ---------- prevalence_thres : float Minimum fraction of samples the observation must be present in in order to be accepted abundance_thres : float Minimum observation count fraction in a sample needed in order to be accepted Returns ------- Otu Otu instance with bad observations removed """ filt_fun = (lambda val, *_: (val.astype(int).astype(bool).mean()) >= prevalence_thres) otu_dense_obs = self.otu_data.filter(filt_fun, axis="observation", inplace=False) otu_df = otu_dense_obs.to_dataframe() if otu_df.apply(pd.api.types.is_sparse).any(): otu_rel_abund = (otu_df / otu_df.sum(axis=0)).sparse.to_dense() else: otu_rel_abund = otu_df / otu_df.sum(axis=0) ind_above_thres = otu_rel_abund.index[(otu_rel_abund > abundance_thres).any(axis=1)] new_otu = self.otu_data.filter(ind_above_thres, axis="observation", inplace=False) ind_below_thres = set( self.otu_data.ids("observation")) - set(ind_above_thres) otu_sparse_obs = self.otu_data.filter(ind_below_thres, axis="observation", inplace=False) new_row = Table( otu_sparse_obs.sum(axis="sample"), ["otu_merged"], self.otu_data.ids(axis="sample"), ) tax_level = self.tax_level random_row_metadata = dict( self.otu_data.metadata(axis="observation")[0]) new_row.add_metadata( { "otu_merged": { **random_row_metadata, **Lineage("Unclassified").to_dict(tax_level), } }, axis="observation", ) final_otu = new_otu.concat([new_row], axis="observation") return Otu(final_otu)
def collapse_biom(table: biom.Table, mapping: dict, normalize=False): """Collapse a BIOM table in many-to-many mode. Parameters ---------- table : biom.Table Table to collapse. mapping : dict of list of str Source-to-target(s) mapping. normalize : bool, optional Whether normalize per-target counts by number of targets per source. Returns ------- biom.Table Collapsed BIOM table. Notes ----- Metadata will not be retained in the collapsed table. See Also -------- .table.collapse_table """ # filter table features table = table.filter(lambda data, id_, md: id_ in mapping, axis='observation', inplace=False) # stop if no feature left if table.is_empty(): return table # add mapping to table metadata table.add_metadata({k: dict(part=v) for k, v in mapping.items()}, axis='observation') # determine collapsing method kwargs = dict(norm=False, one_to_many=True, axis='observation', one_to_many_mode=('divide' if normalize else 'add')) # collapse table in many-to-many mode table = table.collapse(lambda id_, md: zip(md['part'], md['part']), **kwargs) # round to integers if normalize: round_biom(table) # clean up table.del_metadata(keys=['Path']) return table
def rename_deblur_biom(biom, name_stub='deblur', metadata_name='deblurred_seq'): seqs = biom.ids(axis='observation') seqnames = ['{0}{1}'.format(name_stub, x) for x in range(len(seqs))] seq_metadata = {seqname: {metadata_name: seq} for seq, seqname in zip(seqs, seqnames)} renamed_biom = Table(biom.matrix_data, seqnames, biom.ids(axis='sample'), biom.metadata(axis='observation'), biom.metadata(axis='sample'), table_id = biom.table_id + ' renamed') renamed_biom.add_metadata(seq_metadata, axis='observation') return(renamed_biom)
def biom_add_metacol(table: biom.Table, dic, name, missing=''): """Add a metadata column to a table in place based on a dictionary. Parameters ---------- table : biom.Table Table to add metadata column. dict : dict Metadata column (feature-to-value mapping). name : str Metadata column name. missing : any type, optional Default value if not found in dictionary. """ metadata = { x: { name: dic.get(x, missing) } for x in table.ids('observation') } table.add_metadata(metadata, axis='observation')
def trim_dada2_posthoc( table: biom.Table, representative_sequences: pd.Series, trim_length: int = 0, hashed_feature_ids: bool = True) -> (biom.Table, pd.Series): """ Trims ASVs generated by DADA2 to a standard length Parameters ---------- table : biom.Table The feature table representative_sequences: DNAFASTAFormat The sequences which correspond to the ASV table trim_length : int The length to trim the ASVS. If the length is 0, the minimum sequence length will be used. hash_feature_ids: bool Whether feature and sequence IDs should be hashed. """ # Trims the sequences seq_length = representative_sequences.apply(lambda x: len(x)) if trim_length == 0: trim_length = seq_length.min() if (seq_length < trim_length).any(): warnings.warn( "There are ASVs shorter than the trim length. " "These sequences will be discarded.", UserWarning) rep_seqs = representative_sequences.astype(str) rep_seqs = rep_seqs.loc[seq_length >= trim_length].copy() rep_seqs = pd.DataFrame(data=[rep_seqs.apply(lambda x: x[:trim_length])], index=['sequence']).T # Collapses the table based on the trimmed sequences table.filter(lambda v, id_, md: id_ in rep_seqs.index, axis='observation', inplace=True) table.add_metadata( rep_seqs.loc[table.ids(axis='observation')].to_dict(orient='index'), axis='observation') table2 = table.collapse(lambda id_, md: md['sequence'], norm=False, axis='observation') seqs2 = rep_seqs.drop_duplicates()['sequence'].copy() if hashed_feature_ids: table2.update_ids( {seq_: _hash_seq(seq_) for seq_ in table2.ids(axis='observation')}, axis='observation', inplace=True) seqs2.rename({id_: _hash_seq(seq_) for id_, seq_ in seqs2.items()}, inplace=True) else: seqs2.rename({id_: seq_ for id_, seq_ in seqs2.items()}, inplace=True) return table2, seqs2
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) otu_table = load_table(opts.input_otu_fp) ids_to_load = otu_table.ids(axis='observation') if(opts.input_count_fp is None): #precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz) precalc_file_name='_'.join(['16S',opts.gg_version,'precalculated.tab.gz']) input_count_table=join(get_picrust_project_dir(),'picrust','data',precalc_file_name) else: input_count_table=opts.input_count_fp if opts.verbose: print "Loading trait table: ", input_count_table ext=path.splitext(input_count_table)[1] if (ext == '.gz'): count_table_fh = gzip.open(input_count_table,'rb') else: count_table_fh = open(input_count_table,'U') if opts.load_precalc_file_in_biom: count_table = load_table(count_table_fh) else: count_table = convert_precalc_to_biom(count_table_fh, ids_to_load) #Need to only keep data relevant to our otu list ids=[] for x in otu_table.iter(axis='observation'): ids.append(str(x[1])) ob_id=count_table.ids(axis='observation')[0] filtered_otus=[] filtered_values=[] for x in ids: if count_table.exists(x, axis='sample'): filtered_otus.append(x) filtered_values.append(otu_table.data(x, axis='observation')) filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids()) copy_numbers_filtered={} for x in filtered_otus: value = count_table.get_value_by_ids(ob_id,x) try: #data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError,\ "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x]={opts.metadata_identifer:value} filtered_otu_table.add_metadata(copy_numbers_filtered, axis='observation') def metadata_norm(v, i, md): return v / float(md[opts.metadata_identifer]) normalized_table = filtered_otu_table.transform(metadata_norm, axis='observation') #move Observation Metadata from original to filtered OTU table normalized_table = transfer_observation_metadata(otu_table, normalized_table, 'observation') make_output_dir_for_file(opts.output_otu_fp) write_biom_table(normalized_table, opts.output_otu_fp)
def test_import_shogun_biom(self): shogun_table = ('#OTU ID\t1450\t2563\n' 'k__Archaea\t26\t25\n' 'k__Archaea;p__Crenarchaeota\t3\t5\n' 'k__Archaea;p__Crenarchaeota;c__Thermoprotei\t1\t25\n') exp_biom = Table(np.array([[26, 25], [3, 5], [1, 25]]), ['k__Archaea', 'k__Archaea;p__Crenarchaeota', 'k__Archaea;p__Crenarchaeota;c__Thermoprotei'], ['1450', '2563']) obs_biom = import_shogun_biom(StringIO(shogun_table)) self.assertEqual(exp_biom, obs_biom) tax_metadata = {'k__Archaea': { 'taxonomy': ['k__Archaea']}, 'k__Archaea;p__Crenarchaeota': { 'taxonomy': ['k__Archaea', 'p__Crenarchaeota']}, 'k__Archaea;p__Crenarchaeota;c__Thermoprotei': { 'taxonomy': ['k__Archaea', 'p__Crenarchaeota', 'c__Thermoprotei']}} exp_biom_tax = Table(np.array([[26, 25], [3, 5], [1, 25]]), ['k__Archaea', 'k__Archaea;p__Crenarchaeota', 'k__Archaea;p__Crenarchaeota;c__Thermoprotei'], ['1450', '2563']) exp_biom_tax.add_metadata(tax_metadata, axis='observation') obs_biom_tax = import_shogun_biom( StringIO(shogun_table), names_to_taxonomy=True) self.assertEqual(exp_biom_tax, obs_biom_tax) # test modules module_table = ('#MODULE ID\t1450\t2563\n' 'M00017\t26\t25\n' 'M00018\t3\t5\n') exp_m_biom = Table(np.array([[26, 25], [3, 5]]), ['M00017', 'M00018'], ['1450', '2563']) exp_m_biom.add_metadata(self.mod_md, axis='observation') obs_m_biom = import_shogun_biom( StringIO(module_table), annotation_table=StringIO(self.modules), annotation_type='module') self.assertEqual(exp_m_biom, obs_m_biom) # test pathways path_table = ('#PATHWAY ID\t1450\t2563\n' '1.4.1 With NAD+ or NADP+ as acceptor\t26\t25\n' '1.4.3 With oxygen as acceptor\t3\t5\n') exp_p_biom = Table(np.array([[26, 25], [3, 5]]), ['1.4.1 With NAD+ or NADP+ as acceptor', '1.4.3 With oxygen as acceptor'], ['1450', '2563']) exp_p_biom.add_metadata(self.path_md, axis='observation') obs_p_biom = import_shogun_biom( StringIO(path_table), annotation_table=StringIO(self.pathways), annotation_type='pathway') self.assertEqual(exp_p_biom, obs_p_biom) # test enzymes enzyme_table = ('#KEGG ID\t1450\t2563\n' 'K00001\t26\t25\n' 'K00002\t3\t5\n' 'K00003\t1\t25\n') exp_e_biom = Table(np.array([[26, 25], [3, 5], [1, 25]]), ['K00001', 'K00002', 'K00003'], ['1450', '2563']) exp_e_biom.add_metadata(self.enz_md, axis='observation') obs_e_biom = import_shogun_biom( StringIO(enzyme_table), annotation_table=StringIO(self.enzymes), annotation_type='enzyme') self.assertEqual(exp_e_biom, obs_e_biom) # test empty empty_table = ('#KEGG ID\t1450\t2563\n') exp_empty_biom = Table(np.zeros((0, 2)), [], ['1450', '2563']) obs_empty_biom = import_shogun_biom( StringIO(empty_table), annotation_table=StringIO(self.enzymes), annotation_type='enzyme') self.assertEqual(exp_empty_biom, obs_empty_biom)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) otu_table = load_table(opts.input_otu_fp) ids_to_load = otu_table.ids(axis="observation") if opts.input_count_fp is None: # precalc file has specific name (e.g. 16S_13_5_precalculated.tab.gz) precalc_file_name = "_".join(["16S", opts.gg_version, "precalculated.tab.gz"]) input_count_table = join(get_picrust_project_dir(), "picrust", "data", precalc_file_name) else: input_count_table = opts.input_count_fp if opts.verbose: print "Loading trait table: ", input_count_table ext = path.splitext(input_count_table)[1] if ext == ".gz": count_table_fh = gzip.open(input_count_table, "rb") else: count_table_fh = open(input_count_table, "U") if opts.load_precalc_file_in_biom: count_table = load_table(count_table_fh) else: count_table = convert_precalc_to_biom(count_table_fh, ids_to_load) # Need to only keep data relevant to our otu list ids = [] for x in otu_table.iter(axis="observation"): ids.append(str(x[1])) ob_id = count_table.ids(axis="observation")[0] filtered_otus = [] filtered_values = [] for x in ids: if count_table.exists(x, axis="sample"): filtered_otus.append(x) filtered_values.append(otu_table.data(x, axis="observation")) filtered_otu_table = Table(filtered_values, filtered_otus, otu_table.ids()) copy_numbers_filtered = {} for x in filtered_otus: value = count_table.get_value_by_ids(ob_id, x) try: # data can be floats so round them and make them integers value = int(round(float(value))) except ValueError: raise ValueError, "Invalid type passed as copy number for OTU ID %s. Must be int-able." % (value) if value < 1: raise ValueError, "Copy numbers must be greater than or equal to 1." copy_numbers_filtered[x] = {opts.metadata_identifer: value} filtered_otu_table.add_metadata(copy_numbers_filtered, axis="observation") def metadata_norm(v, i, md): return v / float(md[opts.metadata_identifer]) normalized_table = filtered_otu_table.transform(metadata_norm, axis="observation") # move Observation Metadata from original to filtered OTU table normalized_table = transfer_observation_metadata(otu_table, normalized_table, "observation") make_output_dir_for_file(opts.output_otu_fp) write_biom_table(normalized_table, opts.output_otu_fp)
def collapse_biom(table: biom.Table, mapping: dict, divide=False, field=None): """Collapse a BIOM table in many-to-many mode. Parameters ---------- table : biom.Table Table to collapse. mapping : dict of list of str Source-to-target(s) mapping. divide : bool, optional Whether divide per-target counts by number of targets per source. field : int, optional Index of field to be collapsed in a stratified table. Returns ------- biom.Table Collapsed BIOM table. Raises ------ ValueError Field index is not present in a feature ID. Notes ----- Metadata will not be retained in the collapsed table. See Also -------- .table.collapse_table """ # generate metadata metadata = {} for id_ in table.ids('observation'): feature = id_ if field: fields = feature.split('|') try: feature = fields[field] except IndexError: raise ValueError( f'Feature "{feature}" has less than {field + 1} fields.') if feature not in mapping: continue targets = [] for target in mapping[feature]: if field: fields[field] = target target = '|'.join(fields) targets.append(target) metadata[id_] = dict(part=targets) # filter table features table = table.filter(lambda data, id_, md: id_ in metadata, axis='observation', inplace=False) # stop if no feature left if table.is_empty(): return table # add mapping to table metadata table.add_metadata(metadata, axis='observation') # determine collapsing method kwargs = dict(norm=False, one_to_many=True, axis='observation', one_to_many_mode=('divide' if divide else 'add')) # collapse table in many-to-many mode table = table.collapse(lambda _, md: zip(md['part'], md['part']), **kwargs) # round to integers if divide: round_biom(table) # clean up table.del_metadata(keys=['Path']) return table