def main(table_in, table_out, pathways, to_classic): # setup table = load_table(table_in) pathway_dict = get_pathway2kos() # get set of kos from pathways pathways_kos = set() for pathway in pathways: pathways_kos = pathways_kos | pathway_dict[pathway.strip()[-5:]] # get selected kos kos_to_keep = set(table.ids('observation')) & \ pathways_kos if len(kos_to_keep) == 0: raise EmptySetERROR('Intersection created empty set') obs_ids = np.array(list(kos_to_keep)) data = np.empty([len(obs_ids), len(table.ids('sample'))]) for i, obs in enumerate(obs_ids): data[i] = table.data(obs, 'observation') # output new_table = Table(data, obs_ids, table.ids('sample'), type="OTU table") if to_classic: # print to tab delimited biom table f = open(table_out, 'w') f.write(new_table.to_tsv()) else: # print json biom table new_table.to_json("filter_KOs_by_pathway.py", open(table_out, 'w'))
def main(): args = parser.parse_args() input_fp = args.input_biom output_fp = args.output_biom threshold = args.abundance_threshold as_fraction = args.abundance_as_fraction if as_fraction: if not 0 <= threshold <= 1: raise ValueError("The value passed for -n " "(--abundance_as_fraction) must be in the " "interval [0, 1]") if not as_fraction: if not str(threshold).replace('.', '', 1).isdigit(): raise ValueError("If you want to express the minimum threshold as " "a fraction of the total sequences in a sample, " "use -n in combination with -f. Otherwise, if " "you want to express the minimum threshold as an " "absolute sequence count minimum, the value " "passed for -n must be an integer.") threshold = int(threshold) input_table = load_table(input_fp) new_data = [] append_new_data = new_data.append for abundances in input_table.iter_data(): if as_fraction: abundance_fractions = abundances.astype(float) / sum(abundances) indices = [ i for (i, j) in enumerate(abundance_fractions > threshold) if not j ] else: indices = [ i for (i, j) in enumerate(abundances > threshold) if not j ] item_set = abundances.itemset for index in indices: item_set(index, 0) append_new_data(abundances) new_data = array(new_data).transpose() new_table = Table(new_data, input_table.ids('observation'), input_table.ids(), input_table.metadata(axis='observation'), input_table.metadata()) with open(output_fp, 'w') as output_fd: new_table.to_json('one-time generation', output_fd)
def _1(data: biom.Table) -> BIOMV100Format: data = _drop_axis_metadata(data) ff = BIOMV100Format() with ff.open() as fh: fh.write(data.to_json(generated_by=_get_generated_by())) return ff
def generate_per_sample_biom(biom_file, limit): """Generate per-sample BIOM files Parameters ---------- biom_file : str A filepath to a BIOM table limit : int or None Limit the number of tables to load Returns ------- str The sample ID str The table in BIOM Format v1.0 str The table in the classic OTU table format """ table = load_table(biom_file) obs_ids = table.ids(axis='observation') obs_md = table.metadata(axis='observation') if limit is None: limit = np.inf count = 0 for v, sample, _ in table.iter(): if count >= limit: break single_sample = Table(v[:, np.newaxis], obs_ids, [sample], obs_md) single_sample.filter(lambda v_, i, md: v_ > 0, axis='observation') biomv1 = single_sample.to_json('AG') biomtxt = single_sample.to_tsv( header_key='taxonomy', header_value='taxonomy', metadata_formatter=lambda x: '; '.join(x)) yield (sample, biomv1, biomtxt) count += 1