def run(self, **kwargs): json_table_str = kwargs['json_table_str'] hdf5_biom = kwargs['hdf5_table'] axis = kwargs['axis'] ids = kwargs['ids'] if axis not in self.Axes: raise CommandError( "Invalid axis '%s'. Must be either %s." % (axis, ' or '.join(map(lambda e: "'%s'" % e, self.Axes)))) if hdf5_biom is None and json_table_str is None: raise CommandError("Must specify an input table") elif hdf5_biom is not None and json_table_str is not None: raise CommandError("Can only specify one input table") if json_table_str is not None: idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis) new_data = direct_slice_data(json_table_str, idxs, axis) # multiple walks over the string. bad form, but easy right now # ...should add a yield_and_ignore parser or something. def subset_generator(): yield "{" yield direct_parse_key(json_table_str, "id") yield "," yield direct_parse_key(json_table_str, "format") yield "," yield direct_parse_key(json_table_str, "format_url") yield "," yield direct_parse_key(json_table_str, "type") yield "," yield direct_parse_key(json_table_str, "generated_by") yield "," yield direct_parse_key(json_table_str, "date") yield "," yield direct_parse_key(json_table_str, "matrix_type") yield "," yield direct_parse_key(json_table_str, "matrix_element_type") yield "," yield new_data yield "," yield new_axis_md yield "," if axis == "observation": yield direct_parse_key(json_table_str, "columns") else: yield direct_parse_key(json_table_str, "rows") yield "}" format_ = 'json' table = subset_generator() else: with biom_open(hdf5_biom) as f: table = Table.from_hdf5(f, ids=ids, axis=axis) format_ = 'hdf5' return {'subsetted_table': (table, format_)}
def run(self, **kwargs): json_table_str = kwargs['json_table_str'] hdf5_biom = kwargs['hdf5_table'] axis = kwargs['axis'] ids = kwargs['ids'] if axis not in self.Axes: raise CommandError("Invalid axis '%s'. Must be either %s." % ( axis, ' or '.join(map(lambda e: "'%s'" % e, self.Axes)))) if hdf5_biom is None and json_table_str is None: raise CommandError("Must specify an input table") elif hdf5_biom is not None and json_table_str is not None: raise CommandError("Can only specify one input table") if json_table_str is not None: idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis) new_data = direct_slice_data(json_table_str, idxs, axis) # multiple walks over the string. bad form, but easy right now # ...should add a yield_and_ignore parser or something. def subset_generator(): yield "{" yield direct_parse_key(json_table_str, "id") yield "," yield direct_parse_key(json_table_str, "format") yield "," yield direct_parse_key(json_table_str, "format_url") yield "," yield direct_parse_key(json_table_str, "type") yield "," yield direct_parse_key(json_table_str, "generated_by") yield "," yield direct_parse_key(json_table_str, "date") yield "," yield direct_parse_key(json_table_str, "matrix_type") yield "," yield direct_parse_key(json_table_str, "matrix_element_type") yield "," yield new_data yield "," yield new_axis_md yield "," if axis == "observation": yield direct_parse_key(json_table_str, "columns") else: yield direct_parse_key(json_table_str, "rows") yield "}" format_ = 'json' table = subset_generator() else: with biom_open(hdf5_biom) as f: table = Table.from_hdf5(f, ids=ids, axis=axis) format_ = 'hdf5' return {'subsetted_table': (table, format_)}
def _subset_table(hdf5_biom, json_table_str, axis, ids): if axis not in ['sample', 'observation']: raise ValueError("Invalid axis '%s'. Must be either 'sample' or " "'observation'." % axis) if hdf5_biom is None and json_table_str is None: raise ValueError("Must specify an input table") elif hdf5_biom is not None and json_table_str is not None: raise ValueError("Can only specify one input table") if json_table_str is not None: idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis) new_data = direct_slice_data(json_table_str, idxs, axis) # multiple walks over the string. bad form, but easy right now # ...should add a yield_and_ignore parser or something. def subset_generator(): yield "{" yield direct_parse_key(json_table_str, "id") yield "," yield direct_parse_key(json_table_str, "format") yield "," yield direct_parse_key(json_table_str, "format_url") yield "," yield direct_parse_key(json_table_str, "type") yield "," yield direct_parse_key(json_table_str, "generated_by") yield "," yield direct_parse_key(json_table_str, "date") yield "," yield direct_parse_key(json_table_str, "matrix_type") yield "," yield direct_parse_key(json_table_str, "matrix_element_type") yield "," yield new_data yield "," yield new_axis_md yield "," if axis == "observation": yield direct_parse_key(json_table_str, "columns") else: yield direct_parse_key(json_table_str, "rows") yield "}" format_ = 'json' table = subset_generator() else: with biom_open(hdf5_biom) as f: table = Table.from_hdf5(f, ids=ids, axis=axis) format_ = 'hdf5' return table, format_
def parse_biom_table(fp, input_is_dense=False): try: return Table.from_hdf5(fp) except: pass if hasattr(fp, 'read'): return Table.from_json(json.load(fp), input_is_dense=input_is_dense) elif isinstance(fp, list): return Table.from_json(json.loads(''.join(fp)), input_is_dense=input_is_dense) else: return Table.from_json(json.loads(fp), input_is_dense=input_is_dense)
def test_rarefy_to_files2(self): """rarefy_to_files should write valid files with some metadata on otus """ maker = RarefactionMaker(self.otu_table_meta_fp, 0, 1, 1, 1) maker.rarefy_to_files(self.rare_dir, include_full=True, include_lineages=False) fname = os.path.join(self.rare_dir, "rarefaction_1_0.biom") with biom_open(fname, 'U') as biom_file: otu_table = Table.from_hdf5(biom_file) self.assertItemsEqual(otu_table.ids(), self.otu_table.ids()[:2])
def test_rarefy_to_files(self): """rarefy_to_files should write valid files """ maker = RarefactionMaker(self.otu_table_fp, 0, 1, 1, 1) maker.rarefy_to_files( self.rare_dir, include_full=True, include_lineages=False) fname = os.path.join(self.rare_dir, "rarefaction_1_0.biom") with biom_open(fname, 'U') as biom_file: otu_table = Table.from_hdf5(biom_file) self.assertItemsEqual( otu_table.sample_ids, self.otu_table.sample_ids[:2])
def parse_biom_table(fp, ids=None, axis='sample', input_is_dense=False): r"""Parses the biom table stored in the filepath `fp` Parameters ---------- fp : file like File alike object storing the BIOM table ids : iterable The sample/observation ids of the samples/observations that we need to retrieve from the biom table axis : {'sample', 'observation'}, optional The axis to subset on input_is_dense : boolean Indicates if the BIOM table is dense or sparse. Valid only for JSON tables. Returns ------- Table The BIOM table stored at fp Raises ------ ValueError If `samples` and `observations` are provided. Notes ----- Subsetting from the BIOM table is only supported in one axis Examples -------- Parse a hdf5 biom table >>> from h5py import File # doctest: +SKIP >>> from biom.parse import parse_biom_table >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP >>> t = parse_biom_table(f) # doctest: +SKIP Parse a hdf5 biom table subsetting observations >>> from h5py import File # doctest: +SKIP >>> from biom.parse import parse_biom_table >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP >>> t = parse_biom_table(f, ids=["GG_OTU_1"], ... axis='observation') # doctest: +SKIP """ if axis not in ['observation', 'sample']: UnknownAxisError(axis) try: return Table.from_hdf5(fp, ids=ids, axis=axis) except ValueError: pass except RuntimeError: pass if hasattr(fp, 'read'): old_pos = fp.tell() # Read in characters until first non-whitespace # If it is a {, then this is (most likely) JSON c = fp.read(1) while c.isspace(): c = fp.read(1) if c == '{': fp.seek(old_pos) t = Table.from_json(json.load(fp, object_pairs_hook=OrderedDict), input_is_dense=input_is_dense) else: fp.seek(old_pos) t = Table.from_tsv(fp, None, None, lambda x: x) elif isinstance(fp, list): try: t = Table.from_json(json.loads(''.join(fp), object_pairs_hook=OrderedDict), input_is_dense=input_is_dense) except ValueError: t = Table.from_tsv(fp, None, None, lambda x: x) else: t = Table.from_json(json.loads(fp, object_pairs_hook=OrderedDict), input_is_dense=input_is_dense) def subset_ids(data, id_, md): return id_ in ids def gt_zero(vals, id_, md): return np.any(vals) if ids is not None: t.filter(subset_ids, axis=axis) axis = 'observation' if axis == 'sample' else 'sample' t.filter(gt_zero, axis=axis) return t
def parse_biom_table(fp, ids=None, axis='sample', input_is_dense=False): r"""Parses the biom table stored in the filepath `fp` Parameters ---------- fp : file like File alike object storing the BIOM table ids : iterable The sample/observation ids of the samples/observations that we need to retrieve from the biom table axis : {'sample', 'observation'}, optional The axis to subset on input_is_dense : boolean Indicates if the BIOM table is dense or sparse. Valid only for JSON tables. Returns ------- Table The BIOM table stored at fp Raises ------ ValueError If `samples` and `observations` are provided. Notes ----- Subsetting from the BIOM table is only supported in one axis Examples -------- Parse a hdf5 biom table >>> from h5py import File # doctest: +SKIP >>> from biom.parse import parse_biom_table >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP >>> t = parse_biom_table(f) # doctest: +SKIP Parse a hdf5 biom table subsetting observations >>> from h5py import File # doctest: +SKIP >>> from biom.parse import parse_biom_table >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP >>> t = parse_biom_table(f, ids=["GG_OTU_1"], ... axis='observation') # doctest: +SKIP """ if axis not in ['observation', 'sample']: UnknownAxisError(axis) try: return Table.from_hdf5(fp, ids=ids, axis=axis) except: pass if hasattr(fp, 'read'): old_pos = fp.tell() try: t = Table.from_json(json.load(fp), input_is_dense=input_is_dense) except ValueError: fp.seek(old_pos) t = Table.from_tsv(fp, None, None, lambda x: x) elif isinstance(fp, list): try: t = Table.from_json(json.loads(''.join(fp)), input_is_dense=input_is_dense) except ValueError: t = Table.from_tsv(fp, None, None, lambda x: x) else: t = Table.from_json(json.loads(fp), input_is_dense=input_is_dense) if ids is not None: f = lambda data, id_, md: id_ in ids t.filter(f, axis=axis) axis = 'observation' if axis == 'sample' else 'sample' f = lambda vals, id_, md: np.any(vals) t.filter(f, axis=axis) return t
def parse_biom_table(file_obj, ids=None, axis='sample', input_is_dense=False): r"""Parses the biom table stored in `file_obj` Parameters ---------- file_obj : file-like object, or list file-like object storing the BIOM table (tab-delimited or JSON), or a list of lines of the BIOM table in tab-delimited or JSON format ids : iterable The sample/observation ids of the samples/observations that we need to retrieve from the biom table axis : {'sample', 'observation'}, optional The axis to subset on input_is_dense : boolean Indicates if the BIOM table is dense or sparse. Valid only for JSON tables. Returns ------- Table The BIOM table stored at file_obj Raises ------ ValueError If `samples` and `observations` are provided. Notes ----- Subsetting from the BIOM table is only supported in one axis Examples -------- Parse a hdf5 biom table >>> from h5py import File # doctest: +SKIP >>> from biom.parse import parse_biom_table >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP >>> t = parse_biom_table(f) # doctest: +SKIP Parse a hdf5 biom table subsetting observations >>> from h5py import File # doctest: +SKIP >>> from biom.parse import parse_biom_table >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP >>> t = parse_biom_table(f, ids=["GG_OTU_1"], ... axis='observation') # doctest: +SKIP """ if axis not in ['observation', 'sample']: UnknownAxisError(axis) try: return Table.from_hdf5(file_obj, ids=ids, axis=axis) except ValueError: pass except RuntimeError: pass if hasattr(file_obj, 'read'): old_pos = file_obj.tell() # Read in characters until first non-whitespace # If it is a {, then this is (most likely) JSON c = file_obj.read(1) while c.isspace(): c = file_obj.read(1) if c == '{': file_obj.seek(old_pos) t = Table.from_json(json.load(file_obj, object_pairs_hook=OrderedDict), input_is_dense=input_is_dense) else: file_obj.seek(old_pos) t = Table.from_tsv(file_obj, None, None, lambda x: x) elif isinstance(file_obj, list): try: t = Table.from_json(json.loads(''.join(file_obj), object_pairs_hook=OrderedDict), input_is_dense=input_is_dense) except ValueError: t = Table.from_tsv(file_obj, None, None, lambda x: x) else: t = Table.from_json(json.loads(file_obj, object_pairs_hook=OrderedDict), input_is_dense=input_is_dense) def subset_ids(data, id_, md): return id_ in ids def gt_zero(vals, id_, md): return np.any(vals) if ids is not None: t.filter(subset_ids, axis=axis) axis = 'observation' if axis == 'sample' else 'sample' t.filter(gt_zero, axis=axis) return t