def create_biom_file(vcf_fps, output_fp, mapping_fp=None, zip=None): master_table, master_observation_ids = merge_otu_tables(vcf_fps) if zip == 'gz': output_master_f = gzip.open(join(output_fp, 'master_table.biom.zip'), 'wb') output_filtered_f = gzip.open( join(output_fp, 'filtered_table.biom.zip'), 'wb') else: output_master_f = open(join(output_fp, 'master_table.biom'), 'w') output_filtered_f = open(join(output_fp, 'filtered_table.biom'), 'w') if mapping_fp is not None: mapping_f = MetadataMap.fromFile(mapping_fp) master_table.addSampleMetadata(mapping_f) master_table.getBiomFormatJsonString(generatedby(), direct_io=output_master_f) #create a function to filter table by def filter_function(values, id, md): return id in master_observation_ids filtered_table = master_table.filterObservations(filter_function) filtered_table.getBiomFormatJsonString(generatedby(), direct_io=output_filtered_f) output_master_f.close() output_filtered_f.close()
def write_biom_table(result_key, data, option_value=None): """Write a string to a file""" if option_value is None: raise IncompetentDeveloperError("Cannot write output without a " "filepath.") if exists(option_value): raise IOError("Output path '%s' already exists." % option_value) table, fmt = data if fmt not in ['hdf5', 'json', 'tsv']: raise IncompetentDeveloperError("Unknown file format") if fmt == 'hdf5' and not HAVE_H5PY: fmt = 'json' if fmt == 'json': with open(option_value, 'w') as f: f.write(table.to_json(generatedby())) elif fmt == 'tsv': with open(option_value, 'w') as f: f.write(table) f.write('\n') else: import h5py with h5py.File(option_value, 'w') as f: table.to_hdf5(f, generatedby())
def write_biom_table(result_key, data, option_value=None): """Write a string to a file""" if option_value is None: raise IncompetentDeveloperError("Cannot write output without a " "filepath.") if exists(option_value): raise IOError("Output path '%s' already exists." % option_value) table, fmt = data if fmt not in ['hdf5', 'json', 'tsv']: raise IncompetentDeveloperError("Unknown file format") if fmt == 'json': with open(option_value, 'w') as f: f.write(table.to_json(generatedby())) elif fmt == 'tsv': with open(option_value, 'w') as f: f.write(table) f.write('\n') else: if HAVE_H5PY: import h5py else: raise ImportError("h5py is not available, cannot write HDF5!") with h5py.File(option_value, 'w') as f: table.to_hdf5(f, generatedby())
def write_subsetted_biom_table(result_key, data, option_value=None): """Write a string to a file""" if option_value is None: raise IncompetentDeveloperError("Cannot write output without a " "filepath.") if exists(option_value): raise IOError("Output path '%s' already exists." % option_value) table, fmt = data if fmt not in ['hdf5', 'json']: raise IncompetentDeveloperError("Unknown file format") if fmt == 'json': write_list_of_strings(result_key, table, option_value) else: if HAVE_H5PY: import h5py else: # This should never be raised here raise ImportError("h5py is not available, cannot write HDF5!") with h5py.File(option_value, 'w') as f: table.to_hdf5(f, generatedby())
def write_biom_table(result_key, data, option_value=None): """Write a string to a file""" if option_value is None: raise IncompetentDeveloperError("Cannot write output without a " "filepath.") if exists(option_value): raise IOError("Output path '%s' already exists." % option_value) with open(option_value, "w") as f: f.write(data.getBiomFormatJsonString(generatedby()))
def create_biom_file(vcf_fps, output_fp, mapping_fp=None, zip=None): master_table, master_observation_ids = merge_otu_tables(vcf_fps) if zip == 'gz': output_master_f = gzip.open(join(output_fp, 'master_table.biom.zip'), 'wb') output_filtered_f = gzip.open(join(output_fp, 'filtered_table.biom.zip'), 'wb') else: output_master_f = open(join(output_fp, 'master_table.biom'), 'w') output_filtered_f = open(join(output_fp, 'filtered_table.biom'), 'w') if mapping_fp is not None: mapping_f = MetadataMap.fromFile(mapping_fp) master_table.addSampleMetadata(mapping_f) master_table.getBiomFormatJsonString(generatedby(), direct_io=output_master_f) #create a function to filter table by def filter_function(values, id, md): return id in master_observation_ids filtered_table = master_table.filterObservations(filter_function) filtered_table.getBiomFormatJsonString(generatedby(), direct_io=output_filtered_f) output_master_f.close() output_filtered_f.close()
def subset_table(input_hdf5_fp, input_json_fp, axis, ids, output_fp): """Subset a BIOM table. Subset a BIOM table, over either observations or samples, without fully parsing it. This command is intended to assist in working with very large tables when tight on memory, or as a lightweight way to subset a full table. Currently, it is possible to produce tables with rows or columns (observations or samples) that are fully zeroed. Example usage: Choose a subset of the observations in table.biom (JSON) and write them to subset.biom: $ biom subset-table -j table.biom -a observations -s observation_ids.txt \ -o subset.biom Choose a subset of the observations in table.biom (HDF5) and write them to subset.biom: $ biom subset-table -i table.biom -a observations -s observation_ids.txt \ -o subset.biom """ if input_json_fp is not None: with open(input_json_fp, 'U') as f: input_json_fp = f.read() with open(ids, 'U') as f: ids = [] for line in f: if not line.startswith('#'): ids.append(line.strip().split('\t')[0]) table, format_ = _subset_table(input_hdf5_fp, input_json_fp, axis, ids) if format_ == 'json': with open(output_fp, 'w') as f: for line in table: f.write(line) f.write('\n') else: if HAVE_H5PY: import h5py else: # This should never be raised here raise ImportError("h5py is not available, cannot write HDF5!") with h5py.File(output_fp, 'w') as f: table.to_hdf5(f, generatedby())
def create_biom_file(vcf_fp, output_fp, mapping_fp=None, zip=None): if vcf_fp.endswith('gz'): vcf_f = gzip.open(vcf_fp) elif vcf_fp.endswith('vcf'): vcf_f = open(vcf_fp, 'U') else: raise ValueError, "Invalid file format or extension, only '.vcf' or '.vcf.gz' are\ accepted" data, observation_ids, sample_ids =\ create_table_factory_objects(vcf_f) sample_md = None table = table_factory(data, sample_ids=sample_ids, observation_ids=observation_ids, constructor=SparseOTUTable) if mapping_fp != None: mapping_f = MetadataMap.fromFile(mapping_fp) biom_table.addSampleMetadata(mapping_f) if zip == 'gz': output_f = gzip.open('%s.%s' % (output_fp, zip), 'wb') else: output_f = open(output_fp, 'w') table.getBiomFormatJsonString(generatedby(), direct_io=output_f) output_f.close()
def run(self, **kwargs): table_file = kwargs['table_file'] matrix_type = kwargs['matrix_type'] biom_to_classic_table = kwargs['biom_to_classic_table'] sparse_biom_to_dense_biom = kwargs['sparse_biom_to_dense_biom'] dense_biom_to_sparse_biom = kwargs['dense_biom_to_sparse_biom'] sample_metadata = kwargs['sample_metadata'] observation_metadata = kwargs['observation_metadata'] header_key = kwargs['header_key'] output_metadata_id = kwargs['output_metadata_id'] process_obs_metadata = kwargs['process_obs_metadata'] table_type = kwargs['table_type'] if sum([biom_to_classic_table, sparse_biom_to_dense_biom, dense_biom_to_sparse_biom]) > 1: raise CommandError("Converting between classic/BIOM formats and " "sparse/dense representations are mutually " "exclusive. You may only specify a single " "operation at a time.") # if the user does not specify a name for the output metadata column, # set it to the same as the header key output_metadata_id = output_metadata_id or header_key convert_error_msg = ("Input does not look like a BIOM-formatted file. " "Did you accidentally specify that a classic " "table file should be created from a BIOM table " "file?") if biom_to_classic_table: try: result = convert_biom_to_table(table_file, header_key, output_metadata_id) except ValueError: raise CommandError(convert_error_msg) elif sparse_biom_to_dense_biom: try: table = parse_biom_table(table_file) except ValueError: raise CommandError(convert_error_msg) conv_constructor = self.TableTypes[table._biom_type.lower()][1] conv_table = table_factory(table._data, table.SampleIds, table.ObservationIds, table.SampleMetadata, table.ObservationMetadata, table.TableId, constructor=conv_constructor) result = conv_table.getBiomFormatJsonString(generatedby()) elif dense_biom_to_sparse_biom: try: table = parse_biom_table(table_file) except ValueError: raise CommandError(convert_error_msg) conv_constructor = self.TableTypes[table._biom_type.lower()][0] conv_table = table_factory(table._data, table.SampleIds, table.ObservationIds, table.SampleMetadata, table.ObservationMetadata, table.TableId, constructor=conv_constructor) result = conv_table.getBiomFormatJsonString(generatedby()) else: if table_type is None: raise CommandError("Must specify the BIOM table type: %s" % ', '.join(self.TableTypes.keys())) else: table_type = table_type.lower() if table_type not in self.TableTypes: raise CommandError("Unknown BIOM table type, must be one of: " "%s" % ', '.join(self.TableTypes.keys())) if matrix_type not in self.MatrixTypes: raise CommandError("Unknown BIOM matrix type, must be one of: " "%s" % ', '.join(self.MatrixTypes)) if process_obs_metadata not in \ self.ObservationMetadataTypes.keys(): raise CommandError("Unknown observation metadata processing " "method, must be one of: %s" % ', '.join(self.ObservationMetadataTypes.keys())) idx = 0 if matrix_type == 'sparse' else 1 constructor = self.TableTypes[table_type][idx] convert_error_msg = ("Input does not look like a classic table. " "Did you forget to specify that a classic " "table file should be created from a BIOM " "table file?") try: result = convert_table_to_biom(table_file, sample_metadata, observation_metadata, self.ObservationMetadataTypes[process_obs_metadata], constructor) except ValueError: raise CommandError(convert_error_msg) except IndexError: raise CommandError(convert_error_msg) return {'table_str': result}
def tojson(input_data): table = parse_biom_table(input_data) table.type = "Table" result = table.to_json(generatedby()) return result
def run(self, **kwargs): table_file = kwargs["table_file"] biom_to_classic_table = kwargs["biom_to_classic_table"] sparse_biom_to_dense_biom = kwargs["sparse_biom_to_dense_biom"] dense_biom_to_sparse_biom = kwargs["dense_biom_to_sparse_biom"] sample_metadata = kwargs["sample_metadata"] observation_metadata = kwargs["observation_metadata"] header_key = kwargs["header_key"] output_metadata_id = kwargs["output_metadata_id"] process_obs_metadata = kwargs["process_obs_metadata"] if sum([biom_to_classic_table, sparse_biom_to_dense_biom, dense_biom_to_sparse_biom]) > 1: raise CommandError( "Converting between classic/BIOM formats and " "sparse/dense representations are mutually " "exclusive. You may only specify a single " "operation at a time." ) # if the user does not specify a name for the output metadata column, # set it to the same as the header key output_metadata_id = output_metadata_id or header_key convert_error_msg = ( "Input does not look like a BIOM-formatted file. " "Did you accidentally specify that a classic " "table file should be created from a BIOM table " "file?" ) if biom_to_classic_table: try: result = convert_biom_to_table(table_file, header_key, output_metadata_id) except (ValueError, TypeError): raise CommandError(convert_error_msg) elif sparse_biom_to_dense_biom: try: table = parse_biom_table(table_file) except (ValueError, TypeError): raise CommandError(convert_error_msg) conv_table = table_factory( table._data, table.sample_ids, table.observation_ids, table.sample_metadata, table.observation_metadata, table.TableId, ) result = conv_table.get_biom_format_json_string(generatedby()) elif dense_biom_to_sparse_biom: try: table = parse_biom_table(table_file) except (ValueError, TypeError): raise CommandError(convert_error_msg) conv_table = table_factory( table._data, table.sample_ids, table.observation_ids, table.sample_metadata, table.observation_metadata, table.table_id, ) result = conv_table.get_biom_format_json_string(generatedby()) else: if process_obs_metadata not in self.ObservationMetadataTypes.keys(): raise CommandError( "Unknown observation metadata processing method, must be " "one of: %s" % ", ".join(self.ObservationMetadataTypes.keys()) ) convert_error_msg = ( "Input does not look like a classic table. " "Did you forget to specify that a classic " "table file should be created from a BIOM " "table file?" ) try: result = convert_table_to_biom( table_file, sample_metadata, observation_metadata, self.ObservationMetadataTypes[process_obs_metadata], ) except (ValueError, TypeError, IndexError): raise CommandError(convert_error_msg) return {"table_str": result}
def main(): opts,args = parser.parse_args() if opts.input_fp is None: parser.print_help() parser.error('Must specify an input file!') if opts.output_fp is None: parser.print_help() parser.error('Must specify an output file!') biom_to_classic_table = opts.biom_to_classic_table sparse_biom_to_dense_biom = opts.sparse_biom_to_dense_biom dense_biom_to_sparse_biom = opts.dense_biom_to_sparse_biom process_obs_metadata = opts.process_obs_metadata if sum([biom_to_classic_table, sparse_biom_to_dense_biom, dense_biom_to_sparse_biom]) > 1: parser.print_help() option_parser.error("The --biom_to_classic_table, --sparse_biom_to_dense_biom, " "and --dense_biom_to_sparse_biom options are mutually exclusive. Pass only one at a time.") input_f = open(opts.input_fp,'U') output_f = open(opts.output_fp,'w') #dense = opts.biom_type == 'dense' count_map_f = int sample_mapping_fp = opts.sample_mapping_fp obs_mapping_fp = opts.observation_mapping_fp if sample_mapping_fp != None: sample_mapping = parse_mapping(open(sample_mapping_fp,'U')) else: sample_mapping = None if obs_mapping_fp != None: obs_mapping = parse_mapping(open(obs_mapping_fp, 'U')) else: obs_mapping = None # if the user does not specify a name for the output metadata column, set it to the # same as the header key header_key = opts.header_key output_metadata_id = opts.output_metadata_id or header_key if biom_to_classic_table: try: output_f.write(convert_biom_to_table(\ input_f, header_key, output_metadata_id)) except ValueError: raise ValueError, "Input does not look like a .biom file. Did you accidentally specify -b?" elif sparse_biom_to_dense_biom: try: table = parse_biom_table(input_f) except ValueError: raise ValueError, "Input does not look like a .biom file. Did you accidentally specify -b?" conv_constructor = BIOM_TYPES[table._biom_type.lower()][1] conv_table = table_factory(table._data, table.SampleIds, table.ObservationIds, table.SampleMetadata, table.ObservationMetadata, table.TableId, constructor=conv_constructor) output_f.write(conv_table.getBiomFormatJsonString(generatedby())) elif dense_biom_to_sparse_biom: try: table = parse_biom_table(input_f) except ValueError: raise ValueError, "Input does not look like a .biom file. Did you accidentally specify -b?" conv_constructor = BIOM_TYPES[table._biom_type.lower()][0] conv_table = table_factory(table._data, table.SampleIds, table.ObservationIds, table.SampleMetadata, table.ObservationMetadata, table.TableId, constructor=conv_constructor) output_f.write(conv_table.getBiomFormatJsonString(generatedby())) else: if opts.biom_table_type is None: parser.error('Must specify the BIOM table type: %s' % \ ', '.join(BIOM_TYPES.keys())) else: biom_table_type = opts.biom_table_type.lower() if biom_table_type not in BIOM_TYPES: parser.error('Unknown BIOM table type, must be one of: %s' % \ ', '.join(BIOM_TYPES.keys())) if opts.biom_type is None or opts.biom_type not in ['dense', 'sparse']: parser.error('Must specify the BIOM matrix type, ' + \ 'either "dense" or "sparse"') idx = 0 if opts.biom_type == 'sparse' else 1 constructor = BIOM_TYPES[biom_table_type][idx] try: output_f.write(convert_table_to_biom(input_f,sample_mapping, obs_mapping, OBS_META_TYPES[process_obs_metadata], constructor)) except ValueError: raise ValueError, "Input does not look like a classic table. Do you need to pass -b?" input_f.close() output_f.close()
def run(self, **kwargs): table = kwargs['table'] sample_metadata = kwargs['sample_metadata'] observation_metadata = kwargs['observation_metadata'] header_key = kwargs['header_key'] output_metadata_id = kwargs['output_metadata_id'] process_obs_metadata = kwargs['process_obs_metadata'] obs_md_fmt = kwargs['tsv_metadata_formatter'] to_tsv = kwargs['to_tsv'] to_hdf5 = kwargs['to_hdf5'] to_json = kwargs['to_json'] if sum([to_tsv, to_hdf5, to_json]) == 0: raise CommandError("Must specify an output format") elif sum([to_tsv, to_hdf5, to_json]) > 1: raise CommandError("Can only specify a single output format") if obs_md_fmt not in self.ObservationMetadataFormatters: raise CommandError("Unknown tsv_metadata_formatter: %s" % obs_md_fmt) else: obs_md_fmt_f = self.ObservationMetadataFormatters[obs_md_fmt] if sample_metadata is not None: table.add_metadata(sample_metadata) # if the user does not specify a name for the output metadata column, # set it to the same as the header key output_metadata_id = output_metadata_id or header_key if process_obs_metadata not in self.ObservationMetadataTypes: raise CommandError( "Unknown observation metadata processing method, must be " "one of: %s" % ', '.join(self.ObservationMetadataTypes.keys())) else: # assume we had a table coming in as TSV if table.observation_metadata is None: raise CommandError("Obseration metadata processing requested " "but it doesn't appear that there is any " "metadata to operate on!") # and if this came in as TSV, then we expect only a single type of # metadata md_key = table.observation_metadata[0].keys()[0] process_f = self.ObservationMetadataTypes[process_obs_metadata] it = zip(table.observation_ids, table.observation_metadata) new_md = {id_: {md_key: process_f(md[md_key])} for id_, md in it} if observation_metadata: for k, v in observation_metadata.items(): new_md[k].update(v) table.add_metadata(new_md, 'observation') if to_tsv: result = table.to_tsv(header_key=header_key, header_value=output_metadata_id, metadata_formatter=obs_md_fmt_f) fmt = 'tsv' elif to_json: result = table.to_json(generatedby()) fmt = 'json' elif to_hdf5: result = table fmt = 'hdf5' return {'table': (result, fmt)}
def main(): opts,args = parser.parse_args() if opts.input_fp is None: parser.print_help() parser.error('Must specify an input file!') if opts.output_fp is None: parser.print_help() parser.error('Must specify an output file!') ## process header information, if provided observation_header = opts.observation_header sample_header = opts.sample_header if opts.observation_header != None: observation_header = observation_header.split(',') if opts.sample_header != None: sample_header = sample_header.split(',') ## define metadata processing functions, if any process_fns = {} sc_separated = opts.sc_separated if sc_separated != None: process_fns.update({}.fromkeys(sc_separated.split(','), split_on_semicolons)) int_fields = opts.int_fields if int_fields != None: process_fns.update({}.fromkeys(int_fields.split(','), int_)) float_fields = opts.float_fields if float_fields != None: process_fns.update({}.fromkeys(float_fields.split(','), float_)) ## parse mapping files sample_mapping_fp = opts.sample_mapping_fp obs_mapping_fp = opts.observation_mapping_fp if sample_mapping_fp != None: sample_mapping = parse_mapping(open(sample_mapping_fp,'U'), process_fns=process_fns, header=sample_header) else: sample_mapping = None if obs_mapping_fp != None: obs_mapping = parse_mapping(open(obs_mapping_fp, 'U'), process_fns=process_fns, header=observation_header) else: obs_mapping = None if sample_mapping == None and obs_mapping == None: parser.print_help() parser.error('Must specify sample_mapping and/or obs_mapping.') ## parse the table and open the output file for writing output_f = open(opts.output_fp,'w') table = parse_biom_table(open(opts.input_fp,'U')) ## add metadata as necessary if sample_mapping: table.addSampleMetadata(sample_mapping) if obs_mapping: table.addObservationMetadata(obs_mapping) ## write the output file and close it output_f.write(table.getBiomFormatJsonString(generatedby())) output_f.close()