def test_import_biom_obs(self): """ Tests if the import_biom function reads the correct database file, and imports the biom file, by queriying the counts table. :return: """ conn_object = BiomConnection() conn_object.create_tables() write_biom_table(testbiom, fmt='hdf5', filepath="test.biom") import_biom("test.biom", mapping=None) os.remove("test.biom") conn = psycopg2.connect( **{ "host": "localhost", "database": "test", "user": "******", "password": "******" }) cur = conn.cursor() cur.execute("SELECT sampleid " "FROM counts " "LIMIT 5;") result = cur.fetchall() cur.close() conn.close() conn_object.delete_tables() result = [x[0] for x in result] self.assertCountEqual( result, ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5'])
def test_import_biom(self): """ Tests if the import_biom function reads the correct database file, and imports the biom file, by querying the bioms table. :return: """ conn_object = BiomConnection() conn_object.create_tables() write_biom_table(testbiom, fmt='hdf5', filepath="test.biom") import_biom("test.biom", mapping=None) os.remove("test.biom") conn = psycopg2.connect( **{ "host": "localhost", "database": "test", "user": "******", "password": "******" }) cur = conn.cursor() cur.execute("SELECT studyID " "FROM bioms " "LIMIT 1;") result = cur.fetchall() cur.close() conn.close() conn_object.delete_tables() self.assertEqual(result[0][0], 'test')
def add_metadata_to_biom_table(biom_input_fp, taxonomy_map_fp, biom_output_fp): '''Load biom, add metadata, write to new table''' newbiom = load_table(biom_input_fp) if stat(taxonomy_map_fp).st_size == 0: metadata = {} else: metadata = MetadataMap.from_file(taxonomy_map_fp, header=['Sample ID', 'taxonomy', 'c']) newbiom.add_metadata(metadata, 'observation') write_biom_table(newbiom, 'json', biom_output_fp)
def add_metadata(input_fp, output_fp, sample_metadata_fp, observation_metadata_fp, sc_separated, sc_pipe_separated, int_fields, float_fields, sample_header, observation_header, output_as_json): """Add metadata to a BIOM table. Add sample and/or observation metadata to BIOM-formatted files. See examples here: http://biom-format.org/documentation/adding_metadata.html Example usage: Add sample metadata to a BIOM table: $ biom add-metadata -i otu_table.biom -o table_with_sample_metadata.biom -m sample_metadata.txt """ table = load_table(input_fp) if sample_metadata_fp is not None: sample_metadata_f = open(sample_metadata_fp, 'U') else: sample_metadata_f = None if observation_metadata_fp is not None: observation_metadata_f = open(observation_metadata_fp, 'U') else: observation_metadata_f = None if sc_separated is not None: sc_separated = sc_separated.split(',') if sc_pipe_separated is not None: sc_pipe_separated = sc_pipe_separated.split(',') if int_fields is not None: int_fields = int_fields.split(',') if float_fields is not None: float_fields = float_fields.split(',') if sample_header is not None: sample_header = sample_header.split(',') if observation_header is not None: observation_header = observation_header.split(',') result = _add_metadata(table, sample_metadata_f, observation_metadata_f, sc_separated, sc_pipe_separated, int_fields, float_fields, sample_header, observation_header) if output_as_json: fmt = 'json' else: fmt = 'hdf5' write_biom_table(result, fmt, output_fp)
def write_bioms(self, fmt='hdf5'): """ Utility function that writes BIOM files in a Batch object to HDF5 files. OTU files are always written to disk, the rest only if required. :param fmt: Format for writing; 'hdf5' or 'json'. :return: """ for x in self.inputs['name']: for level in self.inputs['levels']: filename = self.inputs['fp'] + '/' + x + '_' + level + '.hdf5' try: write_biom_table(self.levels[level][x], fmt, filename) except Exception: logger.error("Cannot write " + str(x) + " to disk", exc_info=True)
def denoise_to_feature_table(demux_seqs, trim_left, trunc_len, community_dir, rep_seqs_fn='rep_seqs', feature_table_fn='feature_table.qza', biom_table_fn='feature_table.biom', summary_fn='feature_table_summary.qzv'): '''SampleData[SequencesWithQuality] -> FeatureData[Sequence] + FeatureTable[Frequency] denoise fastqs with dada2, create feature table, rep_seqs, and view stats. demux_seqs = SampleData[SequencesWithQuality] demultiplexed seqs output from qiime2.demux.methods.emp() trim_left = int trim X bases from 5' end trunc_len = int length to truncate all sequences community_dir: path destination directory to print results rep_seqs_fn = str filename of representative sequences output Artifact feature_table_fn = str filename of feature table output Artifact summary_fn = str filename of feature table summary output visualization ''' biom_table, rep_seqs = dada2.methods.denoise_single(demux_seqs, trim_left=trim_left, trunc_len=trunc_len) # save Artifact rep_seqs.save(join(community_dir, rep_seqs_fn)) # save biom Artifact biom_table.save(join(community_dir, feature_table_fn)) biom_table_fp = join(community_dir, biom_table_fn) write_biom_table(biom_table.view(Table), 'hdf5', biom_table_fp) # summarize feature table feature_table_summary = feature_table.visualizers.summarize(biom_table) feature_table_summary.visualization.save(join(community_dir, summary_fn)) return biom_table, rep_seqs
def test_run_network(self): """ Checks whether combine_data returns a batch object if inputs are supplied. """ inputs = { 'biom_file': [(testloc[0] + '/data/test.biom')], 'cluster': None, 'otu_meta': None, 'prefix': None, 'sample_data': None, 'split': None, 'tax_table': None, 'fp': testloc[0], 'otu_table': None, 'tools': ['spiec-easi', 'conet'], 'spiec': None, 'spar': None, 'conet': (os.path.dirname(massoc.__file__)[:-6] + 'tests\\CoNet3'), 'conet_bash': None, 'spar_pval': None, 'spar_boot': None, 'levels': ['family'], 'prev': 20, 'min': None, 'rar': None, 'name': ['test'], 'cores': None } write_biom_table(testbiom['test'], fmt='hdf5', filepath=(inputs['biom_file'][0])) get_input(inputs) inputs['settings'] = inputs['fp'] + '/settings.json' run_network(inputs) test = Path(inputs['fp'] + "/conet_family_test.txt") self.assertTrue(test.is_file()) call(("rm " + inputs['biom_file'][0])) call(("rm " + inputs['fp'] + "/settings.json")) call(("rm " + inputs['fp'] + "/test_family.hdf5")) call(("rm " + inputs['fp'] + "/test_otu.hdf5")) call(("rm " + inputs['fp'] + "/conet_test_family.txt")) call(("rm " + inputs['fp'] + "/spiec-easi_test_family.txt"))
def from_uc(input_fp, output_fp, rep_set_fp): """Create a BIOM table from a vsearch/uclust/usearch BIOM file. Example usage: Simple BIOM creation: $ biom from-uc -i in.uc -o out.biom BIOM creation with OTU re-naming: $ biom from-uc -i in.uc -o out.biom --rep-set-fp rep-set.fna """ input_f = open(input_fp, 'U') if rep_set_fp is not None: rep_set_f = open(rep_set_fp, 'U') else: rep_set_f = None table = _from_uc(input_f, rep_set_f) write_biom_table(table, 'hdf5', output_fp)
def from_uc(input_fp, output_fp, rep_set_fp): """Create a BIOM table from a vsearch/uclust/usearch BIOM file. Example usage: Simple BIOM creation: $ biom from-uc -i in.uc -o out.biom BIOM creation with OTU re-naming: $ biom from-uc -i in.uc -o out.biom --rep-set-fp rep-set.fna """ input_f = open(input_fp, "U") if rep_set_fp is not None: rep_set_f = open(rep_set_fp, "U") else: rep_set_f = None table = _from_uc(input_f, rep_set_f) write_biom_table(table, "hdf5", output_fp)
def normalize_table(input_fp, output_fp, relative_abund, presence_absence, axis): """Normalize a BIOM table. Normalize the values of a BIOM table through various methods. Relative abundance will take the relative abundance of each observation in terms of samples or observations. Presence absensece will convert observations to 1's and 0's based on presence of the observation. Example usage: Normalizing a BIOM table to relative abundnace: $ biom normalize-table -i table.biom -r -o normalized_table.biom Converting a BIOM table to a presence/absence table: $ biom normalize-table -i table.biom -p -o converted_table.biom """ table = load_table(input_fp) result = _normalize_table(table, relative_abund, presence_absence, axis) write_biom_table(result, "hdf5" if HAVE_H5PY else "json", output_fp)
def normalize_table(input_fp, output_fp, relative_abund, presence_absence, axis): """Normalize a BIOM table. Normalize the values of a BIOM table through various methods. Relative abundance will take the relative abundance of each observation in terms of samples or observations. Presence absensece will convert observations to 1's and 0's based on presence of the observation. Example usage: Normalizing a BIOM table to relative abundnace: $ biom normalize-table -i table.biom -r -o normalized_table.biom Converting a BIOM table to a presence/absence table: $ biom normalize-table -i table.biom -p -o converted_table.biom """ table = load_table(input_fp) result = _normalize_table(table, relative_abund, presence_absence, axis) write_biom_table(result, 'hdf5' if HAVE_H5PY else 'json', output_fp)
def test_get_input(self): """ Checks whether get_input writes a settings file if inputs are supplied. """ inputs = { 'biom_file': [(testloc[0] + '/data/test.biom')], 'cluster': None, 'otu_meta': None, 'prefix': None, 'sample_data': None, 'split': None, 'tax_table': None, 'fp': testloc[0], 'otu_table': None, 'tools': ['spiec-easi', 'conet'], 'spiec': None, 'conet': None, 'spar_pval': None, 'spar_boot': None, 'levels': ['family'], 'prev': 20, 'min': None, 'rar': None, 'name': ['test'], 'cores': None } write_biom_table(testbiom['test'], fmt='hdf5', filepath=(inputs['biom_file'][0])) get_input(inputs) test = Path(inputs['fp'] + "/settings.json") self.assertTrue(test.is_file()) call(("rm " + inputs['biom_file'][0])) call(("rm " + inputs['fp'] + "/settings.json")) call(("rm " + inputs['fp'] + "/test_family.hdf5")) call(("rm " + inputs['fp'] + "/test_otu.hdf5"))
def test_import_biom_mapping(self): """ Tests if the BiomConnection uses the mapping dict. :return: """ conn_object = BiomConnection() conn_object.create_tables() write_biom_table(testbiom, fmt='hdf5', filepath="test.biom") import_biom("test.biom", mapping={'test': 'banana'}) os.remove("test.biom") conn = psycopg2.connect( **{ "host": "localhost", "database": "test", "user": "******", "password": "******" }) cur = conn.cursor() cur.execute("SELECT studyID " "FROM bioms " "LIMIT 1;") result = cur.fetchall() cur.close() conn.close() conn_object.delete_tables() self.assertEqual(result[0][0], 'banana')
def test_run_netstats(self): """ Checks whether combine_data returns a batch object if inputs are supplied. """ inputs = { 'biom_file': [(testloc[0] + '/data/test.biom')], 'cluster': None, 'otu_meta': None, 'prefix': None, 'sample_data': None, 'split': None, 'tax_table': None, 'fp': testloc[0], 'otu_table': None, 'tools': ['spiec-easi', 'conet'], 'spiec': None, 'spar': None, 'conet': (os.path.dirname(massoc.__file__)[:-6] + 'tests\\CoNet3'), 'conet_bash': None, 'spar_pval': None, 'spar_boot': None, 'levels': ['family'], 'prev': 20, 'min': None, 'rar': None, 'name': ['test'], 'cores': None, 'address': 'bolt://localhost:7687', 'username': '******', 'password': '******', 'quit': False, 'clear': False, 'write': False, 'add': False, 'output': 'network', 'logic': ['union', 'difference', 'intersection'], 'neo4j': (os.path.dirname(massoc.__file__)[:-6] + 'tests\\neo4j') } write_biom_table(testbiom['test'], fmt='hdf5', filepath=(inputs['biom_file'][0])) get_input(inputs) inputs['settings'] = inputs['fp'] + '/settings.json' run_network(inputs) inputs['job'] = 'start' run_neo4j(inputs) inputs['job'] = 'upload' run_neo4j(inputs) run_netstats(inputs) inputs['job'] = 'clear' run_neo4j(inputs) inputs['job'] = 'quit' run_neo4j(inputs) test = Path(inputs['fp'] + "/difference_network.graphml") self.assertTrue(test.is_file()) call(("rm " + inputs['biom_file'][0])) call(("rm " + inputs['fp'] + "/settings.json")) call(("rm " + inputs['fp'] + "/test_family.hdf5")) call(("rm " + inputs['fp'] + "/test_otu.hdf5")) call(("rm " + inputs['fp'] + "/conet_test_family.txt")) call(("rm " + inputs['fp'] + "/spiec-easi_test_family.txt")) call(("rm " + inputs['fp'] + "/network.graphml")) call(("rm " + inputs['fp'] + "/difference_network.graphml")) call(("rm " + inputs['fp'] + "/union_network.graphml")) call(("rm " + inputs['fp'] + "/intersection_network.graphml"))
def _convert(table, output_filepath, sample_metadata=None, observation_metadata=None, to_json=False, to_hdf5=False, to_tsv=False, collapsed_samples=False, collapsed_observations=False, header_key=None, output_metadata_id=None, table_type=None, process_obs_metadata=None, tsv_metadata_formatter='sc_separated'): if sum([to_tsv, to_hdf5, to_json]) == 0: raise ValueError("Must specify an output format") elif sum([to_tsv, to_hdf5, to_json]) > 1: raise ValueError("Can only specify a single output format") if table_type is None: if table.type in [None, "None"]: table.type = "Table" else: pass else: table.type = table_type if tsv_metadata_formatter is not None: obs_md_fmt_f = observation_metadata_formatters[tsv_metadata_formatter] if sample_metadata is not None: table.add_metadata(sample_metadata) # if the user does not specify a name for the output metadata column, # set it to the same as the header key output_metadata_id = output_metadata_id or header_key if process_obs_metadata is not None and not to_tsv: if table.metadata(axis='observation') is None: raise ValueError("Observation metadata processing requested " "but it doesn't appear that there is any " "metadata to operate on!") # and if this came in as TSV, then we expect only a single type of # metadata md_key = list(table.metadata(axis='observation')[0].keys())[0] process_f = observation_metadata_types[process_obs_metadata] it = zip(table.ids(axis='observation'), table.metadata(axis='observation')) new_md = {id_: {md_key: process_f(md[md_key])} for id_, md in it} if observation_metadata: for k, v in observation_metadata.items(): new_md[k].update(v) table.add_metadata(new_md, 'observation') if to_tsv: result = table.to_tsv(header_key=header_key, header_value=output_metadata_id, metadata_formatter=obs_md_fmt_f) with open(output_filepath, 'w') as f: f.write(result) return elif to_json: fmt = 'json' result = table elif to_hdf5: fmt = 'hdf5' result = table if collapsed_observations: metadata = [{ 'collapsed_ids': sorted(md.keys()) } for md in result.metadata(axis='observation')] result._observation_metadata = metadata if collapsed_samples: metadata = [{ 'collapsed_ids': sorted(md.keys()) } for md in result.metadata()] result._sample_metadata = metadata if collapsed_observations or collapsed_samples: # We have changed the metadata, it is safer to make sure that # it is correct result._cast_metadata() write_biom_table(result, fmt, output_filepath) return
def setUpClass(cls): _table1 = [ 'a\ta\t1\t0.0\t0.5\t0.1', 'a\ta\t1\t1.0\t1.0\t0.2', 'a\ta\t1\t2.0\t1.5\t0.2', 'a\tb\t1\t3.0\t2.0\t8.', 'a\tb\t1\t4.0\t2.5\t9.', 'a\tb\t1\t5.0\t3.0\t10.', 'b\ta\t1\t0.0\t2.0\t0.1', 'b\ta\t1\t1.0\t3.0\t0.3', 'b\ta\t1\t2.0\t4.0\t0.1', 'b\tb\t1\t3.0\t5.0\t9.', 'b\tb\t1\t4.0\t6.0\t11.', 'b\tb\t1\t5.0\t7.0\t10.' ] cls.table1 = pd.DataFrame( [(n.split('\t')) for n in _table1], columns=['group', 'dataset', 'level', 'x', 'y', 'c'], dtype=float) cls.table2 = """{"id": "None", "format": "Biological Observation Matrix 1.0.0", "format_url": "http:\/\/biom-format.org", "type": "OTU table", "generated_by": "greg", "date": "2013-08-22T13:10:23.907145", "matrix_type": "sparse", "matrix_element_type": "float", "shape": [ 3, 4 ], "data": [ [ 0, 0, 1 ], [ 0, 1, 2 ], [ 0, 2, 3 ], [ 0, 3, 4 ], [ 1, 0, 2 ], [ 1, 1, 0 ], [ 1, 2, 7 ], [ 1, 3, 8 ], [ 2, 0, 9 ], [ 2, 1, 10 ], [ 2, 2, 11 ], [ 2, 3, 12 ] ], "rows": [ { "id": "o1", "metadata": { "domain": "Archaea" } }, { "id": "o2", "metadata": { "domain": "Bacteria" } }, { "id": "o3", "metadata": { "domain": "Bacteria" } } ], "columns": [ { "id": "s1", "metadata": { "method": "A", "Sample": "A", "parameters": "A" } }, { "id": "s2", "metadata": { "method": "A", "Sample": "A", "parameters": "B" } }, { "id": "s3", "metadata": { "method": "A", "Sample": "A", "parameters": "C" } }, { "id": "s4", "metadata": { "method": "B", "Sample": "A", "parameters": "D" } } ] }""" # table 2 # OTU ID s1 s2 s3 s4 # o1 1.0 2.0 3.0 4.0 # o2 2.0 0.0 7.0 8.0 # o3 9.0 10.0 11.0 12.0 cls.tmpdir = mkdtemp() cls.table2 = Table.from_json(json.loads(cls.table2)) write_biom_table(cls.table2, 'hdf5', join(cls.tmpdir, 'table2.biom')) cls.dm, cls.s_md = make_distance_matrix(join(cls.tmpdir, 'table2.biom'), method="braycurtis") cls.dist = per_method_distance(cls.dm, cls.s_md, group_by='method', standard='B', metric='distance', sample='Sample')
def merge_expected_and_observed_tables(expected_results_dir, results_dirs, md_key='taxonomy', min_count=0, taxonomy_level=6, taxa_to_keep=None, biom_fp='merged_table.biom', filename_pattern='table.L{0}-taxa.biom', dataset_ids=None, reference_ids=None, method_ids=None, parameter_ids=None, force=False): '''For each dataset in expected_results_dir, merge expected and observed taxonomy compositions. dataset_ids: list dataset ids (mock community study ID) to process. Defaults to None (process all). reference_ids: list reference database data to process. Defaults to None (process all). method_ids: list methods to process. Defaults to None (process all). parameter_ids: list parameters to process. Defaults to None (process all). ''' # Quick and dirty way to keep merge from running automatically in notebooks # when users "run all" cells. This is really just a convenience function # that is meant to be called from the tax-credit notebooks and causing # force=False to kill the function is the best simple control. The # alternative is to work out a way to weed out expected_tables that have a # merged biom, and just load that biom instead of overwriting if # force=False. Then do the same for result_tables. If any new result_tables # exist, perform merge if force=True. The only time force=False should # result in a new table is when a new mock community/reference dataset # combo is added — so just let users set force=True if that's the case. if force is False: exit('Skipping merge. Set force=True if you intend to generate new ' 'merged tables.') # Find expected tables, add sample metadata expected_table_lookup = get_expected_tables_lookup( expected_results_dir, filename_pattern=filename_pattern) expected_tables = {} for dataset_id, expected_dict in expected_table_lookup.items(): expected_tables[dataset_id] = {} for reference_id, expected_table_fp in expected_dict.items(): if not exists( join(expected_results_dir, dataset_id, reference_id, biom_fp)) or force is True: expected_tables[dataset_id][reference_id] = \ add_sample_metadata_to_table(expected_table_fp, dataset_id=dataset_id, reference_id=reference_id, min_count=min_count, taxonomy_level=taxonomy_level, taxa_to_keep=taxa_to_keep, md_key='taxonomy', method='expected', params='expected') # Find observed results tables, add sample metadata result_tables = seek_results(results_dirs, dataset_ids, reference_ids, method_ids, parameter_ids) for dataset_id, ref_id, method, params, actual_table_fp in result_tables: biom_destination = join(expected_results_dir, dataset_id, ref_id, biom_fp) if not exists(biom_destination) or force is True: try: expected_table_fp = \ expected_table_lookup[dataset_id][ref_id] except KeyError: raise KeyError("Can't find expected table for \ ({0}, {1}).".format(dataset_id, ref_id)) # import expected table, amend sample ids actual_table = \ add_sample_metadata_to_table(actual_table_fp, dataset_id=dataset_id, reference_id=ref_id, min_count=min_count, taxonomy_level=taxonomy_level, taxa_to_keep=taxa_to_keep, md_key='taxonomy', method=method, params=params) # merge expected and resutls tables expected_tables[dataset_id][ref_id] = \ expected_tables[dataset_id][ref_id].merge(actual_table) # write biom table to destination write_biom_table(expected_tables[dataset_id][ref_id], 'hdf5', biom_destination)
def _convert(table, output_filepath, sample_metadata=None, observation_metadata=None, to_json=False, to_hdf5=False, to_tsv=False, collapsed_samples=False, collapsed_observations=False, header_key=None, output_metadata_id=None, table_type=None, process_obs_metadata=None, tsv_metadata_formatter='sc_separated'): if sum([to_tsv, to_hdf5, to_json]) == 0: raise ValueError("Must specify an output format") elif sum([to_tsv, to_hdf5, to_json]) > 1: raise ValueError("Can only specify a single output format") if table_type is None: if table.type in [None, "None"]: table.type = "Table" else: pass else: table.type = table_type if tsv_metadata_formatter is not None: obs_md_fmt_f = observation_metadata_formatters[tsv_metadata_formatter] if sample_metadata is not None: table.add_metadata(sample_metadata) # if the user does not specify a name for the output metadata column, # set it to the same as the header key output_metadata_id = output_metadata_id or header_key if process_obs_metadata is not None and not to_tsv: if table.metadata(axis='observation') is None: raise ValueError("Observation metadata processing requested " "but it doesn't appear that there is any " "metadata to operate on!") # and if this came in as TSV, then we expect only a single type of # metadata md_key = list(table.metadata(axis='observation')[0].keys())[0] process_f = observation_metadata_types[process_obs_metadata] it = zip(table.ids(axis='observation'), table.metadata(axis='observation')) new_md = {id_: {md_key: process_f(md[md_key])} for id_, md in it} if observation_metadata: for k, v in observation_metadata.items(): new_md[k].update(v) table.add_metadata(new_md, 'observation') if to_tsv: result = table.to_tsv(header_key=header_key, header_value=output_metadata_id, metadata_formatter=obs_md_fmt_f) with open(output_filepath, 'w') as f: f.write(result) return elif to_json: fmt = 'json' result = table elif to_hdf5: fmt = 'hdf5' result = table if collapsed_observations: metadata = [{'collapsed_ids': sorted(md.keys())} for md in result.metadata(axis='observation')] result._observation_metadata = metadata if collapsed_samples: metadata = [{'collapsed_ids': sorted(md.keys())} for md in result.metadata()] result._sample_metadata = metadata if collapsed_observations or collapsed_samples: # We have changed the metadata, it is safer to make sure that # it is correct result._cast_metadata() write_biom_table(result, fmt, output_filepath) return
def extract_mockrobiota_data(communities, community_md, ref_dbs, mockrobiota_dir, mock_data_dir, expected_data_dir, biom_fn='table.L6-taxa.biom'): '''Extract sample metadata, raw data files, and expected taxonomy from mockrobiota, copy to new destination communities: LIST of mock communities to extract community_md: DICT of metadata for mock community. see extract_mockrobiota_dataset_metadata() ref_dbs = DICT mapping marker_gene to reference set names mockrobiota_dir = PATH to mockrobiota repo directory mock_data_dir = PATH to destination directory expected_data_dir = PATH to destination for expected taxonomy files ''' for community in communities: # extract dataset metadata/params forward_read_url, index_read_url, marker_gene = community_md[community] ref_outdir, ref_indir, ref_version, otu_id = ref_dbs[marker_gene][0:4] # mockrobiota source directory mockrobiota_community_dir = join(mockrobiota_dir, "data", community) # new mock community directory community_dir = join(mock_data_dir, community) seqs_dir = join(community_dir, 'raw_seqs') if not exists(seqs_dir): makedirs(seqs_dir) # copy sample-metadata.tsv copyfile(join(mockrobiota_community_dir, 'sample-metadata.tsv'), join(community_dir, 'sample-metadata.tsv')) # download raw data files for file_url_dest in [(forward_read_url, 'sequences.fastq.gz'), (index_read_url, 'barcodes.fastq.gz')]: destination = join(seqs_dir, file_url_dest[1]) if not exists(destination) and file_url_dest[0] != 'NA': try: urlretrieve(file_url_dest[0], destination) except ValueError: print('Error retrieving {0}'.format(file_url_dest[0])) # new directory containing expected taxonomy assignments at each level expected_taxa_dir = join(expected_data_dir, community, ref_outdir, "expected") if not exists(expected_taxa_dir): makedirs(expected_taxa_dir) # copy expected taxonomy.tsv and convert to biom exp_taxa_fp = join(expected_taxa_dir, 'expected-taxonomy.tsv') exp_biom_fp = join(expected_taxa_dir, biom_fn) copyfile( join(mockrobiota_community_dir, ref_indir, ref_version, otu_id, 'expected-taxonomy.tsv'), exp_taxa_fp) newbiom = amend_biom_taxonomy_ids(load_table(exp_taxa_fp)) # add taxonomy ids (names) as observation metadata metadata = { sid: { 'taxonomy': sid.split(';') } for sid in newbiom.ids(axis='observation') } newbiom.add_metadata(metadata, 'observation') write_biom_table(newbiom, 'hdf5', exp_biom_fp)