Beispiel #1
0
 def test_import_biom_obs(self):
     """
     Tests if the import_biom function reads the correct database file,
     and imports the biom file, by queriying the counts table.
     :return:
     """
     conn_object = BiomConnection()
     conn_object.create_tables()
     write_biom_table(testbiom, fmt='hdf5', filepath="test.biom")
     import_biom("test.biom", mapping=None)
     os.remove("test.biom")
     conn = psycopg2.connect(
         **{
             "host": "localhost",
             "database": "test",
             "user": "******",
             "password": "******"
         })
     cur = conn.cursor()
     cur.execute("SELECT sampleid " "FROM counts " "LIMIT 5;")
     result = cur.fetchall()
     cur.close()
     conn.close()
     conn_object.delete_tables()
     result = [x[0] for x in result]
     self.assertCountEqual(
         result, ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5'])
Beispiel #2
0
 def test_import_biom(self):
     """
     Tests if the import_biom function reads the correct database file,
     and imports the biom file, by querying the bioms table.
     :return:
     """
     conn_object = BiomConnection()
     conn_object.create_tables()
     write_biom_table(testbiom, fmt='hdf5', filepath="test.biom")
     import_biom("test.biom", mapping=None)
     os.remove("test.biom")
     conn = psycopg2.connect(
         **{
             "host": "localhost",
             "database": "test",
             "user": "******",
             "password": "******"
         })
     cur = conn.cursor()
     cur.execute("SELECT studyID " "FROM bioms " "LIMIT 1;")
     result = cur.fetchall()
     cur.close()
     conn.close()
     conn_object.delete_tables()
     self.assertEqual(result[0][0], 'test')
Beispiel #3
0
def add_metadata_to_biom_table(biom_input_fp, taxonomy_map_fp, biom_output_fp):
    '''Load biom, add metadata, write to new table'''
    newbiom = load_table(biom_input_fp)
    if stat(taxonomy_map_fp).st_size == 0:
        metadata = {}
    else:
        metadata = MetadataMap.from_file(taxonomy_map_fp,
                                         header=['Sample ID', 'taxonomy', 'c'])
    newbiom.add_metadata(metadata, 'observation')
    write_biom_table(newbiom, 'json', biom_output_fp)
Beispiel #4
0
def add_metadata(input_fp, output_fp, sample_metadata_fp,
                 observation_metadata_fp, sc_separated, sc_pipe_separated,
                 int_fields, float_fields, sample_header, observation_header,
                 output_as_json):
    """Add metadata to a BIOM table.

    Add sample and/or observation metadata to BIOM-formatted files. See
    examples here: http://biom-format.org/documentation/adding_metadata.html

    Example usage:

    Add sample metadata to a BIOM table:

    $ biom add-metadata -i otu_table.biom -o table_with_sample_metadata.biom
      -m sample_metadata.txt
    """
    table = load_table(input_fp)
    if sample_metadata_fp is not None:
        sample_metadata_f = open(sample_metadata_fp, 'U')
    else:
        sample_metadata_f = None
    if observation_metadata_fp is not None:
        observation_metadata_f = open(observation_metadata_fp, 'U')
    else:
        observation_metadata_f = None
    if sc_separated is not None:
        sc_separated = sc_separated.split(',')
    if sc_pipe_separated is not None:
        sc_pipe_separated = sc_pipe_separated.split(',')
    if int_fields is not None:
        int_fields = int_fields.split(',')
    if float_fields is not None:
        float_fields = float_fields.split(',')
    if sample_header is not None:
        sample_header = sample_header.split(',')
    if observation_header is not None:
        observation_header = observation_header.split(',')

    result = _add_metadata(table, sample_metadata_f, observation_metadata_f,
                           sc_separated, sc_pipe_separated, int_fields,
                           float_fields, sample_header, observation_header)

    if output_as_json:
        fmt = 'json'
    else:
        fmt = 'hdf5'

    write_biom_table(result, fmt, output_fp)
Beispiel #5
0
    def write_bioms(self, fmt='hdf5'):
        """
        Utility function that writes BIOM files
        in a Batch object to HDF5 files.
        OTU files are always written to disk,
        the rest only if required.

        :param fmt: Format for writing; 'hdf5' or 'json'.
        :return:
        """
        for x in self.inputs['name']:
            for level in self.inputs['levels']:
                filename = self.inputs['fp'] + '/' + x + '_' + level + '.hdf5'
                try:
                    write_biom_table(self.levels[level][x], fmt, filename)
                except Exception:
                    logger.error("Cannot write " + str(x) + " to disk", exc_info=True)
def denoise_to_feature_table(demux_seqs,
                             trim_left,
                             trunc_len,
                             community_dir,
                             rep_seqs_fn='rep_seqs',
                             feature_table_fn='feature_table.qza',
                             biom_table_fn='feature_table.biom',
                             summary_fn='feature_table_summary.qzv'):
    '''SampleData[SequencesWithQuality] -> FeatureData[Sequence] +
                                           FeatureTable[Frequency]
    denoise fastqs with dada2, create feature table, rep_seqs,
        and view stats.

        demux_seqs = SampleData[SequencesWithQuality]
            demultiplexed seqs output from qiime2.demux.methods.emp()
        trim_left = int
            trim X bases from 5' end
        trunc_len = int
            length to truncate all sequences
        community_dir: path
            destination directory to print results
        rep_seqs_fn = str
            filename of representative sequences output Artifact
        feature_table_fn = str
            filename of feature table output Artifact
        summary_fn = str
            filename of feature table summary output visualization
    '''
    biom_table, rep_seqs = dada2.methods.denoise_single(demux_seqs,
                                                        trim_left=trim_left,
                                                        trunc_len=trunc_len)
    # save Artifact
    rep_seqs.save(join(community_dir, rep_seqs_fn))

    # save biom Artifact
    biom_table.save(join(community_dir, feature_table_fn))
    biom_table_fp = join(community_dir, biom_table_fn)
    write_biom_table(biom_table.view(Table), 'hdf5', biom_table_fp)

    # summarize feature table
    feature_table_summary = feature_table.visualizers.summarize(biom_table)
    feature_table_summary.visualization.save(join(community_dir, summary_fn))

    return biom_table, rep_seqs
Beispiel #7
0
 def test_run_network(self):
     """
     Checks whether combine_data returns
     a batch object if inputs are supplied.
     """
     inputs = {
         'biom_file': [(testloc[0] + '/data/test.biom')],
         'cluster': None,
         'otu_meta': None,
         'prefix': None,
         'sample_data': None,
         'split': None,
         'tax_table': None,
         'fp': testloc[0],
         'otu_table': None,
         'tools': ['spiec-easi', 'conet'],
         'spiec': None,
         'spar': None,
         'conet': (os.path.dirname(massoc.__file__)[:-6] + 'tests\\CoNet3'),
         'conet_bash': None,
         'spar_pval': None,
         'spar_boot': None,
         'levels': ['family'],
         'prev': 20,
         'min': None,
         'rar': None,
         'name': ['test'],
         'cores': None
     }
     write_biom_table(testbiom['test'],
                      fmt='hdf5',
                      filepath=(inputs['biom_file'][0]))
     get_input(inputs)
     inputs['settings'] = inputs['fp'] + '/settings.json'
     run_network(inputs)
     test = Path(inputs['fp'] + "/conet_family_test.txt")
     self.assertTrue(test.is_file())
     call(("rm " + inputs['biom_file'][0]))
     call(("rm " + inputs['fp'] + "/settings.json"))
     call(("rm " + inputs['fp'] + "/test_family.hdf5"))
     call(("rm " + inputs['fp'] + "/test_otu.hdf5"))
     call(("rm " + inputs['fp'] + "/conet_test_family.txt"))
     call(("rm " + inputs['fp'] + "/spiec-easi_test_family.txt"))
Beispiel #8
0
def from_uc(input_fp, output_fp, rep_set_fp):
    """Create a BIOM table from a vsearch/uclust/usearch BIOM file.

    Example usage:

    Simple BIOM creation:

    $ biom from-uc -i in.uc -o out.biom

    BIOM creation with OTU re-naming:

    $ biom from-uc -i in.uc -o out.biom --rep-set-fp rep-set.fna

    """
    input_f = open(input_fp, 'U')
    if rep_set_fp is not None:
        rep_set_f = open(rep_set_fp, 'U')
    else:
        rep_set_f = None
    table = _from_uc(input_f, rep_set_f)
    write_biom_table(table, 'hdf5', output_fp)
Beispiel #9
0
def from_uc(input_fp, output_fp, rep_set_fp):
    """Create a BIOM table from a vsearch/uclust/usearch BIOM file.

    Example usage:

    Simple BIOM creation:

    $ biom from-uc -i in.uc -o out.biom

    BIOM creation with OTU re-naming:

    $ biom from-uc -i in.uc -o out.biom --rep-set-fp rep-set.fna

    """
    input_f = open(input_fp, "U")
    if rep_set_fp is not None:
        rep_set_f = open(rep_set_fp, "U")
    else:
        rep_set_f = None
    table = _from_uc(input_f, rep_set_f)
    write_biom_table(table, "hdf5", output_fp)
def normalize_table(input_fp, output_fp, relative_abund, presence_absence, axis):
    """Normalize a BIOM table.

    Normalize the values of a BIOM table through various methods. Relative
    abundance will take the relative abundance of each observation in terms of
    samples or observations.  Presence absensece will convert observations to
    1's and 0's based on presence of the observation.

    Example usage:

    Normalizing a BIOM table to relative abundnace:

    $ biom normalize-table -i table.biom -r -o normalized_table.biom

    Converting a BIOM table to a presence/absence table:

    $ biom normalize-table -i table.biom -p -o converted_table.biom
    """
    table = load_table(input_fp)
    result = _normalize_table(table, relative_abund, presence_absence, axis)

    write_biom_table(result, "hdf5" if HAVE_H5PY else "json", output_fp)
Beispiel #11
0
def normalize_table(input_fp, output_fp, relative_abund, presence_absence,
                    axis):
    """Normalize a BIOM table.

    Normalize the values of a BIOM table through various methods. Relative
    abundance will take the relative abundance of each observation in terms of
    samples or observations.  Presence absensece will convert observations to
    1's and 0's based on presence of the observation.

    Example usage:

    Normalizing a BIOM table to relative abundnace:

    $ biom normalize-table -i table.biom -r -o normalized_table.biom

    Converting a BIOM table to a presence/absence table:

    $ biom normalize-table -i table.biom -p -o converted_table.biom
    """
    table = load_table(input_fp)
    result = _normalize_table(table, relative_abund, presence_absence, axis)

    write_biom_table(result, 'hdf5' if HAVE_H5PY else 'json', output_fp)
Beispiel #12
0
 def test_get_input(self):
     """
     Checks whether get_input writes a settings file
     if inputs are supplied.
     """
     inputs = {
         'biom_file': [(testloc[0] + '/data/test.biom')],
         'cluster': None,
         'otu_meta': None,
         'prefix': None,
         'sample_data': None,
         'split': None,
         'tax_table': None,
         'fp': testloc[0],
         'otu_table': None,
         'tools': ['spiec-easi', 'conet'],
         'spiec': None,
         'conet': None,
         'spar_pval': None,
         'spar_boot': None,
         'levels': ['family'],
         'prev': 20,
         'min': None,
         'rar': None,
         'name': ['test'],
         'cores': None
     }
     write_biom_table(testbiom['test'],
                      fmt='hdf5',
                      filepath=(inputs['biom_file'][0]))
     get_input(inputs)
     test = Path(inputs['fp'] + "/settings.json")
     self.assertTrue(test.is_file())
     call(("rm " + inputs['biom_file'][0]))
     call(("rm " + inputs['fp'] + "/settings.json"))
     call(("rm " + inputs['fp'] + "/test_family.hdf5"))
     call(("rm " + inputs['fp'] + "/test_otu.hdf5"))
Beispiel #13
0
 def test_import_biom_mapping(self):
     """
     Tests if the BiomConnection uses the mapping dict.
     :return:
     """
     conn_object = BiomConnection()
     conn_object.create_tables()
     write_biom_table(testbiom, fmt='hdf5', filepath="test.biom")
     import_biom("test.biom", mapping={'test': 'banana'})
     os.remove("test.biom")
     conn = psycopg2.connect(
         **{
             "host": "localhost",
             "database": "test",
             "user": "******",
             "password": "******"
         })
     cur = conn.cursor()
     cur.execute("SELECT studyID " "FROM bioms " "LIMIT 1;")
     result = cur.fetchall()
     cur.close()
     conn.close()
     conn_object.delete_tables()
     self.assertEqual(result[0][0], 'banana')
Beispiel #14
0
 def test_run_netstats(self):
     """
     Checks whether combine_data returns
     a batch object if inputs are supplied.
     """
     inputs = {
         'biom_file': [(testloc[0] + '/data/test.biom')],
         'cluster': None,
         'otu_meta': None,
         'prefix': None,
         'sample_data': None,
         'split': None,
         'tax_table': None,
         'fp': testloc[0],
         'otu_table': None,
         'tools': ['spiec-easi', 'conet'],
         'spiec': None,
         'spar': None,
         'conet': (os.path.dirname(massoc.__file__)[:-6] + 'tests\\CoNet3'),
         'conet_bash': None,
         'spar_pval': None,
         'spar_boot': None,
         'levels': ['family'],
         'prev': 20,
         'min': None,
         'rar': None,
         'name': ['test'],
         'cores': None,
         'address': 'bolt://localhost:7687',
         'username': '******',
         'password': '******',
         'quit': False,
         'clear': False,
         'write': False,
         'add': False,
         'output': 'network',
         'logic': ['union', 'difference', 'intersection'],
         'neo4j': (os.path.dirname(massoc.__file__)[:-6] + 'tests\\neo4j')
     }
     write_biom_table(testbiom['test'],
                      fmt='hdf5',
                      filepath=(inputs['biom_file'][0]))
     get_input(inputs)
     inputs['settings'] = inputs['fp'] + '/settings.json'
     run_network(inputs)
     inputs['job'] = 'start'
     run_neo4j(inputs)
     inputs['job'] = 'upload'
     run_neo4j(inputs)
     run_netstats(inputs)
     inputs['job'] = 'clear'
     run_neo4j(inputs)
     inputs['job'] = 'quit'
     run_neo4j(inputs)
     test = Path(inputs['fp'] + "/difference_network.graphml")
     self.assertTrue(test.is_file())
     call(("rm " + inputs['biom_file'][0]))
     call(("rm " + inputs['fp'] + "/settings.json"))
     call(("rm " + inputs['fp'] + "/test_family.hdf5"))
     call(("rm " + inputs['fp'] + "/test_otu.hdf5"))
     call(("rm " + inputs['fp'] + "/conet_test_family.txt"))
     call(("rm " + inputs['fp'] + "/spiec-easi_test_family.txt"))
     call(("rm " + inputs['fp'] + "/network.graphml"))
     call(("rm " + inputs['fp'] + "/difference_network.graphml"))
     call(("rm " + inputs['fp'] + "/union_network.graphml"))
     call(("rm " + inputs['fp'] + "/intersection_network.graphml"))
Beispiel #15
0
def _convert(table,
             output_filepath,
             sample_metadata=None,
             observation_metadata=None,
             to_json=False,
             to_hdf5=False,
             to_tsv=False,
             collapsed_samples=False,
             collapsed_observations=False,
             header_key=None,
             output_metadata_id=None,
             table_type=None,
             process_obs_metadata=None,
             tsv_metadata_formatter='sc_separated'):

    if sum([to_tsv, to_hdf5, to_json]) == 0:
        raise ValueError("Must specify an output format")
    elif sum([to_tsv, to_hdf5, to_json]) > 1:
        raise ValueError("Can only specify a single output format")

    if table_type is None:
        if table.type in [None, "None"]:
            table.type = "Table"
        else:
            pass
    else:
        table.type = table_type

    if tsv_metadata_formatter is not None:
        obs_md_fmt_f = observation_metadata_formatters[tsv_metadata_formatter]

    if sample_metadata is not None:
        table.add_metadata(sample_metadata)

    # if the user does not specify a name for the output metadata column,
    # set it to the same as the header key
    output_metadata_id = output_metadata_id or header_key

    if process_obs_metadata is not None and not to_tsv:
        if table.metadata(axis='observation') is None:
            raise ValueError("Observation metadata processing requested "
                             "but it doesn't appear that there is any "
                             "metadata to operate on!")

        # and if this came in as TSV, then we expect only a single type of
        # metadata
        md_key = list(table.metadata(axis='observation')[0].keys())[0]

        process_f = observation_metadata_types[process_obs_metadata]
        it = zip(table.ids(axis='observation'),
                 table.metadata(axis='observation'))
        new_md = {id_: {md_key: process_f(md[md_key])} for id_, md in it}

        if observation_metadata:
            for k, v in observation_metadata.items():
                new_md[k].update(v)
        table.add_metadata(new_md, 'observation')

    if to_tsv:
        result = table.to_tsv(header_key=header_key,
                              header_value=output_metadata_id,
                              metadata_formatter=obs_md_fmt_f)
        with open(output_filepath, 'w') as f:
            f.write(result)
        return
    elif to_json:
        fmt = 'json'
        result = table
    elif to_hdf5:
        fmt = 'hdf5'
        result = table
        if collapsed_observations:
            metadata = [{
                'collapsed_ids': sorted(md.keys())
            } for md in result.metadata(axis='observation')]
            result._observation_metadata = metadata
        if collapsed_samples:
            metadata = [{
                'collapsed_ids': sorted(md.keys())
            } for md in result.metadata()]
            result._sample_metadata = metadata
        if collapsed_observations or collapsed_samples:
            # We have changed the metadata, it is safer to make sure that
            # it is correct
            result._cast_metadata()
    write_biom_table(result, fmt, output_filepath)

    return
    def setUpClass(cls):
        _table1 = [
            'a\ta\t1\t0.0\t0.5\t0.1', 'a\ta\t1\t1.0\t1.0\t0.2',
            'a\ta\t1\t2.0\t1.5\t0.2', 'a\tb\t1\t3.0\t2.0\t8.',
            'a\tb\t1\t4.0\t2.5\t9.', 'a\tb\t1\t5.0\t3.0\t10.',
            'b\ta\t1\t0.0\t2.0\t0.1', 'b\ta\t1\t1.0\t3.0\t0.3',
            'b\ta\t1\t2.0\t4.0\t0.1', 'b\tb\t1\t3.0\t5.0\t9.',
            'b\tb\t1\t4.0\t6.0\t11.', 'b\tb\t1\t5.0\t7.0\t10.'
        ]

        cls.table1 = pd.DataFrame(
            [(n.split('\t')) for n in _table1],
            columns=['group', 'dataset', 'level', 'x', 'y', 'c'],
            dtype=float)

        cls.table2 = """{"id": "None",
                          "format": "Biological Observation Matrix 1.0.0",
                          "format_url": "http:\/\/biom-format.org",
                          "type": "OTU table",
                          "generated_by": "greg",
                          "date": "2013-08-22T13:10:23.907145",
                          "matrix_type": "sparse",
                          "matrix_element_type": "float",
                          "shape": [
                            3,
                            4
                          ],
                          "data": [
                            [
                              0,
                              0,
                              1
                            ],
                            [
                              0,
                              1,
                              2
                            ],
                            [
                              0,
                              2,
                              3
                            ],
                            [
                              0,
                              3,
                              4
                            ],
                            [
                              1,
                              0,
                              2
                            ],
                            [
                              1,
                              1,
                              0
                            ],
                            [
                              1,
                              2,
                              7
                            ],
                            [
                              1,
                              3,
                              8
                            ],
                            [
                              2,
                              0,
                              9
                            ],
                            [
                              2,
                              1,
                              10
                            ],
                            [
                              2,
                              2,
                              11
                            ],
                            [
                              2,
                              3,
                              12
                            ]
                          ],
                          "rows": [
                            {
                              "id": "o1",
                              "metadata": {
                                "domain": "Archaea"
                              }
                            },
                            {
                              "id": "o2",
                              "metadata": {
                                "domain": "Bacteria"
                              }
                            },
                            {
                              "id": "o3",
                              "metadata": {
                                "domain": "Bacteria"
                              }
                            }
                          ],
                          "columns": [
                            {
                              "id": "s1",
                              "metadata": {
                                "method": "A",
                                "Sample": "A",
                                "parameters": "A"
                              }
                            },
                            {
                              "id": "s2",
                              "metadata": {
                                "method": "A",
                                "Sample": "A",
                                "parameters": "B"
                              }
                            },
                            {
                              "id": "s3",
                              "metadata": {
                                "method": "A",
                                "Sample": "A",
                                "parameters": "C"
                              }
                            },
                            {
                              "id": "s4",
                              "metadata": {
                                "method": "B",
                                "Sample": "A",
                                "parameters": "D"
                              }
                            }
                          ]
                        }"""
        # table 2
        # OTU ID	s1	s2	s3	s4
        # o1    1.0 2.0 3.0 4.0
        # o2    2.0 0.0 7.0 8.0
        # o3    9.0 10.0    11.0    12.0

        cls.tmpdir = mkdtemp()
        cls.table2 = Table.from_json(json.loads(cls.table2))
        write_biom_table(cls.table2, 'hdf5', join(cls.tmpdir, 'table2.biom'))
        cls.dm, cls.s_md = make_distance_matrix(join(cls.tmpdir,
                                                     'table2.biom'),
                                                method="braycurtis")
        cls.dist = per_method_distance(cls.dm,
                                       cls.s_md,
                                       group_by='method',
                                       standard='B',
                                       metric='distance',
                                       sample='Sample')
def merge_expected_and_observed_tables(expected_results_dir,
                                       results_dirs,
                                       md_key='taxonomy',
                                       min_count=0,
                                       taxonomy_level=6,
                                       taxa_to_keep=None,
                                       biom_fp='merged_table.biom',
                                       filename_pattern='table.L{0}-taxa.biom',
                                       dataset_ids=None,
                                       reference_ids=None,
                                       method_ids=None,
                                       parameter_ids=None,
                                       force=False):
    '''For each dataset in expected_results_dir, merge expected and observed
    taxonomy compositions.

    dataset_ids: list
        dataset ids (mock community study ID) to process. Defaults to None
        (process all).
    reference_ids: list
        reference database data to process. Defaults to None (process all).
    method_ids: list
        methods to process. Defaults to None (process all).
    parameter_ids: list
        parameters to process. Defaults to None (process all).
    '''
    # Quick and dirty way to keep merge from running automatically in notebooks
    # when users "run all" cells. This is really just a convenience function
    # that is meant to be called from the tax-credit notebooks and causing
    # force=False to kill the function is the best simple control. The
    # alternative is to work out a way to weed out expected_tables that have a
    # merged biom, and just load that biom instead of overwriting if
    # force=False. Then do the same for result_tables. If any new result_tables
    # exist, perform merge if force=True. The only time force=False should
    # result in a new table is when a new mock community/reference dataset
    # combo is added — so just let users set force=True if that's the case.
    if force is False:
        exit('Skipping merge. Set force=True if you intend to generate new '
             'merged tables.')

    # Find expected tables, add sample metadata
    expected_table_lookup = get_expected_tables_lookup(
        expected_results_dir, filename_pattern=filename_pattern)

    expected_tables = {}
    for dataset_id, expected_dict in expected_table_lookup.items():
        expected_tables[dataset_id] = {}
        for reference_id, expected_table_fp in expected_dict.items():
            if not exists(
                    join(expected_results_dir, dataset_id, reference_id,
                         biom_fp)) or force is True:
                expected_tables[dataset_id][reference_id] = \
                    add_sample_metadata_to_table(expected_table_fp,
                                                 dataset_id=dataset_id,
                                                 reference_id=reference_id,
                                                 min_count=min_count,
                                                 taxonomy_level=taxonomy_level,
                                                 taxa_to_keep=taxa_to_keep,
                                                 md_key='taxonomy',
                                                 method='expected',
                                                 params='expected')

    # Find observed results tables, add sample metadata
    result_tables = seek_results(results_dirs, dataset_ids, reference_ids,
                                 method_ids, parameter_ids)

    for dataset_id, ref_id, method, params, actual_table_fp in result_tables:

        biom_destination = join(expected_results_dir, dataset_id, ref_id,
                                biom_fp)
        if not exists(biom_destination) or force is True:
            try:
                expected_table_fp = \
                    expected_table_lookup[dataset_id][ref_id]
            except KeyError:
                raise KeyError("Can't find expected table for \
                                ({0}, {1}).".format(dataset_id, ref_id))

            # import expected table, amend sample ids
            actual_table = \
                add_sample_metadata_to_table(actual_table_fp,
                                             dataset_id=dataset_id,
                                             reference_id=ref_id,
                                             min_count=min_count,
                                             taxonomy_level=taxonomy_level,
                                             taxa_to_keep=taxa_to_keep,
                                             md_key='taxonomy',
                                             method=method,
                                             params=params)

            # merge expected and resutls tables
            expected_tables[dataset_id][ref_id] = \
                expected_tables[dataset_id][ref_id].merge(actual_table)

            # write biom table to destination
            write_biom_table(expected_tables[dataset_id][ref_id], 'hdf5',
                             biom_destination)
Beispiel #18
0
def _convert(table, output_filepath, sample_metadata=None,
             observation_metadata=None, to_json=False, to_hdf5=False,
             to_tsv=False, collapsed_samples=False,
             collapsed_observations=False, header_key=None,
             output_metadata_id=None, table_type=None,
             process_obs_metadata=None, tsv_metadata_formatter='sc_separated'):

    if sum([to_tsv, to_hdf5, to_json]) == 0:
        raise ValueError("Must specify an output format")
    elif sum([to_tsv, to_hdf5, to_json]) > 1:
        raise ValueError("Can only specify a single output format")

    if table_type is None:
        if table.type in [None, "None"]:
            table.type = "Table"
        else:
            pass
    else:
        table.type = table_type

    if tsv_metadata_formatter is not None:
        obs_md_fmt_f = observation_metadata_formatters[tsv_metadata_formatter]

    if sample_metadata is not None:
        table.add_metadata(sample_metadata)

    # if the user does not specify a name for the output metadata column,
    # set it to the same as the header key
    output_metadata_id = output_metadata_id or header_key

    if process_obs_metadata is not None and not to_tsv:
        if table.metadata(axis='observation') is None:
            raise ValueError("Observation metadata processing requested "
                             "but it doesn't appear that there is any "
                             "metadata to operate on!")

        # and if this came in as TSV, then we expect only a single type of
        # metadata
        md_key = list(table.metadata(axis='observation')[0].keys())[0]

        process_f = observation_metadata_types[process_obs_metadata]
        it = zip(table.ids(axis='observation'),
                 table.metadata(axis='observation'))
        new_md = {id_: {md_key: process_f(md[md_key])} for id_, md in it}

        if observation_metadata:
            for k, v in observation_metadata.items():
                new_md[k].update(v)
        table.add_metadata(new_md, 'observation')

    if to_tsv:
        result = table.to_tsv(header_key=header_key,
                              header_value=output_metadata_id,
                              metadata_formatter=obs_md_fmt_f)
        with open(output_filepath, 'w') as f:
            f.write(result)
        return
    elif to_json:
        fmt = 'json'
        result = table
    elif to_hdf5:
        fmt = 'hdf5'
        result = table
        if collapsed_observations:
            metadata = [{'collapsed_ids': sorted(md.keys())}
                        for md in result.metadata(axis='observation')]
            result._observation_metadata = metadata
        if collapsed_samples:
            metadata = [{'collapsed_ids': sorted(md.keys())}
                        for md in result.metadata()]
            result._sample_metadata = metadata
        if collapsed_observations or collapsed_samples:
            # We have changed the metadata, it is safer to make sure that
            # it is correct
            result._cast_metadata()
    write_biom_table(result, fmt, output_filepath)

    return
def extract_mockrobiota_data(communities,
                             community_md,
                             ref_dbs,
                             mockrobiota_dir,
                             mock_data_dir,
                             expected_data_dir,
                             biom_fn='table.L6-taxa.biom'):
    '''Extract sample metadata, raw data files, and expected taxonomy

    from mockrobiota, copy to new destination
    communities: LIST of mock communities to extract
    community_md: DICT of metadata for mock community.
        see extract_mockrobiota_dataset_metadata()
    ref_dbs = DICT mapping marker_gene to reference set names
    mockrobiota_dir = PATH to mockrobiota repo directory
    mock_data_dir = PATH to destination directory
    expected_data_dir = PATH to destination for expected taxonomy files
    '''
    for community in communities:
        # extract dataset metadata/params
        forward_read_url, index_read_url, marker_gene = community_md[community]
        ref_outdir, ref_indir, ref_version, otu_id = ref_dbs[marker_gene][0:4]

        # mockrobiota source directory
        mockrobiota_community_dir = join(mockrobiota_dir, "data", community)

        # new mock community directory
        community_dir = join(mock_data_dir, community)
        seqs_dir = join(community_dir, 'raw_seqs')
        if not exists(seqs_dir):
            makedirs(seqs_dir)
        # copy sample-metadata.tsv
        copyfile(join(mockrobiota_community_dir, 'sample-metadata.tsv'),
                 join(community_dir, 'sample-metadata.tsv'))
        # download raw data files
        for file_url_dest in [(forward_read_url, 'sequences.fastq.gz'),
                              (index_read_url, 'barcodes.fastq.gz')]:
            destination = join(seqs_dir, file_url_dest[1])
            if not exists(destination) and file_url_dest[0] != 'NA':
                try:
                    urlretrieve(file_url_dest[0], destination)
                except ValueError:
                    print('Error retrieving {0}'.format(file_url_dest[0]))

        # new directory containing expected taxonomy assignments at each level
        expected_taxa_dir = join(expected_data_dir, community, ref_outdir,
                                 "expected")
        if not exists(expected_taxa_dir):
            makedirs(expected_taxa_dir)
        # copy expected taxonomy.tsv and convert to biom
        exp_taxa_fp = join(expected_taxa_dir, 'expected-taxonomy.tsv')
        exp_biom_fp = join(expected_taxa_dir, biom_fn)
        copyfile(
            join(mockrobiota_community_dir, ref_indir, ref_version, otu_id,
                 'expected-taxonomy.tsv'), exp_taxa_fp)
        newbiom = amend_biom_taxonomy_ids(load_table(exp_taxa_fp))
        # add taxonomy ids (names) as observation metadata
        metadata = {
            sid: {
                'taxonomy': sid.split(';')
            }
            for sid in newbiom.ids(axis='observation')
        }
        newbiom.add_metadata(metadata, 'observation')
        write_biom_table(newbiom, 'hdf5', exp_biom_fp)