def _map_observations(table: biom.Table) -> biom.Table:
    obs_dict = {}
    for taxa in table.ids('observation'):
        obs_dict[taxa] = taxa.replace('_', ' ')
    table = table.update_ids(id_map=obs_dict,
                             axis='observation',
                             inplace=False)
    return table
Ejemplo n.º 2
0
def rename_ids(table: biom.Table,
               metadata: qiime2.CategoricalMetadataColumn,
               axis: str = 'sample',
               strict: bool = False)\
               -> biom.Table:

    rename = metadata.to_series()
    if axis == 'feature':
        axis = 'observation'
    old_ids = table.ids(axis=axis)

    new_ids = _generate_new_names(old_ids, rename, strict, False)

    updated = table.update_ids(new_ids, axis=axis, inplace=False)

    return updated
Ejemplo n.º 3
0
Archivo: 54.py Proyecto: tkosciol/qiita
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update({sid: "%d.%s" % (a_id, sid)
                                for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'],
                      4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [basename(new_table_fp), 7,
                      compute_checksum(new_table_fp), 1, dd_id])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id
Ejemplo n.º 4
0
Archivo: 54.py Proyecto: tanaes/qiita
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update(
                    {sid: "%d.%s" % (a_id, sid)
                     for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(
            sql,
            [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [
            basename(new_table_fp), 7,
            compute_checksum(new_table_fp), 1, dd_id
        ])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id
Ejemplo n.º 5
0
class TestSculptor(TestCase):
    def setUp(self):

        # small synthetic dataset
        sample_ids = [
            's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11'
        ]
        self.mf = pd.DataFrame(data=[
            ['fasting', '8', 'A'],
            ['fasting', '-1', 'A'],
            ['control', '1', 'B'],
            ['control', '2', 'B'],
            ['control', '3', 'B'],
            ['fasting', '2', 'A'],
            ['fasting', '11', 'A'],
            ['control', '4', 'B'],
            ['control', '5', 'B'],
            ['control', '90', 'B'],
            ['fasting', '19.9', 'A'],
        ],
                               columns=['Treatment', 'Day', 'Host'],
                               index=sample_ids)
        self.mf['Day'] = pd.to_numeric(self.mf['Day'], errors='coerce')

        otu_ids = [str(i) for i in range(1, 8)]
        data = np.array([[0.0, 2.0, 5.0, 5.0, 0.0, 0.0, 0.0],
                         [0.0, 0.0, 6.0, 9.0, 0.0, 4.0, 0.0],
                         [2.0, 6.0, 0.0, 0.0, 5.0, 0.0, 0.0],
                         [0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0],
                         [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 5.0],
                         [1.0, 0.0, 8.0, 9.0, 0.0, 0.0, 0.0],
                         [0.0, 0.0, 1.0, 3.0, 0.0, 0.0, 0.0],
                         [0.0, 0.0, 0.0, 0.0, 2.0, 3.0, 0.0],
                         [0.0, 3.0, 0.0, 0.0, 0.0, 4.0, 0.0],
                         [0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 0.0],
                         [9.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])

        self.bt = Table(data.T, otu_ids, sample_ids)

        tree_string = ("((1:0.2, 2:0.1)3P:0.3, (((7:0.1, 8:0.1)7P:0.8, (5:0.2,"
                       " 6:0.2)8P:0.1)5P:0.1, (3:0.2, 4:0.7)6P:0.9)4P:0.3)"
                       "root;")
        self.tree = TreeNode.read(StringIO(tree_string))

        # assumes to be only directories
        self.to_delete = []

    def tearDown(self):
        for element in self.to_delete:
            shutil.rmtree(element, ignore_errors=True)

        # delete the directory only if it is empty
        try:
            os.rmdir('roc-curves')
        except (OSError, FileNotFoundError):
            pass

    def test_constructor(self):
        obs = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host', 'test-name')

        self.assertTrue(obs.mapping_file is None)
        self.assertTrue(obs.biom_table is None)

        self.assertEqual(obs.name, 'test-name')

        self.assertTrue(obs._alpha_diversity_values is None)
        self.assertTrue(obs._beta_diversity_matrices is None)

        pd.util.testing.assert_frame_equal(self.mf, obs._original_mf)

        np.testing.assert_equal(obs._original_bt.ids(), self.bt.ids())
        np.testing.assert_equal(obs._original_bt.ids('observation'),
                                self.bt.ids('observation'))

        a = [self.bt.data(i) for i in self.bt.ids()]
        b = [obs._original_bt.data(i) for i in obs._original_bt.ids()]

        np.testing.assert_allclose(a, b)

        # needed to allow for phylogenetic metrics
        for node in obs.tree.postorder():
            self.assertTrue(node.length is not None)

    def test_constructor_errors(self):
        with self.assertRaisesRegex(ValueError, 'The gradient category'):
            _ = Sculptor(self.bt, self.mf, self.tree, 'XXX', 'Host')

        with self.assertRaisesRegex(ValueError, 'The trajectory category'):
            _ = Sculptor(self.bt, self.mf, self.tree, 'Day', 'XXX')

        with self.assertRaisesRegex(ValueError, 'numeric dtype'):
            _ = Sculptor(self.bt, self.mf, self.tree, 'Treatment', 'Host')

        # create fake metadata
        self.bt.update_ids({i: i + 'xx' for i in self.bt.ids()}, inplace=True)
        with self.assertRaisesRegex(ValueError, 'without metadata'):
            _ = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host')

    def test_random_select(self):
        np.random.seed(0)
        obs = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'random-select')

        self.assertTrue(obs.mapping_file is None)
        self.assertTrue(obs.biom_table is None)

        obs.randomly_select(3)

        # if we randomly select three samples there should be 6 in total
        self.assertTrue(len(obs.mapping_file) == 6)
        self.assertEqual(obs.biom_table.shape, (7, 6))

    def test_random_select_errors(self):
        obs = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'random-select-errors')

        with self.assertRaisesRegex(ValueError, 'uniformly subsampled'):
            obs.alpha_table()

        with self.assertRaisesRegex(ValueError, 'uniformly subsampled'):
            obs.beta_table()

        with self.assertRaisesRegex(ValueError, 'uniformly subsampled'):
            obs.microbes_over_time()

    def test_alpha(self):
        skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'test-alpha')
        np.random.seed(0)
        skl.randomly_select(5)

        obs = skl.alpha_table(['faith_pd', 'observed_otus'])

        self.assertTrue(skl._alpha_diversity_values is not None)

        columns = [
            'faith_pd_absolute_sum_of_diff', 'faith_pd_abs_mean_diff',
            'faith_pd_variance_larger_than_standard_deviation',
            'faith_pd_abs_energy', 'observed_otus_absolute_sum_of_diff',
            'observed_otus_abs_mean_diff',
            'observed_otus_variance_larger_than_standard_deviation',
            'observed_otus_abs_energy'
        ]
        data = [[
            2.1999999999999993, 0.5499999999999998, 0.0, 23.919999999999995, 2,
            0.5, False, 32
        ],
                [
                    2.200000000000001, 0.5500000000000003, 0.0,
                    6.760000000000001, 3, 0.75, False, 22
                ]]

        exp = pd.DataFrame(data=data,
                           index=pd.Index(['A', 'B'], name='Host'),
                           columns=columns)
        pd.util.testing.assert_frame_equal(obs, exp)

    def test_alpha_errors(self):
        skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'random-select-errors')
        skl.randomly_select(5)
        with self.assertRaisesRegex(ValueError, 'find one or more metrics'):
            skl.alpha_table(metrics=['does_not_exist'])

    def test_beta(self):
        skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'unittest-test-beta')
        path = 'roc-curves/%s/cached-matrices/' % skl.name

        # avoid any unwanted accidents
        self.to_delete.append('roc-curves/%s/' % skl.name)

        np.random.seed(0)
        skl.randomly_select(5)
        obs = skl.beta_table(['unweighted_unifrac', 'jaccard'])

        data = [[
            0.3927777777777778, 0.4126532637086283, 0.9375, 0.12499999999999999
        ], [0.6557886557886559, 0.1365522219610505, 1.0, 0.0]]
        columns = [
            'unweighted_unifrac_mean', 'unweighted_unifrac_std',
            'jaccard_mean', 'jaccard_std'
        ]
        exp = pd.DataFrame(data=data,
                           columns=columns,
                           index=pd.Index(['A', 'B'], name='Host'))

        pd.util.testing.assert_frame_equal(obs, exp)

        self.assertTrue(os.path.exists(path))
        self.assertTrue(
            os.path.exists(os.path.join(path, 'unweighted_unifrac.full.'
                                        'txt')))
        self.assertTrue(os.path.exists(os.path.join(path, 'jaccard.full.txt')))

    def test_beta_errors(self):
        skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'unittest-beta-errors')
        self.to_delete.append('roc-curves/%s' % skl.name)
        skl.randomly_select(5)
        with self.assertRaisesRegex(ValueError, 'find one or more metrics'):
            skl.beta_table(metrics=['does_not_exist'])

    def test_microbes_over_time(self):
        skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'microbes-over-time')
        np.random.seed(0)
        skl.randomly_select(5)

        obs = skl.microbes_over_time()

        metrics = ['mean', 'abs_energy', 'non_zero_samples', 'abs_mean_diff']
        columns = ['%s_%s' % (a, b) for a, b in product(range(1, 8), metrics)]
        index = ['A', 'B']

        self.assertEqual(obs.columns.tolist(), columns)
        self.assertEqual(obs.index.tolist(), index)
        self.assertEqual(obs.values.shape, (2, 28))
def _update_table_sample_ids(mapping: dict, table: biom.Table) -> biom.Table:
    return table.update_ids(mapping, axis='sample', inplace=False)
def _update_table_feature_ids(mapping: dict, table: biom.Table) -> biom.Table:
    return table.update_ids(mapping, axis='observation', inplace=False)