Ejemplo n.º 1
0
 def __init__(self, table: biom.Table, feature_id: str, **kwargs):
     super().__init__(table=table, **kwargs)
     self.feature_id = feature_id
     values = table.data(id=feature_id, axis="observation",
                         dense=True).astype(int)
     self.add_parameters({"y": values})
Ejemplo n.º 2
0
def beta_phylogenetic(table: biom.Table, phylogeny: skbio.TreeNode,
                      metric: str, weighted: bool)-> skbio.DistanceMatrix:
    if metric not in phylogenetic_metrics():
        raise ValueError("Unknown phylogenetic metric: %s" % metric)
    if table.is_empty():
        raise ValueError("The provided table object is empty")

    # Write table to temp file
    with tempfile.TemporaryDirectory() as temp_dir_name:
        table_fp = os.path.join(temp_dir_name, 'otu_table.tsv')
        newick_fp = os.path.join(temp_dir_name, 'tree.newick')
        with open(table_fp, 'w') as out_table, open(newick_fp, 'w') as newick:
            # This is easy, just write to newick
            phylogeny.write(newick)
            # We have to iterate through each sample
            out_table.write("\t" + "\t".join(table.ids(axis='observation')))
            for sample_id in table.ids(axis='sample'):
                row = table.data(sample_id)
                out_table.write("\n" + str(sample_id) + "\t" + \
                        "\t".join([str(x) for x in row]))
    # Run ExpressBetaDiversity on them
        name_map = {'braycurtis': 'Bray-Curtis',
                    'sorensen': 'Bray-Curtis',
                    'canberra': 'Canberra',
                    'chi_squared': 'Chi-squared',
                    'coeff_similarity': 'CS',
                    'complete_tree': 'CT',
                    'euclidean': 'Euclidean',
                    'f_st': 'Fst',
                    'p_st': 'Fst',
                    'gower': 'Gower',
                    'hellinger': 'Hellinger',
                    'kulczynski': 'Kulczynski',
                    'lennon': 'Lennon',
                    'manhattan': 'Manhattan',
                    'weighted_unifrac': 'Manhattan',
                    'mnnd': 'MNND',
                    'mpd': 'MPD',
                    'morisita_horn': 'Morisita-Horn',
                    'normalized_weighted_unifrac': 'NWU',
                    'pearson': 'Pearson',
                    'raohp': 'RaoHp',
                    'soergel': 'Soergel',
                    'jaccard': 'Soergel',
                    'unweighted_unifrac': 'Soergel',
                    'ruzicka': 'Soergel',
                    'tamas_coeff': 'TC',
                    'weighted_corr': 'WC',
                    'whittaker': 'Whittaker',
                    'yue_clayton': 'Yue-Clayton'
                   }
        if weighted:
            weighted = "-w"
        else:
            weighted = ""
        cmd = 'ExpressBetaDiversity -t tree.newick -s otu_table.tsv %s -c %s' \
                                                  % (weighted, name_map[metric])
        subprocess.run(cmd, cwd=temp_dir_name, shell=True)
        with open(os.path.join(temp_dir_name, 'output.diss'), 'r') as dist_file:
            nsamples = int(dist_file.readline())
            dist_mat = np.zeros((nsamples, nsamples))
            ids = []
            for i, line in enumerate(dist_file):
                ids.append(line.split("\t")[0].strip())
                for j, dist in enumerate(line.split("\t")[1:]):
                    dist_mat[i,j] = float(dist)
                    dist_mat[j,i] = float(dist)

    # Suck the data matrix back in
    # Return a DistanceMatrix object
    results = skbio.DistanceMatrix(dist_mat, ids)
    return results
Ejemplo n.º 3
0
class TestSculptor(TestCase):
    def setUp(self):

        # small synthetic dataset
        sample_ids = [
            's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11'
        ]
        self.mf = pd.DataFrame(data=[
            ['fasting', '8', 'A'],
            ['fasting', '-1', 'A'],
            ['control', '1', 'B'],
            ['control', '2', 'B'],
            ['control', '3', 'B'],
            ['fasting', '2', 'A'],
            ['fasting', '11', 'A'],
            ['control', '4', 'B'],
            ['control', '5', 'B'],
            ['control', '90', 'B'],
            ['fasting', '19.9', 'A'],
        ],
                               columns=['Treatment', 'Day', 'Host'],
                               index=sample_ids)
        self.mf['Day'] = pd.to_numeric(self.mf['Day'], errors='coerce')

        otu_ids = [str(i) for i in range(1, 8)]
        data = np.array([[0.0, 2.0, 5.0, 5.0, 0.0, 0.0, 0.0],
                         [0.0, 0.0, 6.0, 9.0, 0.0, 4.0, 0.0],
                         [2.0, 6.0, 0.0, 0.0, 5.0, 0.0, 0.0],
                         [0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0],
                         [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 5.0],
                         [1.0, 0.0, 8.0, 9.0, 0.0, 0.0, 0.0],
                         [0.0, 0.0, 1.0, 3.0, 0.0, 0.0, 0.0],
                         [0.0, 0.0, 0.0, 0.0, 2.0, 3.0, 0.0],
                         [0.0, 3.0, 0.0, 0.0, 0.0, 4.0, 0.0],
                         [0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 0.0],
                         [9.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])

        self.bt = Table(data.T, otu_ids, sample_ids)

        tree_string = ("((1:0.2, 2:0.1)3P:0.3, (((7:0.1, 8:0.1)7P:0.8, (5:0.2,"
                       " 6:0.2)8P:0.1)5P:0.1, (3:0.2, 4:0.7)6P:0.9)4P:0.3)"
                       "root;")
        self.tree = TreeNode.read(StringIO(tree_string))

        # assumes to be only directories
        self.to_delete = []

    def tearDown(self):
        for element in self.to_delete:
            shutil.rmtree(element, ignore_errors=True)

        # delete the directory only if it is empty
        try:
            os.rmdir('roc-curves')
        except (OSError, FileNotFoundError):
            pass

    def test_constructor(self):
        obs = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host', 'test-name')

        self.assertTrue(obs.mapping_file is None)
        self.assertTrue(obs.biom_table is None)

        self.assertEqual(obs.name, 'test-name')

        self.assertTrue(obs._alpha_diversity_values is None)
        self.assertTrue(obs._beta_diversity_matrices is None)

        pd.util.testing.assert_frame_equal(self.mf, obs._original_mf)

        np.testing.assert_equal(obs._original_bt.ids(), self.bt.ids())
        np.testing.assert_equal(obs._original_bt.ids('observation'),
                                self.bt.ids('observation'))

        a = [self.bt.data(i) for i in self.bt.ids()]
        b = [obs._original_bt.data(i) for i in obs._original_bt.ids()]

        np.testing.assert_allclose(a, b)

        # needed to allow for phylogenetic metrics
        for node in obs.tree.postorder():
            self.assertTrue(node.length is not None)

    def test_constructor_errors(self):
        with self.assertRaisesRegex(ValueError, 'The gradient category'):
            _ = Sculptor(self.bt, self.mf, self.tree, 'XXX', 'Host')

        with self.assertRaisesRegex(ValueError, 'The trajectory category'):
            _ = Sculptor(self.bt, self.mf, self.tree, 'Day', 'XXX')

        with self.assertRaisesRegex(ValueError, 'numeric dtype'):
            _ = Sculptor(self.bt, self.mf, self.tree, 'Treatment', 'Host')

        # create fake metadata
        self.bt.update_ids({i: i + 'xx' for i in self.bt.ids()}, inplace=True)
        with self.assertRaisesRegex(ValueError, 'without metadata'):
            _ = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host')

    def test_random_select(self):
        np.random.seed(0)
        obs = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'random-select')

        self.assertTrue(obs.mapping_file is None)
        self.assertTrue(obs.biom_table is None)

        obs.randomly_select(3)

        # if we randomly select three samples there should be 6 in total
        self.assertTrue(len(obs.mapping_file) == 6)
        self.assertEqual(obs.biom_table.shape, (7, 6))

    def test_random_select_errors(self):
        obs = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'random-select-errors')

        with self.assertRaisesRegex(ValueError, 'uniformly subsampled'):
            obs.alpha_table()

        with self.assertRaisesRegex(ValueError, 'uniformly subsampled'):
            obs.beta_table()

        with self.assertRaisesRegex(ValueError, 'uniformly subsampled'):
            obs.microbes_over_time()

    def test_alpha(self):
        skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'test-alpha')
        np.random.seed(0)
        skl.randomly_select(5)

        obs = skl.alpha_table(['faith_pd', 'observed_otus'])

        self.assertTrue(skl._alpha_diversity_values is not None)

        columns = [
            'faith_pd_absolute_sum_of_diff', 'faith_pd_abs_mean_diff',
            'faith_pd_variance_larger_than_standard_deviation',
            'faith_pd_abs_energy', 'observed_otus_absolute_sum_of_diff',
            'observed_otus_abs_mean_diff',
            'observed_otus_variance_larger_than_standard_deviation',
            'observed_otus_abs_energy'
        ]
        data = [[
            2.1999999999999993, 0.5499999999999998, 0.0, 23.919999999999995, 2,
            0.5, False, 32
        ],
                [
                    2.200000000000001, 0.5500000000000003, 0.0,
                    6.760000000000001, 3, 0.75, False, 22
                ]]

        exp = pd.DataFrame(data=data,
                           index=pd.Index(['A', 'B'], name='Host'),
                           columns=columns)
        pd.util.testing.assert_frame_equal(obs, exp)

    def test_alpha_errors(self):
        skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'random-select-errors')
        skl.randomly_select(5)
        with self.assertRaisesRegex(ValueError, 'find one or more metrics'):
            skl.alpha_table(metrics=['does_not_exist'])

    def test_beta(self):
        skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'unittest-test-beta')
        path = 'roc-curves/%s/cached-matrices/' % skl.name

        # avoid any unwanted accidents
        self.to_delete.append('roc-curves/%s/' % skl.name)

        np.random.seed(0)
        skl.randomly_select(5)
        obs = skl.beta_table(['unweighted_unifrac', 'jaccard'])

        data = [[
            0.3927777777777778, 0.4126532637086283, 0.9375, 0.12499999999999999
        ], [0.6557886557886559, 0.1365522219610505, 1.0, 0.0]]
        columns = [
            'unweighted_unifrac_mean', 'unweighted_unifrac_std',
            'jaccard_mean', 'jaccard_std'
        ]
        exp = pd.DataFrame(data=data,
                           columns=columns,
                           index=pd.Index(['A', 'B'], name='Host'))

        pd.util.testing.assert_frame_equal(obs, exp)

        self.assertTrue(os.path.exists(path))
        self.assertTrue(
            os.path.exists(os.path.join(path, 'unweighted_unifrac.full.'
                                        'txt')))
        self.assertTrue(os.path.exists(os.path.join(path, 'jaccard.full.txt')))

    def test_beta_errors(self):
        skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'unittest-beta-errors')
        self.to_delete.append('roc-curves/%s' % skl.name)
        skl.randomly_select(5)
        with self.assertRaisesRegex(ValueError, 'find one or more metrics'):
            skl.beta_table(metrics=['does_not_exist'])

    def test_microbes_over_time(self):
        skl = Sculptor(self.bt, self.mf, self.tree, 'Day', 'Host',
                       'microbes-over-time')
        np.random.seed(0)
        skl.randomly_select(5)

        obs = skl.microbes_over_time()

        metrics = ['mean', 'abs_energy', 'non_zero_samples', 'abs_mean_diff']
        columns = ['%s_%s' % (a, b) for a, b in product(range(1, 8), metrics)]
        index = ['A', 'B']

        self.assertEqual(obs.columns.tolist(), columns)
        self.assertEqual(obs.index.tolist(), index)
        self.assertEqual(obs.values.shape, (2, 28))