Example #1
0
    def setUp(self):
        self.dm1 = skbio.DistanceMatrix([[0.00, 0.25, 0.25],
                                         [0.25, 0.00, 0.00],
                                         [0.25, 0.00, 0.00]],
                                        ids=['sample1', 'sample2', 'sample3'])

        # Positive correlation with `dm1`
        self.dm2 = skbio.DistanceMatrix([[0.00, 1.00, 2.00],
                                         [1.00, 0.00, 1.00],
                                         [2.00, 1.00, 0.00]],
                                        ids=['sample1', 'sample2', 'sample3'])

        # Perfect negative correlation with `dm1`
        self.dm3 = skbio.DistanceMatrix([[0.00, 0.00, 0.00],
                                         [0.00, 0.00, 0.25],
                                         [0.00, 0.25, 0.00]],
                                        ids=['sample1', 'sample2', 'sample3'])

        self.dm2_reordered = skbio.DistanceMatrix(
            [[0.00, 2.00, 1.00],
             [2.00, 0.00, 1.00],
             [1.00, 1.00, 0.00]],
            ids=['sample3', 'sample1', 'sample2'])

        self.mismatched_dm = skbio.DistanceMatrix(
            [[0.0, 0.0, 0.0, 0.0, 0.0],
             [0.0, 0.0, 1.0, 2.0, 3.0],
             [0.0, 1.0, 0.0, 1.0, 2.0],
             [0.0, 2.0, 1.0, 0.0, 1.0],
             [0.0, 3.0, 2.0, 1.0, 0.0]],
            ids=['foo', 'sample1', 'sample2', 'sample3', 'x'])

        self.output_dir_obj = tempfile.TemporaryDirectory(
                prefix='q2-diversity-test-temp-')
        self.output_dir = self.output_dir_obj.name
    def setUp(self):
        self.dm = skbio.DistanceMatrix([[0, 1, 2.1], [1, 0, 3], [2.1, 3, 0]],
                                       ids=['S1', 'S2', 'S3'])

        # Since support is traditionally held as the name, we'll use only two
        # trees since 1/2 has an exact floating point representation and will
        # look like `"0.5"` on any machine.
        self.support = [
            skbio.DistanceMatrix([[0, 1.1, 2], [1.1, 0, 3], [2, 3, 0]],
                                 ids=['S1', 'S2', 'S3']),
            skbio.DistanceMatrix([[0, 2, 3.1], [2, 0, 1], [3.1, 1, 0]],
                                 ids=['S1', 'S2', 'S3'])
        ]
Example #3
0
    def test_without_where_some_filtered(self):
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1'],
                           'SampleType': ['gut', 'tongue']},
                          index=pd.Index(['S1', 'S2'], name='id'))
        metadata = qiime2.Metadata(df)

        dm = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]],
                                  ['S1', 'S2', 'S3'])

        filtered = filter_distance_matrix(dm, metadata)

        expected = skbio.DistanceMatrix([[0, 1], [1, 0]], ['S1', 'S2'])
        self.assertEqual(self._sorted(filtered), expected)
    def test_simple(self):
        dm = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]],
                                  ids=['S1', 'S2', 'S3'])

        j1 = skbio.DistanceMatrix([[0, 1.1, 2], [1.1, 0, 3], [2, 3, 0]],
                                  ids=['S1', 'S2', 'S3'])
        j2 = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 3.1], [2, 3.1, 0]],
                                  ids=['S1', 'S2', 'S3'])
        j3 = skbio.DistanceMatrix([[0, 1.1, 1.9], [1.1, 0, 3], [1.9, 3, 0]],
                                  ids=['S1', 'S2', 'S3'])

        e = _jackknifed_emperor(dm, [j1, j2, j3], self.md)

        self.assertEqual(len(e.jackknifed), 3)
Example #5
0
    def test_with_exclude_ids_filter_two(self):
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1'],
                           'SampleType': ['gut', 'tongue']},
                          index=['S1', 'S2'])
        metadata = qiime2.Metadata(df)

        dm = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]],
                                  ['S1', 'S2', 'S3'])

        filtered = filter_distance_matrix(dm, metadata,
                                          where=None,
                                          exclude_ids=True)
        expected = skbio.DistanceMatrix([[0]], ['S3'])
        self.assertEqual(self._sorted(filtered), expected)
Example #6
0
    def test_with_where_some_filtered(self):
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=['S1', 'S2', 'S3'])
        metadata = qiime2.Metadata(df)

        dm = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]],
                                  ['S1', 'S2', 'S3'])

        filtered = filter_distance_matrix(dm, metadata,
                                          where="Subject='subject-2'")

        expected = skbio.DistanceMatrix([[0]], ['S3'])
        self.assertEqual(filtered, expected)
Example #7
0
def distance_matrix(metadata: qiime2.MetadataCategory) -> skbio.DistanceMatrix:
    try:
        series = pd.to_numeric(metadata.to_series(), errors='raise')
    except ValueError as e:
        raise ValueError(
            "Encountered non-numeric values in the metadata cateogry. A "
            "distance matrix can only be computed from numeric metadata. "
            "Original error message:\n\n%s" % e)

    # TODO this check can be removed when MetadataCategory is no longer allowed
    # to be empty
    if series.empty:
        raise ValueError(
            "Encountered metadata category that is empty, i.e. there are no "
            "samples or features in the metadata to compute distances "
            "between.")

    if series.hasnans:
        raise ValueError(
            "Encountered missing value(s) in the metadata category. Computing "
            "a distance matrix from missing values is not supported.")

    # This code is derived from @jairideout's scikit-bio cookbook recipe,
    # "Exploring Microbial Community Diversity"
    # https://github.com/biocore/scikit-bio-cookbook
    distances = scipy.spatial.distance.pdist(series.values[:, np.newaxis],
                                             metric='euclidean')
    return skbio.DistanceMatrix(distances, ids=series.index)
Example #8
0
def compare(min_hash_signature: MinHashSigJsonDirFormat,
            ksize: int,
            ignore_abundance: bool = True) -> skbio.DistanceMatrix:

    np_file = 'tmp'
    label_file = 'tmp.labels.txt'
    command = [
        'sourmash', 'compare',
        str(min_hash_signature) + "/*", '--ksize',
        str(ksize), '-o', 'tmp'
    ]
    if ignore_abundance:
        command.append('--ignore-abundance')
    subprocess.run(' '.join(command), check=True, shell=True)
    # load np_file as np.ndarray -> np_sim
    np_sim = numpy.load(np_file)
    # convert similarity to distance
    np_dis = 1 - np_sim
    # read labels into a list -> labels
    labels = [
        os.path.basename(filename).strip().strip('.fastq.gz')
        for filename in open(label_file)
    ]
    os.remove(np_file)
    os.remove(label_file)
    return skbio.DistanceMatrix(np_dis, labels)
Example #9
0
    def setUp(self):
        super().setUp()
        # expected computed with diversity.beta_phylogenetic (weighted_unifrac)
        self.expected = skbio.DistanceMatrix(
            np.array([0.44656238, 0.23771096, 0.30489123, 0.23446002,
                      0.65723575, 0.44911772, 0.381904, 0.69144829,
                      0.39611776, 0.36568012, 0.53377975, 0.48908025,
                      0.35155196, 0.28318669, 0.57376916, 0.23395746,
                      0.24658122, 0.60271637, 0.39802552, 0.36567394,
                      0.68062701, 0.36862049, 0.48350632, 0.33024631,
                      0.33266697, 0.53464744, 0.74605075, 0.53951035,
                      0.49680733, 0.79178838, 0.37109012, 0.52629343,
                      0.22118218, 0.32400805, 0.43189708, 0.59705893]),
            ids=('10084.PC.481', '10084.PC.593', '10084.PC.356',
                 '10084.PC.355', '10084.PC.354', '10084.PC.636',
                 '10084.PC.635', '10084.PC.607', '10084.PC.634'))

        table_fp = self.get_data_path('crawford.biom')
        self.table_as_BIOMV210Format = BIOMV210Format(table_fp, mode='r')
        rel_freq_table_fp = self.get_data_path('crawford_rf.biom')
        self.rf_table_as_BIOMV210Format = BIOMV210Format(rel_freq_table_fp,
                                                         mode='r')

        tree_fp = self.get_data_path('crawford.nwk')
        self.tree_as_NewickFormat = NewickFormat(tree_fp, mode='r')
Example #10
0
    def setUp(self):
        super().setUp()
        # expected computed with skbio.diversity.beta_diversity
        self.expected = skbio.DistanceMatrix([[0.00, 0.25, 0.25],
                                             [0.25, 0.00, 0.00],
                                             [0.25, 0.00, 0.00]],
                                             ids=['S1', 'S2', 'S3'])

        table_fp = self.get_data_path('two_feature_table.biom')
        self.table_as_BIOMV210Format = BIOMV210Format(table_fp, mode='r')
        rf_table_fp = self.get_data_path('two_feature_rf_table.biom')
        self.rf_table_as_BIOMV210Format = BIOMV210Format(rf_table_fp, mode='r')
        p_a_table_fp = self.get_data_path('two_feature_p_a_table.biom')
        self.p_a_table_as_BIOMV210Format = BIOMV210Format(p_a_table_fp,
                                                          mode='r')
        self.table_as_artifact = Artifact.import_data(
                    'FeatureTable[Frequency]', self.table_as_BIOMV210Format)

        tree_fp = self.get_data_path('three_feature.tree')
        self.tree_as_NewickFormat = NewickFormat(tree_fp, mode='r')
        self.tree_as_artifact = Artifact.import_data(
                    'Phylogeny[Rooted]', self.tree_as_NewickFormat)

        self.unweighted_unifrac_thru_framework = self.plugin.actions[
                    'unweighted_unifrac']
Example #11
0
    def test_generalized_unifrac(self):
        bt_fp = self.get_data_path('vaw.biom')
        tree_fp = self.get_data_path('vaw.nwk')

        actual = beta_phylogenetic(table=bt_fp,
                                   phylogeny=tree_fp,
                                   metric='generalized_unifrac',
                                   alpha=0.5)

        data = np.array([[0.0000000, 0.4040518, 0.6285560, 0.5869439,
                          0.4082483, 0.2995673],
                         [0.4040518, 0.0000000, 0.4160597, 0.7071068,
                          0.7302479, 0.4860856],
                         [0.6285560, 0.4160597, 0.0000000, 0.8005220,
                          0.9073159, 0.5218198],
                         [0.5869439, 0.7071068, 0.8005220, 0.0000000,
                          0.4117216, 0.3485667],
                         [0.4082483, 0.7302479, 0.9073159, 0.4117216,
                          0.0000000, 0.6188282],
                         [0.2995673, 0.4860856, 0.5218198, 0.3485667,
                          0.6188282, 0.0000000]])
        ids = ('Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5',
               'Sample6')
        expected = skbio.DistanceMatrix(data, ids=ids)

        self.assertEqual(actual.ids, expected.ids)
        for id1 in actual.ids:
            for id2 in actual.ids:
                npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
Example #12
0
    def test_permanova_pairwise(self):
        dm = skbio.DistanceMatrix(
            [[0.00, 0.25, 0.25], [0.25, 0.00, 0.00], [0.25, 0.00, 0.00]],
            ids=['sample1', 'sample2', 'sample3'])
        md = qiime2.MetadataCategory(
            pd.Series(['a', 'b', 'b'],
                      name='a or b',
                      index=['sample1', 'sample2', 'sample3']))

        with tempfile.TemporaryDirectory() as output_dir:
            beta_group_significance(output_dir, dm, md, pairwise=True)
            index_fp = os.path.join(output_dir, 'index.html')
            self.assertTrue(os.path.exists(index_fp))
            # all expected boxplots are generated
            self.assertTrue(
                os.path.exists(os.path.join(output_dir, 'a-boxplots.pdf')))
            self.assertTrue(
                os.path.exists(os.path.join(output_dir, 'a-boxplots.png')))
            self.assertTrue(
                os.path.exists(os.path.join(output_dir, 'b-boxplots.pdf')))
            self.assertTrue(
                os.path.exists(os.path.join(output_dir, 'b-boxplots.png')))
            # no extra boxplots are generated
            self.assertEqual(len(glob.glob('%s/*-boxplots.pdf' % output_dir)),
                             2)
            self.assertEqual(len(glob.glob('%s/*-boxplots.png' % output_dir)),
                             2)
            self.assertTrue('PERMANOVA results' in open(index_fp).read())
            self.assertTrue('Pairwise permanova' in open(index_fp).read())
            self.assertFalse('Warning' in open(index_fp).read())
Example #13
0
def _metadata_distance(metadata: pd.Series) -> skbio.DistanceMatrix:
    # This code is derived from @jairideout's scikit-bio cookbook recipe,
    # "Exploring Microbial Community Diversity"
    # https://github.com/biocore/scikit-bio-cookbook
    distances = scipy.spatial.distance.pdist(metadata.values[:, numpy.newaxis],
                                             metric='euclidean')
    return skbio.DistanceMatrix(distances, ids=metadata.index)
Example #14
0
def _bootstrap_dm(ids, dm, new_names=None):
    """Makes a bootstrapped distance matrix

    Parameters
    ----------
    ids: array-like
        A list of ids in the distance matrix. These do not have
        to be unique.
    dm : DistanceMatrix
        The distance matrix object to resample.
    new_names: array_like, optional
        The names to be used in the new array. Note, this must be
        unique. If nothing is specified, a numeric index will be
        used.

    Returns
    -------
        A DistanceMatrix with the samples above and the index
        names

    """
    if new_names is None:
        new_names = np.arange(0, len(ids))
    dm_ids = dm.ids
    id_pos = [dm_ids.index(id_) for id_ in ids]
    dm_data = dm.data[id_pos][:, id_pos]

    return skbio.DistanceMatrix(dm_data, new_names)
Example #15
0
    def test_with_exclude_ids_where_filter_two(self):
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['elbow', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='id'))
        metadata = qiime2.Metadata(df)

        dm = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]],
                                  ['S1', 'S2', 'S3'])

        where = "SampleType='tongue' OR SampleType='gut'"

        filtered = filter_distance_matrix(dm, metadata,
                                          where,
                                          exclude_ids=True)
        expected = skbio.DistanceMatrix([[0]], ['S1'])
        self.assertEqual(filtered, expected)
Example #16
0
    def test_variance_adjusted_normalized(self):
        bt_fp = self.get_data_path('vaw.biom')
        tree_fp = self.get_data_path('vaw.nwk')

        actual = beta_phylogenetic(table=bt_fp,
                                   phylogeny=tree_fp,
                                   metric='weighted_normalized_unifrac',
                                   variance_adjusted=True)

        data = np.array([[0.0000000, 0.4086040, 0.6240185, 0.4639481,
                          0.2857143, 0.2766318],
                         [0.4086040, 0.0000000, 0.3798594, 0.6884992,
                          0.6807616, 0.4735781],
                         [0.6240185, 0.3798594, 0.0000000, 0.7713254,
                          0.8812897, 0.5047114],
                         [0.4639481, 0.6884992, 0.7713254, 0.0000000,
                          0.6666667, 0.2709298],
                         [0.2857143, 0.6807616, 0.8812897, 0.6666667,
                          0.0000000, 0.4735991],
                         [0.2766318, 0.4735781, 0.5047114, 0.2709298,
                          0.4735991, 0.0000000]])
        ids = ('Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5',
               'Sample6')
        expected = skbio.DistanceMatrix(data, ids=ids)

        self.assertEqual(actual.ids, expected.ids)
        for id1 in actual.ids:
            for id2 in actual.ids:
                npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
Example #17
0
def cscs(features: biom.Table,
         css_edges: str,
         cosine_threshold: float = 0.6,
         normalization: bool = True,
         weighted: bool = True) -> skbio.DistanceMatrix:
    observationids = {
        x: index
        for index, x in enumerate(features.ids(axis='observation'))
    }
    edgesdok = dok_matrix((features.shape[0], features.shape[0]),
                          dtype=np.float32)
    for line in open(css_edges, "r"):
        if line.find("CLUSTERID1") > -1:
            continue
        linesplit = line.split("\t")
        if float(linesplit[4]) < cosine_threshold:
            edgesdok[observationids[linesplit[0]],
                     observationids[linesplit[1]]] = 0.0
        else:
            edgesdok[observationids[linesplit[0]],
                     observationids[linesplit[1]]] = float(linesplit[4])
            edgesdok[observationids[linesplit[1]],
                     observationids[linesplit[0]]] = float(linesplit[4])
        edgesdok.setdiag(1)

    if normalization:
        features = features.norm(axis='sample', inplace=False)
    if weighted == False:
        features = features.pa  #TODO: make new option in cscs()

    sample_names = features.ids()
    cscs = parallel_make_distance_matrix(features, edgesdok, sample_names)
    cscs = 1 - cscs
    print(cscs)
    return (skbio.DistanceMatrix(cscs, ids=cscs.index))
    def test_2nn(self):
        # -- setup -- #
        # 2 nearest neighbors of each sample are
        # f1: s1, s2 (classified as skinny)
        # s1: f1, s2 (closer to f1 so fat)
        # s2: f1, (s1 or s3) (closer to f1 so fat)
        # s3: s1, s2 (skinny)
        sample_ids = ('f1', 's1', 's2', 's3')
        distance_matrix = skbio.DistanceMatrix([
            [0, 2, 1, 5],
            [2, 0, 3, 4],
            [1, 3, 0, 3],
            [5, 4, 3, 0],
        ],
                                               ids=sample_ids)

        dm = qiime2.Artifact.import_data('DistanceMatrix', distance_matrix)
        categories = pd.Series(('fat', 'skinny', 'skinny', 'skinny'),
                               index=sample_ids,
                               name='body_mass')
        categories.index.name = 'SampleID'
        metadata = qiime2.CategoricalMetadataColumn(categories)

        # -- test -- #
        res = sample_classifier.actions.classify_samples_from_dist(
            distance_matrix=dm, metadata=metadata, k=2, cv=3, random_state=123)
        pred = res[0].view(pd.Series)
        expected = pd.Series(('skinny', 'fat', 'fat', 'skinny'),
                             index=sample_ids)
        self.assertTrue(expected.sort_index().equals(pred.sort_index()))
Example #19
0
    def test_beta_unweighted(self):
        bt_fp = self.get_data_path('crawford.biom')
        tree_fp = self.get_data_path('crawford.nwk')

        actual = beta_phylogenetic(table=bt_fp,
                                   phylogeny=tree_fp,
                                   metric='unweighted_unifrac')

        # computed with beta-phylogenetic
        data = np.array([0.71836067, 0.71317361, 0.69746044, 0.62587207,
                         0.72826674, 0.72065895, 0.72640581, 0.73606053,
                         0.70302967, 0.73407301, 0.6548042, 0.71547381,
                         0.78397813, 0.72318399, 0.76138933, 0.61041275,
                         0.62331299, 0.71848305, 0.70416337, 0.75258475,
                         0.79249029, 0.64392779, 0.70052733, 0.69832716,
                         0.77818938, 0.72959894, 0.75782689, 0.71005144,
                         0.75065046, 0.78944369, 0.63593642, 0.71283615,
                         0.58314638, 0.69200762, 0.68972056, 0.71514083])
        ids = ('10084.PC.481', '10084.PC.593', '10084.PC.356', '10084.PC.355',
               '10084.PC.354', '10084.PC.636', '10084.PC.635', '10084.PC.607',
               '10084.PC.634')
        expected = skbio.DistanceMatrix(data, ids=ids)

        self.assertEqual(actual.ids, expected.ids)
        for id1 in actual.ids:
            for id2 in actual.ids:
                npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
    def test_classify_samples_from_dist(self):
        # -- setup -- #
        # 1,2 are a group, 3,4 are a group
        sample_ids = ('f1', 'f2', 's1', 's2')
        distance_matrix = skbio.DistanceMatrix([
            [0, 1, 4, 4],
            [1, 0, 4, 4],
            [4, 4, 0, 1],
            [4, 4, 1, 0],
        ],
                                               ids=sample_ids)

        dm = qiime2.Artifact.import_data('DistanceMatrix', distance_matrix)
        categories = pd.Series(('skinny', 'skinny', 'fat', 'fat'),
                               index=sample_ids[::-1],
                               name='body_mass')
        categories.index.name = 'SampleID'
        metadata = qiime2.CategoricalMetadataColumn(categories)

        # -- test -- #
        res = sample_classifier.actions.classify_samples_from_dist(
            distance_matrix=dm, metadata=metadata, k=1, cv=3, random_state=123)
        pred = res[0].view(pd.Series).sort_values()
        expected = pd.Series(('fat', 'skinny', 'fat', 'skinny'),
                             index=['f1', 's1', 'f2', 's2'])
        not_expected = pd.Series(('fat', 'fat', 'fat', 'skinny'),
                                 index=sample_ids)

        # order matters for pd.Series.equals()
        self.assertTrue(expected.sort_index().equals(pred.sort_index()))
        self.assertFalse(not_expected.sort_index().equals(pred.sort_index()))
    def test_classify_samples_from_dist_with_group_of_single_item(self):
        # -- setup -- #
        # 1 is a group, 2,3,4 are a group
        sample_ids = ('f1', 's1', 's2', 's3')
        distance_matrix = skbio.DistanceMatrix([
            [0, 2, 3, 3],
            [2, 0, 1, 1],
            [3, 1, 0, 1],
            [3, 1, 1, 0],
        ],
                                               ids=sample_ids)

        dm = qiime2.Artifact.import_data('DistanceMatrix', distance_matrix)
        categories = pd.Series(('fat', 'skinny', 'skinny', 'skinny'),
                               index=sample_ids,
                               name='body_mass')
        categories.index.name = 'SampleID'
        metadata = qiime2.CategoricalMetadataColumn(categories)

        # -- test -- #
        res = sample_classifier.actions.classify_samples_from_dist(
            distance_matrix=dm, metadata=metadata, k=1, cv=3, random_state=123)
        pred = res[0].view(pd.Series)
        expected = pd.Series(('skinny', 'skinny', 'skinny', 'skinny'),
                             index=sample_ids)

        self.assertTrue(expected.sort_index().equals(pred.sort_index()))
Example #22
0
    def test_generalized_unifrac_no_alpha(self):
        actual = self.beta_phylogenetic(table=self.crawford_table,
                                        phylogeny=self.crawford_tree,
                                        metric='generalized_unifrac',
                                        alpha=None)

        # alpha=1 should be equal to weighted normalized UniFrac
        data = np.array([0.2821874, 0.16148405, 0.20186143, 0.1634832,
                         0.40351108, 0.29135056, 0.24790944, 0.41967404,
                         0.24642185, 0.22218489, 0.34007547, 0.27722011,
                         0.20963881, 0.16897221, 0.3217958, 0.15237816,
                         0.16899207, 0.36445044, 0.25408941, 0.23358681,
                         0.4069374, 0.24615927, 0.28573888, 0.20578184,
                         0.20742006, 0.31249151, 0.46169893, 0.35294595,
                         0.32522355, 0.48437103, 0.21534558, 0.30558908,
                         0.12091004, 0.19817777, 0.24792853, 0.34293674])
        ids = ('10084.PC.481', '10084.PC.593', '10084.PC.356', '10084.PC.355',
               '10084.PC.354', '10084.PC.636', '10084.PC.635', '10084.PC.607',
               '10084.PC.634')
        expected = skbio.DistanceMatrix(data, ids=ids)

        self.assertEqual(len(actual), 1)
        self.assertEqual(repr(actual.distance_matrix.type), 'DistanceMatrix')
        actual = actual[0].view(skbio.DistanceMatrix)

        self.assertEqual(actual.ids, expected.ids)
        for id1 in actual.ids:
            for id2 in actual.ids:
                npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
Example #23
0
    def test_single_tree_and_label(self):
        trees = [skbio.TreeNode.read(['(A:0.2, B:1.5, C, (E, F));'])]
        expected = skbio.DistanceMatrix([[0]], ids=['foo'])

        result = robinson_foulds(trees, labels=['foo'])

        self.assertEqual(result, expected)
Example #24
0
    def test_anosim_pairwise(self):
        dm = skbio.DistanceMatrix([[0.00, 0.25, 0.25],
                                   [0.25, 0.00, 0.00],
                                   [0.25, 0.00, 0.00]],
                                  ids=['sample1', 'sample2', 'sample3'])
        md = qiime2.CategoricalMetadataColumn(
            pd.Series(['a', 'b', 'b'], name='a or b',
                      index=pd.Index(['sample1', 'sample2', 'sample3'],
                                     name='id')))

        with tempfile.TemporaryDirectory() as output_dir:
            beta_group_significance(output_dir, dm, md, method='anosim',
                                    permutations=42, pairwise=True)
            index_fp = os.path.join(output_dir, 'index.html')
            self.assertTrue(os.path.exists(index_fp))
            # all expected boxplots are generated
            self.assertTrue(os.path.exists(
                            os.path.join(output_dir, 'a-boxplots.pdf')))
            self.assertTrue(os.path.exists(
                            os.path.join(output_dir, 'a-boxplots.png')))
            self.assertTrue(os.path.exists(
                            os.path.join(output_dir, 'b-boxplots.pdf')))
            self.assertTrue(os.path.exists(
                            os.path.join(output_dir, 'b-boxplots.png')))
            # no extra boxplots are generated
            self.assertEqual(len(glob.glob('%s/*-boxplots.pdf' % output_dir)),
                             2)
            self.assertEqual(len(glob.glob('%s/*-boxplots.png' % output_dir)),
                             2)
            self.assertTrue('ANOSIM results' in open(index_fp).read())
            self.assertTrue('<td>42</td>' in open(index_fp).read())
            self.assertFalse('Warning' in open(index_fp).read())
            self.assertTrue('Pairwise anosim' in open(index_fp).read())
Example #25
0
    def test_generalized_unifrac(self):
        bt_fp = self.get_data_path('vaw.biom')
        bt = Artifact.import_data('FeatureTable[Frequency]', bt_fp)
        tree_fp = self.get_data_path('vaw.nwk')
        tree = Artifact.import_data('Phylogeny[Rooted]', tree_fp)

        actual = self.beta_phylogenetic(table=bt,
                                        phylogeny=tree,
                                        metric='generalized_unifrac',
                                        alpha=0.5)

        data = np.array([[0.0000000, 0.4040518, 0.6285560, 0.5869439,
                          0.4082483, 0.2995673],
                         [0.4040518, 0.0000000, 0.4160597, 0.7071068,
                          0.7302479, 0.4860856],
                         [0.6285560, 0.4160597, 0.0000000, 0.8005220,
                          0.9073159, 0.5218198],
                         [0.5869439, 0.7071068, 0.8005220, 0.0000000,
                          0.4117216, 0.3485667],
                         [0.4082483, 0.7302479, 0.9073159, 0.4117216,
                          0.0000000, 0.6188282],
                         [0.2995673, 0.4860856, 0.5218198, 0.3485667,
                          0.6188282, 0.0000000]])
        ids = ('Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5',
               'Sample6')
        expected = skbio.DistanceMatrix(data, ids=ids)

        self.assertEqual(len(actual), 1)
        self.assertEqual(repr(actual.distance_matrix.type), 'DistanceMatrix')
        actual = actual[0].view(skbio.DistanceMatrix)

        self.assertEqual(actual.ids, expected.ids)
        for id1 in actual.ids:
            for id2 in actual.ids:
                npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
Example #26
0
    def test_variance_adjusted_normalized(self):
        bt_fp = self.get_data_path('vaw.biom')
        bt = Artifact.import_data('FeatureTable[Frequency]', bt_fp)
        tree_fp = self.get_data_path('vaw.nwk')
        tree = Artifact.import_data('Phylogeny[Rooted]', tree_fp)

        actual = self.beta_phylogenetic(table=bt,
                                        phylogeny=tree,
                                        metric='weighted_normalized_unifrac',
                                        variance_adjusted=True)

        data = np.array([[0.0000000, 0.4086040, 0.6240185, 0.4639481,
                          0.2857143, 0.2766318],
                         [0.4086040, 0.0000000, 0.3798594, 0.6884992,
                          0.6807616, 0.4735781],
                         [0.6240185, 0.3798594, 0.0000000, 0.7713254,
                          0.8812897, 0.5047114],
                         [0.4639481, 0.6884992, 0.7713254, 0.0000000,
                          0.6666667, 0.2709298],
                         [0.2857143, 0.6807616, 0.8812897, 0.6666667,
                          0.0000000, 0.4735991],
                         [0.2766318, 0.4735781, 0.5047114, 0.2709298,
                          0.4735991, 0.0000000]])
        ids = ('Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5',
               'Sample6')
        expected = skbio.DistanceMatrix(data, ids=ids)

        self.assertEqual(len(actual), 1)
        self.assertEqual(repr(actual.distance_matrix.type), 'DistanceMatrix')
        actual = actual[0].view(skbio.DistanceMatrix)

        self.assertEqual(actual.ids, expected.ids)
        for id1 in actual.ids:
            for id2 in actual.ids:
                npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
Example #27
0
    def test_beta_weighted(self):
        actual = self.beta_phylogenetic(table=self.crawford_table,
                                        phylogeny=self.crawford_tree,
                                        metric='weighted_unifrac')

        # computed with beta-phylogenetic (weighted_unifrac)
        data = np.array([0.44656238, 0.23771096, 0.30489123, 0.23446002,
                         0.65723575, 0.44911772, 0.381904, 0.69144829,
                         0.39611776, 0.36568012, 0.53377975, 0.48908025,
                         0.35155196, 0.28318669, 0.57376916, 0.23395746,
                         0.24658122, 0.60271637, 0.39802552, 0.36567394,
                         0.68062701, 0.36862049, 0.48350632, 0.33024631,
                         0.33266697, 0.53464744, 0.74605075, 0.53951035,
                         0.49680733, 0.79178838, 0.37109012, 0.52629343,
                         0.22118218, 0.32400805, 0.43189708, 0.59705893])
        ids = ('10084.PC.481', '10084.PC.593', '10084.PC.356', '10084.PC.355',
               '10084.PC.354', '10084.PC.636', '10084.PC.635', '10084.PC.607',
               '10084.PC.634')
        expected = skbio.DistanceMatrix(data, ids=ids)

        self.assertEqual(len(actual), 1)
        self.assertEqual(repr(actual.distance_matrix.type), 'DistanceMatrix')
        actual = actual[0].view(skbio.DistanceMatrix)

        self.assertEqual(actual.ids, expected.ids)
        for id1 in actual.ids:
            for id2 in actual.ids:
                npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
Example #28
0
    def test_beta_unweighted_parallel(self):
        bt_fp = self.get_data_path('crawford.biom')
        bt = Artifact.import_data('FeatureTable[Frequency]', bt_fp)
        tree_fp = self.get_data_path('crawford.nwk')
        tree = Artifact.import_data('Phylogeny[Rooted]', tree_fp)

        actual = self.beta_phylogenetic(table=bt,
                                        phylogeny=tree,
                                        metric='unweighted_unifrac',
                                        threads=2)

        # computed with beta-phylogenetic
        data = np.array([0.71836067, 0.71317361, 0.69746044, 0.62587207,
                         0.72826674, 0.72065895, 0.72640581, 0.73606053,
                         0.70302967, 0.73407301, 0.6548042, 0.71547381,
                         0.78397813, 0.72318399, 0.76138933, 0.61041275,
                         0.62331299, 0.71848305, 0.70416337, 0.75258475,
                         0.79249029, 0.64392779, 0.70052733, 0.69832716,
                         0.77818938, 0.72959894, 0.75782689, 0.71005144,
                         0.75065046, 0.78944369, 0.63593642, 0.71283615,
                         0.58314638, 0.69200762, 0.68972056, 0.71514083])
        ids = ('10084.PC.481', '10084.PC.593', '10084.PC.356', '10084.PC.355',
               '10084.PC.354', '10084.PC.636', '10084.PC.635', '10084.PC.607',
               '10084.PC.634')
        expected = skbio.DistanceMatrix(data, ids=ids)

        self.assertEqual(len(actual), 1)
        self.assertEqual(repr(actual.distance_matrix.type), 'DistanceMatrix')
        actual = actual[0].view(skbio.DistanceMatrix)

        self.assertEqual(actual.ids, expected.ids)
        for id1 in actual.ids:
            for id2 in actual.ids:
                npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
Example #29
0
 def test_metadata_distance_int(self):
     md = pd.Series([1, 2, 3],
                    name='number',
                    index=['sample1', 'sample2', 'sample3'])
     exp = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 1], [2, 1, 0]],
                                ids=['sample1', 'sample2', 'sample3'])
     obs = _metadata_distance(md)
     self.assertEqual(exp, obs)
Example #30
0
    def test_one_sample(self):
        md = qiime2.MetadataCategory(
            pd.Series([1.5], name='number', index=['sample1']))
        exp = skbio.DistanceMatrix([[0.0]], ids=['sample1'])

        obs = distance_matrix(md)

        self.assertEqual(exp, obs)