Exemple #1
0
    def setUp(self):
        # Distance matrices with and without ties in the ranks, with 2 groups
        # of equal size.
        dm_ids = ['s1', 's2', 's3', 's4']
        grouping_equal = ['Control', 'Control', 'Fast', 'Fast']

        self.dm_ties = DistanceMatrix(
            [[0, 1, 1, 4], [1, 0, 3, 2], [1, 3, 0, 3], [4, 2, 3, 0]], dm_ids)

        self.dm_no_ties = DistanceMatrix(
            [[0, 1, 5, 4], [1, 0, 3, 2], [5, 3, 0, 3], [4, 2, 3, 0]], dm_ids)

        # Test with 3 groups of unequal size.
        grouping_unequal = [
            'Control', 'Treatment1', 'Treatment2', 'Treatment1', 'Control',
            'Control'
        ]

        self.dm_unequal = DistanceMatrix(
            [[0.0, 1.0, 0.1, 0.5678, 1.0, 1.0],
             [1.0, 0.0, 0.002, 0.42, 0.998, 0.0],
             [0.1, 0.002, 0.0, 1.0, 0.123, 1.0],
             [0.5678, 0.42, 1.0, 0.0, 0.123, 0.43],
             [1.0, 0.998, 0.123, 0.123, 0.0, 0.5],
             [1.0, 0.0, 1.0, 0.43, 0.5, 0.0]],
            ['s1', 's2', 's3', 's4', 's5', 's6'])

        self.permanova_ties = PERMANOVA(self.dm_ties, grouping_equal)
        self.permanova_no_ties = PERMANOVA(self.dm_no_ties, grouping_equal)
        self.permanova_unequal = PERMANOVA(self.dm_unequal, grouping_unequal)
Exemple #2
0
    def setUp(self):
        # Distance matrices with and without ties in the ranks, with 2 groups
        # of equal size.
        dm_ids = ['s1', 's2', 's3', 's4']
        grouping_equal = ['Control', 'Control', 'Fast', 'Fast']

        self.dm_ties = DistanceMatrix(
            [[0, 1, 1, 4], [1, 0, 3, 2], [1, 3, 0, 3], [4, 2, 3, 0]], dm_ids)

        self.dm_no_ties = DistanceMatrix(
            [[0, 1, 5, 4], [1, 0, 3, 2], [5, 3, 0, 3], [4, 2, 3, 0]], dm_ids)

        # Test with 3 groups of unequal size. This data also generates a
        # negative R statistic.
        grouping_unequal = [
            'Control', 'Treatment1', 'Treatment2', 'Treatment1', 'Control',
            'Control'
        ]

        self.dm_unequal = DistanceMatrix(
            [[0.0, 1.0, 0.1, 0.5678, 1.0, 1.0],
             [1.0, 0.0, 0.002, 0.42, 0.998, 0.0],
             [0.1, 0.002, 0.0, 1.0, 0.123, 1.0],
             [0.5678, 0.42, 1.0, 0.0, 0.123, 0.43],
             [1.0, 0.998, 0.123, 0.123, 0.0, 0.5],
             [1.0, 0.0, 1.0, 0.43, 0.5, 0.0]],
            ['s1', 's2', 's3', 's4', 's5', 's6'])

        self.anosim_ties = ANOSIM(self.dm_ties, grouping_equal)
        self.anosim_no_ties = ANOSIM(self.dm_no_ties, grouping_equal)
        self.anosim_unequal = ANOSIM(self.dm_unequal, grouping_unequal)
Exemple #3
0
    def setUp(self):
        # Distance matrices with and without ties in the ranks, with 2 groups
        # of equal size.
        dm_ids = ['s1', 's2', 's3', 's4']
        grouping_equal = ['Control', 'Control', 'Fast', 'Fast']
        df = pd.read_csv(StringIO(
            'ID,Group\ns2,Control\ns3,Fast\ns4,Fast\ns5,Control\n'
            's1,Control'),
                         index_col=0)

        self.dm_ties = DistanceMatrix(
            [[0, 1, 1, 4], [1, 0, 3, 2], [1, 3, 0, 3], [4, 2, 3, 0]], dm_ids)

        self.dm_no_ties = DistanceMatrix(
            [[0, 1, 5, 4], [1, 0, 3, 2], [5, 3, 0, 3], [4, 2, 3, 0]], dm_ids)

        # Test with 3 groups of unequal size.
        grouping_unequal = [
            'Control', 'Treatment1', 'Treatment2', 'Treatment1', 'Control',
            'Control'
        ]

        self.dm_unequal = DistanceMatrix(
            [[0.0, 1.0, 0.1, 0.5678, 1.0, 1.0],
             [1.0, 0.0, 0.002, 0.42, 0.998, 0.0],
             [0.1, 0.002, 0.0, 1.0, 0.123, 1.0],
             [0.5678, 0.42, 1.0, 0.0, 0.123, 0.43],
             [1.0, 0.998, 0.123, 0.123, 0.0, 0.5],
             [1.0, 0.0, 1.0, 0.43, 0.5, 0.0]],
            ['s1', 's2', 's3', 's4', 's5', 's6'])

        self.permanova_ties = PERMANOVA(self.dm_ties, grouping_equal)
        self.permanova_no_ties = PERMANOVA(self.dm_no_ties, grouping_equal)
        self.permanova_ties_df = PERMANOVA(self.dm_ties, df, column='Group')
        self.permanova_unequal = PERMANOVA(self.dm_unequal, grouping_unequal)
    def test_from_file_with_file_path(self):
        """Should identify the filepath correctly and parse from it."""

        # should fail with the expected exception
        with self.assertRaises(DissimilarityMatrixFormatError):
            DistanceMatrix.from_file(self.bad_dm_fp)

        obs = DistanceMatrix.from_file(self.dm_3x3_fp)
        self.assertEqual(self.dm_3x3, obs)
        self.assertTrue(isinstance(obs, DistanceMatrix))
Exemple #5
0
    def test_init_invalid_input(self):
        """Raises error on invalid distance matrix data / IDs."""
        # Asymmetric.
        data = [[0.0, 2.0], [1.0, 0.0]]
        with self.assertRaises(DistanceMatrixError):
            _ = DistanceMatrix(data, ['a', 'b'])

        # Ensure that the superclass validation is still being performed.
        with self.assertRaises(DissimilarityMatrixError):
            _ = DistanceMatrix([[1, 2, 3]], ['a'])
    def setUp(self):
        super(DistanceMatrixTests, self).setUp()

        self.dm_1x1 = DistanceMatrix(self.dm_1x1_data, ['a'])
        self.dm_2x2 = DistanceMatrix(self.dm_2x2_data, ['a', 'b'])
        self.dm_3x3 = DistanceMatrix(self.dm_3x3_data, ['a', 'b', 'c'])

        self.dms = [self.dm_1x1, self.dm_2x2, self.dm_3x3]
        self.dm_condensed_forms = [np.array([]), np.array([0.123]),
                                   np.array([0.01, 4.2, 12.0])]
Exemple #7
0
    def test_from_file_with_file_path(self):
        """Should identify the filepath correctly and parse from it."""

        # should fail with the expected exception
        with self.assertRaises(DissimilarityMatrixFormatError):
            _ = DistanceMatrix.from_file(self.bad_dm_fp)

        obs = DistanceMatrix.from_file(self.dm_3x3_fp)
        self.assertEqual(self.dm_3x3, obs)
        self.assertTrue(isinstance(obs, DistanceMatrix))
Exemple #8
0
    def test_distance_matrix_instances_as_input(self):
        # IDs shouldn't matter -- the function should only care about the
        # matrix data
        dmx = DistanceMatrix(self.minx)
        dmy = DistanceMatrix(self.miny, ['no', 'cog', 'yay'])

        np.random.seed(0)

        obs = mantel(dmx, dmy, alternative='less')

        self.assertAlmostEqual(obs[0], self.exp_x_vs_y)
        self.assertAlmostEqual(obs[1], 0.843)
Exemple #9
0
    def setUp(self):
        super(DistanceMatrixTests, self).setUp()

        self.dm_1x1 = DistanceMatrix(self.dm_1x1_data, ['a'])
        self.dm_2x2 = DistanceMatrix(self.dm_2x2_data, ['a', 'b'])
        self.dm_3x3 = DistanceMatrix(self.dm_3x3_data, ['a', 'b', 'c'])

        self.dms = [self.dm_1x1, self.dm_2x2, self.dm_3x3]
        self.dm_condensed_forms = [
            np.array([]),
            np.array([0.123]),
            np.array([0.01, 4.2, 12.0])
        ]
Exemple #10
0
    def setUp(self):
        self.minx = DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]])
        self.miny = DistanceMatrix([[0, 2, 7], [2, 0, 6], [7, 6, 0]])
        self.minz = DistanceMatrix([[0, 0.5, 0.25],
                                    [0.5, 0, 0.1],
                                    [0.25, 0.1, 0]])
        self.min_dms = (self.minx, self.miny, self.minz)

        # Versions of self.minx and self.minz (above) that each have an extra
        # ID on the end.
        self.x_extra = DistanceMatrix([[0, 1, 2, 7],
                                       [1, 0, 3, 2],
                                       [2, 3, 0, 4],
                                       [7, 2, 4, 0]], ['0', '1', '2', 'foo'])
        self.z_extra = DistanceMatrix([[0, 0.5, 0.25, 3],
                                       [0.5, 0, 0.1, 24],
                                       [0.25, 0.1, 0, 5],
                                       [3, 24, 5, 0]], ['0', '1', '2', 'bar'])

        # Load expected results. We have to load the p-value column (column
        # index 3) as a string dtype in order to compare with the in-memory
        # results since we're formatting the p-values as strings with the
        # correct number of decimal places. Without this explicit converter,
        # the p-value column will be loaded as a float dtype and the frames
        # won't compare equal.
        p_val_conv = {3: str}

        self.exp_results_minimal = pd.read_csv(
            get_data_path('pwmantel_exp_results_minimal.txt'), sep='\t',
            index_col=(0, 1), converters=p_val_conv)

        self.exp_results_minimal_with_labels = pd.read_csv(
            get_data_path('pwmantel_exp_results_minimal_with_labels.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_duplicate_dms = pd.read_csv(
            get_data_path('pwmantel_exp_results_duplicate_dms.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_na_p_value = pd.read_csv(
            get_data_path('pwmantel_exp_results_na_p_value.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_too_few_permutations = pd.read_csv(
            get_data_path('pwmantel_exp_results_too_few_permutations.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_reordered_distance_matrices = pd.read_csv(
            get_data_path('pwmantel_exp_results_reordered_distance_matrices'
                          '.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)
Exemple #11
0
    def test_compute_collapsed_dm(self):
        expected_data = [[0, 7, 7, 6], [7, 0, 8, 7], [7, 8, 0, 3],
                         [6, 7, 3, 0]]
        expected_ids = ['x', 'c', 'd', 'e']
        expected1 = DistanceMatrix(expected_data, expected_ids)
        self.assertEqual(_compute_collapsed_dm(self.dm1, 'a', 'b', True, 'x'),
                         expected1)

        # computed manually
        expected_data = [[0, 4, 3], [4, 0, 3], [3, 3, 0]]
        expected_ids = ['yy', 'd', 'e']
        expected2 = DistanceMatrix(expected_data, expected_ids)
        self.assertEqual(
            _compute_collapsed_dm(expected1, 'x', 'c', True, 'yy'), expected2)
Exemple #12
0
    def test_compute_q(self):
        expected_data = [[0, -50, -38, -34, -34], [-50, 0, -38, -34, -34],
                         [-38, -38, 0, -40, -40], [-34, -34, -40, 0, -48],
                         [-34, -34, -40, -48, 0]]
        expected_ids = list('abcde')
        expected = DistanceMatrix(expected_data, expected_ids)
        self.assertEqual(_compute_q(self.dm1), expected)

        data = [[0, 3, 2], [3, 0, 3], [2, 3, 0]]
        dm = DistanceMatrix(data, list('abc'))
        # computed this manually
        expected_data = [[0, -8, -8], [-8, 0, -8], [-8, -8, 0]]
        expected = DistanceMatrix(expected_data, list('abc'))
        self.assertEqual(_compute_q(dm), expected)
Exemple #13
0
def get_clusters(x_original, axis=['row', 'column'][0]):
    """Performs UPGMA clustering using euclidean distances"""
    x = x_original.copy()
    if axis == 'column':
        x = x.T
    nr = x.shape[0]
    metric_f = get_nonphylogenetic_metric('euclidean')
    row_dissims = DistanceMatrix(metric_f(x), map(str, range(nr)))
    # do upgma - rows
    # Average in SciPy's cluster.heirarchy.linkage is UPGMA
    linkage_matrix = linkage(row_dissims.condensed_form(), method='average')
    tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids)
    row_order = [int(tip.name) for tip in tree.tips()]
    return row_order
Exemple #14
0
def get_clusters(x_original, axis=['row', 'column'][0]):
    """Performs UPGMA clustering using euclidean distances"""
    x = x_original.copy()
    if axis == 'column':
        x = x.T
    nr = x.shape[0]
    metric_f = get_nonphylogenetic_metric('euclidean')
    row_dissims = DistanceMatrix(metric_f(x), map(str, range(nr)))
    # do upgma - rows
    # Average in SciPy's cluster.heirarchy.linkage is UPGMA
    linkage_matrix = linkage(row_dissims.condensed_form(), method='average')
    tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids)
    row_order = [int(tip.name) for tip in tree.tips()]
    return row_order
 def setup(self):
     with open(get_data_path('PCoA_sample_data_3'), 'U') as lines:
         dist_matrix = DistanceMatrix.from_file(lines)
     self.ordination = PCoA(dist_matrix)
     self.ids = [
         'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
         'PC.355', 'PC.607', 'PC.634'
     ]
 def test_distances(self):
     """distances functions as expected
     """
     expected = [[0, 6. / 13, 4. / 13], [6. / 13, 0, 7. / 13],
                 [4. / 13, 7. / 13, 0]]
     expected = DistanceMatrix(expected, ['d1', 'd2', 'd3'])
     actual = self.a1.distances()
     self.assertEqual(actual, expected)
Exemple #17
0
def pcoa(lines):
    """Run PCoA on the distance matrix present on lines"""
    # Parse the distance matrix
    dist_mtx = DistanceMatrix.from_file(lines)
    # Create the PCoA object
    pcoa_obj = PCoA(dist_mtx)
    # Get the PCoA results and return them
    return pcoa_obj.scores()
Exemple #18
0
def pcoa(lines):
    """Run PCoA on the distance matrix present on lines"""
    # Parse the distance matrix
    dist_mtx = DistanceMatrix.from_file(lines)
    # Create the PCoA object
    pcoa_obj = PCoA(dist_mtx)
    # Get the PCoA results and return them
    return pcoa_obj.scores()
def single_file_nj(input_file, output_file):
    dm = DistanceMatrix.from_file(input_file)

    tree = nj(dm)

    # write output
    f = open(output_file, 'w')
    f.write(tree.to_newick(with_distances=True))
    f.close()
Exemple #20
0
def guide_tree_from_query_sequences(query_sequences, 
                                    distance_fn=three_mer_distance,
                                    display_tree = False):
    guide_dm = []
    seq_ids = []
    for seq_id1, seq1 in query_sequences:
        seq_ids.append(seq_id1)
        row = []
        for seq_id2, seq2 in query_sequences:
            row.append(kmer_distance(seq1, seq2, k=3))
        guide_dm.append(row)
    
    guide_dm = DistanceMatrix(guide_dm, seq_ids)
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', 
               link_color_func=lambda x: 'black')
    return guide_tree
Exemple #21
0
def compute_aligned_sequence_distances(seqs, distance_fn=hamming_distance):
    dm = []
    ids = []
    for id1, seq1 in seqs:
        ids.append(id1)
        row = []
        for id2, seq2 in seqs:
            row.append(hamming_distance(seq1, seq2))
        dm.append(row)
    return DistanceMatrix(dm, ids)
Exemple #22
0
    def test_permute_not_condensed(self):
        obs = self.dm_1x1.permute()
        self.assertEqual(obs, self.dm_1x1)
        self.assertFalse(obs is self.dm_1x1)

        obs = self.dm_2x2.permute()
        self.assertEqual(obs, self.dm_2x2)
        self.assertFalse(obs is self.dm_2x2)

        np.random.seed(0)

        exp = DistanceMatrix([[0, 12, 4.2], [12, 0, 0.01], [4.2, 0.01, 0]],
                             self.dm_3x3.ids)
        obs = self.dm_3x3.permute()
        self.assertEqual(obs, exp)

        exp = DistanceMatrix([[0, 4.2, 12], [4.2, 0, 0.01], [12, 0.01, 0]],
                             self.dm_3x3.ids)
        obs = self.dm_3x3.permute()
        self.assertEqual(obs, exp)
Exemple #23
0
    def test_init_from_dm(self):
        """Constructs a dm from a dm."""
        ids = ['foo', 'bar', 'baz']

        # DissimilarityMatrix -> DissimilarityMatrix
        exp = DissimilarityMatrix(self.dm_3x3_data, ids)
        obs = DissimilarityMatrix(self.dm_3x3, ids)
        self.assertEqual(obs, exp)
        # Test that copy of data is not made.
        self.assertTrue(obs.data is self.dm_3x3.data)
        obs.data[0, 1] = 424242
        self.assertTrue(np.array_equal(obs.data, self.dm_3x3.data))

        # DistanceMatrix -> DissimilarityMatrix
        exp = DissimilarityMatrix(self.dm_3x3_data, ids)
        obs = DissimilarityMatrix(
            DistanceMatrix(self.dm_3x3_data, ('a', 'b', 'c')), ids)
        self.assertEqual(obs, exp)

        # DissimilarityMatrix -> DistanceMatrix
        with self.assertRaises(DistanceMatrixError):
            _ = DistanceMatrix(self.dm_2x2_asym, ['foo', 'bar'])
Exemple #24
0
    def test_random_fn(self):
        """Test passing a different random function than the default."""
        def myrand(num_rows, num_cols):
            # One dm to rule them all...
            data = np.empty((num_rows, num_cols))
            data.fill(42)
            return data

        exp = DistanceMatrix(
            np.asarray([[0, 42, 42], [42, 0, 42], [42, 42, 0]]),
            ['1', '2', '3'])
        obs = randdm(3, random_fn=myrand)
        self.assertEqual(obs, exp)
Exemple #25
0
    def setUp(self):
        data1 = [[0, 5, 9, 9, 8], [5, 0, 10, 10, 9], [9, 10, 0, 8, 7],
                 [9, 10, 8, 0, 3], [8, 9, 7, 3, 0]]
        ids1 = list('abcde')
        self.dm1 = DistanceMatrix(data1, ids1)
        # this newick string was confirmed against http://www.trex.uqam.ca/
        # which generated the following (isomorphic) newick string:
        # (d:2.0000,e:1.0000,(c:4.0000,(a:2.0000,b:3.0000):3.0000):2.0000);
        self.expected1_str = ("(d:2.000000, (c:4.000000, (b:3.000000,"
                              " a:2.000000):3.000000):2.000000, e:1.000000);")
        self.expected1_TreeNode = TreeNode.from_newick(self.expected1_str)

        # this example was pulled from the Phylip manual
        # http://evolution.genetics.washington.edu/phylip/doc/neighbor.html
        data2 = [[0.0000, 1.6866, 1.7198, 1.6606, 1.5243, 1.6043, 1.5905],
                 [1.6866, 0.0000, 1.5232, 1.4841, 1.4465, 1.4389, 1.4629],
                 [1.7198, 1.5232, 0.0000, 0.7115, 0.5958, 0.6179, 0.5583],
                 [1.6606, 1.4841, 0.7115, 0.0000, 0.4631, 0.5061, 0.4710],
                 [1.5243, 1.4465, 0.5958, 0.4631, 0.0000, 0.3484, 0.3083],
                 [1.6043, 1.4389, 0.6179, 0.5061, 0.3484, 0.0000, 0.2692],
                 [1.5905, 1.4629, 0.5583, 0.4710, 0.3083, 0.2692, 0.0000]]
        ids2 = [
            "Bovine", "Mouse", "Gibbon", "Orang", "Gorilla", "Chimp", "Human"
        ]
        self.dm2 = DistanceMatrix(data2, ids2)
        self.expected2_str = ("(Mouse:0.76891, (Gibbon:0.35793, (Orang:0.28469"
                              ", (Gorilla:0.15393, (Chimp:0.15167, Human:0.117"
                              "53):0.03982):0.02696):0.04648):0.42027, Bovine:"
                              "0.91769);")
        self.expected2_TreeNode = TreeNode.from_newick(self.expected2_str)

        data3 = [[0, 5, 4, 7, 6, 8], [5, 0, 7, 10, 9, 11], [4, 7, 0, 7, 6, 8],
                 [7, 10, 7, 0, 5, 8], [6, 9, 6, 5, 0, 8], [8, 11, 8, 8, 8, 0]]
        ids3 = map(str, range(6))
        self.dm3 = DistanceMatrix(data3, ids3)
        self.expected3_str = ("((((0:1.000000,1:4.000000):1.000000,2:2.000000"
                              "):1.250000,5:4.750000):0.750000,3:2.750000,4:2."
                              "250000);")
        self.expected3_TreeNode = TreeNode.from_newick(self.expected3_str)
Exemple #26
0
    def test_tip_tip_distances_endpoints(self):
        """Test getting specifc tip distances  with tipToTipDistances"""
        t = TreeNode.from_newick('((H:1,G:1):2,(R:0.5,M:0.7):3);')
        nodes = [t.find('H'), t.find('G'), t.find('M')]
        names = ['H', 'G', 'M']
        exp = DistanceMatrix(np.array([[0, 2.0, 6.7],
                                       [2.0, 0, 6.7],
                                       [6.7, 6.7, 0.0]]), ['H', 'G', 'M'])

        obs = t.tip_tip_distances(endpoints=names)
        self.assertEqual(obs, exp)

        obs = t.tip_tip_distances(endpoints=nodes)
        self.assertEqual(obs, exp)
Exemple #27
0
    def distances(self):
        """Compute distances between all pairs of sequences

        Returns
        -------
        skbio.core.distance.DistanceMatrix
            Matrix containing the distances between all pairs of sequences.

        Raises
        ------
        skbio.core.exception.BiologicalSequenceError
            If ``len(self) != len(other)``.

        See Also
        --------
        skbio.core.distance.DistanceMatrix
        scipy.spatial.distance.hamming

        Notes
        -----
        Distances between sequences are computed as hamming distances, though
        this will be generalized (see #194).

        Examples
        --------
        >>> from skbio.core.alignment import Alignment
        >>> from skbio.core.sequence import DNA
        >>> seqs = [DNA("A-CCGGG", identifier="s1"),
        ...         DNA("ATCC--G", identifier="s2"),
        ...         DNA("ATCCGGA", identifier="s3")]
        >>> a1 = Alignment(seqs)
        >>> print a1.distances()
        3x3 distance matrix
        IDs:
        s1, s2, s3
        Data:
        [[ 0.          0.42857143  0.28571429]
         [ 0.42857143  0.          0.42857143]
         [ 0.28571429  0.42857143  0.        ]]
        """
        sequence_count = self.sequence_count()
        dm = np.zeros((sequence_count, sequence_count))
        identifiers = []
        for i in xrange(sequence_count):
            self_i = self[i]
            identifiers.append(self_i.identifier)
            for j in xrange(i):
                dm[i, j] = dm[j, i] = self_i.distance(self[j])
        return DistanceMatrix(dm, identifiers)
Exemple #28
0
 def setUp(self):
     self.dm = DistanceMatrix(
         [[0.0, 1.0, 2.0], [1.0, 0.0, 3.0], [2.0, 3.0, 0.0]],
         ['a', 'b', 'c'])
     self.grouping = [1, 2, 1]
     # Ordering of IDs shouldn't matter, nor should extra IDs.
     self.df = pd.read_csv(
         StringIO('ID,Group\nb,Group1\na,Group2\nc,Group1\nd,Group3'),
         index_col=0)
     self.df_missing_id = pd.read_csv(
         StringIO('ID,Group\nb,Group1\nc,Group1'), index_col=0)
     self.categorical_stats = CategoricalStats(self.dm, self.grouping)
     self.categorical_stats_from_df = CategoricalStats(self.dm,
                                                       self.df,
                                                       column='Group')
def single_file_upgma(input_file, output_file):
    # read in dist matrix
    dist_mat = DistanceMatrix.from_file(input_file)

    # SciPy uses average as UPGMA:
    # http://docs.scipy.org/doc/scipy/reference/generated/
    #    scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    linkage_matrix = linkage(dist_mat.condensed_form(), method='average')

    tree = TreeNode.from_linkage_matrix(linkage_matrix, dist_mat.ids)

    # write output
    f = open(output_file, 'w')
    try:
        f.write(tree.to_newick(with_distances=True))
    except AttributeError:
        if c is None:
            raise RuntimeError("""input file %s did not make a UPGMA tree.
 Ensure it has more than one sample present""" % (str(input_file), ))
        raise
    f.close()
def single_file_upgma(input_file, output_file):
    # read in dist matrix
    dist_mat = DistanceMatrix.from_file(input_file)

    # SciPy uses average as UPGMA:
    # http://docs.scipy.org/doc/scipy/reference/generated/
    #    scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    linkage_matrix = linkage(dist_mat.condensed_form(), method='average')

    tree = TreeNode.from_linkage_matrix(linkage_matrix, dist_mat.ids)

    # write output
    f = open(output_file, 'w')
    try:
        f.write(tree.to_newick(with_distances=True))
    except AttributeError:
        if c is None:
            raise RuntimeError("""input file %s did not make a UPGMA tree.
 Ensure it has more than one sample present""" % (str(input_file),))
        raise
    f.close()
Exemple #31
0
    def test_default_usage(self):
        """Test generating random distance matrices."""
        exp = DistanceMatrix(np.asarray([[0.0]]), ['1'])
        obs = randdm(1)
        self.assertEqual(obs, exp)

        obs = randdm(2)
        self.assertEqual(obs.shape, (2, 2))
        self.assertEqual(obs.ids, ('1', '2'))

        obs1 = randdm(5)
        num_trials = 10
        found_diff = False
        for _ in range(num_trials):
            obs2 = randdm(5)

            if obs1 != obs2:
                found_diff = True
                break

        self.assertTrue(found_diff)
Exemple #32
0
def mantel(x, y, method='pearson', permutations=999, alternative='two-sided'):
    """Compute correlation between distance matrices using the Mantel test.

    The Mantel test compares two distance matrices by computing the correlation
    between the distances in the lower (or upper) triangular portions of the
    symmetric distance matrices. Correlation can be computed using Pearson's
    product-moment correlation coefficient or Spearman's rank correlation
    coefficient.

    As defined in [1]_, the Mantel test computes a test statistic :math:`r_M`
    given two symmetric distance matrices :math:`D_X` and :math:`D_Y`.
    :math:`r_M` is defined as

    .. math::

       r_M=\\frac{1}{d-1}\\sum_{i=1}^{n-1}\\sum_{j=i+1}^{n}
       stand(D_X)_{ij}stand(D_Y)_{ij}

    where

    .. math::

       d=\\frac{n(n-1)}{2}

    and :math:`n` is the number of rows/columns in each of the distance
    matrices. :math:`stand(D_X)` and :math:`stand(D_Y)` are distance matrices
    with their upper triangles containing standardized distances. Note that
    since :math:`D_X` and :math:`D_Y` are symmetric, the lower triangular
    portions of the matrices could equivalently have been used instead of the
    upper triangular portions (the current function behaves in this manner).

    If ``method='spearman'``, the above equation operates on ranked distances
    instead of the original distances.

    Statistical significance is assessed via a permutation test. The rows and
    columns of the first distance matrix (`x`) are randomly permuted a
    number of times (controlled via `permutations`). A correlation coefficient
    is computed for each permutation and the p-value is the proportion of
    permuted correlation coefficients that are equal to or more extreme
    than the original (unpermuted) correlation coefficient. Whether a permuted
    correlation coefficient is "more extreme" than the original correlation
    coefficient depends on the alternative hypothesis (controlled via
    `alternative`).

    Parameters
    ----------
    x, y : array_like or DistanceMatrix
        Input distance matrices to compare. Both matrices must have the same
        shape and be at least 3x3 in size. If ``array_like``, will be cast to
        ``DistanceMatrix`` (thus the requirements of a valid ``DistanceMatrix``
        apply to both `x` and `y`, such as symmetry and hollowness). If inputs
        are already ``DistanceMatrix`` instances, the IDs do not need to match
        between them; they are assumed to both be in the same order regardless
        of their IDs (the underlying data matrix is the only thing considered
        by this function).
    method : {'pearson', 'spearman'}
        Method used to compute the correlation between distance matrices.
    permutations : int, optional
        Number of times to randomly permute `x` when assessing statistical
        significance. Must be greater than or equal to zero. If zero,
        statistical significance calculations will be skipped and the p-value
        will be ``np.nan``.
    alternative : {'two-sided', 'greater', 'less'}
        Alternative hypothesis to use when calculating statistical
        significance. The default ``'two-sided'`` alternative hypothesis
        calculates the proportion of permuted correlation coefficients whose
        magnitude (i.e. after taking the absolute value) is greater than or
        equal to the absolute value of the original correlation coefficient.
        ``'greater'`` calculates the proportion of permuted coefficients that
        are greater than or equal to the original coefficient. ``'less'``
        calculates the proportion of permuted coefficients that are less than
        or equal to the original coefficient.

    Returns
    -------
    tuple of floats
        Correlation coefficient and p-value of the test.

    Raises
    ------
    ValueError
        If `x` and `y` are not the same shape and at least 3x3 in size, or an
        invalid `method`, number of `permutations`, or `alternative` are
        provided.

    See Also
    --------
    DistanceMatrix
    scipy.stats.pearsonr
    scipy.stats.spearmanr

    Notes
    -----
    The Mantel test was first described in [2]_. The general algorithm and
    interface are similar to ``vegan::mantel``, available in R's vegan
    package [3]_.

    ``np.nan`` will be returned for the p-value if `permutations` is zero or if
    the correlation coefficient is ``np.nan``. The correlation coefficient will
    be ``np.nan`` if one or both of the inputs does not have any variation
    (i.e. the distances are all constant) and ``method='spearman'``.

    References
    ----------
    .. [1] Legendre, P. and Legendre, L. (2012) Numerical Ecology. 3rd English
       Edition. Elsevier.

    .. [2] Mantel, N. (1967). "The detection of disease clustering and a
       generalized regression approach". Cancer Research 27 (2): 209-220. PMID
       6018555.

    .. [3] http://cran.r-project.org/web/packages/vegan/index.html

    Examples
    --------
    Define two 3x3 distance matrices:

    >>> x = [[0, 1, 2],
    ...      [1, 0, 3],
    ...      [2, 3, 0]]
    >>> y = [[0, 2, 7],
    ...      [2, 0, 6],
    ...      [7, 6, 0]]

    Compute the Pearson correlation between them and assess significance using
    a two-sided test with 999 permutations:

    >>> coeff, p_value = mantel(x, y)
    >>> round(coeff, 4)
    0.7559

    Thus, we see a moderate-to-strong positive correlation (:math:`r_M=0.7559`)
    between the two matrices.

    """
    if method == 'pearson':
        corr_func = pearsonr
    elif method == 'spearman':
        corr_func = spearmanr
    else:
        raise ValueError("Invalid correlation method '%s'." % method)

    if permutations < 0:
        raise ValueError("Number of permutations must be greater than or "
                         "equal to zero.")
    if alternative not in ('two-sided', 'greater', 'less'):
        raise ValueError("Invalid alternative hypothesis '%s'." % alternative)

    x = DistanceMatrix(x)
    y = DistanceMatrix(y)

    if x.shape != y.shape:
        raise ValueError("Distance matrices must have the same shape.")
    if x.shape[0] < 3:
        raise ValueError("Distance matrices must be at least 3x3 in size.")

    x_flat = x.condensed_form()
    y_flat = y.condensed_form()

    orig_stat = corr_func(x_flat, y_flat)[0]

    if permutations == 0 or np.isnan(orig_stat):
        p_value = np.nan
    else:
        perm_gen = (corr_func(x.permute(condensed=True), y_flat)[0]
                    for _ in range(permutations))
        permuted_stats = np.fromiter(perm_gen, np.float, count=permutations)

        if alternative == 'two-sided':
            count_better = (np.absolute(permuted_stats) >=
                            np.absolute(orig_stat)).sum()
        elif alternative == 'greater':
            count_better = (permuted_stats >= orig_stat).sum()
        else:
            count_better = (permuted_stats <= orig_stat).sum()

        p_value = (count_better + 1) / (permutations + 1)

    return orig_stat, p_value
Exemple #33
0
class PairwiseMantelTests(TestCase):
    def setUp(self):
        self.minx = DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]])
        self.miny = DistanceMatrix([[0, 2, 7], [2, 0, 6], [7, 6, 0]])
        self.minz = DistanceMatrix([[0, 0.5, 0.25],
                                    [0.5, 0, 0.1],
                                    [0.25, 0.1, 0]])
        self.min_dms = (self.minx, self.miny, self.minz)

        # Versions of self.minx and self.minz (above) that each have an extra
        # ID on the end.
        self.x_extra = DistanceMatrix([[0, 1, 2, 7],
                                       [1, 0, 3, 2],
                                       [2, 3, 0, 4],
                                       [7, 2, 4, 0]], ['0', '1', '2', 'foo'])
        self.z_extra = DistanceMatrix([[0, 0.5, 0.25, 3],
                                       [0.5, 0, 0.1, 24],
                                       [0.25, 0.1, 0, 5],
                                       [3, 24, 5, 0]], ['0', '1', '2', 'bar'])

        # Load expected results. We have to load the p-value column (column
        # index 3) as a string dtype in order to compare with the in-memory
        # results since we're formatting the p-values as strings with the
        # correct number of decimal places. Without this explicit converter,
        # the p-value column will be loaded as a float dtype and the frames
        # won't compare equal.
        p_val_conv = {3: str}

        self.exp_results_minimal = pd.read_csv(
            get_data_path('pwmantel_exp_results_minimal.txt'), sep='\t',
            index_col=(0, 1), converters=p_val_conv)

        self.exp_results_minimal_with_labels = pd.read_csv(
            get_data_path('pwmantel_exp_results_minimal_with_labels.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_duplicate_dms = pd.read_csv(
            get_data_path('pwmantel_exp_results_duplicate_dms.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_na_p_value = pd.read_csv(
            get_data_path('pwmantel_exp_results_na_p_value.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_too_few_permutations = pd.read_csv(
            get_data_path('pwmantel_exp_results_too_few_permutations.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

        self.exp_results_reordered_distance_matrices = pd.read_csv(
            get_data_path('pwmantel_exp_results_reordered_distance_matrices'
                          '.txt'),
            sep='\t', index_col=(0, 1), converters=p_val_conv)

    def test_minimal_compatible_input(self):
        # Matrices are already in the correct order and have matching IDs.
        np.random.seed(0)

        obs = pwmantel(self.min_dms, alternative='greater')
        assert_frame_equal(obs, self.exp_results_minimal)

    def test_minimal_compatible_input_with_labels(self):
        np.random.seed(0)

        obs = pwmantel(self.min_dms, alternative='greater',
                       labels=('minx', 'miny', 'minz'))
        assert_frame_equal(obs, self.exp_results_minimal_with_labels)

    def test_duplicate_dms(self):
        obs = pwmantel((self.minx, self.minx, self.minx), alternative='less')
        assert_frame_equal(obs, self.exp_results_duplicate_dms)

    def test_na_p_value(self):
        obs = pwmantel((self.miny, self.minx), method='spearman',
                       permutations=0)
        assert_frame_equal(obs, self.exp_results_na_p_value)

    def test_too_few_permutations_for_p_value(self):
        obs = pwmantel((self.miny, self.minx), method='spearman',
                       permutations=9)
        assert_frame_equal(obs, self.exp_results_too_few_permutations)

    def test_reordered_distance_matrices(self):
        # Matrices have matching IDs but they all have different ordering.
        x = self.minx.filter(['1', '0', '2'])
        y = self.miny.filter(['0', '2', '1'])
        z = self.minz.filter(['1', '2', '0'])

        np.random.seed(0)

        obs = pwmantel((x, y, z), alternative='greater')
        assert_frame_equal(obs, self.exp_results_reordered_distance_matrices)

    def test_strict(self):
        # Matrices have some matching and nonmatching IDs, with different
        # ordering.
        x = self.x_extra.filter(['1', '0', 'foo', '2'])
        y = self.miny.filter(['0', '2', '1'])
        z = self.z_extra.filter(['bar', '1', '2', '0'])

        np.random.seed(0)

        # strict=False should discard IDs that aren't found in both matrices
        obs = pwmantel((x, y, z), alternative='greater', strict=False)
        assert_frame_equal(obs, self.exp_results_reordered_distance_matrices)

        with self.assertRaises(ValueError):
            pwmantel((x, y, z), strict=True)

    def test_id_lookup(self):
        # Matrices have mismatched IDs but a lookup is provided.
        self.x_extra.ids = ['a', 'b', 'c', 'foo']
        self.z_extra.ids = ['d', 'e', 'f', 'bar']
        lookup = {'a': '0', 'b': '1', 'c': '2', 'foo': 'foo',
                  'd': '0', 'e': '1', 'f': '2', 'bar': 'bar',
                  '0': '0', '1': '1', '2': '2'}

        x = self.x_extra.filter(['b', 'a', 'foo', 'c'])
        y = self.miny.filter(['0', '2', '1'])
        z = self.z_extra.filter(['bar', 'e', 'f', 'd'])

        x_copy = x.copy()
        y_copy = y.copy()
        z_copy = z.copy()

        np.random.seed(0)

        obs = pwmantel((x, y, z), alternative='greater', strict=False,
                       lookup=lookup)
        assert_frame_equal(obs, self.exp_results_reordered_distance_matrices)

        # Make sure the inputs aren't modified.
        self.assertEqual(x, x_copy)
        self.assertEqual(y, y_copy)
        self.assertEqual(z, z_copy)

    def test_too_few_dms(self):
        with self.assertRaises(ValueError):
            pwmantel([self.miny])

    def test_invalid_input_type(self):
        with self.assertRaises(TypeError):
            pwmantel([self.miny, self.minx, [[0, 42], [42, 0]]])

    def test_wrong_number_of_labels(self):
        with self.assertRaises(ValueError):
            pwmantel(self.min_dms, labels=['foo', 'bar'])

    def test_duplicate_labels(self):
        with self.assertRaises(ValueError):
            pwmantel(self.min_dms, labels=['foo', 'bar', 'foo'])

    def test_missing_ids_in_lookup(self):
        # mapping for '1' is missing
        lookup = {'0': 'a', '2': 'c'}

        with self.assertRaises(KeyError):
            pwmantel(self.min_dms, lookup=lookup)

    def test_no_matching_ids(self):
        self.minx.ids = ['foo', 'bar', 'baz']
        self.miny.ids = ['bro', 'fist', 'breh']

        with self.assertRaises(ValueError):
            pwmantel((self.minx, self.miny, self.minz), strict=False)
Exemple #34
0
 def test_from_file_invalid_input(self):
     """Raises error on invalid distance matrix file."""
     # Asymmetric.
     with self.assertRaises(DistanceMatrixError):
         _ = DistanceMatrix.from_file(self.dm_2x2_asym_f)
Exemple #35
0
    def setUp(self):
        # The test dataset used here is a subset of the Lauber et al. 2009
        # "88 Soils" dataset. It has been altered to exercise various aspects
        # of the code, including (but not limited to):
        #
        # - order of distance matrix IDs and IDs in data frame (metadata) are
        #   not exactly the same
        # - data frame has an extra sample that is not in the distance matrix
        # - this extra sample has non-numeric and missing values in some of its
        #   cells
        #
        # Additional variations of the distance matrix and data frame are used
        # to test different orderings of rows/columns, extra non-numeric data
        # frame columns, etc.
        #
        # This dataset is also useful because it is non-trivial in size (6
        # samples, 11 environment variables) and it includes positive/negative
        # floats and integers in the data frame.
        self.dm = DistanceMatrix.from_file(get_data_path('dm.txt'))

        # Reordered rows and columns (i.e., different ID order). Still
        # conceptually the same distance matrix.
        self.dm_reordered = DistanceMatrix.from_file(
            get_data_path('dm_reordered.txt'))

        self.df = pd.read_csv(get_data_path('df.txt'), sep='\t', index_col=0)

        # Similar to the above data frame, except that it has an extra
        # non-numeric column, and some of the other rows and columns have been
        # reordered.
        self.df_extra_column = pd.read_csv(
            get_data_path('df_extra_column.txt'), sep='\t', index_col=0)

        # All columns in the original data frame (these are all numeric
        # columns).
        self.cols = self.df.columns.tolist()

        # This second dataset is derived from vegan::bioenv's example dataset
        # (varespec and varechem). The original dataset includes a site x
        # species table (e.g., OTU table) and a data frame of environmental
        # variables. Since the bioenv function defined here accepts a distance
        # matrix, we use a Bray-Curtis distance matrix that is derived from the
        # site x species table (this matches what is done by vegan::bioenv when
        # provided an OTU table, using their default distance measure). The
        # data frame only includes the numeric environmental variables we're
        # interested in for these tests: log(N), P, K, Ca, pH, Al
        self.dm_vegan = DistanceMatrix.from_file(
            get_data_path('bioenv_dm_vegan.txt'))
        self.df_vegan = pd.read_csv(
            get_data_path('bioenv_df_vegan.txt'), sep='\t',
            converters={0: str})
        self.df_vegan.set_index('#SampleID', inplace=True)

        # Load expected results.
        self.exp_results = pd.read_csv(get_data_path('exp_results.txt'),
                                       sep='\t', index_col=0)
        self.exp_results_single_column = pd.read_csv(
            get_data_path('exp_results_single_column.txt'), sep='\t',
            index_col=0)
        self.exp_results_different_column_order = pd.read_csv(
            get_data_path('exp_results_different_column_order.txt'), sep='\t',
            index_col=0)
        self.exp_results_vegan = pd.read_csv(
            get_data_path('bioenv_exp_results_vegan.txt'), sep='\t',
            index_col=0)
Exemple #36
0
 def test_nj_error(self):
     data = [[0, 3], [3, 0]]
     dm = DistanceMatrix(data, list('ab'))
     self.assertRaises(ValueError, nj, dm)
Exemple #37
0
 def test_nj_trivial(self):
     data = [[0, 3, 2], [3, 0, 3], [2, 3, 0]]
     dm = DistanceMatrix(data, list('abc'))
     expected_str = "(b:2.000000, a:1.000000, c:1.000000);"
     self.assertEqual(nj(dm, result_constructor=str), expected_str)
Exemple #38
0
def run_mantel_correlogram(fps,
                           distmats,
                           num_perms,
                           comment,
                           alpha,
                           sample_id_map=None,
                           variable_size_distance_classes=False):
    """Runs a Mantel correlogram analysis on all pairs of distance matrices.

    Returns a string suitable for writing out to a file containing the results
    of the test, a list of correlogram filepath names, and a list of matplotlib
    Figure objects representing each correlogram.

    The correlogram filepaths can have an extension string appended to the end
    of them and then be used to save each of the correlogram Figures to a file.
    Each correlogram filepath will be a combination of the two distance matrix
    filepaths that were used to create it.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.

    Arguments:
        fps - list of filepaths of the distance matrices
        distmats - list of tuples containing dm labels and dm data (i.e. the
            output of parse_distmat)
        num_perms - the number of permutations to use to calculate the
            p-value(s)
        comment - comment string to add to the beginning of the results string
        alpha - the alpha value to use to determine significance in the
            correlogram plots
        sample_id_map - dict mapping sample IDs (i.e. what is expected by
            make_compatible_distance_matrices)
        variable_size_distance_classes - create distance classes that vary in
            size (i.e. width) but have the same number of distances in each
            class
    """
    if len(fps) != len(distmats):
        raise ValueError("Must provide the same number of filepaths as there "
                         "are distance matrices.")
    if comment is None:
        comment = ''
    result = comment + 'DM1\tDM2\tNumber of entries\t' + \
                       'Number of permutations\tClass index\t' + \
                       'Number of distances\tMantel r statistic\t' + \
                       'p-value\tp-value (Bonferroni corrected)\tTail type\n'
    correlogram_fps = []
    correlograms = []

    # Loop over all pairs of dms.
    for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)):
        for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]:
            # Make the current pair of distance matrices compatible by only
            # keeping samples that match between them, and ordering them by
            # the same sample IDs.
            (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                make_compatible_distance_matrices((dm1_labels, dm1_data),
                                                  (dm2_labels, dm2_data), lookup=sample_id_map)
            if len(dm1_labels) < 3:
                result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2,
                                                             len(dm1_labels))
                continue

            dm1 = DistanceMatrix(dm1_data, dm1_labels)
            dm2 = DistanceMatrix(dm2_data, dm2_labels)

            # Create an instance of our Mantel correlogram test and run it with
            # the specified number of permutations.
            mc = MantelCorrelogram(
                dm1,
                dm2,
                alpha=alpha,
                variable_size_distance_classes=variable_size_distance_classes)
            results = mc(num_perms)

            # Generate a name for the current correlogram and save it and the
            # correlogram itself.
            dm1_name = path.basename(fp1)
            dm2_name = path.basename(fp2)
            correlogram_fps.append('_'.join((dm1_name, 'AND', dm2_name,
                                             'mantel_correlogram')) + '.')
            correlograms.append(results['correlogram_plot'])

            # Iterate over the results and write them to the text file.
            first_time = True
            for class_idx, num_dist, r, p, p_corr in zip(
                    results['class_index'], results['num_dist'],
                    results['mantel_r'], results['mantel_p'],
                    results['mantel_p_corr']):
                # Format p-values and figure out which tail type we have based
                # on the sign of r.
                p_str = None
                if p is not None:
                    p_str = format_p_value_for_num_iters(p, num_perms)
                p_corr_str = None
                if p_corr is not None:
                    p_corr_str = format_p_value_for_num_iters(
                        p_corr, num_perms)
                if r is None:
                    tail_type = None
                elif r < 0:
                    tail_type = 'less'
                else:
                    tail_type = 'greater'

                if first_time:
                    result += '%s\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\n' % (
                        fp1, fp2, len(dm1_labels), num_perms, class_idx,
                        num_dist, r, p_str, p_corr_str, tail_type)
                    first_time = False
                else:
                    result += '\t\t\t\t%s\t%d\t%s\t%s\t%s\t%s\n' % (
                        class_idx, num_dist, r, p_str, p_corr_str, tail_type)
    return result, correlogram_fps, correlograms
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'best', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'best' or 'morans_i', this parameter will be
            ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """
    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BEST analyses). "
                         "Please use a different metadata column to perform "
                         "statistical tests on.")

    with open(dm_fp, 'U') as dm_f:
        dm = DistanceMatrix.from_file(dm_f)

    # These methods are in skbio. There are still methods in qiime.stats that
    # need to be ported to skbio, at which point a lot of this logic can be
    # simplified.
    if method in ('anosim', 'permanova'):
        if method == 'anosim':
            method_cls = ANOSIM
        elif method == 'permanova':
            method_cls = PERMANOVA
        else:
            # Should never get here...
            pass

        with open(map_fp, 'U') as map_f:
            md_dict = parse_mapping_file_to_dict(map_f)[0]
        df = pd.DataFrame.from_dict(md_dict, orient='index')

        method_inst = method_cls(dm, df, column=categories[0])
        results = method_inst(num_perms)

        with open(join(out_dir, '%s_results.txt' % method), 'w') as out_f:
            out_f.write(results.summary())
    else:
        # Remove any samples from the mapping file that aren't in the distance
        # matrix (important for validation checks). Use strict=True so that an
        # error is raised if the distance matrix contains any samples that
        # aren't in the mapping file.
        with open(map_fp, 'U') as map_f:
            md_map = MetadataMap.parseMetadataMap(map_f)
        md_map.filterSamples(dm.ids, strict=True)

        # Run the specified statistical method.
        if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
            # These methods are run in R. Input validation must be done here
            # before running the R commands. The pure-Python implementations
            # perform all validation in the classes in the stats module.

            # Check to make sure all categories passed in are in mapping file
            # and are not all the same value.
            for category in categories:
                if not category in md_map.CategoryNames:
                    raise ValueError("Category '%s' not found in mapping file "
                                     "columns." % category)

                if md_map.hasSingleCategoryValue(category):
                    raise ValueError("All values in category '%s' are the "
                                     "same. The statistical method '%s' "
                                     "cannot operate on a category that "
                                     "creates only a single group of samples "
                                     "(e.g. there are no 'between' distances "
                                     "because there is only a single group)."
                                     % (category, method))

            # Build the command arguments string.
            command_args = ['-d %s -m %s -c %s -o %s'
                            % (dm_fp, map_fp, categories[0], out_dir)]

            if method == 'morans_i':
                # Moran's I requires only numeric categories.
                for category in categories:
                    if not md_map.isNumericCategory(category):
                        raise TypeError("The category '%s' is not numeric. "
                                        "Not all values could be converted to "
                                        "numbers." % category)
            else:
                # The rest require groups of samples, so the category values
                # cannot all be unique.
                for category in categories:
                    if md_map.hasUniqueCategoryValues(category):
                        raise ValueError("All values in category '%s' are "
                                         "unique. This statistical method "
                                         "cannot operate on a category with "
                                         "unique values (e.g. there are no "
                                         "'within' distances because each "
                                         "group of samples contains only a "
                                         "single sample)." % category)

                # Only Moran's I doesn't accept a number of permutations.
                if num_perms < 0:
                    raise ValueError("The number of permutations must be "
                                     "greater than or equal to zero.")

                command_args[0] += ' -n %d' % num_perms

            rex = RExecutor(TmpDir=get_qiime_temp_dir())
            rex(command_args, '%s.r' % method, output_dir=out_dir)
        elif method == 'best':
            best = Best(dm, md_map, categories)
            best_results = best()

            with open(join(out_dir, '%s_results.txt' % method), 'w') as out_f:
                out_f.write(format_best_results(best_results))
        else:
            raise ValueError("Unrecognized method '%s'. Valid methods: %r"
                             % (method, methods))
 def test_from_file_invalid_input(self):
     """Raises error on invalid distance matrix file."""
     # Asymmetric.
     with self.assertRaises(DistanceMatrixError):
         DistanceMatrix.from_file(self.dm_2x2_asym_f)
Exemple #41
0
def run_mantel_test(method,
                    fps,
                    distmats,
                    num_perms,
                    tail_type,
                    comment,
                    control_dm_fp=None,
                    control_dm=None,
                    sample_id_map=None):
    """Runs a Mantel test on all pairs of distance matrices.

    Returns a string suitable for writing out to a file containing the results
    of the test.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.

    Arguments:
        method - which Mantel test to run (either 'mantel' or 'partial_mantel')
        fps - list of filepaths of the distance matrices
        distmats - list of tuples containing dm labels and dm data (i.e. the
            output of parse_distmat)
        num_perms - the number of permutations to use to calculate the
            p-value(s)
        tail_type - the type of tail test to use when calculating the
            p-value(s). Can be 'two sided', 'greater', or 'less'. Only applies
            when method is mantel
        comment - comment string to add to the beginning of the results string
        control_dm_fp - filepath of the control distance matrix. Only applies
            when method is partial_mantel (it is required then)
        control_dm - tuple containing control distance matrix labels and matrix
            data. Only applies when method is partial_mantel (it is required
            then)
        sample_id_map - dict mapping sample IDs (i.e. what is expected by
            make_compatible_distance_matrices)
    """
    if len(fps) != len(distmats):
        raise ValueError("Must provide the same number of filepaths as there "
                         "are distance matrices.")
    if comment is None:
        comment = ''
    result = comment

    if method == 'mantel':
        result += 'DM1\tDM2\tNumber of entries\tMantel r statistic\t' + \
                  'p-value\tNumber of permutations\tTail type\n'
    elif method == 'partial_mantel':
        if not control_dm_fp or not control_dm:
            raise ValueError("You must provide a control matrix filepath and "
                             "control matrix when running the partial Mantel "
                             "test.")
        result += 'DM1\tDM2\tCDM\tNumber of entries\t' + \
            'Mantel r statistic\tp-value\tNumber of permutations\t' +\
            'Tail type\n'
    else:
        raise ValueError("Invalid method '%s'. Must be either 'mantel' or "
                         "'partial_mantel'." % method)

    # Loop over all pairs of dms.
    for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)):
        for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]:
            # Make the current pair of distance matrices compatible by only
            # keeping samples that match between them, and ordering them by
            # the same sample IDs.
            (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                make_compatible_distance_matrices((dm1_labels, dm1_data),
                                                  (dm2_labels, dm2_data), lookup=sample_id_map)
            if method == 'partial_mantel':
                # We need to intersect three sets (three matrices).
                (dm1_labels, dm1_data), (cdm_labels, cdm_data) = \
                    make_compatible_distance_matrices(
                        (dm1_labels, dm1_data), control_dm,
                        lookup=sample_id_map)
                (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                    make_compatible_distance_matrices(
                        (dm1_labels, dm1_data), (dm2_labels, dm2_data),
                        lookup=sample_id_map)
                if len(dm1_labels) < 3:
                    result += '%s\t%s\t%s\t%d\tToo few samples\n' % (
                        fp1, fp2, control_dm_fp, len(dm1_labels))
                    continue
            elif len(dm1_labels) < 3:
                result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2,
                                                             len(dm1_labels))
                continue

            dm1 = DistanceMatrix(dm1_data, dm1_labels)
            dm2 = DistanceMatrix(dm2_data, dm2_labels)

            # Create an instance of our correlation test and run it with
            # the specified number of permutations.
            if method == 'mantel':
                results = Mantel(dm1, dm2, tail_type)(num_perms)
                p_str = format_p_value_for_num_iters(results['p_value'],
                                                     num_perms)
                result += "%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % (
                    fp1, fp2, len(dm1_labels), results['r_value'], p_str,
                    num_perms, tail_type)
            elif method == 'partial_mantel':
                cdm = DistanceMatrix(cdm_data, cdm_labels)
                results = PartialMantel(dm1, dm2, cdm)(num_perms)
                p_str = format_p_value_for_num_iters(results['mantel_p'],
                                                     num_perms)
                result += "%s\t%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % (
                    fp1, fp2, control_dm_fp, len(dm1_labels),
                    results['mantel_r'], p_str, num_perms, 'greater')
    return result
Exemple #42
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'best', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'best' or 'morans_i', this parameter will be
            ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """

    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BEST analyses). "
                         "Please use a different metadata column to perform "
                         "statistical tests on.")

    # Parse the mapping file and distance matrix.
    with open(map_fp, 'U') as map_f:
        md_map = MetadataMap.parseMetadataMap(map_f)

    with open(dm_fp, 'U') as dm_f:
        dm = DistanceMatrix.from_file(dm_f)

    # Remove any samples from the mapping file that aren't in the distance
    # matrix (important for validation checks). Use strict=True so that an
    # error is raised if the distance matrix contains any samples that aren't
    # in the mapping file.
    md_map.filterSamples(dm.ids, strict=True)

    # Run the specified statistical method.
    if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
        # These methods are run in R. Input validation must be done here before
        # running the R commands. The pure-Python implementations perform all
        # validation in the classes in the stats module.

        # Check to make sure all categories passed in are in mapping file and
        # are not all the same value.
        for category in categories:
            if not category in md_map.CategoryNames:
                raise ValueError("Category '%s' not found in mapping file "
                                 "columns." % category)

            if md_map.hasSingleCategoryValue(category):
                raise ValueError("All values in category '%s' are the "
                                 "same. The statistical method '%s' cannot "
                                 "operate on a category that creates only "
                                 "a single group of samples (e.g. there "
                                 "are no 'between' distances because "
                                 "there is only a single group)." %
                                 (category, method))

        # Build the command arguments string.
        command_args = [
            '-d %s -m %s -c %s -o %s' % (dm_fp, map_fp, categories[0], out_dir)
        ]

        if method == 'morans_i':
            # Moran's I requires only numeric categories.
            for category in categories:
                if not md_map.isNumericCategory(category):
                    raise TypeError(
                        "The category '%s' is not numeric. Not "
                        "all values could be converted to numbers." % category)
        else:
            # The rest require groups of samples, so the category values cannot
            # all be unique.
            for category in categories:
                if md_map.hasUniqueCategoryValues(category):
                    raise ValueError("All values in category '%s' are unique. "
                                     "This statistical method cannot operate "
                                     "on a category with unique values (e.g. "
                                     "there are no 'within' distances because "
                                     "each group of samples contains only a "
                                     "single sample)." % category)

            # Only Moran's I doesn't accept a number of permutations.
            if num_perms < 0:
                raise ValueError("The number of permutations must be greater "
                                 "than or equal to zero.")

            command_args[0] += ' -n %d' % num_perms

        rex = RExecutor(TmpDir=get_qiime_temp_dir())
        rex(command_args, '%s.r' % method, output_dir=out_dir)
    elif method == 'anosim':
        anosim = Anosim(md_map, dm, categories[0])
        anosim_results = anosim(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_anosim_results(anosim_results))
        out_f.close()
    elif method == 'best':
        best = Best(dm, md_map, categories)
        best_results = best()

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_best_results(best_results))
        out_f.close()
    elif method == 'permanova':
        permanova = Permanova(md_map, dm, categories[0])
        permanova_results = permanova(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_permanova_results(permanova_results))
        out_f.close()
    else:
        raise ValueError("Unrecognized method '%s'. Valid methods: %r" %
                         (method, methods))
class DistanceMatrixTests(DissimilarityMatrixTestData):
    def setUp(self):
        super(DistanceMatrixTests, self).setUp()

        self.dm_1x1 = DistanceMatrix(self.dm_1x1_data, ['a'])
        self.dm_2x2 = DistanceMatrix(self.dm_2x2_data, ['a', 'b'])
        self.dm_3x3 = DistanceMatrix(self.dm_3x3_data, ['a', 'b', 'c'])

        self.dms = [self.dm_1x1, self.dm_2x2, self.dm_3x3]
        self.dm_condensed_forms = [np.array([]), np.array([0.123]),
                                   np.array([0.01, 4.2, 12.0])]

    def test_from_file_with_file_path(self):
        """Should identify the filepath correctly and parse from it."""

        # should fail with the expected exception
        with self.assertRaises(DissimilarityMatrixFormatError):
            DistanceMatrix.from_file(self.bad_dm_fp)

        obs = DistanceMatrix.from_file(self.dm_3x3_fp)
        self.assertEqual(self.dm_3x3, obs)
        self.assertTrue(isinstance(obs, DistanceMatrix))

    def test_from_file_invalid_input(self):
        """Raises error on invalid distance matrix file."""
        # Asymmetric.
        with self.assertRaises(DistanceMatrixError):
            DistanceMatrix.from_file(self.dm_2x2_asym_f)

    def test_init_invalid_input(self):
        """Raises error on invalid distance matrix data / IDs."""
        # Asymmetric.
        data = [[0.0, 2.0], [1.0, 0.0]]
        with self.assertRaises(DistanceMatrixError):
            DistanceMatrix(data, ['a', 'b'])

        # Ensure that the superclass validation is still being performed.
        with self.assertRaises(DissimilarityMatrixError):
            DistanceMatrix([[1, 2, 3]], ['a'])

    def test_condensed_form(self):
        """Test retrieving the data matrix in condensed form."""
        for dm, condensed in zip(self.dms, self.dm_condensed_forms):
            obs = dm.condensed_form()
            self.assertTrue(np.array_equal(obs, condensed))

    def test_permute_condensed(self):
        # Can't really permute a 1x1 or 2x2...
        for _ in range(2):
            obs = self.dm_1x1.permute(condensed=True)
            npt.assert_equal(obs, np.array([]))

        for _ in range(2):
            obs = self.dm_2x2.permute(condensed=True)
            npt.assert_equal(obs, np.array([0.123]))

        dm_copy = self.dm_3x3.copy()

        np.random.seed(0)

        obs = self.dm_3x3.permute(condensed=True)
        npt.assert_equal(obs, np.array([12.0, 4.2, 0.01]))

        obs = self.dm_3x3.permute(condensed=True)
        npt.assert_equal(obs, np.array([4.2, 12.0, 0.01]))

        # Ensure dm hasn't changed after calling permute() on it a couple of
        # times.
        self.assertEqual(self.dm_3x3, dm_copy)

    def test_permute_not_condensed(self):
        obs = self.dm_1x1.permute()
        self.assertEqual(obs, self.dm_1x1)
        self.assertFalse(obs is self.dm_1x1)

        obs = self.dm_2x2.permute()
        self.assertEqual(obs, self.dm_2x2)
        self.assertFalse(obs is self.dm_2x2)

        np.random.seed(0)

        exp = DistanceMatrix([[0, 12, 4.2],
                              [12, 0, 0.01],
                              [4.2, 0.01, 0]], self.dm_3x3.ids)
        obs = self.dm_3x3.permute()
        self.assertEqual(obs, exp)

        exp = DistanceMatrix([[0, 4.2, 12],
                              [4.2, 0, 0.01],
                              [12, 0.01, 0]], self.dm_3x3.ids)
        obs = self.dm_3x3.permute()
        self.assertEqual(obs, exp)

    def test_eq(self):
        """Test data equality between different matrix types."""
        # Compare DistanceMatrix to DissimilarityMatrix, where both have the
        # same data and IDs.
        eq_dm = DissimilarityMatrix(self.dm_3x3_data, ['a', 'b', 'c'])
        self.assertTrue(self.dm_3x3 == eq_dm)
        self.assertTrue(eq_dm == self.dm_3x3)

    def test_validate(self):
        """Empty stub: DistanceMatrix._validate tested elsewhere."""
        pass
Exemple #44
0
 def setup(self):
     with open(get_data_path('PCoA_sample_data_3'), 'U') as lines:
         dist_matrix = DistanceMatrix.from_file(lines)
     self.ordination = PCoA(dist_matrix)
     self.ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
                 'PC.355', 'PC.607', 'PC.634']