def test_matrix_parse(self): if TestCluster.module == 'Bio.Cluster': from Bio.Cluster import treecluster elif TestCluster.module == 'Pycluster': from Pycluster import treecluster # Normal matrix, no errors data1 = numpy.array([[1.1, 1.2], [1.4, 1.3], [1.1, 1.5], [2.0, 1.5], [1.7, 1.9], [1.7, 1.9], [5.7, 5.9], [5.7, 5.9], [3.1, 3.3], [5.4, 5.3], [5.1, 5.5], [5.0, 5.5], [5.1, 5.2]]) # Another normal matrix, no errors; written as a list data2 = [[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0]] # Ragged matrix data3 = [[91.1, 92.2, 93.3, 94.4, 95.5], [93.1, 93.2, 91.3, 92.4], [94.1, 92.2, 90.3], [12.1, 92.0, 90.0, 95.0, 90.0]] # Matrix with bad cells data4 = [[7.1, 7.2, 7.3, 7.4, 7.5], [7.1, 7.2, 7.3, 7.4, 'snoopy'], [7.1, 7.2, 7.3, None, None]] # Matrix with a bad row data5 = [[23.1, 23.2, 23.3, 23.4, 23.5], None, [23.1, 23.0, 23.0, 23.0, 23.0]] # Various references that don't point to matrices at all data6 = "snoopy" data7 = {'a': [[2.3, 1.2], [3.3, 5.6]]} data8 = [] data9 = [None] try: treecluster(data1) except Exception: # TODO - Which exceptions? self.fail("treecluster failed to accept matrix data1") try: treecluster(data2) except Exception: # TODO - Which exceptions? self.fail("treecluster failed to accept matrix data2") self.assertRaises(TypeError, treecluster, data3) self.assertRaises(TypeError, treecluster, data4) self.assertRaises(TypeError, treecluster, data5) self.assertRaises(TypeError, treecluster, data6) self.assertRaises(TypeError, treecluster, data7) self.assertRaises(TypeError, treecluster, data8) self.assertRaises(TypeError, treecluster, data9)
def get_clusters_from_seqlist(seqlist, dist_threshold=0.05): """Cluster a list of sequences by a distance identity threshold Parameters ---------- seqlist : list list of sequences as str dist_threshold : float Max distance value to retain, branches above this length in the hierarchical clustering tree will be cut. Returns ------- list list of lists - input sequences now grouped by cluster list list of int - cluster memberships of the originally input list """ if len(seqlist) == 1: # Skip alignment if there is only one sequence return([seqlist], [0]) else: aligner = PairwiseAligner() aligner.mode = "local" # Convert sequence list to distance matrix distmatrix = [] for seq1 in seqlist: row = [] for seq2 in seqlist: maxlen = max([len(seq1), len(seq2)]) # Take percentage identity of pairwise alignment score (match base # +1, all other operations +0) over the longer sequence in pair idval = aligner.align(seq1, seq2).score / maxlen distval = 1 - idval # convert to distance fraction row.append(distval) distmatrix.append(row) # Hierarchical clustering from the distance matrix htree = treecluster(data=None, distancematrix=array(distmatrix)) # Find number of branches with length longer than threshold, and add 1 # to get number of cuts cuts = 1 + len([htree[i].distance for i in range(len(htree)) if htree[i].distance > dist_threshold]) clust_ids = list(htree.cut(cuts)) clust_seqs_dict = defaultdict(list) for i in range(len(seqlist)): clust_seqs_dict[clust_ids[i]] += [seqlist[i]] # Convert dict of lists to list of lists clust_seqs = [clust_seqs_dict[i] for i in clust_seqs_dict] return(clust_seqs, clust_ids)
def test_matrix_parse(self): if TestCluster.module == 'Bio.Cluster': from Bio.Cluster import treecluster elif TestCluster.module == 'Pycluster': from Pycluster import treecluster # Normal matrix, no errors data1 = numpy.array([[1.1, 1.2], [1.4, 1.3], [1.1, 1.5], [2.0, 1.5], [1.7, 1.9], [1.7, 1.9], [5.7, 5.9], [5.7, 5.9], [3.1, 3.3], [5.4, 5.3], [5.1, 5.5], [5.0, 5.5], [5.1, 5.2]]) # Another normal matrix, no errors; written as a list data2 = [[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0]] # Ragged matrix data3 = [[91.1, 92.2, 93.3, 94.4, 95.5], [93.1, 93.2, 91.3, 92.4], [94.1, 92.2, 90.3], [12.1, 92.0, 90.0, 95.0, 90.0]] # Matrix with bad cells data4 = [[7.1, 7.2, 7.3, 7.4, 7.5], [7.1, 7.2, 7.3, 7.4, 'snoopy'], [7.1, 7.2, 7.3, None, None]] # Matrix with a bad row data5 = [[23.1, 23.2, 23.3, 23.4, 23.5], None, [23.1, 23.0, 23.0, 23.0, 23.0]] # Various references that don't point to matrices at all data6 = "snoopy" data7 = {'a': [[2.3, 1.2], [3.3, 5.6]]} data8 = [] data9 = [None] try: treecluster(data1) except: self.fail("treecluster failed to accept matrix data1") try: treecluster(data2) except: self.fail("treecluster failed to accept matrix data2") self.assertRaises(TypeError, lambda: treecluster(data3)) self.assertRaises(TypeError, lambda: treecluster(data4)) self.assertRaises(TypeError, lambda: treecluster(data5)) self.assertRaises(TypeError, lambda: treecluster(data6)) self.assertRaises(TypeError, lambda: treecluster(data7)) self.assertRaises(TypeError, lambda: treecluster(data8)) self.assertRaises(TypeError, lambda: treecluster(data9))
def test_treecluster(self): if TestCluster.module == 'Bio.Cluster': from Bio.Cluster import treecluster elif TestCluster.module == 'Pycluster': from Pycluster import treecluster # First data set weight1 = [1, 1, 1, 1, 1] data1 = numpy.array([[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0]]) mask1 = numpy.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], int) # TODO - Use a context manager here once we drop Python 2.6 # Method should be one letter: self.assertRaises(ValueError, treecluster, **{"data": data1, "mask": mask1, "weight": weight1, "transpose": 0, "method": "any", "dist": "e"}) # Distance should be one letter: self.assertRaises(ValueError, treecluster, **{"data": data1, "mask": mask1, "weight": weight1, "transpose": 0, "method": "any", "dist": "euclidean"}) # test first data set # Pairwise average-linkage clustering" tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='a', dist='e') self.assertEqual(len(tree), len(data1) - 1) self.assertEqual(tree[0].left, 2) self.assertEqual(tree[0].right, 1) self.assertAlmostEqual(tree[0].distance, 2.600, places=3) self.assertEqual(tree[1].left, -1) self.assertEqual(tree[1].right, 0) self.assertAlmostEqual(tree[1].distance, 7.300, places=3) self.assertEqual(tree[2].left, 3) self.assertEqual(tree[2].right, -2) self.assertAlmostEqual(tree[2].distance, 21.348, places=3) # Pairwise single-linkage clustering tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='s', dist='e') self.assertEqual(len(tree), len(data1) - 1) self.assertEqual(tree[0].left, 1) self.assertEqual(tree[0].right, 2) self.assertAlmostEqual(tree[0].distance, 2.600, places=3) self.assertEqual(tree[1].left, 0) self.assertEqual(tree[1].right, -1) self.assertAlmostEqual(tree[1].distance, 5.800, places=3) self.assertEqual(tree[2].left, -2) self.assertEqual(tree[2].right, 3) self.assertAlmostEqual(tree[2].distance, 12.908, places=3) # Pairwise centroid-linkage clustering tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='c', dist='e') self.assertEqual(len(tree), len(data1) - 1) self.assertEqual(tree[0].left, 1) self.assertEqual(tree[0].right, 2) self.assertAlmostEqual(tree[0].distance, 2.600, places=3) self.assertEqual(tree[1].left, 0) self.assertEqual(tree[1].right, -1) self.assertAlmostEqual(tree[1].distance, 6.650, places=3) self.assertEqual(tree[2].left, -2) self.assertEqual(tree[2].right, 3) self.assertAlmostEqual(tree[2].distance, 19.437, places=3) # Pairwise maximum-linkage clustering tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='m', dist='e') self.assertEqual(len(tree), len(data1) - 1) self.assertEqual(tree[0].left, 2) self.assertEqual(tree[0].right, 1) self.assertAlmostEqual(tree[0].distance, 2.600, places=3) self.assertEqual(tree[1].left, -1) self.assertEqual(tree[1].right, 0) self.assertAlmostEqual(tree[1].distance, 8.800, places=3) self.assertEqual(tree[2].left, 3) self.assertEqual(tree[2].right, -2) self.assertAlmostEqual(tree[2].distance, 32.508, places=3) # Second data set weight2 = [1, 1] data2 = numpy.array([[0.8223, 0.9295], [1.4365, 1.3223], [1.1623, 1.5364], [2.1826, 1.1934], [1.7763, 1.9352], [1.7215, 1.9912], [2.1812, 5.9935], [5.3290, 5.9452], [3.1491, 3.3454], [5.1923, 5.3156], [4.7735, 5.4012], [5.1297, 5.5645], [5.3934, 5.1823]]) mask2 = numpy.array([[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]], int) # Test second data set # Pairwise average-linkage clustering tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='a', dist='e') self.assertEqual(len(tree), len(data2) - 1) self.assertEqual(tree[0].left, 5) self.assertEqual(tree[0].right, 4) self.assertAlmostEqual(tree[0].distance, 0.003, places=3) self.assertEqual(tree[1].left, 9) self.assertEqual(tree[1].right, 12) self.assertAlmostEqual(tree[1].distance, 0.029, places=3) self.assertEqual(tree[2].left, 2) self.assertEqual(tree[2].right, 1) self.assertAlmostEqual(tree[2].distance, 0.061, places=3) self.assertEqual(tree[3].left, 11) self.assertEqual(tree[3].right, -2) self.assertAlmostEqual(tree[3].distance, 0.070, places=3) self.assertEqual(tree[4].left, -4) self.assertEqual(tree[4].right, 10) self.assertAlmostEqual(tree[4].distance, 0.128, places=3) self.assertEqual(tree[5].left, 7) self.assertEqual(tree[5].right, -5) self.assertAlmostEqual(tree[5].distance, 0.224, places=3) self.assertEqual(tree[6].left, -3) self.assertEqual(tree[6].right, 0) self.assertAlmostEqual(tree[6].distance, 0.254, places=3) self.assertEqual(tree[7].left, -1) self.assertEqual(tree[7].right, 3) self.assertAlmostEqual(tree[7].distance, 0.391, places=3) self.assertEqual(tree[8].left, -8) self.assertEqual(tree[8].right, -7) self.assertAlmostEqual(tree[8].distance, 0.532, places=3) self.assertEqual(tree[9].left, 8) self.assertEqual(tree[9].right, -9) self.assertAlmostEqual(tree[9].distance, 3.234, places=3) self.assertEqual(tree[10].left, -6) self.assertEqual(tree[10].right, 6) self.assertAlmostEqual(tree[10].distance, 4.636, places=3) self.assertEqual(tree[11].left, -11) self.assertEqual(tree[11].right, -10) self.assertAlmostEqual(tree[11].distance, 12.741, places=3) # Pairwise single-linkage clustering tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='s', dist='e') self.assertEqual(len(tree), len(data2) - 1) self.assertEqual(tree[0].left, 4) self.assertEqual(tree[0].right, 5) self.assertAlmostEqual(tree[0].distance, 0.003, places=3) self.assertEqual(tree[1].left, 9) self.assertEqual(tree[1].right, 12) self.assertAlmostEqual(tree[1].distance, 0.029, places=3) self.assertEqual(tree[2].left, 11) self.assertEqual(tree[2].right, -2) self.assertAlmostEqual(tree[2].distance, 0.033, places=3) self.assertEqual(tree[3].left, 1) self.assertEqual(tree[3].right, 2) self.assertAlmostEqual(tree[3].distance, 0.061, places=3) self.assertEqual(tree[4].left, 10) self.assertEqual(tree[4].right, -3) self.assertAlmostEqual(tree[4].distance, 0.077, places=3) self.assertEqual(tree[5].left, 7) self.assertEqual(tree[5].right, -5) self.assertAlmostEqual(tree[5].distance, 0.092, places=3) self.assertEqual(tree[6].left, 0) self.assertEqual(tree[6].right, -4) self.assertAlmostEqual(tree[6].distance, 0.242, places=3) self.assertEqual(tree[7].left, -7) self.assertEqual(tree[7].right, -1) self.assertAlmostEqual(tree[7].distance, 0.246, places=3) self.assertEqual(tree[8].left, 3) self.assertEqual(tree[8].right, -8) self.assertAlmostEqual(tree[8].distance, 0.287, places=3) self.assertEqual(tree[9].left, -9) self.assertEqual(tree[9].right, 8) self.assertAlmostEqual(tree[9].distance, 1.936, places=3) self.assertEqual(tree[10].left, -10) self.assertEqual(tree[10].right, -6) self.assertAlmostEqual(tree[10].distance, 3.432, places=3) self.assertEqual(tree[11].left, 6) self.assertEqual(tree[11].right, -11) self.assertAlmostEqual(tree[11].distance, 3.535, places=3) # Pairwise centroid-linkage clustering tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='c', dist='e') self.assertEqual(len(tree), len(data2) - 1) self.assertEqual(tree[0].left, 4) self.assertEqual(tree[0].right, 5) self.assertAlmostEqual(tree[0].distance, 0.003, places=3) self.assertEqual(tree[1].left, 12) self.assertEqual(tree[1].right, 9) self.assertAlmostEqual(tree[1].distance, 0.029, places=3) self.assertEqual(tree[2].left, 1) self.assertEqual(tree[2].right, 2) self.assertAlmostEqual(tree[2].distance, 0.061, places=3) self.assertEqual(tree[3].left, -2) self.assertEqual(tree[3].right, 11) self.assertAlmostEqual(tree[3].distance, 0.063, places=3) self.assertEqual(tree[4].left, 10) self.assertEqual(tree[4].right, -4) self.assertAlmostEqual(tree[4].distance, 0.109, places=3) self.assertEqual(tree[5].left, -5) self.assertEqual(tree[5].right, 7) self.assertAlmostEqual(tree[5].distance, 0.189, places=3) self.assertEqual(tree[6].left, 0) self.assertEqual(tree[6].right, -3) self.assertAlmostEqual(tree[6].distance, 0.239, places=3) self.assertEqual(tree[7].left, 3) self.assertEqual(tree[7].right, -1) self.assertAlmostEqual(tree[7].distance, 0.390, places=3) self.assertEqual(tree[8].left, -7) self.assertEqual(tree[8].right, -8) self.assertAlmostEqual(tree[8].distance, 0.382, places=3) self.assertEqual(tree[9].left, -9) self.assertEqual(tree[9].right, 8) self.assertAlmostEqual(tree[9].distance, 3.063, places=3) self.assertEqual(tree[10].left, 6) self.assertEqual(tree[10].right, -6) self.assertAlmostEqual(tree[10].distance, 4.578, places=3) self.assertEqual(tree[11].left, -10) self.assertEqual(tree[11].right, -11) self.assertAlmostEqual(tree[11].distance, 11.536, places=3) # Pairwise maximum-linkage clustering tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='m', dist='e') self.assertEqual(len(tree), len(data2) - 1) self.assertEqual(tree[0].left, 5) self.assertEqual(tree[0].right, 4) self.assertAlmostEqual(tree[0].distance, 0.003, places=3) self.assertEqual(tree[1].left, 9) self.assertEqual(tree[1].right, 12) self.assertAlmostEqual(tree[1].distance, 0.029, places=3) self.assertEqual(tree[2].left, 2) self.assertEqual(tree[2].right, 1) self.assertAlmostEqual(tree[2].distance, 0.061, places=3) self.assertEqual(tree[3].left, 11) self.assertEqual(tree[3].right, 10) self.assertAlmostEqual(tree[3].distance, 0.077, places=3) self.assertEqual(tree[4].left, -2) self.assertEqual(tree[4].right, -4) self.assertAlmostEqual(tree[4].distance, 0.216, places=3) self.assertEqual(tree[5].left, -3) self.assertEqual(tree[5].right, 0) self.assertAlmostEqual(tree[5].distance, 0.266, places=3) self.assertEqual(tree[6].left, -5) self.assertEqual(tree[6].right, 7) self.assertAlmostEqual(tree[6].distance, 0.302, places=3) self.assertEqual(tree[7].left, -1) self.assertEqual(tree[7].right, 3) self.assertAlmostEqual(tree[7].distance, 0.425, places=3) self.assertEqual(tree[8].left, -8) self.assertEqual(tree[8].right, -6) self.assertAlmostEqual(tree[8].distance, 0.968, places=3) self.assertEqual(tree[9].left, 8) self.assertEqual(tree[9].right, 6) self.assertAlmostEqual(tree[9].distance, 3.975, places=3) self.assertEqual(tree[10].left, -10) self.assertEqual(tree[10].right, -7) self.assertAlmostEqual(tree[10].distance, 5.755, places=3) self.assertEqual(tree[11].left, -11) self.assertEqual(tree[11].right, -9) self.assertAlmostEqual(tree[11].distance, 22.734, places=3)
# @Date: 2019-05-27T10:15:26+08:00 # @Email: [email protected] # @Filename: BioPy_1730416009_0527.py # @Last modified time: 2019-05-27T14:31:37+08:00 import pandas as pd from Bio.Cluster import treecluster # Use pandas to read the excel-format file dfrm = pd.read_excel('./ExpressionData.xlsx') # Convert the table into a matrix/array data_array = dfrm.drop('ID', axis=1).values # Perform hierarchical clustering (For Gene/Protein) tree_gene = treecluster(data_array, transpose=0, method='s', dist='e') # Perform hierarchical clustering (For Experiment Condiction # -> transpose=1,method=pairwise single-linkage clustering) tree_exp = treecluster(data_array, transpose=1, method='m', dist='e') # Output the result with open('./Results.txt', 'wt') as outFile: outFile.write("# Cluster Tree of Exp Condiction\n") outFile.write(str(tree_exp) + '\n') outFile.write("# Cluster Tree of Gene\n") outFile.write(str(tree_gene) + '\n')
def test_matrix_parse(module): if module == 'Bio.Cluster': from Bio.Cluster import treecluster elif module == 'Pycluster': from Pycluster import treecluster else: raise 'Unknown module name', module print "test_matrix_parse:" # Normal matrix, no errors data1 = array([[1.1, 1.2], [1.4, 1.3], [1.1, 1.5], [2.0, 1.5], [1.7, 1.9], [1.7, 1.9], [5.7, 5.9], [5.7, 5.9], [3.1, 3.3], [5.4, 5.3], [5.1, 5.5], [5.0, 5.5], [5.1, 5.2]]) # Another normal matrix, no errors; written as a list data2 = [[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0]] # Ragged matrix data3 = [[91.1, 92.2, 93.3, 94.4, 95.5], [93.1, 93.2, 91.3, 92.4], [94.1, 92.2, 90.3], [12.1, 92.0, 90.0, 95.0, 90.0]] # Matrix with bad cells data4 = [[ 7.1, 7.2, 7.3, 7.4, 7.5, ], [7.1, 7.2, 7.3, 7.4, 'snoopy'], [7.1, 7.2, 7.3, None, None]] # Matrix with a bad row data5 = [[23.1, 23.2, 23.3, 23.4, 23.5], None, [23.1, 23.0, 23.0, 23.0, 23.0]] # Various references that don't point to matrices at all data6 = "snoopy" data7 = {'a': [[2.3, 1.2], [3.3, 5.6]]} data8 = [] data9 = [None] data10 = [[None]] try: result = treecluster(data1) print "Read data1 (correct)" except: "Error: treecluster failed to accept matrix data1" try: result = treecluster(data2) print "Read data2 (correct)" except: "Error: treecluster failed to accept matrix data2" try: result = treecluster(data3) print "Error: treecluster incorrectly accepted data3" except: print "Refused incorrect matrix data3" try: result = treecluster(data4) print "Error: treecluster incorrectly accepted data4" except: print "Refused incorrect matrix data4" try: result = treecluster(data5) print "Error: treecluster incorrectly accepted data5" except: print "Refused incorrect matrix data5" try: result = treecluster(data6) print "Error: treecluster incorrectly accepted data6" except: print "Refused incorrect matrix data6" try: result = treecluster(data7) print "Error: treecluster incorrectly accepted data7" except: print "Refused incorrect matrix data7" try: result = treecluster(data8) print "Error: treecluster incorrectly accepted data8" except: print "Refused incorrect matrix data8" try: result = treecluster(data9) print "Error: treecluster incorrectly accepted data9" except: print "Refused incorrect matrix data9" try: result = treecluster(data10) print "Error: treecluster incorrectly accepted data10" except: print "Refused incorrect matrix data10" print
def test_treecluster(module): if module == 'Bio.Cluster': from Bio.Cluster import treecluster elif module == 'Pycluster': from Pycluster import treecluster else: raise 'Unknown module name', module print "test_treecluster:" # First data set weight1 = [1, 1, 1, 1, 1] data1 = array([[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0]]) mask1 = array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]) # Second data set weight2 = [1, 1] data2 = array([[0.8223, 0.9295], [1.4365, 1.3223], [1.1623, 1.5364], [2.1826, 1.1934], [1.7763, 1.9352], [1.7215, 1.9912], [2.1812, 5.9935], [5.3290, 5.9452], [3.1491, 3.3454], [5.1923, 5.3156], [4.7735, 5.4012], [5.1297, 5.5645], [5.3934, 5.1823]]) mask2 = array([[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]]) # test first data set print "First data set:" print_matrix(data1, mask1) print "Pairwise average-linkage clustering" result, linkdist = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='a', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data1) - 1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data1) - 1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % ( i, result[i][0], result[i][1], linkdist[i]) print "Pairwise single-linkage clustering" result, linkdist = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='s', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data1) - 1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data1) - 1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % ( i, result[i][0], result[i][1], linkdist[i]) print "Pairwise centroid-linkage clustering" result, linkdist = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='c', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data1) - 1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data1) - 1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % ( i, result[i][0], result[i][1], linkdist[i]) print "Pairwise maximum-linkage clustering" result, linkdist = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='m', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data1) - 1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data1) - 1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % ( i, result[i][0], result[i][1], linkdist[i]) # Test second data set print "Second data set:" print "Pairwise average-linkage clustering" result, linkdist = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='a', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data2) - 1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data2) - 1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % ( i, result[i][0], result[i][1], linkdist[i]) print "Pairwise single-linkage clustering" result, linkdist = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='s', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data2) - 1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data2) - 1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % ( i, result[i][0], result[i][1], linkdist[i]) print "Pairwise centroid-linkage clustering" result, linkdist = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='c', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data2) - 1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data2) - 1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % ( i, result[i][0], result[i][1], linkdist[i]) print "Pairwise maximum-linkage clustering" result, linkdist = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='m', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data2) - 1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data2) - 1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % ( i, result[i][0], result[i][1], linkdist[i]) print
def test_treecluster(self): if TestCluster.module == "Bio.Cluster": from Bio.Cluster import treecluster elif TestCluster.module == "Pycluster": from Pycluster import treecluster # First data set weight1 = [1, 1, 1, 1, 1] data1 = numpy.array( [ [1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0], ] ) mask1 = numpy.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], int) # test first data set # Pairwise average-linkage clustering" tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method="a", dist="e") self.assertEqual(len(tree), len(data1) - 1) self.assertEqual(tree[0].left, 2) self.assertEqual(tree[0].right, 1) self.assertAlmostEqual(tree[0].distance, 2.600, 3) self.assertEqual(tree[1].left, -1) self.assertEqual(tree[1].right, 0) self.assertAlmostEqual(tree[1].distance, 7.300, 3) self.assertEqual(tree[2].left, 3) self.assertEqual(tree[2].right, -2) self.assertAlmostEqual(tree[2].distance, 21.348, 3) # Pairwise single-linkage clustering tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method="s", dist="e") self.assertEqual(len(tree), len(data1) - 1) self.assertEqual(tree[0].left, 1) self.assertEqual(tree[0].right, 2) self.assertAlmostEqual(tree[0].distance, 2.600, 3) self.assertEqual(tree[1].left, 0) self.assertEqual(tree[1].right, -1) self.assertAlmostEqual(tree[1].distance, 5.800, 3) self.assertEqual(tree[2].left, -2) self.assertEqual(tree[2].right, 3) self.assertAlmostEqual(tree[2].distance, 12.908, 3) # Pairwise centroid-linkage clustering tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method="c", dist="e") self.assertEqual(len(tree), len(data1) - 1) self.assertEqual(tree[0].left, 1) self.assertEqual(tree[0].right, 2) self.assertAlmostEqual(tree[0].distance, 2.600, 3) self.assertEqual(tree[1].left, 0) self.assertEqual(tree[1].right, -1) self.assertAlmostEqual(tree[1].distance, 6.650, 3) self.assertEqual(tree[2].left, -2) self.assertEqual(tree[2].right, 3) self.assertAlmostEqual(tree[2].distance, 19.437, 3) # Pairwise maximum-linkage clustering tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method="m", dist="e") self.assertEqual(len(tree), len(data1) - 1) self.assertEqual(tree[0].left, 2) self.assertEqual(tree[0].right, 1) self.assertAlmostEqual(tree[0].distance, 2.600, 3) self.assertEqual(tree[1].left, -1) self.assertEqual(tree[1].right, 0) self.assertAlmostEqual(tree[1].distance, 8.800, 3) self.assertEqual(tree[2].left, 3) self.assertEqual(tree[2].right, -2) self.assertAlmostEqual(tree[2].distance, 32.508, 3) # Second data set weight2 = [1, 1] data2 = numpy.array( [ [0.8223, 0.9295], [1.4365, 1.3223], [1.1623, 1.5364], [2.1826, 1.1934], [1.7763, 1.9352], [1.7215, 1.9912], [2.1812, 5.9935], [5.3290, 5.9452], [3.1491, 3.3454], [5.1923, 5.3156], [4.7735, 5.4012], [5.1297, 5.5645], [5.3934, 5.1823], ] ) mask2 = numpy.array( [[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]], int, ) # Test second data set # Pairwise average-linkage clustering tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method="a", dist="e") self.assertEqual(len(tree), len(data2) - 1) self.assertEqual(tree[0].left, 5) self.assertEqual(tree[0].right, 4) self.assertAlmostEqual(tree[0].distance, 0.003, 3) self.assertEqual(tree[1].left, 9) self.assertEqual(tree[1].right, 12) self.assertAlmostEqual(tree[1].distance, 0.029, 3) self.assertEqual(tree[2].left, 2) self.assertEqual(tree[2].right, 1) self.assertAlmostEqual(tree[2].distance, 0.061, 3) self.assertEqual(tree[3].left, 11) self.assertEqual(tree[3].right, -2) self.assertAlmostEqual(tree[3].distance, 0.070, 3) self.assertEqual(tree[4].left, -4) self.assertEqual(tree[4].right, 10) self.assertAlmostEqual(tree[4].distance, 0.128, 3) self.assertEqual(tree[5].left, 7) self.assertEqual(tree[5].right, -5) self.assertAlmostEqual(tree[5].distance, 0.224, 3) self.assertEqual(tree[6].left, -3) self.assertEqual(tree[6].right, 0) self.assertAlmostEqual(tree[6].distance, 0.254, 3) self.assertEqual(tree[7].left, -1) self.assertEqual(tree[7].right, 3) self.assertAlmostEqual(tree[7].distance, 0.391, 3) self.assertEqual(tree[8].left, -8) self.assertEqual(tree[8].right, -7) self.assertAlmostEqual(tree[8].distance, 0.532, 3) self.assertEqual(tree[9].left, 8) self.assertEqual(tree[9].right, -9) self.assertAlmostEqual(tree[9].distance, 3.234, 3) self.assertEqual(tree[10].left, -6) self.assertEqual(tree[10].right, 6) self.assertAlmostEqual(tree[10].distance, 4.636, 3) self.assertEqual(tree[11].left, -11) self.assertEqual(tree[11].right, -10) self.assertAlmostEqual(tree[11].distance, 12.741, 3) # Pairwise single-linkage clustering tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method="s", dist="e") self.assertEqual(len(tree), len(data2) - 1) self.assertEqual(tree[0].left, 4) self.assertEqual(tree[0].right, 5) self.assertAlmostEqual(tree[0].distance, 0.003, 3) self.assertEqual(tree[1].left, 9) self.assertEqual(tree[1].right, 12) self.assertAlmostEqual(tree[1].distance, 0.029, 3) self.assertEqual(tree[2].left, 11) self.assertEqual(tree[2].right, -2) self.assertAlmostEqual(tree[2].distance, 0.033, 3) self.assertEqual(tree[3].left, 1) self.assertEqual(tree[3].right, 2) self.assertAlmostEqual(tree[3].distance, 0.061, 3) self.assertEqual(tree[4].left, 10) self.assertEqual(tree[4].right, -3) self.assertAlmostEqual(tree[4].distance, 0.077, 3) self.assertEqual(tree[5].left, 7) self.assertEqual(tree[5].right, -5) self.assertAlmostEqual(tree[5].distance, 0.092, 3) self.assertEqual(tree[6].left, 0) self.assertEqual(tree[6].right, -4) self.assertAlmostEqual(tree[6].distance, 0.242, 3) self.assertEqual(tree[7].left, -7) self.assertEqual(tree[7].right, -1) self.assertAlmostEqual(tree[7].distance, 0.246, 3) self.assertEqual(tree[8].left, 3) self.assertEqual(tree[8].right, -8) self.assertAlmostEqual(tree[8].distance, 0.287, 3) self.assertEqual(tree[9].left, -9) self.assertEqual(tree[9].right, 8) self.assertAlmostEqual(tree[9].distance, 1.936, 3) self.assertEqual(tree[10].left, -10) self.assertEqual(tree[10].right, -6) self.assertAlmostEqual(tree[10].distance, 3.432, 3) self.assertEqual(tree[11].left, 6) self.assertEqual(tree[11].right, -11) self.assertAlmostEqual(tree[11].distance, 3.535, 3) # Pairwise centroid-linkage clustering tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method="c", dist="e") self.assertEqual(len(tree), len(data2) - 1) self.assertEqual(tree[0].left, 4) self.assertEqual(tree[0].right, 5) self.assertAlmostEqual(tree[0].distance, 0.003, 3) self.assertEqual(tree[1].left, 12) self.assertEqual(tree[1].right, 9) self.assertAlmostEqual(tree[1].distance, 0.029, 3) self.assertEqual(tree[2].left, 1) self.assertEqual(tree[2].right, 2) self.assertAlmostEqual(tree[2].distance, 0.061, 3) self.assertEqual(tree[3].left, -2) self.assertEqual(tree[3].right, 11) self.assertAlmostEqual(tree[3].distance, 0.063, 3) self.assertEqual(tree[4].left, 10) self.assertEqual(tree[4].right, -4) self.assertAlmostEqual(tree[4].distance, 0.109, 3) self.assertEqual(tree[5].left, -5) self.assertEqual(tree[5].right, 7) self.assertAlmostEqual(tree[5].distance, 0.189, 3) self.assertEqual(tree[6].left, 0) self.assertEqual(tree[6].right, -3) self.assertAlmostEqual(tree[6].distance, 0.239, 3) self.assertEqual(tree[7].left, 3) self.assertEqual(tree[7].right, -1) self.assertAlmostEqual(tree[7].distance, 0.390, 3) self.assertEqual(tree[8].left, -7) self.assertEqual(tree[8].right, -8) self.assertAlmostEqual(tree[8].distance, 0.382, 3) self.assertEqual(tree[9].left, -9) self.assertEqual(tree[9].right, 8) self.assertAlmostEqual(tree[9].distance, 3.063, 3) self.assertEqual(tree[10].left, 6) self.assertEqual(tree[10].right, -6) self.assertAlmostEqual(tree[10].distance, 4.578, 3) self.assertEqual(tree[11].left, -10) self.assertEqual(tree[11].right, -11) self.assertAlmostEqual(tree[11].distance, 11.536, 3) # Pairwise maximum-linkage clustering tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method="m", dist="e") self.assertEqual(len(tree), len(data2) - 1) self.assertEqual(tree[0].left, 5) self.assertEqual(tree[0].right, 4) self.assertAlmostEqual(tree[0].distance, 0.003, 3) self.assertEqual(tree[1].left, 9) self.assertEqual(tree[1].right, 12) self.assertAlmostEqual(tree[1].distance, 0.029, 3) self.assertEqual(tree[2].left, 2) self.assertEqual(tree[2].right, 1) self.assertAlmostEqual(tree[2].distance, 0.061, 3) self.assertEqual(tree[3].left, 11) self.assertEqual(tree[3].right, 10) self.assertAlmostEqual(tree[3].distance, 0.077, 3) self.assertEqual(tree[4].left, -2) self.assertEqual(tree[4].right, -4) self.assertAlmostEqual(tree[4].distance, 0.216, 3) self.assertEqual(tree[5].left, -3) self.assertEqual(tree[5].right, 0) self.assertAlmostEqual(tree[5].distance, 0.266, 3) self.assertEqual(tree[6].left, -5) self.assertEqual(tree[6].right, 7) self.assertAlmostEqual(tree[6].distance, 0.302, 3) self.assertEqual(tree[7].left, -1) self.assertEqual(tree[7].right, 3) self.assertAlmostEqual(tree[7].distance, 0.425, 3) self.assertEqual(tree[8].left, -8) self.assertEqual(tree[8].right, -6) self.assertAlmostEqual(tree[8].distance, 0.968, 3) self.assertEqual(tree[9].left, 8) self.assertEqual(tree[9].right, 6) self.assertAlmostEqual(tree[9].distance, 3.975, 3) self.assertEqual(tree[10].left, -10) self.assertEqual(tree[10].right, -7) self.assertAlmostEqual(tree[10].distance, 5.755, 3) self.assertEqual(tree[11].left, -11) self.assertEqual(tree[11].right, -9) self.assertAlmostEqual(tree[11].distance, 22.734, 3)
def do_treecluster_images(): """特征维度对各层次聚类的影响""" outDir = '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2' txt_dict = getWordCount(outDir) xx = range(100, 1000, 100) xx = [300, 600] for topN in xx: data, textNames = TC(txt_dict, topN=topN)[:2] # # 不降维 # tfidf_dict = myTFIDF(txt_dict, itc=False) # data, textNames, wordName = dict2Array(tfidf_dict) # method 's': 最小距离法 'm': 最大距离法 'c': 重心法 'a': 类平均法 # dist e 欧式距离 u 余弦距离 tree = treecluster(data=data, method='m', dist='e') # tree2 = treecluster(data=data, method='s', dist='e') # tree3 = treecluster(data=data, method='a', dist='e') # tree4 = treecluster(data=data, method='c', dist='e') args = range(2, 50) # args = list(range(2, 15, 3)) + [21, 27, 30, 40, 50, 60, 70, 80, 100, 150, 250] d = [[], [], [], [], []] # 轮廓系数 ksize = [[], [], [], [], []] # 最大类的大小 for k in args: clusterid = tree.cut(nclusters=k) d[0].append(silhouette_score(data, clusterid, metric='euclidean')) ksize[0].append(max(size_of_cluster(clusterid))) clustering = AgglomerativeClustering(linkage='ward', n_clusters=k) # ['ward','complete','average'] clustering.fit(data) d[1].append(silhouette_score(data, clustering.labels_, metric='euclidean')) ksize[1].append(max(size_of_cluster(clustering.labels_))) # clusterid2 = tree2.cut(nclusters=k) # d[2].append(silhouette_score(data, clusterid2, metric='euclidean')) # ksize[2].append(max(size_of_cluster(clusterid2))) # clusterid3 = tree3.cut(nclusters=k) # d[3].append(silhouette_score(data, clusterid3, metric='euclidean')) # ksize[3].append(max(size_of_cluster(clusterid3))) # clusterid4 = tree4.cut(nclusters=k) # d[4].append(silhouette_score(data, clusterid4, metric='euclidean')) # ksize[4].append(max(size_of_cluster(clusterid4))) # d[2].append(hierarchical(data, k, 'complete'))#m,e # d[3].append(hierarchical(data, k, 'average'))#a,e # 用subplot()方法绘制多幅图形 plt.figure(figsize=(6, 6)) # 创建第一个画板 plt.figure(1) # 将第一个画板划分为2行1列组成的区块,并获取到第一块区域 ax1 = plt.subplot(211) realN = 0 # 在第一个子区域中绘图 for di in d: if len(di) > 1: plt.plot(args, di, marker='o') realN += 1 # plt.legend(xx) plt.legend(range(realN)) plt.xlabel = 'k' plt.ylabel = 'silhouette' # plt.ylim(-1, 1) # 选中第二个子区域,并绘图 ax2 = plt.subplot(212) for di in ksize: if len(di) > 1: plt.plot(args, di, marker='o') plt.legend(range(realN)) plt.xlabel = 'k' plt.ylabel = 'MAXcluster' # plt.ylim(0, 2000) ax1.set_title('feature number=%d by TC' % topN) ax2.set_title("max size of clusters") plt.savefig('./treecluster_images/feature number=%d by TC 1<k<50' % topN) plt.show()
def test_matrix_parse(self): if TestCluster.module == 'Bio.Cluster': from Bio.Cluster import treecluster elif TestCluster.module == 'Pycluster': from Pycluster import treecluster # Normal matrix, no errors data1 = numpy.array([[1.1, 1.2], [1.4, 1.3], [1.1, 1.5], [2.0, 1.5], [1.7, 1.9], [1.7, 1.9], [5.7, 5.9], [5.7, 5.9], [3.1, 3.3], [5.4, 5.3], [5.1, 5.5], [5.0, 5.5], [5.1, 5.2]]) # Another normal matrix, no errors; written as a list data2 = [[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [2.1, 2.0, 0.0, 5.0, 0.0]] # Rows are not contiguous data3 = data1[::2, :] # Columns are not contiguous data4 = numpy.array(data2)[:, ::2] # Matrix using float32 data5 = numpy.array( [[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [2.1, 2.0, 0.0, 5.0, 0.0]], numpy.float32) # Matrix using int data6 = numpy.array([[1, 2, 3, 4, 5], [3, 3, 1, 2, 1], [4, 2, 0, 5, 0], [2, 2, 0, 5, 0]], numpy.int32) try: treecluster(data1) except Exception: self.fail("treecluster failed to accept matrix data1") try: treecluster(data2) except Exception: self.fail("treecluster failed to accept matrix data2") try: treecluster(data3) except Exception: self.fail("treecluster failed to accept matrix data3") try: treecluster(data4) except Exception: self.fail("treecluster failed to accept matrix data4") try: treecluster(data5) except Exception: self.fail("treecluster failed to accept matrix data5") try: treecluster(data6) except Exception: self.fail("treecluster failed to accept matrix data6") # Ragged matrix data7 = [[91.1, 92.2, 93.3, 94.4, 95.5], [93.1, 93.2, 91.3, 92.4], [94.1, 92.2, 90.3], [12.1, 92.0, 90.0, 95.0, 90.0]] # Matrix with bad cells data8 = [[7.1, 7.2, 7.3, 7.4, 7.5], [7.1, 7.2, 7.3, 7.4, 'snoopy'], [7.1, 7.2, 7.3, None, None]] # Matrix with a bad row data9 = [[23.1, 23.2, 23.3, 23.4, 23.5], None, [23.1, 23.0, 23.0, 23.0, 23.0]] # Various references that don't point to matrices at all data10 = "snoopy" data11 = {'a': [[2.3, 1.2], [3.3, 5.6]]} data12 = [] data13 = [None] # Array of incorrect rank data14 = numpy.array([[[1.1, 1.2], [2.3, 1.2], [3.4, 1.6]], [[1.4, 1.3], [3.2, 4.5], [9.8, 4.9]], [[1.1, 1.5], [1.1, 2.3], [6.5, 0.4]]]) # Array with non-numerical values data15 = numpy.array([['a', 'b', 'c'], ['e', 'f', 'g']], 'c') # Empty array data16 = numpy.array([[]], 'd') self.assertRaises(ValueError, treecluster, data7) self.assertRaises(ValueError, treecluster, data8) self.assertRaises(ValueError, treecluster, data9) self.assertRaises(ValueError, treecluster, data10) self.assertRaises(TypeError, treecluster, data11) self.assertRaises(ValueError, treecluster, data12) self.assertRaises(ValueError, treecluster, data13) self.assertRaises(ValueError, treecluster, data14) self.assertRaises(ValueError, treecluster, data15) self.assertRaises(ValueError, treecluster, data16)
def test_treecluster(self): if TestCluster.module == 'Bio.Cluster': from Bio.Cluster import treecluster elif TestCluster.module == 'Pycluster': from Pycluster import treecluster # First data set weight1 = [1, 1, 1, 1, 1] data1 = numpy.array([[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [9.7, 2.0, 0.0, 5.0, 0.0]]) mask1 = numpy.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], int) # test first data set # Pairwise average-linkage clustering tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='a', dist='e') self.assertEqual(len(tree), len(data1) - 1) self.assertEqual(tree[0].left, 2) self.assertEqual(tree[0].right, 1) self.assertAlmostEqual(tree[0].distance, 2.600, places=3) self.assertEqual(tree[1].left, -1) self.assertEqual(tree[1].right, 0) self.assertAlmostEqual(tree[1].distance, 7.300, places=3) self.assertEqual(tree[2].left, 3) self.assertEqual(tree[2].right, -2) self.assertAlmostEqual(tree[2].distance, 13.540, places=3) indices = tree.sort([0, 1, 2, 3]) self.assertEqual(len(indices), len(data1)) self.assertEqual(indices[0], 0) self.assertEqual(indices[1], 1) self.assertEqual(indices[2], 2) self.assertEqual(indices[3], 3) indices = tree.sort([0, 3, 2, 1]) self.assertEqual(len(indices), len(data1)) self.assertEqual(indices[0], 3) self.assertEqual(indices[1], 0) self.assertEqual(indices[2], 2) self.assertEqual(indices[3], 1) # Pairwise single-linkage clustering tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='s', dist='e') self.assertEqual(len(tree), len(data1) - 1) self.assertEqual(tree[0].left, 1) self.assertEqual(tree[0].right, 2) self.assertAlmostEqual(tree[0].distance, 2.600, places=3) self.assertEqual(tree[1].left, 0) self.assertEqual(tree[1].right, -1) self.assertAlmostEqual(tree[1].distance, 5.800, places=3) self.assertEqual(tree[2].left, -2) self.assertEqual(tree[2].right, 3) self.assertAlmostEqual(tree[2].distance, 6.380, places=3) indices = tree.sort([0, 1, 2, 3]) self.assertEqual(len(indices), len(data1)) self.assertEqual(indices[0], 0) self.assertEqual(indices[1], 1) self.assertEqual(indices[2], 2) self.assertEqual(indices[3], 3) indices = tree.sort([0, 3, 2, 1]) self.assertEqual(len(indices), len(data1)) self.assertEqual(indices[0], 3) self.assertEqual(indices[1], 0) self.assertEqual(indices[2], 2) self.assertEqual(indices[3], 1) # Pairwise centroid-linkage clustering tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='c', dist='e') self.assertEqual(len(tree), len(data1) - 1) self.assertEqual(tree[0].left, 1) self.assertEqual(tree[0].right, 2) self.assertAlmostEqual(tree[0].distance, 2.600, places=3) self.assertEqual(tree[1].left, 0) self.assertEqual(tree[1].right, -1) self.assertAlmostEqual(tree[1].distance, 6.650, places=3) self.assertEqual(tree[2].left, -2) self.assertEqual(tree[2].right, 3) self.assertAlmostEqual(tree[2].distance, 11.629, places=3) indices = tree.sort([0, 1, 2, 3]) self.assertEqual(len(indices), len(data1)) self.assertEqual(indices[0], 0) self.assertEqual(indices[1], 1) self.assertEqual(indices[2], 2) self.assertEqual(indices[3], 3) indices = tree.sort([0, 3, 2, 1]) self.assertEqual(len(indices), len(data1)) self.assertEqual(indices[0], 3) self.assertEqual(indices[1], 0) self.assertEqual(indices[2], 2) self.assertEqual(indices[3], 1) # Pairwise maximum-linkage clustering tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='m', dist='e') self.assertEqual(len(tree), len(data1) - 1) self.assertEqual(tree[0].left, 2) self.assertEqual(tree[0].right, 1) self.assertAlmostEqual(tree[0].distance, 2.600, places=3) self.assertEqual(tree[1].left, -1) self.assertEqual(tree[1].right, 0) self.assertAlmostEqual(tree[1].distance, 8.800, places=3) self.assertEqual(tree[2].left, 3) self.assertEqual(tree[2].right, -2) self.assertAlmostEqual(tree[2].distance, 23.100, places=3) indices = tree.sort([0, 1, 2, 3]) self.assertEqual(len(indices), len(data1)) self.assertEqual(indices[0], 0) self.assertEqual(indices[1], 1) self.assertEqual(indices[2], 2) self.assertEqual(indices[3], 3) indices = tree.sort([0, 3, 2, 1]) self.assertEqual(len(indices), len(data1)) self.assertEqual(indices[0], 3) self.assertEqual(indices[1], 0) self.assertEqual(indices[2], 2) self.assertEqual(indices[3], 1) # Second data set weight2 = [1, 1] data2 = numpy.array([[0.8223, 0.9295], [1.4365, 1.3223], [1.1623, 1.5364], [2.1826, 1.1934], [1.7763, 1.9352], [1.7215, 1.9912], [2.1812, 5.9935], [5.3290, 5.9452], [3.1491, 3.3454], [5.1923, 5.3156], [4.7735, 5.4012], [5.1297, 5.5645], [5.3934, 5.1823]]) mask2 = numpy.array( [[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]], int) # Test second data set # Pairwise average-linkage clustering tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='a', dist='e') self.assertEqual(len(tree), len(data2) - 1) self.assertEqual(tree[0].left, 5) self.assertEqual(tree[0].right, 4) self.assertAlmostEqual(tree[0].distance, 0.003, places=3) self.assertEqual(tree[1].left, 9) self.assertEqual(tree[1].right, 12) self.assertAlmostEqual(tree[1].distance, 0.029, places=3) self.assertEqual(tree[2].left, 2) self.assertEqual(tree[2].right, 1) self.assertAlmostEqual(tree[2].distance, 0.061, places=3) self.assertEqual(tree[3].left, 11) self.assertEqual(tree[3].right, -2) self.assertAlmostEqual(tree[3].distance, 0.070, places=3) self.assertEqual(tree[4].left, -4) self.assertEqual(tree[4].right, 10) self.assertAlmostEqual(tree[4].distance, 0.128, places=3) self.assertEqual(tree[5].left, 7) self.assertEqual(tree[5].right, -5) self.assertAlmostEqual(tree[5].distance, 0.224, places=3) self.assertEqual(tree[6].left, -3) self.assertEqual(tree[6].right, 0) self.assertAlmostEqual(tree[6].distance, 0.254, places=3) self.assertEqual(tree[7].left, -1) self.assertEqual(tree[7].right, 3) self.assertAlmostEqual(tree[7].distance, 0.391, places=3) self.assertEqual(tree[8].left, -8) self.assertEqual(tree[8].right, -7) self.assertAlmostEqual(tree[8].distance, 0.532, places=3) self.assertEqual(tree[9].left, 8) self.assertEqual(tree[9].right, -9) self.assertAlmostEqual(tree[9].distance, 3.234, places=3) self.assertEqual(tree[10].left, -6) self.assertEqual(tree[10].right, 6) self.assertAlmostEqual(tree[10].distance, 4.636, places=3) self.assertEqual(tree[11].left, -11) self.assertEqual(tree[11].right, -10) self.assertAlmostEqual(tree[11].distance, 12.741, places=3) indices = tree.sort() self.assertEqual(len(indices), len(data2)) self.assertEqual(indices[0], 7) self.assertEqual(indices[1], 11) self.assertEqual(indices[2], 9) self.assertEqual(indices[3], 12) self.assertEqual(indices[4], 10) self.assertEqual(indices[5], 6) self.assertEqual(indices[6], 8) self.assertEqual(indices[7], 5) self.assertEqual(indices[8], 4) self.assertEqual(indices[9], 3) self.assertEqual(indices[10], 2) self.assertEqual(indices[11], 1) self.assertEqual(indices[12], 0) # Pairwise single-linkage clustering tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='s', dist='e') self.assertEqual(len(tree), len(data2) - 1) self.assertEqual(tree[0].left, 4) self.assertEqual(tree[0].right, 5) self.assertAlmostEqual(tree[0].distance, 0.003, places=3) self.assertEqual(tree[1].left, 9) self.assertEqual(tree[1].right, 12) self.assertAlmostEqual(tree[1].distance, 0.029, places=3) self.assertEqual(tree[2].left, 11) self.assertEqual(tree[2].right, -2) self.assertAlmostEqual(tree[2].distance, 0.033, places=3) self.assertEqual(tree[3].left, 1) self.assertEqual(tree[3].right, 2) self.assertAlmostEqual(tree[3].distance, 0.061, places=3) self.assertEqual(tree[4].left, 10) self.assertEqual(tree[4].right, -3) self.assertAlmostEqual(tree[4].distance, 0.077, places=3) self.assertEqual(tree[5].left, 7) self.assertEqual(tree[5].right, -5) self.assertAlmostEqual(tree[5].distance, 0.092, places=3) self.assertEqual(tree[6].left, 0) self.assertEqual(tree[6].right, -4) self.assertAlmostEqual(tree[6].distance, 0.242, places=3) self.assertEqual(tree[7].left, -7) self.assertEqual(tree[7].right, -1) self.assertAlmostEqual(tree[7].distance, 0.246, places=3) self.assertEqual(tree[8].left, 3) self.assertEqual(tree[8].right, -8) self.assertAlmostEqual(tree[8].distance, 0.287, places=3) self.assertEqual(tree[9].left, -9) self.assertEqual(tree[9].right, 8) self.assertAlmostEqual(tree[9].distance, 1.936, places=3) self.assertEqual(tree[10].left, -10) self.assertEqual(tree[10].right, -6) self.assertAlmostEqual(tree[10].distance, 3.432, places=3) self.assertEqual(tree[11].left, 6) self.assertEqual(tree[11].right, -11) self.assertAlmostEqual(tree[11].distance, 3.535, places=3) indices = tree.sort() self.assertEqual(len(indices), len(data2)) self.assertEqual(indices[0], 6) self.assertEqual(indices[1], 3) self.assertEqual(indices[2], 0) self.assertEqual(indices[3], 1) self.assertEqual(indices[4], 2) self.assertEqual(indices[5], 4) self.assertEqual(indices[6], 5) self.assertEqual(indices[7], 8) self.assertEqual(indices[8], 7) self.assertEqual(indices[9], 10) self.assertEqual(indices[10], 11) self.assertEqual(indices[11], 9) self.assertEqual(indices[12], 12) # Pairwise centroid-linkage clustering tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='c', dist='e') self.assertEqual(len(tree), len(data2) - 1) self.assertEqual(tree[0].left, 4) self.assertEqual(tree[0].right, 5) self.assertAlmostEqual(tree[0].distance, 0.003, places=3) self.assertEqual(tree[1].left, 12) self.assertEqual(tree[1].right, 9) self.assertAlmostEqual(tree[1].distance, 0.029, places=3) self.assertEqual(tree[2].left, 1) self.assertEqual(tree[2].right, 2) self.assertAlmostEqual(tree[2].distance, 0.061, places=3) self.assertEqual(tree[3].left, -2) self.assertEqual(tree[3].right, 11) self.assertAlmostEqual(tree[3].distance, 0.063, places=3) self.assertEqual(tree[4].left, 10) self.assertEqual(tree[4].right, -4) self.assertAlmostEqual(tree[4].distance, 0.109, places=3) self.assertEqual(tree[5].left, -5) self.assertEqual(tree[5].right, 7) self.assertAlmostEqual(tree[5].distance, 0.189, places=3) self.assertEqual(tree[6].left, 0) self.assertEqual(tree[6].right, -3) self.assertAlmostEqual(tree[6].distance, 0.239, places=3) self.assertEqual(tree[7].left, 3) self.assertEqual(tree[7].right, -1) self.assertAlmostEqual(tree[7].distance, 0.390, places=3) self.assertEqual(tree[8].left, -7) self.assertEqual(tree[8].right, -8) self.assertAlmostEqual(tree[8].distance, 0.382, places=3) self.assertEqual(tree[9].left, -9) self.assertEqual(tree[9].right, 8) self.assertAlmostEqual(tree[9].distance, 3.063, places=3) self.assertEqual(tree[10].left, 6) self.assertEqual(tree[10].right, -6) self.assertAlmostEqual(tree[10].distance, 4.578, places=3) self.assertEqual(tree[11].left, -10) self.assertEqual(tree[11].right, -11) self.assertAlmostEqual(tree[11].distance, 11.536, places=3) indices = tree.sort() self.assertEqual(len(indices), len(data2)) self.assertEqual(indices[0], 0) self.assertEqual(indices[1], 1) self.assertEqual(indices[2], 2) self.assertEqual(indices[3], 3) self.assertEqual(indices[4], 4) self.assertEqual(indices[5], 5) self.assertEqual(indices[6], 8) self.assertEqual(indices[7], 6) self.assertEqual(indices[8], 10) self.assertEqual(indices[9], 12) self.assertEqual(indices[10], 9) self.assertEqual(indices[11], 11) self.assertEqual(indices[12], 7) # Pairwise maximum-linkage clustering tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='m', dist='e') self.assertEqual(len(tree), len(data2) - 1) self.assertEqual(tree[0].left, 5) self.assertEqual(tree[0].right, 4) self.assertAlmostEqual(tree[0].distance, 0.003, places=3) self.assertEqual(tree[1].left, 9) self.assertEqual(tree[1].right, 12) self.assertAlmostEqual(tree[1].distance, 0.029, places=3) self.assertEqual(tree[2].left, 2) self.assertEqual(tree[2].right, 1) self.assertAlmostEqual(tree[2].distance, 0.061, places=3) self.assertEqual(tree[3].left, 11) self.assertEqual(tree[3].right, 10) self.assertAlmostEqual(tree[3].distance, 0.077, places=3) self.assertEqual(tree[4].left, -2) self.assertEqual(tree[4].right, -4) self.assertAlmostEqual(tree[4].distance, 0.216, places=3) self.assertEqual(tree[5].left, -3) self.assertEqual(tree[5].right, 0) self.assertAlmostEqual(tree[5].distance, 0.266, places=3) self.assertEqual(tree[6].left, -5) self.assertEqual(tree[6].right, 7) self.assertAlmostEqual(tree[6].distance, 0.302, places=3) self.assertEqual(tree[7].left, -1) self.assertEqual(tree[7].right, 3) self.assertAlmostEqual(tree[7].distance, 0.425, places=3) self.assertEqual(tree[8].left, -8) self.assertEqual(tree[8].right, -6) self.assertAlmostEqual(tree[8].distance, 0.968, places=3) self.assertEqual(tree[9].left, 8) self.assertEqual(tree[9].right, 6) self.assertAlmostEqual(tree[9].distance, 3.975, places=3) self.assertEqual(tree[10].left, -10) self.assertEqual(tree[10].right, -7) self.assertAlmostEqual(tree[10].distance, 5.755, places=3) self.assertEqual(tree[11].left, -11) self.assertEqual(tree[11].right, -9) self.assertAlmostEqual(tree[11].distance, 22.734, places=3) indices = tree.sort() self.assertEqual(len(indices), len(data2)) self.assertEqual(indices[0], 8) self.assertEqual(indices[1], 6) self.assertEqual(indices[2], 9) self.assertEqual(indices[3], 12) self.assertEqual(indices[4], 11) self.assertEqual(indices[5], 10) self.assertEqual(indices[6], 7) self.assertEqual(indices[7], 5) self.assertEqual(indices[8], 4) self.assertEqual(indices[9], 3) self.assertEqual(indices[10], 2) self.assertEqual(indices[11], 1) self.assertEqual(indices[12], 0)
def test_mask_parse(self): if TestCluster.module == 'Bio.Cluster': from Bio.Cluster import treecluster elif TestCluster.module == 'Pycluster': from Pycluster import treecluster # data matrix data = numpy.array([[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5], [4.1, 2.2, 0.3, 5.4, 0.5], [2.1, 2.0, 0.0, 5.0, 0.0]]) # Normal mask, no errors mask1 = numpy.array([[1, 1, 0, 1, 0], [1, 1, 1, 0, 0], [1, 1, 0, 1, 1], [1, 0, 1, 1, 0]]) # Same mask, no errors; written as a list mask2 = [[1, 1, 0, 1, 0], [1, 1, 1, 0, 0], [1, 1, 0, 1, 1], [1, 0, 1, 1, 0]] # Rows are not contiguous mask3 = numpy.array([[1, 1, 0, 1, 0], [1, 1, 1, 0, 0], [1, 1, 1, 0, 0], [1, 1, 0, 1, 1], [1, 1, 1, 0, 0], [1, 1, 0, 1, 1], [1, 1, 0, 1, 1], [1, 0, 1, 1, 0]]) mask3 = mask3[::2, :] # Columns are not contiguous mask4 = numpy.array([[1, 1, 0, 1, 0, 1, 0, 0, 1, 1], [1, 1, 1, 0, 0, 1, 1, 0, 0, 1], [1, 1, 0, 1, 1, 1, 0, 1, 1, 0], [1, 0, 1, 1, 0, 1, 0, 0, 1, 1]]) mask4 = mask4[:, ::2] # Matrix using int16 mask5 = numpy.array([[1, 1, 0, 1, 0], [1, 1, 1, 0, 0], [1, 1, 1, 0, 0], [1, 1, 0, 1, 1]], numpy.int16) # Matrix using float mask6 = numpy.array( [[1.0, 2.2, 3.1, 4.8, 5.1], [3.3, 3.3, 1.4, 2.4, 1.2], [4.1, 2.2, 0.6, 5.5, 0.6], [2.7, 2.5, 0.4, 5.7, 0.2]], numpy.float) try: treecluster(data, mask1) except Exception: self.fail("treecluster failed to accept matrix mask1") try: treecluster(data, mask2) except Exception: self.fail("treecluster failed to accept matrix mask2") try: treecluster(data, mask3) except Exception: self.fail("treecluster failed to accept matrix mask3") try: treecluster(data, mask4) except Exception: self.fail("treecluster failed to accept matrix mask4") try: treecluster(data, mask5) except Exception: self.fail("treecluster failed to accept matrix mask5") try: treecluster(data, mask6) except Exception: self.fail("treecluster failed to accept matrix mask6") # Ragged mask mask7 = [[1, 1, 0, 1], [1, 1, 1, 0, 0], [1, 1, 0, 1, 1], [1, 1, 0]] # Mask with incorrect number of rows mask8 = numpy.array([[1, 1, 0, 1, 0], [1, 1, 1, 0, 0], [1, 1, 0, 1, 1], [0, 1, 1, 0, 1], [1, 0, 1, 1, 0]]) # Mask with incorrect number of columns mask9 = numpy.array([[1, 1, 0, 1, 0, 1], [1, 1, 1, 0, 0, 0], [0, 1, 1, 0, 1, 1], [1, 0, 1, 1, 0, 1]]) # Matrix with bad cells mask10 = [[1, 1, 0, 1, 0], [1, 1, 1, 0, 'snoopy'], [1, 1, 0, 1, 1], [1, 0, 1, 1, 0]] # Matrix with a bad row mask11 = [[1, 1, 0, 1, 0], None, [1, 1, 0, 1, 1], [1, 0, 1, 1, 0]] # Array with non-numerical values mask12 = numpy.array([['a', 'b', 'c'], ['e', 'f', 'g']], 'c') # Empty arrays mask13 = numpy.array([[]], 'd') mask14 = [] # Array of incorrect rank mask15 = numpy.array([[[1, 1], [0, 1], [1, 1]], [[1, 1], [0, 1], [1, 1]], [[1, 1], [1, 1], [1, 0]]]) # References that cannot be converted to a matrix of int mask16 = "snoopy" mask17 = {'a': [[1, 0], [1, 1]]} mask18 = [None] self.assertRaises(ValueError, treecluster, data, mask7) self.assertRaises(ValueError, treecluster, data, mask8) self.assertRaises(ValueError, treecluster, data, mask9) self.assertRaises(ValueError, treecluster, data, mask10) self.assertRaises(ValueError, treecluster, data, mask11) self.assertRaises(ValueError, treecluster, data, mask12) self.assertRaises(ValueError, treecluster, data, mask13) self.assertRaises(ValueError, treecluster, data, mask14) self.assertRaises(ValueError, treecluster, data, mask15) self.assertRaises(ValueError, treecluster, data, mask16) self.assertRaises(TypeError, treecluster, data, mask17) self.assertRaises(TypeError, treecluster, data, mask18)
def test_matrix_parse(module): if module=='Bio.Cluster': from Bio.Cluster import treecluster elif module=='Pycluster': from Pycluster import treecluster else: raise 'Unknown module name', module print "test_matrix_parse:" # Normal matrix, no errors data1 = array([[ 1.1, 1.2 ], [ 1.4, 1.3 ], [ 1.1, 1.5 ], [ 2.0, 1.5 ], [ 1.7, 1.9 ], [ 1.7, 1.9 ], [ 5.7, 5.9 ], [ 5.7, 5.9 ], [ 3.1, 3.3 ], [ 5.4, 5.3 ], [ 5.1, 5.5 ], [ 5.0, 5.5 ], [ 5.1, 5.2 ]]) # Another normal matrix, no errors; written as a list data2 = [[ 1.1, 2.2, 3.3, 4.4, 5.5 ], [ 3.1, 3.2, 1.3, 2.4, 1.5 ], [ 4.1, 2.2, 0.3, 5.4, 0.5 ], [ 12.1, 2.0, 0.0, 5.0, 0.0 ]] # Ragged matrix data3 = [[ 91.1, 92.2, 93.3, 94.4, 95.5], [ 93.1, 93.2, 91.3, 92.4 ], [ 94.1, 92.2, 90.3 ], [ 12.1, 92.0, 90.0, 95.0, 90.0 ]] # Matrix with bad cells data4 = [ [ 7.1, 7.2, 7.3, 7.4, 7.5, ], [ 7.1, 7.2, 7.3, 7.4, 'snoopy' ], [ 7.1, 7.2, 7.3, None, None]] # Matrix with a bad row data5 = [ [ 23.1, 23.2, 23.3, 23.4, 23.5], None, [ 23.1, 23.0, 23.0, 23.0, 23.0]] # Various references that don't point to matrices at all data6 = "snoopy" data7 = {'a': [[2.3,1.2],[3.3,5.6]]} data8 = [] data9 = [None] data10 = [[None]] try: result = treecluster(data1) print "Read data1 (correct)" except: "Error: treecluster failed to accept matrix data1" try: result = treecluster(data2) print "Read data2 (correct)" except: "Error: treecluster failed to accept matrix data2" try: result = treecluster(data3) print "Error: treecluster incorrectly accepted data3" except: print "Refused incorrect matrix data3" try: result = treecluster(data4) print "Error: treecluster incorrectly accepted data4" except: print "Refused incorrect matrix data4" try: result = treecluster(data5) print "Error: treecluster incorrectly accepted data5" except: print "Refused incorrect matrix data5" try: result = treecluster(data6) print "Error: treecluster incorrectly accepted data6" except: print "Refused incorrect matrix data6" try: result = treecluster(data7) print "Error: treecluster incorrectly accepted data7" except: print "Refused incorrect matrix data7" try: result = treecluster(data8) print "Error: treecluster incorrectly accepted data8" except: print "Refused incorrect matrix data8" try: result = treecluster(data9) print "Error: treecluster incorrectly accepted data9" except: print "Refused incorrect matrix data9" try: result = treecluster(data10) print "Error: treecluster incorrectly accepted data10" except: print "Refused incorrect matrix data10" print
def test_treecluster(module): if module=='Bio.Cluster': from Bio.Cluster import treecluster elif module=='Pycluster': from Pycluster import treecluster else: raise 'Unknown module name', module print "test_treecluster:" # First data set weight1 = [ 1,1,1,1,1 ] data1 = array([[ 1.1, 2.2, 3.3, 4.4, 5.5], [ 3.1, 3.2, 1.3, 2.4, 1.5], [ 4.1, 2.2, 0.3, 5.4, 0.5], [ 12.1, 2.0, 0.0, 5.0, 0.0]]) mask1 = array([[ 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 1]]) # Second data set weight2 = [ 1,1 ] data2 = array([[ 0.8223, 0.9295 ], [ 1.4365, 1.3223 ], [ 1.1623, 1.5364 ], [ 2.1826, 1.1934 ], [ 1.7763, 1.9352 ], [ 1.7215, 1.9912 ], [ 2.1812, 5.9935 ], [ 5.3290, 5.9452 ], [ 3.1491, 3.3454 ], [ 5.1923, 5.3156 ], [ 4.7735, 5.4012 ], [ 5.1297, 5.5645 ], [ 5.3934, 5.1823 ]]) mask2 = array([[ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ], [ 1, 1 ]]) # test first data set print "First data set:" print_matrix(data1, mask1) print "Pairwise average-linkage clustering" result, linkdist = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='a', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data1)-1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data1)-1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i]) print "Pairwise single-linkage clustering" result, linkdist = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='s', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data1)-1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data1)-1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i]) print "Pairwise centroid-linkage clustering" result, linkdist = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='c', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data1)-1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data1)-1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i]) print "Pairwise maximum-linkage clustering" result, linkdist = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='m', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data1)-1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data1)-1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i]) # Test second data set print "Second data set:" print "Pairwise average-linkage clustering" result, linkdist = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='a', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data2)-1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data2)-1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i]) print "Pairwise single-linkage clustering" result, linkdist = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='s', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data2)-1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data2)-1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i]) print "Pairwise centroid-linkage clustering" result, linkdist = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='c', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data2)-1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data2)-1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i]) print "Pairwise maximum-linkage clustering" result, linkdist = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='m', dist='e') print "Number of nodes is %d (should be %d)" % (len(result), len(data2)-1) print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data2)-1) for i in range(len(result)): print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i]) print
from Bio.Cluster import treecluster import numpy as np from Bio.Cluster import distancematrix data=np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12],[0,1,2,3]]) tree = treecluster(data) print(tree) # 예제 데이터도 없어... # 아무튼 이런 식으로 array로 그릴수도 있고 tree = treecluster(data,dist="b",distancematrix=None) print(tree) # 다른 옵션을 줄 수도 있다. distances=distancematrix((data)) tree = treecluster(data=None,distancematrix=distances) print(tree) # Distance matrix를 미리 계산해 그걸로 그릴 수도 있다. # ValueError: use either data or distancematrix; do not use both # Data와 Distance matrix중 하나는 None이어야 한다. 안그러면 위 에러가 반긴다.