def test_from_iterable_validate_equal_valid_data(self): validate_true = DistanceMatrix.from_iterable((x for x in range(4)), lambda a, b: abs(b - a), validate=True) validate_false = DistanceMatrix.from_iterable((x for x in range(4)), lambda a, b: abs(b - a), validate=False) self.assertEqual(validate_true, validate_false)
def progressive_msa_and_tree(sequences, pairwise_aligner, metric=kmer_distance, guide_tree=None, display_aln=False, display_tree=False): """ Perform progressive msa of sequences and build a UPGMA tree Parameters ---------- sequences : skbio.SequenceCollection The sequences to be aligned. pairwise_aligner : function Function that should be used to perform the pairwise alignments, for example skbio.alignment.global_pairwise_align_nucleotide. Must support skbio.Sequence objects or skbio.TabularMSA objects as input. metric : function, optional Function that returns a single distance value when given a pair of skbio.Sequence objects. This will be used to build a guide tree if one is not provided. guide_tree : skbio.TreeNode, optional The tree that should be used to guide the alignment process. display_aln : bool, optional Print the alignment before returning. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.alignment skbio.TreeNode """ if guide_tree is None: guide_dm = DistanceMatrix.from_iterable( sequences, metric=metric, key='id') guide_lm = average(guide_dm.condensed_form()) guide_tree = TreeNode.from_linkage_matrix(guide_lm, guide_dm.ids) msa = progressive_msa(sequences, guide_tree, pairwise_aligner=pairwise_aligner) if display_aln: print(msa) msa_dm = DistanceMatrix.from_iterable(msa, metric=metric, key='id') msa_lm = average(msa_dm.condensed_form()) msa_tree = TreeNode.from_linkage_matrix(msa_lm, msa_dm.ids) if display_tree: print("\nOutput tree:") d = dendrogram(msa_lm, labels=msa_dm.ids, orientation='right', link_color_func=lambda x: 'black', leaf_font_size=24) return msa, msa_tree
def aln_distmat(alignment, reps=3): '''Calculate pairwise distances from a MSA of genomes''' aln = TabularMSA.read(alignment, constructor=DNA) aln.reassign_index(minter="id") dist = DistanceMatrix.from_iterable([seq.values for seq in aln], metric=hamming, keys=aln.index) return dist
def guide_tree_from_sequences(sequences, metric=kmer_distance, display_tree = False): """ Build a UPGMA tree by applying metric to sequences Parameters ---------- sequences : list of skbio.Sequence objects (or subclasses) The sequences to be represented in the resulting guide tree. metric : function Function that returns a single distance value when given a pair of skbio.Sequence objects. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.TreeNode """ guide_dm = DistanceMatrix.from_iterable( sequences, metric=metric, key='id') guide_lm = average(guide_dm.condensed_form()) guide_tree = to_tree(guide_lm) if display_tree: guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', link_color_func=lambda x: 'black') return guide_tree
def test_from_iterable_no_key(self): iterable = (x for x in range(4)) exp = DistanceMatrix([[0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0]]) res = DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a)) self.assertEqual(res, exp)
def test_from_iterable_validate_false_non_symmetric(self): exp = DistanceMatrix([[0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0]]) res = DistanceMatrix.from_iterable((x for x in range(4)), lambda a, b: a - b, validate=False) self.assertEqual(res, exp)
def test_from_iterable_with_keys(self): iterable = (x for x in range(4)) exp = DistanceMatrix( [[0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0]], ['0', '1', '4', '9']) res = DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a), keys=iter(['0', '1', '4', '9'])) self.assertEqual(res, exp)
def test_from_iterable_with_keys(self): iterable = (x for x in range(4)) exp = DistanceMatrix([[0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0]], ['0', '1', '4', '9']) res = DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a), keys=iter(['0', '1', '4', '9'])) self.assertEqual(res, exp)
def setUp(self): np.random.seed(0) x = np.random.rand(10) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) ids = np.arange(len(x)).astype(np.str) self.tree = TreeNode.from_linkage_matrix(lm, ids) # initialize tree with branch length and named internal nodes for i, n in enumerate(self.tree.postorder(include_self=True)): n.length = 1 if not n.is_tip(): n.name = "y%d" % i
def setUp(self): np.random.seed(0) x = np.random.rand(10) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y)) lm = ward(dm.condensed_form()) ids = np.arange(len(x)).astype(np.str) self.tree = TreeNode.from_linkage_matrix(lm, ids) # initialize tree with branch length and named internal nodes for i, n in enumerate(self.tree.postorder(include_self=True)): n.length = 1 if not n.is_tip(): n.name = "y%d" % i
def gradient_linkage(X, y, method='average'): """ Principal Balance Analysis using Hierarchical Clustering on known gradient. The hierarchy is built based on the values of the samples located along a gradient. Given a feature :math:`x`, the mean gradient values that :math:`x` was observed in is calculated by .. math:: f(g , x) = \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j} Where :math:`N` is the number of samples, :math:`x_i` is the proportion of feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value at sample `i`. The distance between two features :math:`x` and :math:`y` can be defined as .. math:: d(x, y) = (f(g, x) - f(g, y))^2 If :math:`d(x, y)` is very small, then :math:`x` and :math:`y` are expected to live in very similar positions across the gradient. A hierarchical clustering is then performed using :math:`d(x, y)` as the distance metric. Parameters ---------- X : pd.DataFrame Contingency table where the samples are rows and the features are columns. y : pd.Series Continuous vector representing some ordering of the features in X. method : str Clustering method. (default='average') Returns ------- skbio.TreeNode Tree generated from principal balance analysis. See Also -------- mean_niche_estimator """ _X, _y = match(X, y) mean_X = mean_niche_estimator(_X, gradient=_y) dm = DistanceMatrix.from_iterable(mean_X, euclidean) lm = linkage(dm.condensed_form(), method) return TreeNode.from_linkage_matrix(lm, X.columns)
def setUp(self): np.random.seed(0) self.table = pd.DataFrame(np.random.random((5, 5))) num_otus = 5 # otus x = np.random.rand(num_otus) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) self.tree = SquareDendrogram.from_tree(t) for i, n in enumerate(t.postorder()): if not n.is_tip(): n.name = "y%d" % i n.length = np.random.rand()*3
def main(): ids, seqs = [], [] for line in fileinput.input(): line = line.rstrip('\r\n') if line.startswith('>'): ids.append(line[1:]) seqs.append('') else: seqs[-1] += line mat = DistanceMatrix.from_iterable(seqs, hamming_no_gap, keys=ids, validate=False) mat.write(sys.stdout)
def setUp(self): np.random.seed(0) self.table = pd.DataFrame(np.random.random((5, 5))) num_otus = 5 # otus x = np.random.rand(num_otus) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) self.tree = SquareDendrogram.from_tree(t) for i, n in enumerate(t.postorder()): if not n.is_tip(): n.name = "y%d" % i n.length = np.random.rand() * 3
def rank_linkage(r, method='average'): r""" Hierchical Clustering on feature ranks. The hierarchy is built based on the rank values of the features given an input vector `r` of ranks. The distance between two features :math:`x` and :math:`y` can be defined as .. math:: d(x, y) = (r(x) - r(y))^2 Where :math:`r(x)` is the rank of the features. Hierarchical clustering is then performed using :math:`d(x, y)` as the distance metric. This can be useful for constructing principal balances. Parameters ---------- r : pd.Series Continuous vector representing some ordering of the features in X. method : str Clustering method. (default='average') Returns ------- skbio.TreeNode Tree for constructing principal balances. Examples -------- >>> import pandas as pd >>> from gneiss.cluster import rank_linkage >>> ranks = pd.Series([1, 2, 4, 5], ... index=['o1', 'o2', 'o3', 'o4']) >>> tree = rank_linkage(ranks) >>> print(tree.ascii_art()) /-o1 /y1------| | \-o2 -y0------| | /-o3 \y2------| \-o4 """ dm = DistanceMatrix.from_iterable(r, euclidean) lm = linkage(dm.condensed_form(), method) t = TreeNode.from_linkage_matrix(lm, r.index) t = rename_internal_nodes(t) return t
def test_cache_ntips(self): dm = DistanceMatrix.from_iterable([0, 1, 2, 3], lambda x, y: np.abs(x - y)) lm = ward(dm.condensed_form()) ids = np.arange(4).astype(np.str) t = mock.from_linkage_matrix(lm, ids) t._cache_ntips() self.assertEqual(t.leafcount, 4) self.assertEqual(t.children[0].leafcount, 2) self.assertEqual(t.children[1].leafcount, 2) self.assertEqual(t.children[0].children[0].leafcount, 1) self.assertEqual(t.children[0].children[1].leafcount, 1) self.assertEqual(t.children[1].children[0].leafcount, 1) self.assertEqual(t.children[1].children[1].leafcount, 1)
def test_cache_ntips(self): dm = DistanceMatrix.from_iterable([0, 1, 2, 3], lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) ids = np.arange(4).astype(np.str) t = mock.from_linkage_matrix(lm, ids) t._cache_ntips() self.assertEquals(t.leafcount, 4) self.assertEquals(t.children[0].leafcount, 2) self.assertEquals(t.children[1].leafcount, 2) self.assertEquals(t.children[0].children[0].leafcount, 1) self.assertEquals(t.children[0].children[1].leafcount, 1) self.assertEquals(t.children[1].children[0].leafcount, 1) self.assertEquals(t.children[1].children[1].leafcount, 1)
def progressive_msa_and_tree(sequences, pairwise_aligner, metric=kmer_distance, guide_tree=None, display_aln=False, display_tree=False): """ Perform progressive msa of sequences and build a UPGMA tree Parameters ---------- sequences : skbio.SequenceCollection The sequences to be aligned. pairwise_aligner : function Function that should be used to perform the pairwise alignments, for example skbio.alignment.global_pairwise_align_nucleotide. Must support skbio.Sequence objects or skbio.TabularMSA objects as input. metric : function, optional Function that returns a single distance value when given a pair of skbio.Sequence objects. This will be used to build a guide tree if one is not provided. guide_tree : skbio.TreeNode, optional The tree that should be used to guide the alignment process. display_aln : bool, optional Print the alignment before returning. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.alignment skbio.TreeNode """ msa = progressive_msa(sequences, pairwise_aligner=pairwise_aligner, guide_tree=guide_tree) if display_aln: print(msa) msa_dm = DistanceMatrix.from_iterable(msa, metric=metric, key='id') msa_lm = sp.cluster.hierarchy.average(msa_dm.condensed_form()) msa_tree = TreeNode.from_linkage_matrix(msa_lm, msa_dm.ids) if display_tree: print("\nOutput tree:") d = sp.cluster.hierarchy.dendrogram(msa_lm, labels=msa_dm.ids, orientation='right', link_color_func=lambda x: 'black') return msa, msa_tree
def geodesic_distance(metadata: qiime2.Metadata, latitude: str = 'Latitude', longitude: str = 'Longitude', missing_data: str = 'error') -> DistanceMatrix: sample_md = _load_and_validate( metadata, [latitude, longitude], ['latitude', 'longitude'], missing_data=missing_data) # Collect geocoordinate points points = [Point(x) for x in zip(sample_md[latitude], sample_md[longitude])] # Compute pairwise distances between all points def distance_function(a, b): return distance.geodesic(a, b).meters dm = DistanceMatrix.from_iterable( points, metric=distance_function, keys=sample_md.index) return dm
def test_from_iterable_skbio_hamming_metric_with_metadata(self): # test for #1254 seqs = [ Sequence('ACGT'), Sequence('ACGA', metadata={'id': 'seq1'}), Sequence('AAAA', metadata={'id': 'seq2'}), Sequence('AAAA', positional_metadata={'qual': range(4)}) ] exp = DistanceMatrix([[0, 0.25, 0.75, 0.75], [0.25, 0.0, 0.5, 0.5], [0.75, 0.5, 0.0, 0.0], [0.75, 0.5, 0.0, 0.0]], ['a', 'b', 'c', 'd']) dm = DistanceMatrix.from_iterable( seqs, metric=skbio.sequence.distance.hamming, keys=['a', 'b', 'c', 'd']) self.assertEqual(dm, exp)
def hamming_distance_matrix(msa, ignore_sequence_ids=False): """Compute Hamming distance matrix of an MSA. Parameters ---------- msa : skbio TabularMSA Aligned sequences for calculating pairwise Hamming distances ignore_sequence_ids : bool Default is False. If true, ignore sequence identifier of alignment. Useful if identifier got truncated by alignment producing program such that different sequences collapse to the same identifier. Returns ------- skbio DistanceMatrix """ key = 'id' if ignore_sequence_ids: key = None return DistanceMatrix.from_iterable(msa, hamming, key=key, validate=False)
def random_tree(n): """ Generates a tree with random topology. Parameters ---------- n : int Number of nodes in the tree Returns ------- skbio.TreeNode Random tree """ x = np.random.rand(n) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) ids = np.arange(len(x)).astype(np.str) t = TreeNode.from_linkage_matrix(lm, ids) t = rename_internal_nodes(t) return t
def setUp(self): np.random.seed(0) self.table = pd.DataFrame(np.random.random((5, 5)), index=['0', '1', '2', '3', '4'], columns=['0', '1', '2', '3', '4']) num_otus = 5 # otus x = np.random.rand(num_otus) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) self.t = SquareDendrogram.from_tree(t) self.md = pd.Series(['a', 'a', 'a', 'b', 'b'], index=['0', '1', '2', '3', '4']) for i, n in enumerate(t.postorder()): if not n.is_tip(): n.name = "y%d" % i n.length = np.random.rand()*3 self.highlights = pd.DataFrame({'y8': ['#FF0000', '#00FF00'], 'y6': ['#0000FF', '#F0000F']}).T
def test_from_iterable_skbio_hamming_metric_with_metadata(self): # test for #1254 seqs = [ Sequence('ACGT'), Sequence('ACGA', metadata={'id': 'seq1'}), Sequence('AAAA', metadata={'id': 'seq2'}), Sequence('AAAA', positional_metadata={'qual': range(4)}) ] exp = DistanceMatrix([ [0, 0.25, 0.75, 0.75], [0.25, 0.0, 0.5, 0.5], [0.75, 0.5, 0.0, 0.0], [0.75, 0.5, 0.0, 0.0]], ['a', 'b', 'c', 'd']) dm = DistanceMatrix.from_iterable( seqs, metric=skbio.sequence.distance.hamming, keys=['a', 'b', 'c', 'd']) self.assertEqual(dm, exp)
def setUp(self): np.random.seed(0) self.table = pd.DataFrame(np.random.random((5, 5)), index=['0', '1', '2', '3', '4'], columns=['0', '1', '2', '3', '4']) num_otus = 5 # otus x = np.random.rand(num_otus) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) self.t = SquareDendrogram.from_tree(t) self.md = pd.Series(['a', 'a', 'a', 'b', 'b'], index=['0', '1', '2', '3', '4']) for i, n in enumerate(t.postorder()): if not n.is_tip(): n.name = "y%d" % i n.length = np.random.rand() * 3 self.highlights = pd.DataFrame({ 'y8': ['#FF0000', '#00FF00'], 'y6': ['#0000FF', '#F0000F'] }).T
def embad(table): """ Calculates the pairwise Earth Mover's distance. Assumes that the table is sorted. Parameters ---------- table : pd.DataFrame Contingency table where the columns are features and the rows are samples. Returns ------- skbio.DistanceMatrix Pairwise distance matrix of Earth Mover's distances """ numsamples, numfeatures = table.shape sample_permutation = range(numsamples) def emd_dist_matrix(numfeatures): D = np.zeros((numfeatures, numfeatures)) for i in range(numfeatures): for j in range(numfeatures): D[i, j] = abs(i - j) D = D.astype(np.float64) return D D = emd_dist_matrix(numfeatures) distance_metric = partial(emd, distance_matrix=D) table_values = table.values.astype(np.float) sample_distance = DistanceMatrix.from_iterable( np.ascontiguousarray(table_values), distance_metric) sample_distance.ids = table.index[sample_permutation] return sample_distance
def test_from_iterable_empty(self): with self.assertRaises(DissimilarityMatrixError): DistanceMatrix.from_iterable([], lambda x: x)
def test_from_iterable_validate_asym(self): iterable = (x for x in range(4)) with self.assertRaises(DistanceMatrixError): DistanceMatrix.from_iterable(iterable, lambda a, b: b - a)
# We can apply this tree style to a random tree as follows. t = ete3.Tree() t.populate(10) t.render("%%inline", tree_style=ts) # distance based methods to phylogenetic reconstruction # for the next approach, we will rely on computing the distances between the sequences. # We will use dissimilarity distance between two objects x and y. Literature on this can be found online, for now # we are going to show the code. from skbio import DistanceMatrix dm = DistanceMatrix([[0.0, 1.0, 2.0], [1.0, 0.0, 3.0], [2.0, 3.0, 0.0]], ids=['a', 'b', 'c']) _ = dm.plot(cmap='Greens') # We will use the scikit-bio to create a skbio.distancematrix object. These objects can be viewed as heatmaps. from BioinformaticsCode.algorithms import kmer_distance kmer_dm = DistanceMatrix.from_iterable(sequences, metric=kmer_distance, key='id') _ = kmer_dm.plot(cmap='Greens', title='3mer distances between sequences') kmer_dm.plot
from skbio import DistanceMatrix from skbio.tree import nj distance_matrix = {} codes_set = set() fname = sys.argv[1] for line in open(fname): code1, code2, distance = line[0:41].strip(), line[41:81].strip( ), 1 - float(line[81:].strip()) distance_vector = distance_matrix.get(code1, {}) distance_vector[code2] = distance distance_matrix[code1] = distance_vector distance_vector = distance_matrix.get(code2, {}) distance_vector[code1] = distance distance_matrix[code2] = distance_vector codes_set.add(code1) codes_set.add(code2) distance_matrix[code1][code1] = 0.0 distance_matrix[code2][code2] = 0.0 distance_function = lambda x, y: distance_matrix[x][y] label_function = lambda x: x.replace(' ', '') dm = DistanceMatrix.from_iterable(codes_set, distance_function, label_function) tree = nj(dm, True) print(tree.ascii_art())
def test_basic_plot(self): self.maxDiff = None exp_edges = {'dest_node': ['0', '1', '2', 'y3'], 'edge_color': ['#00FF00', '#00FF00', '#00FF00', '#FF0000'], 'edge_width': [2, 2, 2, 2], 'src_node': ['y3', 'y4', 'y3', 'y4'], 'x0': [338.2612593838583, 193.1688862557773, 338.2612593838583, 193.1688862557773], 'x1': [487.5, 12.499999999999972, 324.89684138234867, 338.2612593838583], 'y0': [271.7282256126416, 365.95231443706376, 271.7282256126416, 365.95231443706376], 'y1': [347.7691620070637, 483.2800610261029, 16.719938973897143, 271.7282256126416]} exp_nodes = {'child0': [np.nan, np.nan, np.nan, '0', '1'], 'child1': [np.nan, np.nan, np.nan, '2', 'y3'], 'color': ['#1C9099', '#1C9099', '#1C9099', '#FF999F', '#FF999F'], 'hover_var': [None, None, None, None, None], 'is_tip': [True, True, True, False, False], 'node_size': [10, 10, 10, 10, 10], 'x': [487.5, 12.499999999999972, 324.89684138234867, 338.26125938385832, 193.16888625577729], 'y': [347.7691620070637, 483.28006102610289, 16.719938973897143, 271.72822561264161, 365.95231443706376]} np.random.seed(0) num_otus = 3 # otus x = np.random.rand(num_otus) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) t = UnrootedDendrogram.from_tree(t) # incorporate colors in tree for i, n in enumerate(t.postorder(include_self=True)): if not n.is_tip(): n.name = "y%d" % i n.color = '#FF999F' n.edge_color = '#FF0000' n.node_size = 10 else: n.color = '#1C9099' n.edge_color = '#00FF00' n.node_size = 10 n.length = np.random.rand()*3 n.edge_width = 2 p = radialplot(t, node_color='color', edge_color='edge_color', node_size='node_size', edge_width='edge_width') for e in exp_edges.keys(): self.assertListEqual( list(p.renderers[0].data_source.data[e]), exp_edges[e]) for e in exp_nodes.keys(): self.assertListEqual( list(p.renderers[1].data_source.data[e]), exp_nodes[e]) self.assertTrue(isinstance(t, TreeNode))
def test_from_iterable_with_key_and_keys(self): iterable = (x for x in range(4)) with self.assertRaises(ValueError): DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a), key=str, keys=['1', '2', '3', '4'])
def test_from_iterable_single(self): exp = DistanceMatrix([[0]]) res = DistanceMatrix.from_iterable(["boo"], lambda _: 100) self.assertEqual(res, exp)
def progressive_msa(sequences, pairwise_aligner, guide_tree=None, metric=kmer_distance): """ Perform progressive msa of sequences Parameters ---------- sequences : skbio.SequenceCollection The sequences to be aligned. metric : function, optional Function that returns a single distance value when given a pair of skbio.Sequence objects. This will be used to build a guide tree if one is not provided. guide_tree : skbio.TreeNode, optional The tree that should be used to guide the alignment process. pairwise_aligner : function Function that should be used to perform the pairwise alignments, for example skbio.alignment.global_pairwise_align_nucleotide. Must support skbio.Sequence objects or skbio.TabularMSA objects as input. Returns ------- skbio.TabularMSA """ if guide_tree is None: guide_dm = DistanceMatrix.from_iterable( sequences, metric=metric, key='id') guide_lm = sp.cluster.hierarchy.average(guide_dm.condensed_form()) guide_tree = TreeNode.from_linkage_matrix(guide_lm, guide_dm.ids) seq_lookup = {s.metadata['id']: s for i, s in enumerate(sequences)} c1, c2 = guide_tree.children if c1.is_tip(): c1_aln = seq_lookup[c1.name] else: c1_aln = progressive_msa(sequences, pairwise_aligner, c1) if c2.is_tip(): c2_aln = seq_lookup[c2.name] else: c2_aln = progressive_msa(sequences, pairwise_aligner, c2) alignment, _, _ = pairwise_aligner(c1_aln, c2_aln) # this is a temporary hack as the aligners in skbio 0.4.1 are dropping # metadata - this makes sure that the right metadata is associated with # the sequence after alignment if isinstance(c1_aln, Sequence): alignment[0].metadata = c1_aln.metadata len_c1_aln = 1 else: for i in range(len(c1_aln)): alignment[i].metadata = c1_aln[i].metadata len_c1_aln = len(c1_aln) if isinstance(c2_aln, Sequence): alignment[1].metadata = c2_aln.metadata else: for i in range(len(c2_aln)): alignment[len_c1_aln + i].metadata = c2_aln[i].metadata return alignment
# -*- coding: utf-8 -*- """ Created on Wed Jul 20 14:18:58 2016 @author: virginiasaulnier """ from io import StringIO from skbio import DistanceMatrix from skbio import fisher_alpha dm_fh =StringIO("\ta\tb\tc\n" "a\t0.0\t0.5\t1.0\n" "b\t0.5\t0.0\t0.75\n" "c\t1.0\t0.75\t0.0\n") dm = DistanceMatrix.read(dm_fh) print(dm) my_pairs= StringIO("ac,gt,cg,gc,at,ta,gc,ta,tg") dm2 = DistanceMatrix.from_iterable(my_pairs,metric= fisher_alpha(),key=id)
if os.path.isfile(degapped_alignment_fn): angio_msa_nogap_noshort = TabularMSA.read(degapped_alignment_fn, constructor=DNA) sys.stderr.write("Read in degapped alignment: {}\n".format( angio_msa_nogap_noshort.shape)) else: angio_msa_nogap_noshort = get_reduced_alignment( "genes/{}/FNA2AA-upp-masked.fasta".format(gene), angio_1kp_ids) if os.path.isfile(distance_matrix_fn): p_dm = DistanceMatrix.read(distance_matrix_fn) p_dm_df = p_dm.to_data_frame() sys.stderr.write("Read in pre-determined distance matrix!\n") else: p_dm = DistanceMatrix.from_iterable(angio_msa_nogap_noshort, metric=p_distance, key="id") p_dm_df = p_dm.to_data_frame() p_dm_df.to_csv( "onekp_only_angios_pdistance/{}_angio_p_dm.csv".format(gene)) # Cluster sequences divergent_seqs_medoids = [] runs = {} best_run = len(p_dm_df) best_run_idx = (6, 0) for k, i in itertools.product(range(6, 16), range(100)): try: medoids, membership = kMedoids(p_dm, k) medoid_dist = p_dm_df[p_dm_df.ix[medoids].index].apply(min, 1)
def test_basic_plot(self): self.maxDiff = None exp_edges = { 'dest_node': ['0', '1', '2', 'y3'], 'edge_color': ['#00FF00', '#00FF00', '#00FF00', '#FF0000'], 'edge_width': [2, 2, 2, 2], 'src_node': ['y3', 'y4', 'y3', 'y4'], 'x0': [ 338.2612593838583, 193.1688862557773, 338.2612593838583, 193.1688862557773 ], 'x1': [487.5, 12.499999999999972, 324.89684138234867, 338.2612593838583], 'y0': [ 271.7282256126416, 365.95231443706376, 271.7282256126416, 365.95231443706376 ], 'y1': [ 347.7691620070637, 483.2800610261029, 16.719938973897143, 271.7282256126416 ] } exp_nodes = { 'child0': [np.nan, np.nan, np.nan, '0', '1'], 'child1': [np.nan, np.nan, np.nan, '2', 'y3'], 'color': ['#1C9099', '#1C9099', '#1C9099', '#FF999F', '#FF999F'], 'hover_var': [None, None, None, None, None], 'is_tip': [True, True, True, False, False], 'node_size': [10, 10, 10, 10, 10], 'x': [ 12.499999999999972, 487.5, 324.89684138234867, 338.26125938385832, 193.16888625577729 ], 'y': [ 483.28006102610289, 347.7691620070637, 16.719938973897143, 271.72822561264161, 365.95231443706376 ] } np.random.seed(0) num_otus = 3 # otus x = np.random.rand(num_otus) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) t = UnrootedDendrogram.from_tree(t) # incorporate colors in tree for i, n in enumerate(t.postorder(include_self=True)): if not n.is_tip(): n.name = "y%d" % i n.color = '#FF999F' n.edge_color = '#FF0000' n.node_size = 10 else: n.color = '#1C9099' n.edge_color = '#00FF00' n.node_size = 10 n.length = np.random.rand() * 3 n.edge_width = 2 p = radialplot(t, node_color='color', edge_color='edge_color', node_size='node_size', edge_width='edge_width') for e in exp_edges.keys(): if isinstance(exp_edges[e], float): npt.assert_allclose(p.renderers[0].data_source.data[e], np.array(exp_edges[e])) else: self.assertListEqual(list(p.renderers[0].data_source.data[e]), exp_edges[e]) for e in exp_nodes.keys(): self.assertListEqual(list(p.renderers[1].data_source.data[e]), exp_nodes[e]) self.assertTrue(isinstance(t, TreeNode))
def test_from_iterable_single(self): exp = DistanceMatrix([[0]]) res = DistanceMatrix.from_iterable(["boo"], lambda a, b: 0) self.assertEqual(res, exp)
for a in range(len(rows[0])): if a > 0: this_sample = [] for b in range(len(rows)): if b > 0: this_sample.append(float(rows[b][a])) samples.append(this_sample) """ only_samples = ['LR', 'SR'] new_samples, new_names = [], [] for a in range(len(sample_names)): for b in range(len(only_samples)): if sample_names[a] == only_samples[b]: new_samples.append(samples[a]) new_names.append(sample_names[a]) samples = new_samples sample_names = new_names print(len(samples), len(sample_names)) """ sam_dm = dm.from_iterable(samples, metric=braycurtis) pdisp = permdisp(sam_dm, sample_names, column=None, test='median', permutations=999) print(pdisp) asim = anosim(sam_dm, sample_names, column=None, permutations=999) print(asim) perm = permanova(sam_dm, sample_names, column=None, permutations=999) print(perm)