def main(): args = parse_args() tree1 = TreeNode.read(open(args.tree1_file)) tree2 = TreeNode.read(open(args.tree2_file)) tree_dist = calc_tree_distance(tree1, tree2) print ("Tree distance: %d" %tree_dist)
def test_continous(self): exp_basis = pd.read_csv( self.get_data_path('expected/numeric_basis.tsv'), sep='\t') exp_data = pd.read_csv(self.get_data_path('expected/numeric_data.tsv'), sep='\t') exp_groups = pd.read_csv( self.get_data_path('expected/numeric_groups.tsv'), sep='\t') exp_factors = pd.read_csv( self.get_data_path('expected/numeric_factors.tsv'), sep='\t') exp_tree = (TreeNode.read( self.get_data_path('expected/numeric_tree.nwk'))) pf = phylofactor(self.table, self.phylogeny, self.metadata, formula='Continuous~Data', nfactors=3, family='poisson') data, basis, out_tree, groups, factors = pf assert_frame_equal(basis, exp_basis) assert_frame_equal(groups, exp_groups) assert_frame_equal(factors, exp_factors) assert_frame_equal(data, exp_data) self.assertEqual(TreeNode.compare_rfd(exp_tree, out_tree), 0)
def main_calc_tree_distance(lang_set_mat, dist_metric="rfd"): """Calculate Tree Distance.""" pred_linkage = get_linkage_matrix(lang_set_mat) pred_tree = TreeNode.from_linkage_matrix(pred_linkage, INDO_EURO_LANG_NAMES) pred_tree_string_io = StringIO() pred_tree.write(pred_tree_string_io) pred_tree_string = pred_tree_string_io.getvalue() # Replace distances with 1 unweighted_tree_string = re.sub(r"\d+\.\d+", "1", pred_tree_string) pred_tree = TreeNode.read(StringIO(unweighted_tree_string)) if dist_metric == "rfd": tree_dist = pred_tree.compare_rfd(GT_INDO_EUROPEAN_TREE) else: gt_distances_struct = GT_INDO_EUROPEAN_TREE.tip_tip_distances() gt_distances = gt_distances_struct.data gt_ids = gt_distances_struct.ids pred_distances = pred_tree.tip_tip_distances( endpoints=list(gt_ids)).data tree_dist = np.sum((gt_distances - pred_distances)**2) return tree_dist, pred_tree
def testSimpleTwice(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters_for_several_thresholds( tree, [0.25, 0.25]) self.assertSameClusterSets( [[0.25, [['A', 'B'], ['D']]], [0.25, [['A', 'B'], ['D']]]], clusters)
def build_base_silva_taxonomy(tree_file, tax_dict): """Returns {TaxonomyID : [(rank, taxonomy), ...]} """ print("Building base SILVA taxonomy...") tree = TreeNode.read(tree_file) ml = {} for node in tree.postorder(): # tree.tips(): if node.is_root(): break l = [] rank, taxonomy = tax_dict[node.name] clean_taxonomy_str = filter_characters(taxonomy) if rank in allowed_ranks: l.append((allowed_ranks_dict[rank], clean_taxonomy_str)) for ancestor in node.ancestors(): if ancestor.is_root(): break else: arank, ataxonomy = tax_dict[ancestor.name] cleaned_ataxonomy = filter_characters(ataxonomy) if arank in allowed_ranks: l.append((allowed_ranks_dict[arank], cleaned_ataxonomy)) #l.reverse() ml[node.name.strip()] = dict(l) return ml
def testNoPairs(self): tree = TreeNode.read( StringIO( "(((A:1, B:2):3, (C:4, D:5):6)'f__family; g__genoos':10)root;") ) examples = ThresholdFinder().find_examples(tree, 'f', 'g') self.assertSameCladeDistanceSet([], examples)
def testClusterNamingWithBootstraps(self): tree = TreeNode.read( StringIO( "((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)'0.7:G':30)root;")) clusters = Tree2Tax().named_clusters(tree, 40) self.assertSameClusters([['F'], _('A B D H')], clusters) assert_equals(_('G.2 G.1'), [c.name() for c in clusters])
def test__generate_html_summary_phylogeny(self): fp_biom = join('qtp_biom', 'support_files', 'sepp.biom') fp_tree = join('qtp_biom', 'support_files', 'sepp.tre') # load metadata qurl = '/qiita_db/analysis/%s/metadata/' % 1 md = self.qclient.get(qurl) # load phylogeny tree = TreeNode.read(fp_tree) obs_index_fp, obs_viz_fp, qza_fp = _generate_html_summary(fp_biom, md, self.out_dir, True, tree=tree) # test if two expected tags show up in the html summary page with open(obs_index_fp) as f: obs_html = ''.join(f.readlines()) self.assertTrue('<th>Number placed fragments</th>' in obs_html) self.assertTrue('<td>434</td>' in obs_html) # test that phylogeny specific html content does not show up if no # tree is given obs_index_fp, obs_viz_fp, qza_fp = _generate_html_summary(fp_biom, md, self.out_dir, True, tree=None) with open(obs_index_fp) as f: obs_html = ''.join(f.readlines()) self.assertTrue('<th>Number placed fragments</th>' not in obs_html)
def depth_partition(self, input_tree, percentile, output_tree): ''' Attempt to cluster tree with nodes of tip-to-tip distrubution < an nth percentile cutoff of the whole-tree distance distribution. A better description can be found in the citation below. Parameters ---------- tree: skbio TreeNode obj http://scikit-bio.org/docs/latest/generated/skbio.tree.TreeNode.html #skbio.tree.TreeNode percentile: float The percentile cutoff to use to determine the cutoff from clading from a given node. Clustering method modified from Prosperi et al method: Prosperi, M.C.F., et al. A novel methodology for large-scale phylogeny partition. Nat. Commun. 2:321 doi: 10.1038/ncomms1325 (2011). http://www.nature.com/ncomms/journal/v2/n5/full/ncomms1325.html ''' tree = TreeNode.read(input_tree) tree = tree.root_at_midpoint() cluster_count = 1 clustered = set() clusters = {} logging.debug("Calculating %ith percentile cutoff from root" \ % (percentile)) whole_tree_distribution = self._node_dist(tree) cutoff = np.percentile(whole_tree_distribution, percentile) logging.debug("Cutoff (%ith percentile): %f" % (percentile, cutoff)) for node in tree.preorder(): if node in clustered: continue elif node.is_tip(): continue else: node_distribution = self._node_dist(node) median=np.median(node_distribution) logging.debug("Median of node: %f" % median) if median <= cutoff: logging.debug("Cluster found!") cluster_name = "partition_%i" % (cluster_count) clusters[cluster_name] = [x.name.replace(' ','_') for x in node.tips()] self._rename(node, cluster_name) cluster_count+=1 for descenent in node.traverse(): clustered.add(descenent) logging.info("%i depth cluster(s) found in tree" % (cluster_count-1)) tree.write(output_tree, "newick") logging.debug("Recording tips that were not partitioned") clusters[self.UNCLUSTERED] = [] for tip in tree.tips(): if tip not in clustered: clusters[self.UNCLUSTERED].append(tip.name.replace(' ','_')) return clusters
def testSistersOneIncompleteSister(self): ann = TreeAnnotator() tree = TreeNode.read( StringIO("((A:1, B:2):3, ((C:1,D:1):1, (E:1,F:5)'g3':6):10)root;")) print(tree.ascii_art()) sisters = ann.find_sisters(tree, tree.find('B')) assert_equals(sorted(['g3']), sorted([s.name for s in sisters]))
def test_ilr_ordination(self): np.random.seed(0) table = pd.DataFrame([[1, 1, 2, 2], [1, 2, 2, 1], [2, 2, 1, 1]], index=[1, 2, 3], columns=['a', 'b', 'c', 'd']) table = table.reindex(columns=np.random.permutation(table.columns)) tree = TreeNode.read([ '((c:0.025,d:0.025,f:0.1,e:0.025):0.2,(b:0.025,a:0.025):0.2);']) res_ord, res_tree, res_md = ilr_phylogenetic_ordination( table, tree, top_k_var=3) exp_balances = pd.DataFrame( [[0.693147, 0.0, 3.892122e-17], [0.0, -4.901291e-01, -4.901291e-01], [-0.693147, -5.551115e-17, -3.892122e-17]], columns=['y0', 'y1', 'y2'], index=[1, 2, 3]) exp_balances = exp_balances[['y0', 'y1', 'y2']] exp_balances.index.name = 'sampleid' pdt.assert_frame_equal(res_ord.samples, exp_balances) exp_tree_str = ('((b:0.025,a:0.025)y1:0.2,' '(c:0.025,d:0.025)y2:0.2)y0;\n') self.assertEqual(str(res_tree), exp_tree_str) exp_md = pd.DataFrame([[-0.5, -0.707107, 0.000000], [-0.5, 0.707107, 0.000000], [0.5, 0.000000, -0.707107], [0.5, 0.000000, 0.707107]], columns=['y0', 'y1', 'y2'], index=['b', 'a', 'c', 'd']) exp_md.index.name = 'featureid' pdt.assert_frame_equal(res_md, exp_md)
def test_defaults(self): exp_basis = pd.read_csv( self.get_data_path('expected/categorical_basis.tsv'), sep='\t') exp_data = pd.read_csv( self.get_data_path('expected/categorical_data.tsv'), sep='\t') exp_groups = pd.read_csv( self.get_data_path('expected/categorical_groups.tsv'), sep='\t') exp_factors = pd.read_csv( self.get_data_path('expected/categorical_factors.tsv'), sep='\t') exp_tree = TreeNode.read( self.get_data_path('expected/categorical_tree.nwk')) pf = phylofactor(self.table, self.phylogeny, self.metadata, formula='Categorical~Data', nfactors=3, family='binomial') data, basis, out_tree, groups, factors = pf assert_frame_equal(basis, exp_basis) assert_frame_equal(groups, exp_groups) assert_frame_equal(factors, exp_factors) assert_frame_equal(data, exp_data) self.assertEqual(TreeNode.compare_rfd(exp_tree, out_tree), 0)
def depth_partition(self, input_tree, percentile, output_tree): ''' Attempt to cluster tree with nodes of tip-to-tip distrubution < an nth percentile cutoff of the whole-tree distance distribution. A better description can be found in the citation below. Parameters ---------- tree: skbio TreeNode obj http://scikit-bio.org/docs/latest/generated/skbio.tree.TreeNode.html #skbio.tree.TreeNode percentile: float The percentile cutoff to use to determine the cutoff from clading from a given node. Clustering method modified from Prosperi et al method: Prosperi, M.C.F., et al. A novel methodology for large-scale phylogeny partition. Nat. Commun. 2:321 doi: 10.1038/ncomms1325 (2011). http://www.nature.com/ncomms/journal/v2/n5/full/ncomms1325.html ''' tree = TreeNode.read(input_tree) tree = tree.root_at_midpoint() cluster_count = 1 clustered = set() clusters = {} logging.debug("Calculating %ith percentile cutoff from root" \ % (percentile)) whole_tree_distribution = self._node_dist(tree) cutoff = np.percentile(whole_tree_distribution, percentile) logging.debug("Cutoff (%ith percentile): %f" % (percentile, cutoff)) for node in tree.preorder(): if node in clustered: continue elif node.is_tip(): continue else: node_distribution = self._node_dist(node) median = np.median(node_distribution) logging.debug("Median of node: %f" % median) if median <= cutoff: logging.debug("Cluster found!") cluster_name = "partition_%i" % (cluster_count) clusters[cluster_name] = [ x.name.replace(' ', '_') for x in node.tips() ] self._rename(node, cluster_name) cluster_count += 1 for descenent in node.traverse(): clustered.add(descenent) logging.info("%i depth cluster(s) found in tree" % (cluster_count - 1)) tree.write(output_tree, "newick") logging.debug("Recording tips that were not partitioned") clusters[self.UNCLUSTERED] = [] for tip in tree.tips(): if tip not in clustered: clusters[self.UNCLUSTERED].append(tip.name.replace(' ', '_')) return clusters
def testMultiplyNamedNode(self): tree = TreeNode.read( StringIO( "(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2; s__spec':6)'f__family':10)root;" )) examples = ThresholdFinder().find_examples(tree, 'f', 'g') self.assertSameCladeDistanceSet( [['f__family', 'g__genus1', 'g__genus2; s__spec', 16.0]], examples)
def testFullTaxonomy(self): ann = TreeAnnotator() tree = TreeNode.read( StringIO( "(((A:1, B:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;") ) assert_equals('f__family; g__genus2', ann.full_taxonomy(tree, tree.find('D')))
def testClusterNamingConventionsWithSomeUnnamed(self): tree = TreeNode.read( StringIO('((((A:11, B:12):10, D:9):20, F:20)G:30)root;')) clusters = Tree2Tax().named_clusters( tree, 0.05) #i.e. everything is a separate cluster self.assertSameClusters([['A'], ['B'], ['D'], ['F']], clusters) assert_equals(['G.1', 'G.2', 'G.3', 'G.4'], [c.name() for c in clusters])
def testFindParents(self): ann = TreeAnnotator() tree = TreeNode.read(StringIO("(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;")) assert_equals('g__genus1', ann.find_named_parent(tree, tree.find('B')).name, 'self is named') tree = TreeNode.read(StringIO("(((A:1, 2475:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;")) assert_equals('g__genus1', ann.find_named_parent(tree, tree.find('2475')).name, 'parent directly above') tree = TreeNode.read(StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;")) assert_equals('f__family', ann.find_named_parent(tree, tree.find('2475')).name, 'parent 2 above') tree = TreeNode.read(StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10);")) assert_equals(None, ann.find_named_parent(tree, tree.find('f__family').parent), 'parent of root') tree = TreeNode.read(StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6):10);")) assert_equals(None, ann.find_named_parent(tree, tree.find('g__genus2').parent), 'no parent before root')
def testSistersSelfNoParent(self): ann = TreeAnnotator() tree = TreeNode.read( StringIO( "(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;" )) sisters = ann.find_sisters(tree, tree.find('B')) assert_equals([], [s.name for s in sisters])
def generate_html_summary(qclient, job_id, parameters, out_dir): """Generates the HTML summary of a BIOM artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to validate and create the artifact out_dir : str The path to the job's output directory Returns ------- bool, None, str Whether the job is successful Ignored The error message, if not successful """ # Step 1: gather file information from qiita using REST api artifact_id = parameters['input_data'] qclient_url = "/qiita_db/artifacts/%s/" % artifact_id artifact_info = qclient.get(qclient_url) # Step 2: get the mapping file, depends if analysis or not if artifact_info['analysis'] is None: is_analysis = False qurl = ('/qiita_db/prep_template/%s/' % artifact_info['prep_information'][0]) md = qclient.get(qurl)['qiime-map'] else: is_analysis = True qurl = '/qiita_db/analysis/%s/metadata/' % artifact_info['analysis'] md = qclient.get(qurl) tree = None if 'plain_text' in artifact_info['files']: tree = TreeNode.read(artifact_info['files']['plain_text'][0]) # Step 3: generate HTML summary # if we get to this point of the code we are sure that this is a biom file # and that it only has one element index_fp, viz_fp, qza_fp = _generate_html_summary( artifact_info['files']['biom'][0], md, out_dir, is_analysis, tree) # Step 4: add the new file to the artifact using REST api success = True error_msg = "" try: qclient.patch(qclient_url, 'add', '/html_summary/', value=dumps({'html': index_fp, 'dir': viz_fp})) except Exception as e: success = False error_msg = str(e) return success, None, error_msg
def testSistersSisterWithDescendentNames(self): ann = TreeAnnotator() tree = TreeNode.read( StringIO( "((A:1, B:2):3, (((a:1,b:1)'s1':1,D:1)'g2':1, (E:1,F:5)'g3':6):10)root;" )) print(tree.ascii_art()) sisters = ann.find_sisters(tree, tree.find('B')) assert_equals(sorted(['g2', 'g3']), sorted([s.name for s in sisters]))
def testTreeSubtree2(self): '''one genus is a subtree of another, and the longest branch is in both subtrees''' tree = TreeNode.read( StringIO( "((((A:1, B:52)'g__genus1':3, D:50)'g__genus2':6)'f__family':10)root;" )) examples = ThresholdFinder().find_examples(tree, 'f', 'g') self.assertSameCladeDistanceSet( [['f__family', 'g__genus1', 'g__genus2', 105.0]], examples)
def testOppositeSorting(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters_for_several_thresholds( tree, [0.05, 0.25]) self.assertSameClusterSets( [[0.05, [['A'], ['B'], ['D']]], [0.25, [['A', 'B'], ['D']]]], clusters) assert_equals(_('C.1 C.2 Root'), [c.name() for c in clusters[0].clusters])
def testNaming(self): tree = TreeNode.read( StringIO('((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)G:30)root;')) clusters = Tree2Tax().named_clusters_for_several_thresholds( tree, [40, 25]) self.assertSameClusterSets( [[25, [['F'], _('A B'), _('D H')]], [40, [['F'], _('A B D H')]]], clusters) assert_equals(_('G.3 G.1 G.2'), [c.name() for c in clusters[0].clusters]) assert_equals(_('G.2 G.1'), [c.name() for c in clusters[1].clusters])
def testFindParents(self): ann = TreeAnnotator() tree = TreeNode.read( StringIO( "(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;" )) assert_equals('g__genus1', ann.find_named_parent(tree, tree.find('B')).name, 'self is named') tree = TreeNode.read( StringIO( "(((A:1, 2475:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;" )) assert_equals('g__genus1', ann.find_named_parent(tree, tree.find('2475')).name, 'parent directly above') tree = TreeNode.read( StringIO( "(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;" )) assert_equals('f__family', ann.find_named_parent(tree, tree.find('2475')).name, 'parent 2 above') tree = TreeNode.read( StringIO( "(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6)'f__family':10);")) assert_equals( None, ann.find_named_parent(tree, tree.find('f__family').parent), 'parent of root') tree = TreeNode.read( StringIO("(((A:1, 2475:2):3, (C:4, D:5)'g__genus2':6):10);")) assert_equals( None, ann.find_named_parent(tree, tree.find('g__genus2').parent), 'no parent before root')
def test_missing_taxonomy(self): tree = TreeNode.read( StringIO('((((A:11, B:12)C:10, D:9)E:20, F:20)G:30)root;')) assert_equals(['C'], TaxonomyFunctions().missing_taxonomy( tree, tree.find('A'), tree.find('E'))) assert_equals([], TaxonomyFunctions().missing_taxonomy( tree, tree.find('A'), tree.find('A'))) assert_equals(['E', 'C'], TaxonomyFunctions().missing_taxonomy( tree, tree.find('A'), tree.find('G')))
def testTipToCluster(self): tree = TreeNode.read(StringIO('((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)G:30)root;')) clusters = Tree2Tax().named_clusters_for_several_thresholds(tree, [40, 25]) self.assertSameClusterSets([[25,[['F'], _('A B'), _('D H')]], [40,[['F'], _('A B D H')]]], clusters) assert_equals(_('G.3 G.1 G.2'), [c.name() for c in clusters[0].clusters]) assert_equals(_('G.2 G.1'), [c.name() for c in clusters[1].clusters]) tip = tree.find('F') assert_equals('G.3', clusters[0].tip_to_cluster(tip).name()) assert_equals('G.2', clusters[1].tip_to_cluster(tip).name()) tip = tree.find('D') assert_equals('G.2', clusters[0].tip_to_cluster(tip).name()) assert_equals('G.1', clusters[1].tip_to_cluster(tip).name())
def get_dist_matrix_from_tree(file_path): with open(file_path, 'r') as myfile: data = myfile.read().replace('\n', '') t = TreeNode.read(StringIO(data)) df = t.tip_tip_distances().to_data_frame() #df.index = df.index.astype(int) # sort rows and cols df.sort_index(inplace=True) #df.columns = df.columns.values.astype(np.int32) df = df[sorted(df.columns)] print(df) return df.as_matrix()
def setUp(self): self.file_ref_phylo = get_data_path( 'analyses/sepp/reference_phylogeny_small.qza') self.file_ref_aln = get_data_path( 'analyses/sepp/reference_alignment_small.qza') self.fragments = pd.read_csv( get_data_path('analyses/sepp/fragments.tsv'), sep='\t', index_col=0) self.exp_taxonomy = pd.read_csv( get_data_path('analyses/sepp/exp_taxonomy.tsv'), sep='\t', index_col=0) self.exp_tree = TreeNode.read( get_data_path('analyses/sepp/exp_tree.nwk'))
def _prune_features_from_phylogeny(table: biom.Table, phylogeny_fp: NewickFormat) -> NewickFormat: print('Will prune the phylogeny') tree = TreeNode.read(str(phylogeny_fp)) obs = table.ids('observation') tip_names_set = set([x.name for x in tree.tips()]) to_delete_names = tip_names_set - set(obs) to_delete_set = to_delete_names if len(set(obs) - tip_names_set) > 0: raise ValueError( "There are", len(set(obs) - tip_names_set), "features in the feature table not present " "in the phylogeny! Please check your tree" ) else: print("All", len(obs), "features present in the " "feature table are also in the phylogeny.") if len(to_delete_set) > 0: t0 = time() print("The set of features in the phylogeny and the table " "are not the same.", len(to_delete_set), "features will be pruned from the tree.") tree_pruned = tree.shear(set(obs)) print("It takes", time()-t0, "seconds to prune the phylogeny") to_delete_set = set([x.name for x in tree_pruned.tips()]) - set(obs) to_delete_rev_set = set(obs) - set([x.name for x in tree_pruned.tips()]) if len(to_delete_set) > 0 or len(to_delete_rev_set): raise ValueError( "Pruning the phylogeny failed! There are", len(to_delete_set), "features in the phylogeny not present in " "the feature table, and", len(to_delete_rev_set), "features in the feature table not available in the phylogeny!" "Both should be 0" ) else: print("The phylogeny was pruned successfully!") else: print("The set of features in the phylogeny and the table " "are the same. No feature will be pruned from the tree.") tree_pruned = tree tree_pruned_out = _1(tree_pruned) return tree_pruned_out
def shear_tree(self): self.project.set_tree_paths(self.config) if len(Data.wols): i_wol_tree = get_wol_tree(self.config.i_wol_tree) wol = TreeNode.read(i_wol_tree) for dat, data in self.project.datasets.items(): if dat in Datasets.filt_raw: continue if data.phylo and data.phylo[0] == 'wol': if self.config.force or not isfile(data.tree[1]): wol_features = wol.shear(list(data.features.keys())) for tip in wol_features.tips(): tip.name = data.features[tip.name] wol_features.write(data.tree[2]) cmd = run_import(data.tree[2], data.tree[1], "Phylogeny[Rooted]") self.cmds.setdefault(dat, []).append(cmd) self.register_command('wol')
def read_tree(nwk_path, leaf_names=None, trim_src_tag=False): """ Read a tree in Newick format Returns: TreeNode object for the root of the tree """ tree = TreeNode.read(nwk_path, format='newick') swap_space(tree) if leaf_names is not None: tree = tree.shear(leaf_names) tree = tree.unrooted_copy() tree.assign_ids() if trim_src_tag: for n in tree.tips(): n.name = check_accession(n.name) return tree
def read_tree(tree_path): """ Read Newick formatted tree from GTDB. Only tips that have a NCBI accession will be kept. i.e. GTDB MAGs are pruned from the tree """ tree = TreeNode.read(tree_path) leaves = list() for tip in tree.tips(): if 'GC' in tip.name: tip.name = tip.name[3:].replace(' ', '_') leaves.append(tip.name) tree = tree.shear(leaves) tree.prune() for node in tree.non_tips(): node.name = node.name.replace(' ', '_') return tree
def test_ilr_phylogenetic(self): np.random.seed(0) table = pd.DataFrame([[1, 1, 2, 2], [1, 2, 2, 1], [2, 2, 1, 1]], index=[1, 2, 3], columns=['a', 'b', 'c', 'd']) table = table.reindex(columns=np.random.permutation(table.columns)) tree = TreeNode.read( ['((c:0.025,d:0.025,f:0.1,e:0.025):0.2,(b:0.025,a:0.025):0.2);']) res_balances, res_tree = ilr_phylogenetic(table, tree) exp_balances = pd.DataFrame( [[0.693147, 0.0, 3.892122e-17], [ 0.0, -4.901291e-01, -4.901291e-01 ], [-0.693147, -5.551115e-17, -3.892122e-17]], columns=['y0', 'y1', 'y2'], index=[1, 2, 3]) pdt.assert_frame_equal(res_balances, exp_balances) exp_tree_str = ('((b:0.025,a:0.025)y1:0.2,' '(c:0.025,d:0.025)y2:0.2)y0;\n') self.assertEqual(str(res_tree), exp_tree_str)
def get_unifrac( otu_file_1: pathlib.Path, otu_file_2: pathlib.Path, tree_file: pathlib.Path, weighted: bool, threshold: int, ): otu_1 = load_table(str(otu_file_1)).to_dataframe(dense=True) otu_2 = load_table(str(otu_file_2)).to_dataframe(dense=True) tree = TreeNode.read(str(tree_file)) unifrac_data = dict() for u, v, otu_ids, col in get_vectors(otu_1, otu_2, threshold): if weighted: unifrac_value = weighted_unifrac( u, v, otu_ids, tree, normalized=True, validate=True ) else: unifrac_value = unweighted_unifrac(u, v, otu_ids, tree, validate=True) unifrac_data[col] = unifrac_value return pd.Series(unifrac_data), otu_1.shape[0], otu_2.shape[0]
def _open_tree(self, tree_path): ''' Open a tree file, determine what decorations are already present. Strip Unwanted decoration Parameters ---------- tree_path: str Path to a file containing a phylogenetic tree, in Newick format. Returns ------- skbio TreeNode object ''' tree_obj=TreeNode.read(open(tree_path)) bootstrapped = True for node in tree_obj.non_tips(): if node.name: try: float(node.name) except: logging.debug("Tree is decorated already. Stripping all \ previous decoration from the tree.") bootstrapped = False tree_obj = self._strip_tree(tree_obj) break else: if bootstrapped: logging.warning("This tree doesn't appear correctly \ formatted or there is information missing. No boostrap value or decoration \ found for bare node. ") bootstrapped = False if bootstrapped: logging.debug("Tree is bootstrap or has confidence values \ assigned to the nodes.") return tree_obj
def testNoClusering(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters(tree, 0.05) self.assertSameClusters([['A'],['B'],['D']], clusters) assert_equals(_('C.1 C.2 Root'), [c.name() for c in clusters])
def testTreeSubtree2(self): '''one genus is a subtree of another, and the longest branch is in both subtrees''' tree = TreeNode.read(StringIO("((((A:1, B:52)'g__genus1':3, D:50)'g__genus2':6)'f__family':10)root;")) examples = ThresholdFinder().find_examples(tree, 'f', 'g') self.assertSameCladeDistanceSet([['f__family','g__genus1','g__genus2',105.0]], examples)
def testClusterEverything(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters(tree, 0.5) self.assertSameClusters([['A','B','D']], clusters) assert_equals('Root',clusters[0].name())
def testFullTaxonomy(self): ann = TreeAnnotator() tree = TreeNode.read(StringIO("(((A:1, B:2):3, (C:4, D:5)'g__genus2':6)'f__family':10)root;")) assert_equals('f__family; g__genus2', ann.full_taxonomy(tree, tree.find('D')))
def testMultiplyNamedNode(self): tree = TreeNode.read(StringIO("(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2; s__spec':6)'f__family':10)root;")) examples = ThresholdFinder().find_examples(tree, 'f', 'g') self.assertSameCladeDistanceSet([['f__family','g__genus1','g__genus2; s__spec',16.0]], examples)
def testClusterNamingOnTwoInternalNodesReverseOrder(self): tree = TreeNode.read(StringIO('((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)G:30)root;')) clusters = Tree2Tax().named_clusters(tree, 40) self.assertSameClusters([['F'], _('A B D H')], clusters) assert_equals(_('G.2 G.1'), [c.name() for c in clusters])
def testSistersSisterWithDescendentNames(self): ann = TreeAnnotator() tree = TreeNode.read(StringIO("((A:1, B:2):3, (((a:1,b:1)'s1':1,D:1)'g2':1, (E:1,F:5)'g3':6):10)root;")) print(tree.ascii_art()) sisters = ann.find_sisters(tree, tree.find('B')) assert_equals(sorted(['g2','g3']), sorted([s.name for s in sisters]))
def testClusterOnTwoInternalNodes(self): tree = TreeNode.read(StringIO('((((A:11, B:12)C:10, (H:8, D:9)I:3)E:20, F:20)G:30)root;')) clusters = Tree2Tax().named_clusters(tree, 40) self.assertSameClusters([_('A B D H'), ['F']], clusters)
def testSistersSelfNoParent(self): ann = TreeAnnotator() tree = TreeNode.read(StringIO("(((A:1, B:2)'g__genus1':3, (C:4, D:5)'g__genus2':6)'f__family':10)root;")) sisters = ann.find_sisters(tree, tree.find('B')) assert_equals([], [s.name for s in sisters])
def testNoPairs(self): tree = TreeNode.read(StringIO("(((A:1, B:2):3, (C:4, D:5):6)'f__family; g__genoos':10)root;")) examples = ThresholdFinder().find_examples(tree, 'f', 'g') self.assertSameCladeDistanceSet([], examples)
def validate(qclient, job_id, parameters, out_dir): """Validate and fix a new BIOM artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to validate and create the artifact out_dir : str The path to the job's output directory Returns ------- bool, list of qiita_client.ArtifactInfo , str Whether the job is successful The artifact information, if successful The error message, if not successful """ prep_id = parameters.get('template') analysis_id = parameters.get('analysis') files = loads(parameters['files']) a_type = parameters['artifact_type'] if a_type != "BIOM": return (False, None, "Unknown artifact type %s. Supported types: BIOM" % a_type) qclient.update_job_step(job_id, "Step 1: Collecting metadata") if prep_id is not None: is_analysis = False metadata = qclient.get("/qiita_db/prep_template/%s/data/" % prep_id) metadata = metadata['data'] qurl = ('/qiita_db/prep_template/%s/' % prep_id) md = qclient.get(qurl)['qiime-map'] elif analysis_id is not None: is_analysis = True metadata = qclient.get("/qiita_db/analysis/%s/metadata/" % analysis_id) md = metadata else: return (False, None, "Missing metadata information") # Check if the biom table has the same sample ids as the prep info qclient.update_job_step(job_id, "Step 2: Validating BIOM file") new_biom_fp = biom_fp = files['biom'][0] table = load_table(biom_fp) metadata_ids = set(metadata) biom_sample_ids = set(table.ids()) if not metadata_ids.issuperset(biom_sample_ids): # The BIOM sample ids are different from the ones in the prep template qclient.update_job_step(job_id, "Step 3: Fixing BIOM sample ids") # Attempt 1: the user provided the run prefix column - in this case # the run prefix column holds the sample ids present in the BIOM file if 'run_prefix' in metadata[next(iter(metadata_ids))]: id_map = {v['run_prefix']: k for k, v in metadata.items()} else: # Attemp 2: the sample ids in the BIOM table are the same that in # the prep template but without the prefix prefix = next(iter(metadata_ids)).split('.', 1)[0] prefixed = set("%s.%s" % (prefix, s) for s in biom_sample_ids) if metadata_ids.issuperset(prefixed): id_map = {s: "%s.%s" % (prefix, s) for s in biom_sample_ids} else: # There is nothing we can do. The samples in the BIOM table do # not match the ones in the prep template and we can't fix it error_msg = ('The sample ids in the BIOM table do not match ' 'the ones in the prep information. Please, ' 'provide the column "run_prefix" in the prep ' 'information to map the existing sample ids to ' 'the prep information sample ids.') return False, None, error_msg # Fix the sample ids try: table.update_ids(id_map, axis='sample') except TableException: missing = biom_sample_ids - set(id_map) error_msg = ('Your prep information is missing samples that are ' 'present in your BIOM table: %s' % ', '.join(missing)) return False, None, error_msg new_biom_fp = join(out_dir, basename(biom_fp)) with biom_open(new_biom_fp, 'w') as f: table.to_hdf5(f, "Qiita BIOM type plugin") filepaths = [(new_biom_fp, 'biom')] # Validate the representative set, if it exists if 'preprocessed_fasta' in files: repset_fp = files['preprocessed_fasta'][0] # The observations ids of the biom table should be the same # as the representative sequences ids found in the representative set observation_ids = table.ids(axis='observation').tolist() extra_ids = [] for record in load([repset_fp], constructor=FastaIterator): rec_id = record['SequenceID'].split()[0] try: observation_ids.remove(rec_id) except ValueError: extra_ids.append(rec_id) error_msg = [] if extra_ids: error_msg.append("The representative set sequence file includes " "observations not found in the BIOM table: %s" % ', '.join(extra_ids)) if observation_ids: error_msg.append("The representative set sequence file is missing " "observation ids found in the BIOM tabe: %s" % ', '.join(observation_ids)) if error_msg: return False, None, '\n'.join(error_msg) filepaths.append((repset_fp, 'preprocessed_fasta')) # Validate the sequence specific phylogenetic tree (e.g. generated # by SEPP for Deblur), if it exists tree = None if 'plain_text' in files: phylogeny_fp = files['plain_text'][0] try: tree = TreeNode.read(phylogeny_fp) filepaths.append((phylogeny_fp, 'plain_text')) except Exception: return False, None, ("Phylogenetic tree cannot be parsed " "via scikit-biom") for fp_type, fps in files.items(): if fp_type not in ('biom', 'preprocessed_fasta', 'plain_text'): for fp in fps: filepaths.append((fp, fp_type)) index_fp, viz_fp, qza_fp = _generate_html_summary( new_biom_fp, md, join(out_dir), is_analysis, tree) filepaths.append((index_fp, 'html_summary')) filepaths.append((viz_fp, 'html_summary_dir')) if 'qza' not in files: filepaths.append((qza_fp, 'qza')) return True, [ArtifactInfo(None, 'BIOM', filepaths)], ""
def testClusterNamingConventionsWithSomeUnnamed(self): tree = TreeNode.read(StringIO('((((A:11, B:12):10, D:9):20, F:20)G:30)root;')) clusters = Tree2Tax().named_clusters(tree, 0.05) #i.e. everything is a separate cluster self.assertSameClusters([['A'],['B'],['D'],['F']], clusters) assert_equals(['G.1', 'G.2', 'G.3', 'G.4'], [c.name() for c in clusters])
def testClusterNamingWithBootstraps(self): tree = TreeNode.read(StringIO("((F:20, ((A:11, B:12):10, (H:8, D:9):3):20)'0.7:G':30)root;")) clusters = Tree2Tax().named_clusters(tree, 40) self.assertSameClusters([['F'], _('A B D H')], clusters) assert_equals(_('G.2 G.1'), [c.name() for c in clusters])
def testNamingWithBootstraps(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)0.091:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters(tree, 0.05) self.assertSameClusters([['A'],['B'],['D']], clusters) assert_equals(_('Root.1 Root.2 Root.3'), [c.name() for c in clusters])
def testSimple(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters(tree, 0.25) self.assertSameClusters([['A','B'],['D']], clusters)
def testSistersOneIncompleteSister(self): ann = TreeAnnotator() tree = TreeNode.read(StringIO("((A:1, B:2):3, ((C:1,D:1):1, (E:1,F:5)'g3':6):10)root;")) print(tree.ascii_art()) sisters = ann.find_sisters(tree, tree.find('B')) assert_equals(sorted(['g3']), sorted([s.name for s in sisters]))
def test_missing_taxonomy(self): tree = TreeNode.read(StringIO('((((A:11, B:12)C:10, D:9)E:20, F:20)G:30)root;')) assert_equals(['C'], TaxonomyFunctions().missing_taxonomy(tree, tree.find('A'), tree.find('E'))) assert_equals([], TaxonomyFunctions().missing_taxonomy(tree, tree.find('A'), tree.find('A'))) assert_equals(['E','C'], TaxonomyFunctions().missing_taxonomy(tree, tree.find('A'), tree.find('G')))
def testClusterIntoThree(self): tree = TreeNode.read(StringIO('((((A:11, B:12)C:10, (H:8, D:9)I:3)E:20, F:20)G:30)root;')) clusters = Tree2Tax().named_clusters(tree, 25) self.assertSameClusters([_('A B'), _('D H'), ['F']], clusters)
def testSimpleTwice(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters_for_several_thresholds(tree, [0.25, 0.25]) self.assertSameClusterSets([[0.25,[['A','B'],['D']]], [0.25,[['A','B'],['D']]]], clusters)
def nj(dm, disallow_negative_branch_length=True, result_constructor=None): """ Apply neighbor joining for phylogenetic reconstruction. Parameters ---------- dm : skbio.DistanceMatrix Input distance matrix containing distances between OTUs. disallow_negative_branch_length : bool, optional Neighbor joining can result in negative branch lengths, which don't make sense in an evolutionary context. If `True`, negative branch lengths will be returned as zero, a common strategy for handling this issue that was proposed by the original developers of the algorithm. result_constructor : function, optional Function to apply to construct the result object. This must take a newick-formatted string as input. The result of applying this function to a newick-formatted string will be returned from this function. This defaults to ``lambda x: TreeNode.read(StringIO(x), format='newick')``. Returns ------- TreeNode By default, the result object is a `TreeNode`, though this can be overridden by passing `result_constructor`. See Also -------- TreeNode.root_at_midpoint Notes ----- Neighbor joining was initially described in Saitou and Nei (1987) [1]_. The example presented here is derived from the Wikipedia page on neighbor joining [2]_. The Phylip manual also describes the method [3]_ and Phylip itself provides an implementation which is useful for comparison. Neighbor joining, by definition, creates unrooted trees. One strategy for rooting the resulting trees is midpoint rooting, which is accessible as ``TreeNode.root_at_midpoint``. References ---------- .. [1] Saitou N, and Nei M. (1987) "The neighbor-joining method: a new method for reconstructing phylogenetic trees." Molecular Biology and Evolution. PMID: 3447015. .. [2] http://en.wikipedia.org/wiki/Neighbour_joining .. [3] http://evolution.genetics.washington.edu/phylip/doc/neighbor.html Examples -------- Define a new distance matrix object describing the distances between five OTUs: a, b, c, d, and e. >>> from skbio import DistanceMatrix >>> from skbio.tree import nj >>> data = [[0, 5, 9, 9, 8], ... [5, 0, 10, 10, 9], ... [9, 10, 0, 8, 7], ... [9, 10, 8, 0, 3], ... [8, 9, 7, 3, 0]] >>> ids = list('abcde') >>> dm = DistanceMatrix(data, ids) Contstruct the neighbor joining tree representing the relationship between those OTUs. This is returned as a TreeNode object. >>> tree = nj(dm) >>> print(tree.ascii_art()) /-d | | /-c |---------| ---------| | /-b | \--------| | \-a | \-e Again, construct the neighbor joining tree, but instead return the newick string representing the tree, rather than the TreeNode object. (Note that in this example the string output is truncated when printed to facilitate rendering.) >>> newick_str = nj(dm, result_constructor=str) >>> print(newick_str[:55], "...") (d:2.000000, (c:4.000000, (b:3.000000, a:2.000000):3.00 ... """ if dm.shape[0] < 3: raise ValueError( "Distance matrix must be at least 3x3 to " "generate a neighbor joining tree.") if result_constructor is None: result_constructor = \ lambda x: TreeNode.read(StringIO(x), format='newick') # initialize variables node_definition = None # while there are still more than three distances in the distance matrix, # join neighboring nodes. while(dm.shape[0] > 3): # compute the Q matrix q = _compute_q(dm) # identify the pair of nodes that have the lowest Q value. if multiple # pairs have equally low Q values, the first pair identified (closest # to the top-left of the matrix) will be chosen. these will be joined # in the current node. idx1, idx2 = _lowest_index(q) pair_member_1 = dm.ids[idx1] pair_member_2 = dm.ids[idx2] # determine the distance of each node to the new node connecting them. pair_member_1_len, pair_member_2_len = _pair_members_to_new_node( dm, idx1, idx2, disallow_negative_branch_length) # define the new node in newick style node_definition = "(%s:%f, %s:%f)" % (pair_member_1, pair_member_1_len, pair_member_2, pair_member_2_len) # compute the new distance matrix, which will contain distances of all # other nodes to this new node dm = _compute_collapsed_dm( dm, pair_member_1, pair_member_2, disallow_negative_branch_length=disallow_negative_branch_length, new_node_id=node_definition) # When there are three distances left in the distance matrix, we have a # fully defined tree. The last node is internal, and its distances are # defined by these last three values. # First determine the distance between the last two nodes to be joined in # a pair... pair_member_1 = dm.ids[1] pair_member_2 = dm.ids[2] pair_member_1_len, pair_member_2_len = \ _pair_members_to_new_node(dm, pair_member_1, pair_member_2, disallow_negative_branch_length) # ...then determine their distance to the other remaining node, but first # handle the trival case where the input dm was only 3 x 3 node_definition = node_definition or dm.ids[0] internal_len = _otu_to_new_node( dm, pair_member_1, pair_member_2, node_definition, disallow_negative_branch_length=disallow_negative_branch_length) # ...and finally create the newick string describing the whole tree. newick = "(%s:%f, %s:%f, %s:%f);" % (pair_member_1, pair_member_1_len, node_definition, internal_len, pair_member_2, pair_member_2_len) # package the result as requested by the user and return it. return result_constructor(newick)
def result_constructor(x): return TreeNode.read(StringIO(x), format='newick')
def testOppositeSorting(self): tree = TreeNode.read(StringIO('((A:0.11, B:0.12)C:0.1, D:0.2)root;')) clusters = Tree2Tax().named_clusters_for_several_thresholds(tree, [0.05, 0.25]) self.assertSameClusterSets([[0.05,[['A'],['B'],['D']]], [0.25,[['A','B'],['D']]]], clusters) assert_equals(_('C.1 C.2 Root'), [c.name() for c in clusters[0].clusters])