def main(): logging.basicConfig() logger.setLevel(logging.INFO) arguments = myTools.checkArgs( [("phylTree.conf",myTools.File), ("ensemblTree",myTools.File)], [("flatten",bool,False), ("rebuild",bool,False), ("fam",bool,False), ("cutoff",str,"-1"), ("defaultFamName",str,"FAM%08d"), ("scoreMethod",int,[1,2,3]), ("newNodeID",float,1e8), ("recurs",bool,False), ("indicator",bool,False), ("debug",bool,False)], __doc__) if arguments['debug']: logger.setLevel(logging.DEBUG) myProteinTree.nextNodeID = int(arguments["newNodeID"]) # For the rebuild step. phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) hasLowScore = setupScoring(phylTree, arguments["scoreMethod"], arguments["cutoff"]) prottrees = myProteinTree.loadTree(arguments["ensemblTree"]) prottrees = process(prottrees, phylTree, hasLowScore, arguments["defaultFamName"], arguments["flatten"], arguments["rebuild"], arguments["recurs"], arguments["indicator"]) if arguments["fam"]: # Will not work on previous versions of ToolsDyogen. from treeTools.ALL.extractGeneFamilies import extractGeneFamilies count, dupCount, geneFamilies = extractGeneFamilies(phylTree, prottrees) else: for tree in prottrees: tree.printTree(sys.stdout)
def get_robusts_from_prottree(dataset, prottreefile, ancestors, phyltrees): ###WARNING: dubious gene trees could be marked as "robust"!! dataset_robusts = pd.DataFrame(0, index=ancestors, columns=('all_crown', 'implicit', 'robusts', 'Ngenes=Nspecies'), dtype=int) #name='%s_%s' % (dataset, edition)) prottrees = list(myProteinTree.loadTree(op.expanduser(prottreefile))) for ancestor in ancestors: logger.info('%s: %s', dataset, ancestor) anc_genecounts, anc_spgenes, anc_branches = \ tree_extract_genecounts( deepcopy(prottrees), ancestor, phyltrees[dataset], onlybasal=True) dataset_robusts.loc[ancestor, 'all_crown'] = anc_genecounts.shape[0] dataset_robusts.loc[ancestor, 'implicit'] = \ (anc_branches != ancestor).all(axis=1).sum() dataset_robusts.loc[ancestor, 'robusts'] = \ (anc_genecounts == 1).all(axis=1).sum() dataset_robusts.loc[ancestor, 'Ngenes=Nspecies'] = \ (anc_genecounts.sum(axis=1) == len(phyltrees[dataset].species[ancestor])).sum() return dataset, dataset_robusts
def main(forestfile, phyltreefile, speciesfile, invert=False): phyltree = myPhylTree.PhylogeneticTree(phyltreefile) with open(speciesfile) as f: badspecies = [line.rstrip() for line in f if not line.startswith('#')] subroot, subtree = phyltree.getSubTree(badspecies) for tree in myProteinTree.loadTree(forestfile):
def main(ensembl_version, forestfile, outfile=None, delete_distant_orthologs=False): out = stdout if outfile is None else open(outfile, 'w') for fused_tree in fuse_subspecies( myProteinTree.loadTree(op.expanduser(forestfile % ensembl_version)), species2seq, delete_distant_orthologs): fused_tree.printTree(out) if outfile is not None: out.close()
def extractMultipleGeneTrees(proteinTree, family_name, field='family_name', toNewick=False, withAncSpeciesNames=False, withAncGenesNames=False, withTags=False, phyltree=None, output=None, force=False, mkdirs=False, firstmatch=False): if phyltree: phyltree = myPhylTree.PhylogeneticTree(phyltree) family_names = dict.fromkeys(family_name, 0) for tree in myProteinTree.loadTree(proteinTree): family = tree.info[tree.root][field].split('.')[0] if family in family_names: print("Found", family, end=' ', file=sys.stderr) wasfound = family_names[family] outfile = output.format(genetree=family) if output else '<stdout>' if os.path.isfile(outfile) and not wasfound and not firstmatch and not force: #if family_names[family] == 0: #FIXME so that you can omit the --force option but append to file print("%s exists. Skipping. (use --force)" % outfile, file=sys.stderr) family_names.pop(family) else: if phyltree is not None: #markLowScore(tree, hasLowScore) #flattenTree # tree.rebuildTree(phyltree) #TODO: start in new thread. filemode = 'a' if wasfound else 'w' try: out = open(outfile, filemode) if output else sys.stdout except IOError: if mkdirs: os.makedirs(os.path.split(outfile)[0]) out = open(outfile, filemode) else: raise if toNewick: print("Output to newick format", file=sys.stderr) tree.printNewick(out, withDist=True, withTags=withTags, withAncSpeciesNames=withAncSpeciesNames, withAncGenesNames=withAncGenesNames, withID=withTags) else: tree.printTree(out) if output: out.close() if firstmatch: family_names.pop(family) else: family_names[family] += 1 if firstmatch and not family_names: break notfound = set((fam for fam,wasfound in family_names.items() if not wasfound)) if notfound: print('WARNING: %d names were not found in field %r: %s' % ( len(notfound), field, ' '.join(notfound)), file=sys.stderr)
def main(proteinTree, outFile, sortAttr=None): if sortAttr: get_attribute = lambda *args: ProteinTree_getnodeattr( *args, attrname=sortAttr) else: get_attribute = ProteinTree_getId with myFile.openFile(outFile, 'w') as out: for tree in myProteinTree.loadTree(proteinTree): ProteinTree_LeafSort(tree, get_attribute) tree.printTree(out)
def processTrees(ensemblTree, phylTree): for tree in myProteinTree.loadTree(arguments["ensemblTree"]): try: tree.flattenTree(phylTree, rec=True) # Not sure this step is useful, and why this hasLowScore function has no effect. tree.rebuildTree(phylTree, hasLowScore=alwaysFalse) except BaseException as err: err.args += ("Root id '%d'" % tree.root, ) raise yield tree
def main(): arguments = myTools.checkArgs([("proteinTree", myTools.File), ("gene_name", str)], [], __doc__) # Information on ancestral node def printAncNode(node): txt = [node] d = tree.info[node].pop('Duplication', None) if tree.info[node].pop("dubious_duplication", None): txt.append("DUBIOUS_DUPLICATION") elif (d == 1) and ("duplication_confidence_score" in tree.info[node]): txt.append("ROOT_DUPLICATION") elif d == 2: txt.append("DUPLICATION") elif d == 3: txt.append("EDITED_DUPLICATION") else: txt.append("SPECIATION") txt.append(tree.info[node].pop("taxon_name", None)) txt.append(tree.info[node].pop("family_name", None)) txt.append(tree.info[node].pop("Bootstrap", None)) txt.append(tree.info[node].pop("duplication_confidence_score", None)) print(myFile.myTSV.printLine(txt)) # Information on Gene def printGeneNode(node): txt = [node] txt.append("GENE") txt.append(tree.info[node].pop("taxon_name", None)) txt.append(tree.info[node].pop("gene_name", None)) print(myFile.myTSV.printLine(txt)) # Recursive loop on the gene family def do(node): if node in tree.data: for (g, d) in tree.data[node]: if do(g): printAncNode(node) return True elif tree.info[node]["gene_name"] == arguments["gene_name"]: printGeneNode(node) return True return False # searching for the good gene tree for tree in myProteinTree.loadTree(arguments["proteinTree"]): if do(tree.root): break
def main(treeforestfile, outfile, dryrun=False, edited_node_id=EDITED_NODE_ID, infinite_dist=INFINITE_DIST): total_deleted = 0 total_leaves_deleted = 0 if dryrun and outfile is not stdout: outfile.close() if treeforestfile == '-': treeforestfile = stdin for tree in ProteinTree.loadTree(treeforestfile): del_count, del_leaf_count = filterbranches(tree, tree.root, edited_node_id, infinite_dist) total_deleted += del_count total_leaves_deleted += del_leaf_count if not dryrun: tree.printTree(outfile) #break print("Deleted %d branches, of which %d leaves." % (total_deleted, total_leaves_deleted))
def main(ensembltree, outputfile): get_ch = lambda tree, node: [x[0] for x in tree.data.get(node, [])] get_chd = lambda tree, nodedist: tree.data.get(nodedist[0], []) count_trees = 0 count_treenodes = [] count_splits = 0 count_split_desc = 0 with myFile.openFile(outputfile, 'w') as out: for tree in ProteinTree.loadTree(ensembltree): count_treenodes.append(0) for (node, dist), childrendists in dfw_descendants_generalized( tree, get_chd, queue=[(None, (tree.root, 0))]): count_treenodes[-1] += 1 assert tree.info[node]['duplication'] != 10, \ "Unexpected. parent node is a split gene: %s: %s" % \ (node, tree.info[node]) for child, chdist in childrendists: if tree.info[child]['Duplication'] == 10: # It's a gene split # Recurse through all the descendants to remove them count_splits += 1 for _, GS_descendant in reversed( list( dfw_pairs_generalized(tree, get_ch, queue=[(None, child) ], include_root=True))): tree.info.pop(GS_descendant) tree.data.pop(GS_descendant) count_split_desc += 1 tree.data[node].remove((child, chdist)) tree.printTree(out) print("%d trees" % count_trees, file=stderr) print("treenodes:", " ".join(str(nn) for nn in count_treenodes), file=stderr) print("Splits: %d Split descendants: %d" % (count_splits, count_split_desc), file=stderr)
def run(process, proteinTreeFile, converted_args): print("Give args: %s" % converted_args, file=stderr) if proteinTreeFile == '-': proteinTreeFile = stdin count_outputs = defaultdict(int) for tree in myProteinTree.loadTree(proteinTreeFile): r = process(tree, *converted_args) try: count_outputs[r] += 1 except TypeError: count_outputs["unhashable"] += 1 pass tree.printTree(stdout) print("Outputs counts:", count_outputs, file=stderr)
def main(phyltreefile, forestfile=None): #with open(badspecieslistfile) as f: # badspecies = [line.rstrip() for line in f if not line.startswith('#')] phyltree = myPhylTree.PhylogeneticTree(phyltreefile) if forestfile is None: forestfile = stdin for tree in myProteinTree.loadTree(forestfile): keptleaves = set( (leaf for leaf in set(tree.info).difference(tree.data) if tree.info[leaf]['taxon_name'] in phyltree.allNames)) newroot, _ = thin_prottree(tree, tree.root, 0, keptleaves) #print('DEBUG: newroot =', newroot) #print('DEBUG: newdata =', tree.data) #print('DEBUG: newinfo =', ' '.join(str(x) for x in tree.info.keys())) if newroot is not None: fix_thinned_dups(phyltree, tree, newroot) tree.printTree(stdout, newroot) else: logger.warning('Discard tree %d', tree.root)
def test_convert2species(ensembl_version, default=None, forestfile='~/GENOMICUS%d/tree.1.ensembl.bz2'): """test the `convert_prot2species` function for every modernID""" # Check rejection of wrong strings from LibsDyogen import myProteinTree for wrong in ('xululul', '0000000', 'ENSXXXP', 'ENSG000'): predicted_sp = convert_prot2species(wrong, ensembl_version, False) assert predicted_sp is False, "%r predicted %r" % (wrong, predicted_sp) expected_species = set(GENE2SP[ensembl_version].values()) expected_species_p = set(PROT2SP[ensembl_version].values()) assert expected_species == expected_species_p for tree in myProteinTree.loadTree( op.expanduser(forestfile % ensembl_version)): for tip in (set(tree.info) - set(tree.data)): tipinfo = tree.info[tip] sp = tipinfo['taxon_name'] gene = tipinfo['gene_name'] prot = tipinfo['protein_name'] assert sp in expected_species, 'Unexpected species %r' % sp try: predicted_sp = convert_gene2species(gene, ensembl_version) except KeyError as err: err.args = err.args[:-1] + \ (err.args[-1] + ' '.join((sp, gene, "Not found")),) raise assert sp == predicted_sp, "%s: %r ≠ %r" % (gene, sp, predicted_sp) try: predicted_sp = convert_prot2species(prot, ensembl_version, default) except KeyError as err: err.args = err.args[:-1] + \ (err.args[-1] + ' '.join((sp, prot, "Not found")),) raise assert sp == predicted_sp, "%s: %r ≠ %r" % (prot, sp, predicted_sp)
def main(): # Arguments arguments = myTools.checkArgs([("phylTree.conf", myTools.File), ("proteinTree", myTools.File)], [("out:ancGenesFiles", str, ""), ("reuseNames", bool, False)], __doc__) phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) proteinTrees = myProteinTree.loadTree(arguments["proteinTree"]) count, dupCount, geneFamilies = extractGeneFamilies(phylTree, proteinTrees, arguments["reuseNames"]) outTemplate = arguments["out:ancGenesFiles"] if outTemplate: for (anc, lst) in geneFamilies.items(): print("Ecriture des familles de %s ..." % anc, end=' ', file=sys.stderr) f = myFile.openFile(outTemplate % phylTree.fileName[anc], "w") for gg in lst: print(" ".join(gg), file=f) f.close() print(len(lst), "OK", file=sys.stderr)
def main(): arguments = myTools.checkArgs([("phylTree.conf", myTools.File), ("iniTree", myTools.File), ("rootSpecies", str)], [], __doc__) phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"]) # Returns a list of nodes under the new root species ######################################################### def search(node): if phylTree.isChildOf(tree.info[node]['taxon_name'], arguments["rootSpecies"]): return [node] elif node in tree.data: r = [] for (g, _) in tree.data[node]: r.extend(search(g)) return r else: return [] nb = 0 for tree in myProteinTree.loadTree(arguments["iniTree"]): l = search(tree.root) nb += len(l) if len(l) == 1: tree.info[l[0]]["tree_name"] = tree.info[tree.root]["tree_name"] myProteinTree.printTree(sys.stdout, tree.data, tree.info, l[0]) else: for (i, r) in enumerate(l): tree.info[r]["tree_name"] = tree.info[ tree.root]["tree_name"] + myProteinTree.getDupSuffix( i + 1, True) myProteinTree.printTree(sys.stdout, tree.data, tree.info, r) print(nb, "extracted trees", file=sys.stderr)
for (s1, s2) in itertools.combinations(speciessets, 2): inters.update(s1.intersection(s2)) all = set().union(*speciessets) anc = tree.info[rnode]["taxon_name"] if arguments["scoreMethod"] == 3: inters.intersection_update(goodSpecies(anc)) all.intersection_update(goodSpecies(anc)) #print >> sys.stderr,rnode return ((len(inters) == 0) and (minDuplicationScore[anc] == 0)) or ( len(inters) < (minDuplicationScore[anc] * len(all))) nbEdit = {"dubious": 0, "toolow": 0, "good": 0} for (nb, tree) in enumerate(myProteinTree.loadTree(arguments["ensemblTree"])): assert max(tree.info) < arguments["newNodeID"] # On trie les bonnes duplications des mauvaises ################################################ for (node, inf) in tree.info.items(): print(node, inf, file=sys.stderr) if inf['Duplication'] != 0: if 'dubious_duplication' in inf: # On considere que les duplications 'dubious' ne sont pas valables pour une duplication assert inf['Duplication'] == 1 del inf['dubious_duplication'] nbEdit["dubious"] += 1
def main(badnodelistfile, forestfile, badnode_col=0, maxdist=MAXDIST, print_unchanged=True, dryrun=False): with open(badnodelistfile) as f: header_line = next(f) badnodes = set(int(line.rstrip().split()[0]) for line in f) logger.info('%d nodes to remove.', len(badnodes)) proteintrees = myProteinTree.loadTree(forestfile) #for has_changed, tree in edit_from_selection(badnodes, proteintrees): #for has_changed, tree in or_combine_flagged_iterable( # edit_from_selection(proteintrees, badnodes), # edit_toolong, # maxdist=maxdist): n_unprinted = 0 if dryrun: def output(tree, flag1, flag2): nonlocal n_unprinted n_unprinted += 1 return int(flag1 | flag2) elif print_unchanged: def output(tree, flag1, flag2): tree.printTree(stdout) return int(flag1 | flag2) else: def output(tree, flag1, flag2): if flag1 | flag2: tree.printTree(stdout) return 1 else: nonlocal n_unprinted n_unprinted += 1 return 0 n_edited_trees = 0 n_discarded_roots = 0 n_edited_trees_fromsel = 0 n_edited_trees_toolong = 0 for change1, change2, tree in edit_toolong_flagged(edit_from_selection( proteintrees, badnodes), maxdist=maxdist): n_edited_trees_fromsel += int(change1) n_edited_trees_toolong += int(change2) if tree.root is None: n_discarded_roots += 1 else: n_edited_trees += output(tree, change1, change2) logger.info( '\n%9d edited output trees. %d unprinted trees. %d discarded roots.\n' ' -> %9d from node selection,\n' ' -> %9d because of too long branches.\n', n_edited_trees, n_unprinted, n_discarded_roots, n_edited_trees_fromsel, n_edited_trees_toolong)
pass indiv_files = False if args.outfile is None: outfile = stdout elif not '{genetree}' in args.outfile: outfile = open(args.outfile, 'w') else: indiv_files = True def get_outfile(tree): rootinfo = tree.info[tree.root] genetree = rootinfo.get('tree_name', rootinfo['family_name']) return open(args.outfile.format(genetree=genetree), 'w') for tree in ProteinTree.loadTree(args.forestfile): with get_outfile(tree) as outfile: tree.printNewick(outfile, withDist=True, withTags=True, withAncSpeciesNames=True, withAncGenesNames=True, withID=True) if not indiv_files: try: for tree in ProteinTree.loadTree(args.forestfile): tree.printNewick(outfile, withDist=True, withTags=True, withAncSpeciesNames=True,
newAnc, newLastWritten)) else: # when the node is a leaf allGenes = [tree.info[node]["gene_name"]] for a in toWrite: # 'a'= name of the ancestor to print geneFamilies[a].append( [currName] + allGenes ) # write the name of the gene of Anc followed by the names of the genes of the children (this is done for all the species in toWrite) #FIXME geneFamilies is defined in the main, it is modified whereas it is not even a parameter return allGenes # for the recurrence geneFamilies = collections.defaultdict(list) for tree in myProteinTree.loadTree( arguments["geneTreeForest"]): # for all gene trees in the forest extractGeneFamilies( tree.root, tree.info[tree.root]["tree_name"], None, None ) # FIXME this function modifies tree and geneFamilies even if tree and gene families are not parameters tree.printTree(sys.stdout) for (anc, lst) in geneFamilies.items(): print("Write %s family ..." % anc, end=' ', file=sys.stderr) f = myFile.openFile(arguments["out:ancGenes"] % speciesTree.fileName[anc], "w") for gg in lst: print(" ".join(gg), file=f) f.close() print(len(lst), "OK", file=sys.stderr)
print("NEW TREE", file=sys.stderr) if node in tree.data: t1 = tree.info[node]['taxon_name'] for (g, d) in tree.data[node]: # Une distance ne peut etre prise qu'entre deux noeuds de speciation if (tree.info[node]['Duplication'] == 0) and (tree.info[g]['Duplication'] == 0): t2 = tree.info[g]['taxon_name'] # Les deux noeuds doivent etre strictement consecutifs if (phylTree.parent[t2].name == t1) and (d != 0): lengths[(t1, t2)].append(d) print(myFile.myTSV.printLine([t1, t2, d]), file=sys.stderr) do(g) for tree in myProteinTree.loadTree(arguments["proteinTree"]): do(tree.root) # On trie les listes des longueurs for l in lengths.values(): l.sort() # Parcourt recursivement l'arbre et l'ecrit au format avec des parentheses, avec les longueurs de branche medianes def convertToFlatFile(anc): a = phylTree.fileName[anc] if anc in phylTree.listSpecies: # On est arrive sur une feuille return a else: # On est sur un noeud, on construit la liste des distances
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """Extract the subtree descending from the first node validating the given condition""" from LibsDyogen import myTools, myProteinTree if __name__ == '__main__': args = myTools.checkArgs([("forestFile", myTools.File), ("outFile", str), ("id", int)], [], __doc__) node = args["id"] with open(args["outFile"], "w") as out: for tree in myProteinTree.loadTree(args["forestFile"]): if node in tree.info: tree.printTree(out, node) break
def iter_from_prottree(treefile, *args, **kwargs): from LibsDyogen import myProteinTree for tree in myProteinTree.loadTree(treefile, *args, **kwargs): yield tree
def sort_children(tree): """Sort children based on their numerical id""" for node, nodedata in tree.data.items(): nodedata.sort() if __name__ == '__main__': if set(('-h', '--help')) & set(argv[1:]): print(__doc__) exit() elif len(argv) > 3: print(__doc__) exit(1) else: try: outfile = argv[2] except IndexError: outfile = stdout try: infile = argv[1] except IndexError: infile = stdin with myFile.openFile(outfile, 'w') as out: for tree in myProteinTree.loadTree(infile): sort_children(tree) tree.printTree(out)
def main(): arguments = myTools.checkArgs([("iniTree", myTools.File)], [], __doc__) for tree in myProteinTree.loadTree(arguments["iniTree"]): next
#! /usr/bin/env python """ Decoupe un fichier d'arbres en fichiers separes usage: ./splitTrees.py GeneTreeForest.phylTree.bz2 Fam.%s """ from LibsDyogen import myFile, myTools, myProteinTree arguments = myTools.checkArgs([("proteinTree", myTools.File), ("output", str)], [], __doc__) for (i, tree) in enumerate(myProteinTree.loadTree(arguments["proteinTree"])): print(i) f = myFile.openFile(arguments["output"] % (i + 1), "w") tree.printTree(f) f.close()
def forest_summary(forestfile): # Counts n_nodes = 0 n_intnodes = 0 n_leaves = 0 n_dup = {} n_dubious = 0 #n_multifurc = 0 # Complete lists for summary stats taxa_set = set() species_set = set() dup_conf_scores = [] dup_bootstraps = [] tree_n_nodes = [] tree_n_intnodes = [] tree_n_leaves = [] tree_n_speciesleaves = [] tree_n_dup = [] tree_n_dubious = [] for tree_i, tree in enumerate(ProteinTree.loadTree(forestfile)): tree_n_intnodes.append(len(tree.data)) tree_n_nodes.append(len(tree.info)) # Counter for this tree tree_n_leaves.append(0) tree_n_speciesleaves.append(0) tree_n_dup.append(0) tree_n_dubious.append(0) for node_id, nodeinfo in tree.info.items(): taxa_set.add(nodeinfo['taxon_name']) try: n_dup[nodeinfo['Duplication']] += 1 except KeyError: n_dup[nodeinfo['Duplication']] = 1 #if 'dubious_duplication' in nodeinfo: # all_tree_n_dubious[-1] += 1 if nodeinfo['Duplication'] != 0: tree_n_dup[-1] += 1 dup_conf_scores.append( nodeinfo.get('duplication_confidence_score', np.NaN)) dup_bootstraps.append(nodeinfo.get('Bootstrap', np.NaN)) # dupli without conf scores: edited nodes with 'Duplication': 3 # dupli without bootstrap: edited nodes with 'Duplication': 2 if 'gene_name' in nodeinfo: species_set.add(nodeinfo['taxon_name']) tree_n_speciesleaves[-1] += 1 if node_id not in tree.data: tree_n_leaves[-1] += 1 assert tree_n_nodes[-1] - tree_n_intnodes[-1] == tree_n_leaves[-1] dup_conf_scores = np.array(dup_conf_scores) dup_conf_scores_nan = np.isnan(dup_conf_scores) dup_conf_scores = dup_conf_scores[~dup_conf_scores_nan] dup_bootstraps = np.array(dup_bootstraps) dup_bootstraps_nan = np.isnan(dup_bootstraps) dup_bootstraps = dup_bootstraps[~dup_bootstraps_nan] tree_n_nodes = np.array(tree_n_nodes) tree_n_intnodes = np.array(tree_n_intnodes) tree_n_leaves = np.array(tree_n_leaves) tree_n_speciesleaves = np.array(tree_n_speciesleaves) tree_n_dup = np.array(tree_n_dup) tree_n_dubious = np.array(tree_n_dubious) return """ Nb of taxa : {:d} Nb of species : {:d} Nb of trees : {:d} tot tree average tree std n_nodes : {:-8d} {:-8.2f} {:-8.2f} n_intnodes : {:-8d} {:-8.2f} {:-8.2f} n_leaves : {:-8d} {:-8.2f} {:-8.2f} n_speciesleaves : {:-8d} {:-8.2f} {:-8.2f} n_dup : {} tree average= {:4.2f} tree std= {:4.2f} #n_dubious #n_multifurc dup_conf_scores: average= {:8.5f} std= {:8.5f} missing= {:d} dup_bootstraps: average= {:8.5f} std= {:8.5f} missing= {:d} """.format(len(taxa_set), len(species_set), (tree_i + 1), tree_n_nodes.sum(), tree_n_nodes.mean(), tree_n_nodes.std(), tree_n_intnodes.sum(), tree_n_intnodes.mean(), tree_n_intnodes.std(), tree_n_leaves.sum(), tree_n_leaves.mean(), tree_n_leaves.std(), tree_n_speciesleaves.sum(), tree_n_speciesleaves.mean(), tree_n_speciesleaves.std(), ', '.join('%d: %d' % item for item in n_dup.items()), tree_n_dup.mean(), tree_n_dup.std(), dup_conf_scores.mean(), dup_conf_scores.std(), dup_conf_scores_nan.sum(), dup_bootstraps.mean(), dup_bootstraps.std(), dup_bootstraps_nan.sum())
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from __future__ import print_function from LibsDyogen import myTools, myProteinTree import logging logger = logging.getLogger(__name__) if __name__ == '__main__': logging.basicConfig(format="%(levelname)s:%(message)s") args = myTools.checkArgs([('forestfile', myTools.File)], [], __doc__) for tree in myProteinTree.loadTree(args['forestfile']): rootinfo = tree.info[tree.root] try: tree_name = rootinfo['tree_name'] except KeyError: logger.warning("No tree_name found at root: %d %s", tree.root, rootinfo) tree_name = ';'.join( (rootinfo.get('family_name', 'unnamed'), rootinfo['taxon_name'])) print(tree_name)