Esempio n. 1
0
def main():
    logging.basicConfig()
    logger.setLevel(logging.INFO)

    arguments = myTools.checkArgs(
        [("phylTree.conf",myTools.File), ("ensemblTree",myTools.File)],
        [("flatten",bool,False), ("rebuild",bool,False), ("fam",bool,False),
         ("cutoff",str,"-1"), ("defaultFamName",str,"FAM%08d"),
         ("scoreMethod",int,[1,2,3]), ("newNodeID",float,1e8),
         ("recurs",bool,False), ("indicator",bool,False), ("debug",bool,False)],
        __doc__)
    if arguments['debug']: logger.setLevel(logging.DEBUG)

    myProteinTree.nextNodeID = int(arguments["newNodeID"])  # For the rebuild step.
    phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"])

    hasLowScore = setupScoring(phylTree,
                               arguments["scoreMethod"],
                               arguments["cutoff"])

    prottrees = myProteinTree.loadTree(arguments["ensemblTree"])

    prottrees = process(prottrees, phylTree, hasLowScore,
                        arguments["defaultFamName"], arguments["flatten"],
                        arguments["rebuild"], arguments["recurs"],
                        arguments["indicator"])

    if arguments["fam"]:
        # Will not work on previous versions of ToolsDyogen.
        from treeTools.ALL.extractGeneFamilies import extractGeneFamilies
        count, dupCount, geneFamilies = extractGeneFamilies(phylTree, prottrees)
    else:
        for tree in prottrees:
            tree.printTree(sys.stdout)
Esempio n. 2
0
def get_robusts_from_prottree(dataset, prottreefile, ancestors, phyltrees):

    ###WARNING: dubious gene trees could be marked as "robust"!!

    dataset_robusts = pd.DataFrame(0,
                                   index=ancestors,
                                   columns=('all_crown', 'implicit', 'robusts',
                                            'Ngenes=Nspecies'),
                                   dtype=int)
    #name='%s_%s' % (dataset, edition))
    prottrees = list(myProteinTree.loadTree(op.expanduser(prottreefile)))
    for ancestor in ancestors:
        logger.info('%s: %s', dataset, ancestor)
        anc_genecounts, anc_spgenes, anc_branches = \
                tree_extract_genecounts(
                        deepcopy(prottrees),
                        ancestor,
                        phyltrees[dataset],
                        onlybasal=True)

        dataset_robusts.loc[ancestor, 'all_crown'] = anc_genecounts.shape[0]
        dataset_robusts.loc[ancestor, 'implicit'] = \
                (anc_branches != ancestor).all(axis=1).sum()
        dataset_robusts.loc[ancestor, 'robusts'] = \
                (anc_genecounts == 1).all(axis=1).sum()
        dataset_robusts.loc[ancestor, 'Ngenes=Nspecies'] = \
                (anc_genecounts.sum(axis=1) == len(phyltrees[dataset].species[ancestor])).sum()

    return dataset, dataset_robusts
def main(forestfile, phyltreefile, speciesfile, invert=False):
    phyltree = myPhylTree.PhylogeneticTree(phyltreefile)
    with open(speciesfile) as f:
        badspecies = [line.rstrip() for line in f if not line.startswith('#')]

    subroot, subtree = phyltree.getSubTree(badspecies)

    for tree in myProteinTree.loadTree(forestfile):
Esempio n. 4
0
def main(ensembl_version, forestfile, outfile=None, delete_distant_orthologs=False):
    out = stdout if outfile is None else open(outfile, 'w')
    for fused_tree in fuse_subspecies(
            myProteinTree.loadTree(op.expanduser(forestfile % ensembl_version)),
            species2seq,
            delete_distant_orthologs):
        fused_tree.printTree(out)
    if outfile is not None:
        out.close()
Esempio n. 5
0
def extractMultipleGeneTrees(proteinTree, family_name, field='family_name',
         toNewick=False, withAncSpeciesNames=False, withAncGenesNames=False,
         withTags=False, phyltree=None, output=None, force=False,
         mkdirs=False, firstmatch=False):
    if phyltree:
        phyltree = myPhylTree.PhylogeneticTree(phyltree)

    family_names = dict.fromkeys(family_name, 0)

    for tree in myProteinTree.loadTree(proteinTree):
        family = tree.info[tree.root][field].split('.')[0]
        if family in family_names:
            print("Found", family, end=' ', file=sys.stderr)
            wasfound = family_names[family]
            outfile = output.format(genetree=family) if output else '<stdout>'
            if os.path.isfile(outfile) and not wasfound and not firstmatch and not force:
                #if family_names[family] == 0:
                #FIXME so that you can omit the --force option but append to file
                print("%s exists. Skipping. (use --force)" % outfile, file=sys.stderr)
                family_names.pop(family)
            else:
                if phyltree is not None:
                    #markLowScore(tree, hasLowScore)
                    #flattenTree
                    #
                    tree.rebuildTree(phyltree)
                #TODO: start in new thread.
                filemode = 'a' if wasfound else 'w'
                try:
                    out = open(outfile, filemode) if output else sys.stdout
                except IOError:
                    if mkdirs:
                        os.makedirs(os.path.split(outfile)[0])
                        out = open(outfile, filemode)
                    else:
                        raise

                if toNewick:
                    print("Output to newick format", file=sys.stderr)
                    tree.printNewick(out, withDist=True, withTags=withTags,
                                     withAncSpeciesNames=withAncSpeciesNames,
                                     withAncGenesNames=withAncGenesNames,
                                     withID=withTags)
                else:
                    tree.printTree(out)
                if output: out.close()
                if firstmatch:
                    family_names.pop(family)
                else:
                    family_names[family] += 1
        if firstmatch and not family_names:
            break

    notfound = set((fam for fam,wasfound in family_names.items() if not wasfound))
    if notfound:
        print('WARNING: %d names were not found in field %r: %s' % (
              len(notfound), field, ' '.join(notfound)), file=sys.stderr)
Esempio n. 6
0
def main(proteinTree, outFile, sortAttr=None):
    if sortAttr:
        get_attribute = lambda *args: ProteinTree_getnodeattr(
            *args, attrname=sortAttr)
    else:
        get_attribute = ProteinTree_getId

    with myFile.openFile(outFile, 'w') as out:
        for tree in myProteinTree.loadTree(proteinTree):
            ProteinTree_LeafSort(tree, get_attribute)
            tree.printTree(out)
def processTrees(ensemblTree, phylTree):
    for tree in myProteinTree.loadTree(arguments["ensemblTree"]):
        try:
            tree.flattenTree(phylTree, rec=True)
            # Not sure this step is useful, and why this hasLowScore function has no effect.
            tree.rebuildTree(phylTree, hasLowScore=alwaysFalse)
        except BaseException as err:
            err.args += ("Root id '%d'" % tree.root, )
            raise

        yield tree
Esempio n. 8
0
def main():
    arguments = myTools.checkArgs([("proteinTree", myTools.File),
                                   ("gene_name", str)], [], __doc__)

    # Information on ancestral node
    def printAncNode(node):
        txt = [node]
        d = tree.info[node].pop('Duplication', None)
        if tree.info[node].pop("dubious_duplication", None):
            txt.append("DUBIOUS_DUPLICATION")
        elif (d == 1) and ("duplication_confidence_score" in tree.info[node]):
            txt.append("ROOT_DUPLICATION")
        elif d == 2:
            txt.append("DUPLICATION")
        elif d == 3:
            txt.append("EDITED_DUPLICATION")
        else:
            txt.append("SPECIATION")
        txt.append(tree.info[node].pop("taxon_name", None))
        txt.append(tree.info[node].pop("family_name", None))
        txt.append(tree.info[node].pop("Bootstrap", None))
        txt.append(tree.info[node].pop("duplication_confidence_score", None))
        print(myFile.myTSV.printLine(txt))

    # Information on Gene
    def printGeneNode(node):
        txt = [node]
        txt.append("GENE")
        txt.append(tree.info[node].pop("taxon_name", None))
        txt.append(tree.info[node].pop("gene_name", None))
        print(myFile.myTSV.printLine(txt))

    # Recursive loop on the gene family
    def do(node):
        if node in tree.data:
            for (g, d) in tree.data[node]:
                if do(g):
                    printAncNode(node)
                    return True
        elif tree.info[node]["gene_name"] == arguments["gene_name"]:
            printGeneNode(node)
            return True
        return False

    # searching for the good gene tree
    for tree in myProteinTree.loadTree(arguments["proteinTree"]):
        if do(tree.root):
            break
def main(treeforestfile, outfile, dryrun=False, edited_node_id=EDITED_NODE_ID,
         infinite_dist=INFINITE_DIST):
    total_deleted = 0
    total_leaves_deleted = 0
    if dryrun and outfile is not stdout: outfile.close()
    if treeforestfile == '-': treeforestfile = stdin
    for tree in ProteinTree.loadTree(treeforestfile):
        del_count, del_leaf_count = filterbranches(tree, tree.root,
                                                   edited_node_id,
                                                   infinite_dist)
        total_deleted        += del_count
        total_leaves_deleted += del_leaf_count
        if not dryrun:
            tree.printTree(outfile)
        #break
    print("Deleted %d branches, of which %d leaves." % (total_deleted, total_leaves_deleted))
Esempio n. 10
0
def main(ensembltree, outputfile):
    get_ch = lambda tree, node: [x[0] for x in tree.data.get(node, [])]
    get_chd = lambda tree, nodedist: tree.data.get(nodedist[0], [])

    count_trees = 0
    count_treenodes = []
    count_splits = 0
    count_split_desc = 0
    with myFile.openFile(outputfile, 'w') as out:
        for tree in ProteinTree.loadTree(ensembltree):
            count_treenodes.append(0)
            for (node, dist), childrendists in dfw_descendants_generalized(
                    tree, get_chd, queue=[(None, (tree.root, 0))]):
                count_treenodes[-1] += 1
                assert tree.info[node]['duplication'] != 10, \
                        "Unexpected. parent node is a split gene: %s: %s" % \
                                (node, tree.info[node])
                for child, chdist in childrendists:
                    if tree.info[child]['Duplication'] == 10:
                        # It's a gene split
                        # Recurse through all the descendants to remove them
                        count_splits += 1
                        for _, GS_descendant in reversed(
                                list(
                                    dfw_pairs_generalized(tree,
                                                          get_ch,
                                                          queue=[(None, child)
                                                                 ],
                                                          include_root=True))):
                            tree.info.pop(GS_descendant)
                            tree.data.pop(GS_descendant)
                            count_split_desc += 1

                        tree.data[node].remove((child, chdist))

            tree.printTree(out)
    print("%d trees" % count_trees, file=stderr)
    print("treenodes:",
          " ".join(str(nn) for nn in count_treenodes),
          file=stderr)
    print("Splits: %d  Split descendants: %d" %
          (count_splits, count_split_desc),
          file=stderr)
Esempio n. 11
0
def run(process, proteinTreeFile, converted_args):
    print("Give args: %s" % converted_args, file=stderr)

    if proteinTreeFile == '-':
        proteinTreeFile = stdin

    count_outputs = defaultdict(int)

    for tree in myProteinTree.loadTree(proteinTreeFile):
        r = process(tree, *converted_args)
        try:
            count_outputs[r] += 1
        except TypeError:
            count_outputs["unhashable"] += 1
            pass

        tree.printTree(stdout)

    print("Outputs counts:", count_outputs, file=stderr)
def main(phyltreefile, forestfile=None):
    #with open(badspecieslistfile) as f:
    #    badspecies = [line.rstrip() for line in f if not line.startswith('#')]
    phyltree = myPhylTree.PhylogeneticTree(phyltreefile)

    if forestfile is None:
        forestfile = stdin
    for tree in myProteinTree.loadTree(forestfile):
        keptleaves = set(
            (leaf for leaf in set(tree.info).difference(tree.data)
             if tree.info[leaf]['taxon_name'] in phyltree.allNames))
        newroot, _ = thin_prottree(tree, tree.root, 0, keptleaves)
        #print('DEBUG: newroot =', newroot)
        #print('DEBUG: newdata =', tree.data)
        #print('DEBUG: newinfo =', ' '.join(str(x) for x in tree.info.keys()))
        if newroot is not None:
            fix_thinned_dups(phyltree, tree, newroot)
            tree.printTree(stdout, newroot)
        else:
            logger.warning('Discard tree %d', tree.root)
Esempio n. 13
0
def test_convert2species(ensembl_version,
                         default=None,
                         forestfile='~/GENOMICUS%d/tree.1.ensembl.bz2'):
    """test the `convert_prot2species` function for every modernID"""
    # Check rejection of wrong strings
    from LibsDyogen import myProteinTree

    for wrong in ('xululul', '0000000', 'ENSXXXP', 'ENSG000'):
        predicted_sp = convert_prot2species(wrong, ensembl_version, False)
        assert predicted_sp is False, "%r predicted %r" % (wrong, predicted_sp)

    expected_species = set(GENE2SP[ensembl_version].values())
    expected_species_p = set(PROT2SP[ensembl_version].values())
    assert expected_species == expected_species_p

    for tree in myProteinTree.loadTree(
            op.expanduser(forestfile % ensembl_version)):
        for tip in (set(tree.info) - set(tree.data)):
            tipinfo = tree.info[tip]
            sp = tipinfo['taxon_name']
            gene = tipinfo['gene_name']
            prot = tipinfo['protein_name']

            assert sp in expected_species, 'Unexpected species %r' % sp

            try:
                predicted_sp = convert_gene2species(gene, ensembl_version)
            except KeyError as err:
                err.args = err.args[:-1] + \
                           (err.args[-1] + ' '.join((sp, gene, "Not found")),)
                raise
            assert sp == predicted_sp, "%s: %r ≠ %r" % (gene, sp, predicted_sp)

            try:
                predicted_sp = convert_prot2species(prot, ensembl_version,
                                                    default)
            except KeyError as err:
                err.args = err.args[:-1] + \
                           (err.args[-1] + ' '.join((sp, prot, "Not found")),)
                raise
            assert sp == predicted_sp, "%s: %r ≠ %r" % (prot, sp, predicted_sp)
Esempio n. 14
0
def main():
    # Arguments
    arguments = myTools.checkArgs([("phylTree.conf", myTools.File),
                                   ("proteinTree", myTools.File)],
                                  [("out:ancGenesFiles", str, ""),
                                   ("reuseNames", bool, False)],
                                  __doc__)

    phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"])
    proteinTrees = myProteinTree.loadTree(arguments["proteinTree"])

    count, dupCount, geneFamilies = extractGeneFamilies(phylTree,
                                                        proteinTrees,
                                                        arguments["reuseNames"])

    outTemplate = arguments["out:ancGenesFiles"]
    if outTemplate:
        for (anc, lst) in geneFamilies.items():
            print("Ecriture des familles de %s ..." % anc, end=' ', file=sys.stderr)
            f = myFile.openFile(outTemplate % phylTree.fileName[anc], "w")
            for gg in lst:
                print(" ".join(gg), file=f)
            f.close()
            print(len(lst), "OK", file=sys.stderr)
Esempio n. 15
0
def main():
    arguments = myTools.checkArgs([("phylTree.conf", myTools.File),
                                   ("iniTree", myTools.File),
                                   ("rootSpecies", str)], [], __doc__)

    phylTree = myPhylTree.PhylogeneticTree(arguments["phylTree.conf"])

    # Returns a list of nodes under the new root species
    #########################################################
    def search(node):
        if phylTree.isChildOf(tree.info[node]['taxon_name'],
                              arguments["rootSpecies"]):
            return [node]
        elif node in tree.data:
            r = []
            for (g, _) in tree.data[node]:
                r.extend(search(g))
            return r
        else:
            return []

    nb = 0
    for tree in myProteinTree.loadTree(arguments["iniTree"]):
        l = search(tree.root)
        nb += len(l)
        if len(l) == 1:
            tree.info[l[0]]["tree_name"] = tree.info[tree.root]["tree_name"]
            myProteinTree.printTree(sys.stdout, tree.data, tree.info, l[0])
        else:
            for (i, r) in enumerate(l):
                tree.info[r]["tree_name"] = tree.info[
                    tree.root]["tree_name"] + myProteinTree.getDupSuffix(
                        i + 1, True)
                myProteinTree.printTree(sys.stdout, tree.data, tree.info, r)

    print(nb, "extracted trees", file=sys.stderr)
    for (s1, s2) in itertools.combinations(speciessets, 2):
        inters.update(s1.intersection(s2))
    all = set().union(*speciessets)
    anc = tree.info[rnode]["taxon_name"]

    if arguments["scoreMethod"] == 3:
        inters.intersection_update(goodSpecies(anc))
        all.intersection_update(goodSpecies(anc))
    #print >> sys.stderr,rnode
    return ((len(inters) == 0) and (minDuplicationScore[anc] == 0)) or (
        len(inters) < (minDuplicationScore[anc] * len(all)))


nbEdit = {"dubious": 0, "toolow": 0, "good": 0}

for (nb, tree) in enumerate(myProteinTree.loadTree(arguments["ensemblTree"])):

    assert max(tree.info) < arguments["newNodeID"]

    # On trie les bonnes duplications des mauvaises
    ################################################
    for (node, inf) in tree.info.items():
        print(node, inf, file=sys.stderr)
        if inf['Duplication'] != 0:

            if 'dubious_duplication' in inf:
                # On considere que les duplications 'dubious' ne sont pas valables pour une duplication
                assert inf['Duplication'] == 1
                del inf['dubious_duplication']
                nbEdit["dubious"] += 1
Esempio n. 17
0
def main(badnodelistfile,
         forestfile,
         badnode_col=0,
         maxdist=MAXDIST,
         print_unchanged=True,
         dryrun=False):
    with open(badnodelistfile) as f:
        header_line = next(f)
        badnodes = set(int(line.rstrip().split()[0]) for line in f)

    logger.info('%d nodes to remove.', len(badnodes))

    proteintrees = myProteinTree.loadTree(forestfile)
    #for has_changed, tree in edit_from_selection(badnodes, proteintrees):
    #for has_changed, tree in or_combine_flagged_iterable(
    #                                edit_from_selection(proteintrees, badnodes),
    #                                edit_toolong,
    #                                maxdist=maxdist):
    n_unprinted = 0

    if dryrun:

        def output(tree, flag1, flag2):
            nonlocal n_unprinted
            n_unprinted += 1
            return int(flag1 | flag2)
    elif print_unchanged:

        def output(tree, flag1, flag2):
            tree.printTree(stdout)
            return int(flag1 | flag2)
    else:

        def output(tree, flag1, flag2):
            if flag1 | flag2:
                tree.printTree(stdout)
                return 1
            else:
                nonlocal n_unprinted
                n_unprinted += 1
                return 0

    n_edited_trees = 0
    n_discarded_roots = 0
    n_edited_trees_fromsel = 0
    n_edited_trees_toolong = 0

    for change1, change2, tree in edit_toolong_flagged(edit_from_selection(
            proteintrees, badnodes),
                                                       maxdist=maxdist):
        n_edited_trees_fromsel += int(change1)
        n_edited_trees_toolong += int(change2)
        if tree.root is None:
            n_discarded_roots += 1
        else:
            n_edited_trees += output(tree, change1, change2)

    logger.info(
        '\n%9d edited output trees. %d unprinted trees. %d discarded roots.\n'
        ' -> %9d from node selection,\n'
        ' -> %9d because of too long branches.\n', n_edited_trees, n_unprinted,
        n_discarded_roots, n_edited_trees_fromsel, n_edited_trees_toolong)
Esempio n. 18
0
        pass

    indiv_files = False
    if args.outfile is None:
        outfile = stdout
    elif not '{genetree}' in args.outfile:
        outfile = open(args.outfile, 'w')
    else:
        indiv_files = True

        def get_outfile(tree):
            rootinfo = tree.info[tree.root]
            genetree = rootinfo.get('tree_name', rootinfo['family_name'])
            return open(args.outfile.format(genetree=genetree), 'w')

        for tree in ProteinTree.loadTree(args.forestfile):
            with get_outfile(tree) as outfile:
                tree.printNewick(outfile,
                                 withDist=True,
                                 withTags=True,
                                 withAncSpeciesNames=True,
                                 withAncGenesNames=True,
                                 withID=True)

    if not indiv_files:
        try:
            for tree in ProteinTree.loadTree(args.forestfile):
                tree.printNewick(outfile,
                                 withDist=True,
                                 withTags=True,
                                 withAncSpeciesNames=True,
                    newAnc, newLastWritten))

    else:  # when the node is a leaf
        allGenes = [tree.info[node]["gene_name"]]

    for a in toWrite:  # 'a'= name of the ancestor to print
        geneFamilies[a].append(
            [currName] + allGenes
        )  # write the name of the gene of Anc followed by the names of the genes of the children (this is done for all the species in toWrite)
        #FIXME geneFamilies is defined in the main, it is modified whereas it is not even a parameter

    return allGenes  # for the recurrence


geneFamilies = collections.defaultdict(list)
for tree in myProteinTree.loadTree(
        arguments["geneTreeForest"]):  # for all gene trees in the forest
    extractGeneFamilies(
        tree.root, tree.info[tree.root]["tree_name"], None, None
    )  # FIXME this function modifies tree and geneFamilies even if tree and gene families are not parameters
    tree.printTree(sys.stdout)

for (anc, lst) in geneFamilies.items():
    print("Write %s family ..." % anc, end=' ', file=sys.stderr)
    f = myFile.openFile(arguments["out:ancGenes"] % speciesTree.fileName[anc],
                        "w")
    for gg in lst:
        print(" ".join(gg), file=f)
    f.close()
    print(len(lst), "OK", file=sys.stderr)
    print("NEW TREE", file=sys.stderr)
    if node in tree.data:
        t1 = tree.info[node]['taxon_name']
        for (g, d) in tree.data[node]:
            # Une distance ne peut etre prise qu'entre deux noeuds de speciation
            if (tree.info[node]['Duplication']
                    == 0) and (tree.info[g]['Duplication'] == 0):
                t2 = tree.info[g]['taxon_name']
                # Les deux noeuds doivent etre strictement consecutifs
                if (phylTree.parent[t2].name == t1) and (d != 0):
                    lengths[(t1, t2)].append(d)
                    print(myFile.myTSV.printLine([t1, t2, d]), file=sys.stderr)
            do(g)


for tree in myProteinTree.loadTree(arguments["proteinTree"]):
    do(tree.root)

# On trie les listes des longueurs
for l in lengths.values():
    l.sort()


# Parcourt recursivement l'arbre et l'ecrit au format avec des parentheses, avec les longueurs de branche medianes
def convertToFlatFile(anc):
    a = phylTree.fileName[anc]
    if anc in phylTree.listSpecies:
        # On est arrive sur une feuille
        return a
    else:
        # On est sur un noeud, on construit la liste des distances
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Extract the subtree descending from the first node validating the given condition"""
from LibsDyogen import myTools, myProteinTree

if __name__ == '__main__':
    args = myTools.checkArgs([("forestFile", myTools.File), ("outFile", str),
                              ("id", int)], [], __doc__)

    node = args["id"]

    with open(args["outFile"], "w") as out:
        for tree in myProteinTree.loadTree(args["forestFile"]):
            if node in tree.info:
                tree.printTree(out, node)
                break
Esempio n. 22
0
def iter_from_prottree(treefile, *args, **kwargs):
    from LibsDyogen import myProteinTree
    for tree in myProteinTree.loadTree(treefile, *args, **kwargs):
        yield tree
Esempio n. 23
0

def sort_children(tree):
    """Sort children based on their numerical id"""
    for node, nodedata in tree.data.items():
        nodedata.sort()


if __name__ == '__main__':
    if set(('-h', '--help')) & set(argv[1:]):
        print(__doc__)
        exit()
    elif len(argv) > 3:
        print(__doc__)
        exit(1)

    else:
        try:
            outfile = argv[2]
        except IndexError:
            outfile = stdout
        try:
            infile = argv[1]
        except IndexError:
            infile = stdin

    with myFile.openFile(outfile, 'w') as out:
        for tree in myProteinTree.loadTree(infile):
            sort_children(tree)
            tree.printTree(out)
Esempio n. 24
0
def main():
    arguments = myTools.checkArgs([("iniTree", myTools.File)], [], __doc__)

    for tree in myProteinTree.loadTree(arguments["iniTree"]):
        next
Esempio n. 25
0
#! /usr/bin/env python
"""
	Decoupe un fichier d'arbres en fichiers separes

	usage:
			./splitTrees.py GeneTreeForest.phylTree.bz2 Fam.%s
"""

from LibsDyogen import myFile, myTools, myProteinTree

arguments = myTools.checkArgs([("proteinTree", myTools.File), ("output", str)],
                              [], __doc__)

for (i, tree) in enumerate(myProteinTree.loadTree(arguments["proteinTree"])):
    print(i)
    f = myFile.openFile(arguments["output"] % (i + 1), "w")
    tree.printTree(f)
    f.close()
def forest_summary(forestfile):

    # Counts
    n_nodes = 0
    n_intnodes = 0
    n_leaves = 0
    n_dup = {}
    n_dubious = 0
    #n_multifurc = 0

    # Complete lists for summary stats
    taxa_set = set()
    species_set = set()
    dup_conf_scores = []
    dup_bootstraps = []

    tree_n_nodes = []
    tree_n_intnodes = []
    tree_n_leaves = []
    tree_n_speciesleaves = []

    tree_n_dup = []
    tree_n_dubious = []

    for tree_i, tree in enumerate(ProteinTree.loadTree(forestfile)):
        tree_n_intnodes.append(len(tree.data))
        tree_n_nodes.append(len(tree.info))

        # Counter for this tree
        tree_n_leaves.append(0)
        tree_n_speciesleaves.append(0)
        tree_n_dup.append(0)
        tree_n_dubious.append(0)

        for node_id, nodeinfo in tree.info.items():
            taxa_set.add(nodeinfo['taxon_name'])

            try:
                n_dup[nodeinfo['Duplication']] += 1
            except KeyError:
                n_dup[nodeinfo['Duplication']] = 1

            #if 'dubious_duplication' in nodeinfo:
            #    all_tree_n_dubious[-1] += 1

            if nodeinfo['Duplication'] != 0:
                tree_n_dup[-1] += 1
                dup_conf_scores.append(
                    nodeinfo.get('duplication_confidence_score', np.NaN))
                dup_bootstraps.append(nodeinfo.get('Bootstrap', np.NaN))
                # dupli without conf scores: edited nodes with 'Duplication': 3
                # dupli without bootstrap: edited nodes with 'Duplication': 2

            if 'gene_name' in nodeinfo:
                species_set.add(nodeinfo['taxon_name'])
                tree_n_speciesleaves[-1] += 1

            if node_id not in tree.data:
                tree_n_leaves[-1] += 1

        assert tree_n_nodes[-1] - tree_n_intnodes[-1] == tree_n_leaves[-1]

    dup_conf_scores = np.array(dup_conf_scores)
    dup_conf_scores_nan = np.isnan(dup_conf_scores)
    dup_conf_scores = dup_conf_scores[~dup_conf_scores_nan]
    dup_bootstraps = np.array(dup_bootstraps)
    dup_bootstraps_nan = np.isnan(dup_bootstraps)
    dup_bootstraps = dup_bootstraps[~dup_bootstraps_nan]

    tree_n_nodes = np.array(tree_n_nodes)
    tree_n_intnodes = np.array(tree_n_intnodes)
    tree_n_leaves = np.array(tree_n_leaves)
    tree_n_speciesleaves = np.array(tree_n_speciesleaves)

    tree_n_dup = np.array(tree_n_dup)
    tree_n_dubious = np.array(tree_n_dubious)

    return """
Nb of taxa    : {:d}
Nb of species : {:d}
Nb of trees   : {:d}

                       tot  tree average     tree std
n_nodes         : {:-8d}      {:-8.2f}     {:-8.2f}
n_intnodes      : {:-8d}      {:-8.2f}     {:-8.2f}
n_leaves        : {:-8d}      {:-8.2f}     {:-8.2f}
n_speciesleaves : {:-8d}      {:-8.2f}     {:-8.2f}

n_dup           : {}
                  tree average= {:4.2f}   tree std= {:4.2f}
#n_dubious
#n_multifurc

dup_conf_scores: average= {:8.5f}   std= {:8.5f}   missing= {:d}
dup_bootstraps:  average= {:8.5f}   std= {:8.5f}   missing= {:d}
""".format(len(taxa_set), len(species_set), (tree_i + 1), tree_n_nodes.sum(),
           tree_n_nodes.mean(), tree_n_nodes.std(), tree_n_intnodes.sum(),
           tree_n_intnodes.mean(), tree_n_intnodes.std(), tree_n_leaves.sum(),
           tree_n_leaves.mean(), tree_n_leaves.std(),
           tree_n_speciesleaves.sum(), tree_n_speciesleaves.mean(),
           tree_n_speciesleaves.std(), ',  '.join('%d: %d' % item
                                                  for item in n_dup.items()),
           tree_n_dup.mean(), tree_n_dup.std(), dup_conf_scores.mean(),
           dup_conf_scores.std(), dup_conf_scores_nan.sum(),
           dup_bootstraps.mean(), dup_bootstraps.std(),
           dup_bootstraps_nan.sum())
Esempio n. 27
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from __future__ import print_function

from LibsDyogen import myTools, myProteinTree
import logging
logger = logging.getLogger(__name__)

if __name__ == '__main__':
    logging.basicConfig(format="%(levelname)s:%(message)s")
    args = myTools.checkArgs([('forestfile', myTools.File)], [], __doc__)

    for tree in myProteinTree.loadTree(args['forestfile']):
        rootinfo = tree.info[tree.root]
        try:
            tree_name = rootinfo['tree_name']
        except KeyError:
            logger.warning("No tree_name found at root: %d %s", tree.root,
                           rootinfo)
            tree_name = ';'.join(
                (rootinfo.get('family_name',
                              'unnamed'), rootinfo['taxon_name']))
        print(tree_name)