Ejemplo n.º 1
0
def read_dlcoal_recon(filename, stree,
                      exts={"coal_tree": ".coal.tree",
                            "coal_recon": ".coal.recon",
                            "locus_tree": ".locus.tree",
                            "locus_recon": ".locus.recon",
                            "daughters": ".daughters"
                            },
                      filenames={}):
    """Reads a reconciled gene tree from files"""

    extra = {}

    # trees
    coal_tree = treelib.read_tree(
        filenames.get("coal_tree", filename + exts["coal_tree"]))
    extra["locus_tree"] = treelib.read_tree(
        filenames.get("locus_tree", filename + exts["locus_tree"]))

    # recons
    extra["coal_recon"], junk = phylo.read_recon_events(
        filenames.get("coal_recon", filename + exts["coal_recon"]),
        coal_tree, extra["locus_tree"])
    extra["locus_recon"], extra["locus_events"] = phylo.read_recon(
        filenames.get("locus_recon", filename + exts["locus_recon"]),
        extra["locus_tree"], stree)


    extra["daughters"] = set(
        extra["locus_tree"].nodes[x] for x in util.read_strings(
        filenames.get("daughters", filename + exts["daughters"])))

    return coal_tree, extra
Ejemplo n.º 2
0
    def read(self, filename, stree,
             exts={"coal_tree": ".coal.tree",
                   "coal_recon": ".coal.recon",
                   "locus_tree": ".locus.tree",
                   "locus_recon": ".locus.recon",
                   "daughters": ".daughters"
                   },
             filenames={},
             check=True):
        """Reads a reconciled gene tree from files"""

        # trees
        coal_tree = treelib.read_tree(
            filenames.get("coal_tree", filename + exts["coal_tree"]))
        self.locus_tree = treelib.read_tree(
            filenames.get("locus_tree", filename + exts["locus_tree"]))

        # recons
        self.coal_recon, junk = phylo.read_recon_events(
            filenames.get("coal_recon", filename + exts["coal_recon"]),
            coal_tree, self.locus_tree)
        self.locus_recon, self.locus_events = phylo.read_recon_events(
            filenames.get("locus_recon", filename + exts["locus_recon"]),
            self.locus_tree, stree)

        self.daughters = set(
            self.locus_tree.nodes[x] for x in util.read_strings(
            filenames.get("daughters", filename + exts["daughters"])))

        assert (not check) or (check and self.is_valid(coal_tree))

        return coal_tree, self.get_dict()
Ejemplo n.º 3
0
def read_dlcoal_recon(filename, stree,
                      exts={"coal_tree": ".coal.tree",
                            "coal_recon": ".coal.recon",
                            "locus_tree": ".locus.tree",
                            "locus_recon": ".locus.recon",
                            "daughters": ".daughters"
                            },
                      filenames={}):
    """Reads a reconciled gene tree from files"""

    extra = {}

    # trees
    coal_tree = treelib.read_tree(
        filenames.get("coal_tree", filename + exts["coal_tree"]))
    extra["locus_tree"] = treelib.read_tree(
        filenames.get("locus_tree", filename + exts["locus_tree"]))

    # recons
    extra["coal_recon"], junk = phylo.read_recon_events(
        filenames.get("coal_recon", filename + exts["coal_recon"]),
        coal_tree, extra["locus_tree"])
    extra["locus_recon"], extra["locus_events"] = phylo.read_recon_events(
        filenames.get("locus_recon", filename + exts["locus_recon"]),
        extra["locus_tree"], stree)


    extra["daughters"] = set(
        extra["locus_tree"].nodes[x] for x in util.read_strings(
        filenames.get("daughters", filename + exts["daughters"])))

    return coal_tree, extra
Ejemplo n.º 4
0
 def test_nonbinary_trees(self):
     mul = MulRFModel(extra = None)
     gene2species = phylo.read_gene2species("../../../examples/test/nonBinaryAll.smap")
     stree = treelib.read_tree('../../../examples/test/nonBinaryAll.stree')
     gtree = treelib.read_tree('../../../examples/test/nonBinaryAll.gtree')
     
     mul.stree = stree
     mul.gene2species = gene2species
     self.assertEqual(mul.compute_cost(gtree), 6)
Ejemplo n.º 5
0
 def test_smap_error(self):
     mul = MulRFModel(extra = None)
     gene2species = phylo.read_gene2species("../../../examples/test/nonBinaryAll.smap")
     stree = treelib.read_tree('../../../examples/test/24Hits.stree')
     gtree = treelib.read_tree('../../../examples/test/24Hits.gtree')
     
     
     with self.assertRaises(Exception):
         mul.optimize_model(gtree, stree, None)
Ejemplo n.º 6
0
 def test_null_trees(self):
     mul = MulRFModel(extra = None)
     stree = treelib.read_tree('../../../examples/test/EmptyTree.stree')
     gtree = treelib.read_tree('../../../examples/test/EmptyTree.stree')
     gene2species = phylo.read_gene2species("../../../examples/test/24Hits.smap")
     mul.stree = stree
     mul.gene2species = gene2species
     
     with self.assertRaises(AttributeError):
         mul.compute_cost(gtree)
Ejemplo n.º 7
0
 def test_deep(self):
     deep = DeepCoalescenceModel(extra = None)
     gene2species = phylo.read_gene2species("../../../examples/test/24Hits.smap")
     stree = treelib.read_tree('../../../examples/test/test1.stree')
     gtree = treelib.read_tree('../../../examples/test/test1.gtree')
     
     deep.stree = stree
     deep.gene2species = gene2species
     
     self.assertEqual(deep.compute_cost(gtree), 2)
Ejemplo n.º 8
0
    def addFamilies(self, eventsfile, discard=[]):

        if not tableExists(self.cur, "Families"):
            self.makeFamiliesTable()

        util.tic("add families")
        events_tab = tablelib.read_table(eventsfile)
        events_lookup = events_tab.lookup("partid")
        familyGeneNames = self.makeFamilyGeneNames()
        discard = set(discard)

        for row in events_tab:
            famid = row["partid"]
            if famid in discard:
                util.logger("discarding '%s'" % famid)
                continue

            tree = treelib.read_tree(self.getTreeFile(famid))
            treelen = sum(x.dist for x in tree)
            seqs = fasta.read_fasta(self.getFastaFile(famid))
            seqlen = stats.median(map(len, seqs.values()))

            self.cur.execute(
                """INSERT INTO Families VALUES 
                                ("%s", "%s", %f, %f, %f, %d, %d, %d,
                                 "%s");""" %
                (row["partid"], familyGeneNames.get(row["partid"],
                                                    ("", ""))[0],
                 row["famrate"], treelen, seqlen * 3, row["dup"], row["loss"],
                 row["genes"], familyGeneNames.get(row["partid"],
                                                   ("", ""))[1]))
        util.toc()
Ejemplo n.º 9
0
    def test_reorder(self):
        """Test reordering of tree children."""
        infile = StringIO("((a,b),(c,d));")
        tree = read_tree(infile)

        infile = StringIO("((d,c),(b,a));")
        tree2 = read_tree(infile)

        hashtree1 = tree.get_one_line_newick()
        hashtree2 = tree2.get_one_line_newick()
        self.assertTrue(hashtree1 != hashtree2)

        reorder_tree(tree, tree2)
        hashtree1 = tree.get_one_line_newick()
        hashtree2 = tree2.get_one_line_newick()
        self.assertEqual(hashtree1, hashtree2)
Ejemplo n.º 10
0
def trainTree(conf, stree, gene2species):
    args = conf["REST"]
    treefiles = []
    
    for arg in args:
        treefiles.extend(util.shellparser(arg))

    util.tic("reading trees")
    trees = []
    prog = progress.ProgressBar(len(treefiles))
    for treefile in treefiles:
        prog.update()
        trees.append(treelib.read_tree(treefile))
        
        # even out top two branches
        totlen = trees[-1].root.children[0].dist + \
                 trees[-1].root.children[1].dist
        trees[-1].root.children[0].dist = totlen / 2.0
        trees[-1].root.children[1].dist = totlen / 2.0
        
    util.toc()
    
    params = Spidir.learnModel(trees, stree, gene2species, conf["trainstats"],
                               filenames=treefiles)
    
    Spidir.writeParams(conf["param"], params)
    def addFamilies(self, eventsfile, discard=[]):

        if not tableExists(self.cur, "Families"):
            self.makeFamiliesTable()

        util.tic("add families")
        events_tab = tablelib.read_table(eventsfile)
        events_lookup = events_tab.lookup("partid")
        familyGeneNames = self.makeFamilyGeneNames()
        discard = set(discard)

        for row in events_tab:
            famid = row["partid"]
            if famid in discard:
                util.logger("discarding '%s'" % famid)
                continue

            tree = treelib.read_tree(self.getTreeFile(famid))
            treelen = sum(x.dist for x in tree)
            seqs = fasta.read_fasta(self.getFastaFile(famid))
            seqlen = stats.median(map(len, seqs.values()))

            self.cur.execute(
                """INSERT INTO Families VALUES
                ("%s", "%s", %f, %f, %f, %d, %d, %d,
                "%s");""" %
                (row["partid"],
                 familyGeneNames.get(row["partid"], ("", ""))[0],
                 row["famrate"], treelen, seqlen * 3,
                 row["dup"], row["loss"], row["genes"],
                 familyGeneNames.get(row["partid"], ("", ""))[1]))
        util.toc()
Ejemplo n.º 12
0
def bionj(aln=None, labels=None, distmat=None, seqtype="pep", verbose=True):
    # make temp files
    distfile = util.tempfile(".", "bionj-in", ".dist")
    treefile = util.tempfile(".", "bionj-out", ".tree")

    # find distances and then NJ tree
    if distmat is not None:
        phylip.write_dist_matrix(distmat, out=distfile)

        if labels is None:
            labels = aln.keys()
    else:
        if seqtype == "pep":
            labels = phylip.protdist(aln, distfile, verbose=verbose)
        else:
            labels = phylip.dnadist(aln, distfile, verbose=verbose)

    os.system("echo -n '%s\n%s' | bionj > /dev/null" % (distfile, treefile))
    tree = treelib.read_tree(treefile)
    phylip.rename_tree_with_names(tree, labels)

    # clean up
    os.remove(distfile)
    os.remove(treefile)

    return tree
    def test_reorder(self):
        """Test reordering of tree children."""
        infile = StringIO("((a,b),(c,d));")
        tree = read_tree(infile)

        infile = StringIO("((d,c),(b,a));")
        tree2 = read_tree(infile)

        hashtree1 = tree.get_one_line_newick()
        hashtree2 = tree2.get_one_line_newick()
        self.assertTrue(hashtree1 != hashtree2)

        reorder_tree(tree, tree2)
        hashtree1 = tree.get_one_line_newick()
        hashtree2 = tree2.get_one_line_newick()
        self.assertEqual(hashtree1, hashtree2)
    def test2(self):
        outdir = 'test/tmp/test_vistrans/Vis_test2/'
        make_clean_dir(outdir)

        stree = treelib.parse_newick(stree_newick)
        tree = treelib.read_tree(treefile2)
        brecon = phylo.read_brecon(breconfile2, tree, stree)

        transsvg.draw_tree(tree, brecon, stree, filename=outdir + "tree.svg")
Ejemplo n.º 15
0
    def test2(self):
        outdir = 'test/tmp/test_vistrans/Vis_test2/'
        make_clean_dir(outdir)

        stree = treelib.parse_newick(stree_newick)
        tree = treelib.read_tree(treefile2)
        brecon = phylo.read_brecon(breconfile2, tree, stree)

        transsvg.draw_tree(tree, brecon, stree, filename=outdir + "tree.svg")
    def test3(self):
        outdir = 'test/tmp/test_vistrans/Vis_test3/'
        make_clean_dir(outdir)

        stree = treelib.parse_newick(stree_newick)
        tree = treelib.read_tree(treefile3)
        brecon = phylo.read_brecon(breconfile3, tree, stree)

        phylo.add_implied_spec_nodes_brecon(tree, brecon)
        phylo.write_brecon(open(outdir + 'brecon', 'w'), brecon)

        transsvg.draw_tree(tree, brecon, stree, filename=outdir + "tree.svg")
Ejemplo n.º 17
0
    def test3(self):
        outdir = 'test/tmp/test_vistrans/Vis_test3/'
        make_clean_dir(outdir)

        stree = treelib.parse_newick(stree_newick)
        tree = treelib.read_tree(treefile3)
        brecon = phylo.read_brecon(breconfile3, tree, stree)

        phylo.add_implied_spec_nodes_brecon(tree, brecon)
        phylo.write_brecon(open(outdir + 'brecon', 'w'), brecon)

        transsvg.draw_tree(tree, brecon, stree, filename=outdir + "tree.svg")
Ejemplo n.º 18
0
def debug_test1():
    stree = treelib.read_tree('../examples/flies.stree')
    for node in stree:
        node.dist *= 1e7 # gen per myr
    popsize = 2e7
    freq = 1e0
    dr = .0012/1e7
    lr = .0006/1e7
    freqdup = freqloss = .05
    forcetime = 1e7
    
    ltree, ex = sim_DLILS_gene_tree(stree, popsize, freq, dr, lr, freqdup, freqloss, forcetime)
    
    return stree, gtree, ex
    def test_read_tree(self):
        """Test reading tree structure."""
        tree = treelib.read_tree(StringIO(fungi2))
        ptree = dict((node.name, node.parent.name if node.parent else None)
                     for node in tree)
        ptree_expected = {
            1: None, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 2, 9: 8, 10: 'xx',
            11: 10, 12: 10, 'sbay': 5, 14: 13, 'xx': 1, 'scer': 7, 'ctro': 11,
            'scas': 3, 'agos': 9, 'kwal': 8, 'dhan': 14, 'smik': 6, 'cgla': 4,
            'spar': 7, 'calb': 11, 'lelo': 12, 'cpar': 12, 13: 'xx', 'klac': 9,
            'clus': 13, 'cgui': 14}
        self.assertEqual(ptree, ptree_expected)

        newick = tree.get_one_line_newick(writeData=treelib.write_nhx_data)
        self.assertEqual(newick, fungi2)
    def test_nhx_big(self):
        """Test parsing of big NHX comments."""
        text = """(CFTR_GASAC:0.028272[&&NHX:S=GASAC:O=ENSGACT00000011967.1:T=69293:G=ENSGACG00000009039],((((((((((((((((((CFTR_HUMAN:0.002013[&&NHX:S=HUMAN:O=ENST00000003084.5:T=9606:G=ENSG00000001626],CFTR_PANTR:0.001342[&&NHX:S=PANTR:O=ENSPTRT00000036339.2:T=9598:G=ENSPTRG00000019619]):0.001545,CFTR_PONPY:0.006514[&&NHX:S=PONPY:O=ENSPPYT00000020909.1:T=9600:G=ENSPPYG00000017940]):0.003539,CFTR_MACMU:0.008416[&&NHX:S=MACMU:O=ENSMMUT00000015762.2:T=9544:G=ENSMMUG00000011269]):0.022751,CFTR_TUPGB:0.110613[&&NHX:S=TUPGB:O=ENSTBET00000011046.1:T=37347:G=ENSTBEG00000010974]):0.006474,((CFTR_OTOGA:0.035577[&&NHX:S=OTOGA:O=ENSOGAT00000001759.1:T=30611:G=ENSOGAG00000001756],CFTR_MICMU:0.026588[&&NHX:S=MICMU:O=ENSMICT00000005779.1:T=30608:G=ENSMICG00000005761]):0.010514,CFTR_MYOLU:0.06919[&&NHX:S=MYOLU:O=ENSMLUT00000012267.1:T=59463:G=ENSMLUG00000012244]):0.00395):0.001879,(CFTR_ECHTE:0.065629[&&NHX:S=ECHTE:O=ENSETET00000000538.1:T=9371:G=ENSETEG00000000537],CFTR_LOXAF:0.050347[&&NHX:S=LOXAF:O=ENSLAFT00000005758.1:T=9785:G=ENSLAFG00000005753]):0.016592):0.002471,((CFTR_SORAR:0.056771[&&NHX:S=SORAR:O=ENSSART00000012124.1:T=42254:G=ENSSARG00000012121],CFTR_ERIEU:0.043527[&&NHX:S=ERIEU:O=ENSEEUT00000006570.1:T=9365:G=ENSEEUG00000006484]):0.015585,CFTR_DASNO:0.047157[&&NHX:S=DASNO:O=ENSDNOT00000016544.1:T=9361:G=ENSDNOG00000016541]):0.00431):0.005677,(CFTR_F2_HORSE:0.016035[&&NHX:S=HORSE:O=ENSECAT00000010738.1:T=9796:G=ENSECAG00000009139],((CFTR_CANFA:0.047251[&&NHX:S=CANFA:O=ENSCAFT00000005518.2:T=9615:G=ENSCAFG00000003429],Q9N1D7_FELCA:0.025264[&&NHX:S=FELCA:O=ENSFCAT00000014959.2:T=9685:G=ENSFCAG00000014955]):0.022297,CFTR_BOVIN:0.062409[&&NHX:S=BOVIN:O=ENSBTAT00000053450.1:T=9913:G=ENSBTAG00000006589]):0.00767):0.004191):0.006209,(CFTR_F2_CAVPO:0.136979[&&NHX:S=CAVPO:O=ENSCPOT00000012891.1:T=10141:G=ENSCPOG00000012767],CFTR_SPETR:0.026944[&&NHX:S=SPETR:O=ENSSTOT00000005733.1:T=43179:G=ENSSTOG00000005707]):0.009628):0.007329,(Q29399_RABIT:0.027324[&&NHX:S=RABIT:O=ENSOCUT00000010738.1:T=9986:G=ENSOCUG00000010733],CFTR_OCHPR:0.050953[&&NHX:S=OCHPR:O=ENSOPRT00000014760.1:T=9978:G=ENSOPRG00000014721]):0.017472):0.011797,(Cftr_MOUSE:0.035769[&&NHX:S=MOUSE:O=ENSMUST00000045706.4:T=10090:G=ENSMUSG00000041301],Cftr_RAT:0.049345[&&NHX:S=RAT:O=ENSRNOT00000010981.4:T=10116:G=ENSRNOG00000008284]):0.158692):0.033423,Q2QL94_MONDO:0.08197[&&NHX:S=MONDO:O=ENSMODT00000020031.2:T=13616:G=ENSMODG00000015771]):0.026265,CFTR_ORNAN:0.094961[&&NHX:S=ORNAN:O=ENSOANT00000013974.1:T=9258:G=ENSOANG00000008767]):0.03792,A0M8U4_CHICK:0.119618[&&NHX:S=CHICK:O=ENSGALT00000015182.3:T=9031:G=ENSGALG00000009324]):0.033083,CFTR_XENTR:0.130489[&&NHX:S=XENTR:O=ENSXETT00000047145.1:T=8364:G=ENSXETG00000021796]):0.352249,si_dkey-270i2_F3_BRARE:0.203525[&&NHX:S=BRARE:O=ENSDART00000100729.1:T=7955:G=ENSDARG00000041107]):0.063334,CFTR_ORYLA:0.123603[&&NHX:S=ORYLA:O=ENSORLT00000024332.1:T=8090:G=ENSORLG00000019555]):0.034773,CFTR_TETNG:0.049086[&&NHX:S=TETNG:O=ENSTNIT00000019381.1:T=99883:G=ENSTNIG00000016063]):0.028272)[&&NHX:Loglk=-24078.827174:RatioCons=0.000000;:LoglkSpec=0.000000];"""  # nopep8
        tree = read_tree(StringIO(text))
        expected = {
            29: {},
            30: {},
            'CFTR_MACMU': {'O': 'ENSMMUT00000015762.2',
                           'S': 'MACMU',
                           'T': '9544',
                           'G': 'ENSMMUG00000011269'},
        }

        for name, data in expected.items():
            self.assertEqual(tree[name].data, data)
Ejemplo n.º 21
0
    def read(
            self,
            filename,
            stree,
            exts={
                "coal_tree": ".coal.tree",
                "coal_recon": ".coal.recon",
                "locus_tree": ".locus.tree",
                "locus_recon": ".locus.recon",
                "daughters": ".daughters"
            },
            filenames={},
            check=True):
        """Reads a reconciled gene tree from files"""

        # trees
        coal_tree = treelib.read_tree(
            filenames.get("coal_tree", filename + exts["coal_tree"]))
        self.locus_tree = treelib.read_tree(
            filenames.get("locus_tree", filename + exts["locus_tree"]))

        # recons
        self.coal_recon, junk = phylo.read_recon_events(
            filenames.get("coal_recon", filename + exts["coal_recon"]),
            coal_tree, self.locus_tree)
        self.locus_recon, self.locus_events = phylo.read_recon_events(
            filenames.get("locus_recon", filename + exts["locus_recon"]),
            self.locus_tree, stree)

        self.daughters = set(
            self.locus_tree.nodes[x] for x in util.read_strings(
                filenames.get("daughters", filename + exts["daughters"])))

        assert (not check) or (check and self.is_valid(coal_tree))

        return coal_tree, self.get_dict()
Ejemplo n.º 22
0
def debug_test2():
    stree = treelib.read_tree('examples/flies.stree') # run from ../ of this directory
    for node in stree:
        node.dist *= 1e7 # gen per myr
    popsize = 2e7
    freq = 1e0
    dr = .0012/1e7
    lr = .0006/1e7
    freqdup = freqloss = .05
    forcetime = 1e7
    
#    ltree, ex = sim_DLILS_gene_tree(stree, popsize, freq, dr, lr, freqdup, freqloss, forcetime)
    
    coal_tree, ex = sample_dlcoal_no_ifix(stree=stree, n=popsize, freq=freq, duprate=dr, lossrate=lr, freqdup=freqdup, freqloss=freqloss, forcetime=forcetime)
    
    treelib.draw_tree(coal_tree, scale=.00000005)
Ejemplo n.º 23
0
    def test_nhx_big(self):
        """Test parsing of big NHX comments."""
        text = """(CFTR_GASAC:0.028272[&&NHX:S=GASAC:O=ENSGACT00000011967.1:T=69293:G=ENSGACG00000009039],((((((((((((((((((CFTR_HUMAN:0.002013[&&NHX:S=HUMAN:O=ENST00000003084.5:T=9606:G=ENSG00000001626],CFTR_PANTR:0.001342[&&NHX:S=PANTR:O=ENSPTRT00000036339.2:T=9598:G=ENSPTRG00000019619]):0.001545,CFTR_PONPY:0.006514[&&NHX:S=PONPY:O=ENSPPYT00000020909.1:T=9600:G=ENSPPYG00000017940]):0.003539,CFTR_MACMU:0.008416[&&NHX:S=MACMU:O=ENSMMUT00000015762.2:T=9544:G=ENSMMUG00000011269]):0.022751,CFTR_TUPGB:0.110613[&&NHX:S=TUPGB:O=ENSTBET00000011046.1:T=37347:G=ENSTBEG00000010974]):0.006474,((CFTR_OTOGA:0.035577[&&NHX:S=OTOGA:O=ENSOGAT00000001759.1:T=30611:G=ENSOGAG00000001756],CFTR_MICMU:0.026588[&&NHX:S=MICMU:O=ENSMICT00000005779.1:T=30608:G=ENSMICG00000005761]):0.010514,CFTR_MYOLU:0.06919[&&NHX:S=MYOLU:O=ENSMLUT00000012267.1:T=59463:G=ENSMLUG00000012244]):0.00395):0.001879,(CFTR_ECHTE:0.065629[&&NHX:S=ECHTE:O=ENSETET00000000538.1:T=9371:G=ENSETEG00000000537],CFTR_LOXAF:0.050347[&&NHX:S=LOXAF:O=ENSLAFT00000005758.1:T=9785:G=ENSLAFG00000005753]):0.016592):0.002471,((CFTR_SORAR:0.056771[&&NHX:S=SORAR:O=ENSSART00000012124.1:T=42254:G=ENSSARG00000012121],CFTR_ERIEU:0.043527[&&NHX:S=ERIEU:O=ENSEEUT00000006570.1:T=9365:G=ENSEEUG00000006484]):0.015585,CFTR_DASNO:0.047157[&&NHX:S=DASNO:O=ENSDNOT00000016544.1:T=9361:G=ENSDNOG00000016541]):0.00431):0.005677,(CFTR_F2_HORSE:0.016035[&&NHX:S=HORSE:O=ENSECAT00000010738.1:T=9796:G=ENSECAG00000009139],((CFTR_CANFA:0.047251[&&NHX:S=CANFA:O=ENSCAFT00000005518.2:T=9615:G=ENSCAFG00000003429],Q9N1D7_FELCA:0.025264[&&NHX:S=FELCA:O=ENSFCAT00000014959.2:T=9685:G=ENSFCAG00000014955]):0.022297,CFTR_BOVIN:0.062409[&&NHX:S=BOVIN:O=ENSBTAT00000053450.1:T=9913:G=ENSBTAG00000006589]):0.00767):0.004191):0.006209,(CFTR_F2_CAVPO:0.136979[&&NHX:S=CAVPO:O=ENSCPOT00000012891.1:T=10141:G=ENSCPOG00000012767],CFTR_SPETR:0.026944[&&NHX:S=SPETR:O=ENSSTOT00000005733.1:T=43179:G=ENSSTOG00000005707]):0.009628):0.007329,(Q29399_RABIT:0.027324[&&NHX:S=RABIT:O=ENSOCUT00000010738.1:T=9986:G=ENSOCUG00000010733],CFTR_OCHPR:0.050953[&&NHX:S=OCHPR:O=ENSOPRT00000014760.1:T=9978:G=ENSOPRG00000014721]):0.017472):0.011797,(Cftr_MOUSE:0.035769[&&NHX:S=MOUSE:O=ENSMUST00000045706.4:T=10090:G=ENSMUSG00000041301],Cftr_RAT:0.049345[&&NHX:S=RAT:O=ENSRNOT00000010981.4:T=10116:G=ENSRNOG00000008284]):0.158692):0.033423,Q2QL94_MONDO:0.08197[&&NHX:S=MONDO:O=ENSMODT00000020031.2:T=13616:G=ENSMODG00000015771]):0.026265,CFTR_ORNAN:0.094961[&&NHX:S=ORNAN:O=ENSOANT00000013974.1:T=9258:G=ENSOANG00000008767]):0.03792,A0M8U4_CHICK:0.119618[&&NHX:S=CHICK:O=ENSGALT00000015182.3:T=9031:G=ENSGALG00000009324]):0.033083,CFTR_XENTR:0.130489[&&NHX:S=XENTR:O=ENSXETT00000047145.1:T=8364:G=ENSXETG00000021796]):0.352249,si_dkey-270i2_F3_BRARE:0.203525[&&NHX:S=BRARE:O=ENSDART00000100729.1:T=7955:G=ENSDARG00000041107]):0.063334,CFTR_ORYLA:0.123603[&&NHX:S=ORYLA:O=ENSORLT00000024332.1:T=8090:G=ENSORLG00000019555]):0.034773,CFTR_TETNG:0.049086[&&NHX:S=TETNG:O=ENSTNIT00000019381.1:T=99883:G=ENSTNIG00000016063]):0.028272)[&&NHX:Loglk=-24078.827174:RatioCons=0.000000;:LoglkSpec=0.000000];"""  # nopep8
        tree = read_tree(StringIO(text))
        expected = {
            29: {},
            30: {},
            'CFTR_MACMU': {
                'O': 'ENSMMUT00000015762.2',
                'S': 'MACMU',
                'T': '9544',
                'G': 'ENSMMUG00000011269'
            },
        }

        for name, data in expected.items():
            self.assertEqual(tree[name].data, data)
    def __init__(self, dbfile=None, famfile=None, smapfile=None,
                 genenamefile=None, streefile=None,
                 baseDir=None,
                 treeFileExt=None,
                 fastaFileExt=None):
        self.fams = genecluster.FamilyDb(famfile)
        self.gene2species = phylo.read_gene2species(smapfile)
        self.genenames_tab = tablelib.read_table(genenamefile)
        self.gene2name = self.genenames_tab.lookup("id")
        self.stree = treelib.read_tree(streefile)
        self.baseDir = baseDir
        self.treeFileExt = treeFileExt
        self.fastaFileExt = fastaFileExt

        # open database
        self.con = sqlite.connect(dbfile, isolation_level="DEFERRED")
        self.cur = self.con.cursor()
Ejemplo n.º 25
0
def debug_test3():
    stree = treelib.read_tree('examples/nbin.stree') # run from ../ of this directory
    for node in stree:
        node.dist *= 1e7 # gen per myr
    popsize = 2e7
    freq = 1e0
    dr = .0000012 / 1e7 #.0012/1e7
    lr = .0000011 / 1e7 #.0006/1e7
    freqdup = freqloss = .05
    forcetime = 1e7
    
    for node in stree:
        print node.name, node.dist, len(node.children)
    print
    
    locus_tree, locus_extras = sim_DLILS_gene_tree(stree, popsize, freq, \
                                                        dr, lr, \
                                                        freqdup, freqloss, \
                                                        forcetime)
    
    for node in locus_tree:
        print node.name, node.dist, len(node.children)
    print
    
    logged_locus_tree, logged_extras = locus_to_logged_tree(locus_tree, popsize)
    daughters = logged_extras[0]
    pops = logged_extras[1]
    
    coal_tree, coal_recon = dlcoal.sample_locus_coal_tree(logged_locus_tree,
                                    n=pops, daughters=daughters,
                                    namefunc=lambda x: logged_extras[2][x] + '_' + str(x))
    
    #begin debug
    print coal_tree.leaf_names()
    try:
#        print set(coal_tree) - set(coal_tree.postorder())
        treelib.assert_tree(coal_tree)
    except AssertionError:
        print 'assertion error thrown on coal_tree being a proper tree'
        from rasmus import util
        hd= util.hist_dict(x.name for x in coal_tree.postorder())
        for key in hd.keys():
            print key if hd[key]>1 else '',
        print
        print len(coal_tree.nodes) - len(list(coal_tree.postorder()))
Ejemplo n.º 26
0
def consense_from_file(intrees, verbose=True, args="y"):

    # read all trees
    trees = util.open_stream(intrees).readlines()
    ntrees = len(trees)

    cwd = create_temp_dir()
    out = open("intree", "w")
    for tree in trees:
        out.write(tree)
    out.close()

    exec_phylip("consense", args, verbose)

    tree = treelib.read_tree("outtree")

    cleanup_temp_dir(cwd)
    return tree, ntrees
Ejemplo n.º 27
0
def consense_from_file(intrees, verbose=True, args="y"):

    # read all trees
    trees = util.open_stream(intrees).readlines()
    ntrees = len(trees)

    cwd = create_temp_dir()
    out = open("intree", "w")
    for tree in trees:
        out.write(tree)
    out.close()
    
    exec_phylip("consense", args, verbose)
    
    tree = treelib.read_tree("outtree")
    
    cleanup_temp_dir(cwd)
    return tree, ntrees
Ejemplo n.º 28
0
    def test_read_tree(self):
        """Test reading tree structure."""
        tree = treelib.read_tree(StringIO(fungi2))
        ptree = dict((node.name, node.parent.name if node.parent else None)
                     for node in tree)
        ptree_expected = {
            1: None,
            2: 1,
            3: 2,
            4: 3,
            5: 4,
            6: 5,
            7: 6,
            8: 2,
            9: 8,
            10: 'xx',
            11: 10,
            12: 10,
            'sbay': 5,
            14: 13,
            'xx': 1,
            'scer': 7,
            'ctro': 11,
            'scas': 3,
            'agos': 9,
            'kwal': 8,
            'dhan': 14,
            'smik': 6,
            'cgla': 4,
            'spar': 7,
            'calb': 11,
            'lelo': 12,
            'cpar': 12,
            13: 'xx',
            'klac': 9,
            'clus': 13,
            'cgui': 14
        }
        self.assertEqual(ptree, ptree_expected)

        newick = tree.get_one_line_newick(writeData=treelib.write_nhx_data)
        self.assertEqual(newick, fungi2)
Ejemplo n.º 29
0
    def __init__(self,
                 dbfile=None,
                 famfile=None,
                 smapfile=None,
                 genenamefile=None,
                 streefile=None,
                 baseDir=None,
                 treeFileExt=None,
                 fastaFileExt=None):
        self.fams = genecluster.FamilyDb(famfile)
        self.gene2species = phylo.read_gene2species(smapfile)
        self.genenames_tab = tablelib.read_table(genenamefile)
        self.gene2name = self.genenames_tab.lookup("id")
        self.stree = treelib.read_tree(streefile)
        self.baseDir = baseDir
        self.treeFileExt = treeFileExt
        self.fastaFileExt = fastaFileExt

        # open database
        self.con = sqlite.connect(dbfile, isolation_level="DEFERRED")
        self.cur = self.con.cursor()
Ejemplo n.º 30
0
    def test_tree_namefunc(self):
        """Test reading/writing tree with namefunc."""

        count = [0]

        def namefunc(name):
            count[0] += 1
            return 'name%d' % count[0]

        tree = treelib.read_tree(StringIO(fungi2), namefunc=namefunc)
        newick = tree.get_one_line_newick()
        expected_newick = '(((((((name1:7.061760,name2:7.061760):4.999680,name3:12.061440):5.970600,name4:18.032040):52.682400,name5:70.714260):7.220700,name6:77.934960):23.181480,((name7:78.553260,name8:78.553260):10.434960,name9:88.988220):12.128400):78.883560,(((name10:41.275620,name11:41.275980):29.632860,(name12:52.323120,name13:52.323120):18.585720):31.149540,((name14:75.615840,name15:75.615840):14.006880,name16:89.622720):12.435660)xx:77.941620);'  # nopep8

        self.assertEqual(newick, expected_newick)

        def namefunc2(name):
            return 'prefix_' + name

        newick2 = tree.get_one_line_newick(namefunc=namefunc2)
        expected_newick2 = '(((((((prefix_name1:7.061760,prefix_name2:7.061760):4.999680,prefix_name3:12.061440):5.970600,prefix_name4:18.032040):52.682400,prefix_name5:70.714260):7.220700,prefix_name6:77.934960):23.181480,((prefix_name7:78.553260,prefix_name8:78.553260):10.434960,prefix_name9:88.988220):12.128400):78.883560,(((prefix_name10:41.275620,prefix_name11:41.275980):29.632860,(prefix_name12:52.323120,prefix_name13:52.323120):18.585720):31.149540,((prefix_name14:75.615840,prefix_name15:75.615840):14.006880,prefix_name16:89.622720):12.435660)xx:77.941620);'  # nopep8

        self.assertEqual(newick2, expected_newick2)
Ejemplo n.º 31
0
    def test_write_tree(self):
        """Test tree writing
           Test root data writing
        """

        newick = '''(
 (
  a:1.000000,
  b:2.000000
 )x:3.000000,
 (
  c:4.000000,
  d:5.000000
 )y:6.000000
)rra:0.000000;
'''
        infile = StringIO(newick)
        tree = read_tree(infile)

        out = StringIO()
        tree.write(out, rootData=True)
        self.assertEqual(newick, out.getvalue())
    def test_write_tree(self):
        """Test tree writing
           Test root data writing
        """

        newick = '''(
 (
  a:1.000000,
  b:2.000000
 )x:3.000000,
 (
  c:4.000000,
  d:5.000000
 )y:6.000000
)rra:0.000000;
'''
        infile = StringIO(newick)
        tree = read_tree(infile)

        out = StringIO()
        tree.write(out, rootData=True)
        self.assertEqual(newick, out.getvalue())
    def test_tree_namefunc(self):
        """Test reading/writing tree with namefunc."""

        count = [0]

        def namefunc(name):
            count[0] += 1
            return 'name%d' % count[0]

        tree = treelib.read_tree(StringIO(fungi2), namefunc=namefunc)
        newick = tree.get_one_line_newick()
        expected_newick = '(((((((name1:7.061760,name2:7.061760):4.999680,name3:12.061440):5.970600,name4:18.032040):52.682400,name5:70.714260):7.220700,name6:77.934960):23.181480,((name7:78.553260,name8:78.553260):10.434960,name9:88.988220):12.128400):78.883560,(((name10:41.275620,name11:41.275980):29.632860,(name12:52.323120,name13:52.323120):18.585720):31.149540,((name14:75.615840,name15:75.615840):14.006880,name16:89.622720):12.435660)xx:77.941620);'  # nopep8

        self.assertEqual(newick, expected_newick)

        def namefunc2(name):
            return 'prefix_' + name

        newick2 = tree.get_one_line_newick(namefunc=namefunc2)
        expected_newick2 = '(((((((prefix_name1:7.061760,prefix_name2:7.061760):4.999680,prefix_name3:12.061440):5.970600,prefix_name4:18.032040):52.682400,prefix_name5:70.714260):7.220700,prefix_name6:77.934960):23.181480,((prefix_name7:78.553260,prefix_name8:78.553260):10.434960,prefix_name9:88.988220):12.128400):78.883560,(((prefix_name10:41.275620,prefix_name11:41.275980):29.632860,(prefix_name12:52.323120,prefix_name13:52.323120):18.585720):31.149540,((prefix_name14:75.615840,prefix_name15:75.615840):14.006880,prefix_name16:89.622720):12.435660)xx:77.941620);'  # nopep8

        self.assertEqual(newick2, expected_newick2)
Ejemplo n.º 34
0
    def read(self, filename, stree,
             exts={"tree" : ".tree",
                   "recon" : ".recon",
                   "order" : ".order"},
             filenames={}):
        """Read the reconciliation from a file"""

        gtree = treelib.read_tree(
            filenames.get("tree", filename + exts["tree"]))

        self.species_map = {}
        self.locus_map = {}
        for name, sname, locus in util.read_delim(filenames.get("recon", filename + exts["recon"])):
            if name.isdigit(): name = int(name)
            if sname.isdigit(): sname = int(sname)
            assert locus.isdigit()
            locus = int(locus)

            node = gtree.nodes[name]
            self.species_map[node] = stree.nodes[sname]
            self.locus_map[node] = locus

        self.order = collections.defaultdict(dict)
        for toks in util.read_delim(filenames.get("order", filename + exts["order"])):
            sname, locus, lst = toks[0], toks[1], toks[2].split(',')
            if sname.isdigit(): sname = int(sname)
            assert locus.isdigit()
            locus = int(locus)
            names = map(lambda x: int(x) if x.isdigit() else x, lst)

            snode = stree.nodes[sname]
            nodes = map(lambda x: gtree.nodes[x], names)
            if snode not in self.order:
                self.order[snode] = {}
            self.order[snode][locus] = nodes
        self.order = dict(self.order)

        return gtree, self.get_dict()
    def test_nhx(self):
        """Test parsing of NHX comments."""

        text = """(((ADH2:0.1[&&NHX:S=human:E=1.1.1.1], ADH1:0.11[&&NHX:S=human:E=1.1.1.1]):0.05[&&NHX:S=Primates:E=1.1.1.1:D=Y:B=100], ADHY:0.1[&&NHX:S=nematode:E=1.1.1.1],ADHX:0.12[&&NHX:S=insect:E=1.1.1.1]):0.1[&&NHX:S=Metazoa:E=1.1.1.1:D=N], (ADH4:0.09[&&NHX:S=yeast:E=1.1.1.1],ADH3:0.13[&&NHX:S=yeast:E=1.1.1.1], ADH2:0.12[&&NHX:S=yeast:E=1.1.1.1],ADH1:0.11[&&NHX:S=yeast:E=1.1.1.1]):0.1 [&&NHX:S=Fungi])[&&NHX:E=1.1.1.1:D=N];"""  # nopep8
        tree = read_tree(StringIO(text))

        data = {'ADH3': {'S': 'yeast', 'E': '1.1.1.1'},
                1: {'E': '1.1.1.1', 'D': 'N'},
                2: {'S': 'Metazoa', 'E': '1.1.1.1', 'D': 'N'},
                3: {'S': 'Primates', 'B': '100', 'E': '1.1.1.1', 'D': 'Y'},
                4: {'S': 'Fungi'},
                'ADH2': {'S': 'human', 'E': '1.1.1.1'},
                'ADHY': {'S': 'nematode', 'E': '1.1.1.1'},
                'ADHX': {'S': 'insect', 'E': '1.1.1.1'},
                'ADH1': {'S': 'human', 'E': '1.1.1.1'},
                'ADH1_1': {'S': 'yeast', 'E': '1.1.1.1'},
                'ADH4': {'S': 'yeast', 'E': '1.1.1.1'},
                'ADH2_1': {'S': 'yeast', 'E': '1.1.1.1'}}

        data2 = dict((node.name, node.data) for node in tree)

        for key, val in data.items():
            self.assertEqual(data2[key], val)
Ejemplo n.º 36
0
def get_branch_lens(trees, stree, gene2species=gene2species):
    # determine species nanes
    species = map(str, stree.nodes.keys())
    species.remove(str(stree.root.name))
    
    # make rates table
    rates = tablelib.Table(headers=species)
    
    # loop through trees
    for tree in trees:
        if isinstance(tree, str):
            tree = treelib.read_tree(tree)
        recon = reconcile(tree, stree, gene2species)
        events = label_events(tree, recon)
        
        # skip trees with duplications or with extremly long branch lengths
        assert "dup" not in events.values()
        
        row = {}
        for node in tree.nodes.values():
            row[str(recon[node].name)] = node.dist
        rates.append(row)
    
    return rates
Ejemplo n.º 37
0
 
 
 def gene2species(name):
     return name[:1].upper()
 
 
 params = {"A": [4, 2],
           "B": [3, 1]}
           
 conf = {"debug": 0,
         "dupprob": .5,
         "lossprob": 1.0}
 
 
 
 stree = treelib.read_tree(StringIO.StringIO("(A, B);"))
 
 
 # test 1
 print "\n\nTest 1"
 tree  = treelib.read_tree(StringIO.StringIO("(a:3, b:2);"))
 logl = treeLogLikelihood(conf, tree, stree, gene2species, params, baserate=1)
 
 treelib.draw_tree_lens(tree,scale=5)
 floateq(logl, log(stats.normalPdf(3, params["A"]) *
                   stats.normalPdf(2, params["B"])))
 
 
 # test 2
 print "\n\nTest 2"    
 tree  = treelib.read_tree(StringIO.StringIO("((a1:2.5, a2:2):1, b:2);"))
Ejemplo n.º 38
0
def spidir(conf, distmat, labels, stree, gene2species, params):
    """Main function for the SPIDIR algorithm"""
    
    setDebug(conf["debug"])

    if isDebug(DEBUG_HIGH) and pyspidir:
        pyspidir.set_log(3, "")
        
    
    if "out" in conf:
        # create debug table
        conf["debugtab_file"] = file(conf["out"] + ".debug.tab", "w")
        
        debugtab = tablelib.Table(headers=["correct",
                                           "logl", "treelen", "baserate", 
                                           "error", "errorlogl", 
                                           "eventlogl", "tree",
                                           "topology", "species_hash"],
                                  types={"correct": bool,
                                         "logl": float, 
                                         "treelen": float, 
                                         "baserate": float, 
                                         "error": float, 
                                         "errorlogl": float,
                                         "eventlogl": float, 
                                         "tree": str,
                                         "topology": str,
                                         "species_hash": str})
        debugtab.writeHeader(conf["debugtab_file"])
        conf["debugtab"] = debugtab
    else:
        conf["debugfile"] = None
    
    
    trees = []
    logls = []
    tree = None
    visited = {}
    
    util.tic("SPIDIR")
    
    # do auto searches
    for search in conf["search"]:
        util.tic("Search by %s" % search)
        
        if search == "greedy":
            tree, logl = Search.searchGreedy(conf, distmat, labels, stree, 
                                      gene2species, params,
                                      visited=visited)
            
        elif search == "mcmc":
            tree, logl = Search.searchMCMC(conf, distmat, labels, stree, 
                                    gene2species, params, initTree=tree,
                                    visited=visited)
                                    
        elif search == "regraft":
            tree, logl = Search.searchRegraft(conf, distmat, labels, stree, 
                                    gene2species, params, initTree=tree,
                                    visited=visited, proposeFunc=Search.proposeTree3)
                                    
        elif search == "exhaustive":
            if tree == None:
                tree = phylo.neighborjoin(distmat, labels)
                tree = phylo.recon_root(tree, stree, gene2species)
            
            tree, logl = Search.searchExhaustive(conf, distmat, labels, tree, stree, 
                                          gene2species, params, 
                                          depth=conf["depth"],
                                          visited=visited)
        elif search == "hillclimb":
            tree, logl = Search.searchHillClimb(conf, distmat, labels, stree, 
                                         gene2species, params, initTree=tree,
                                         visited=visited)
        
        elif search == "none":
            break
        else:
            raise SindirError("unknown search '%s'" % search)
        
        util.toc()
        
        Search.printMCMC(conf, "N/A", tree, stree, gene2species, visited)
        
        printVisitedTrees(visited)
        

    def evalUserTree(tree):        
        setTreeDistances(conf, tree, distmat, labels)
        logl = treeLogLikelihood(conf, tree, stree, gene2species, params)
        
        thash = phylo.hash_tree(tree)
        if thash in visited:
            a, b, count = visited[thash]
        else:
            count = 0
        visited[thash] = [logl, tree.copy(), count+1]
        
        if isDebug(DEBUG_LOW):
            debug("\nuser given tree:")
            recon = phylo.reconcile(tree, stree, gene2species)
            events = phylo.label_events(tree, recon)
            drawTreeLogl(tree, events=events)        
    
    # eval the user given trees
    for treefile in conf["tree"]:
        tree = treelib.read_tree(treefile)
        evalUserTree(tree)
    
    for topfile in conf["tops"]:
        infile = file(topfile)
        strees = []
        
        while True:
            try:
                strees.append(treelib.read_tree(infile))
            except:
                break
        
        print len(strees)
        
        for top in strees:
            tree = phylo.stree2gtree(top, labels, gene2species)
            evalUserTree(tree)    
    
    if len(conf["tops"]) > 0:
        printVisitedTrees(visited)    
    
    
    
    # eval correcttree for debug only
    if "correcttree" in conf:
        tree = conf["correcttree"]
        setTreeDistances(conf, tree, distmat, labels)
        logl = treeLogLikelihood(conf, tree, stree, gene2species, params)
        
        if isDebug(DEBUG_LOW):
            debug("\ncorrect tree:")
            recon = phylo.reconcile(tree, stree, gene2species)
            events = phylo.label_events(tree, recon)
            drawTreeLogl(tree, events=events)
    
    
    util.toc()
    
    if len(visited) == 0:
        raise SindirError("No search or tree topologies given")
    
    
    if "correcthash" in conf:
        if conf["correcthash"] in visited:
            debug("SEARCH: visited correct tree")
        else:
            debug("SEARCH: NEVER saw correct tree")

    
    # return ML tree
    trees = [x[1] for x in visited.itervalues()]
    i = util.argmax([x.data["logl"] for x in trees])
    return trees[i], trees[i].data["logl"]
Ejemplo n.º 39
0
                stree.add_child(parent, child)
                child.dist = newdist
                callagain = True
                break
        if callagain:
            remove_single_child_nodes()
    
    # main code
    sim_walk(stree.root, freq)
    remove_single_child_nodes()
    return stree # poor nomenclature; this will be fixed in v2.1



if __name__ == "__main__":
    stree = treelib.read_tree('simple.stree')
    popsize = 1e4
    freq = 1e0
    dr = 2.1
    lr = 2.0
    freqdup = .05
    freqloss = .05
    forcetime = 1e0
    tree = sim_tree(stree, popsize, freq, dr, lr, freqdup, freqloss, forcetime)
    if tree:
        treelib.draw_tree(tree, scale=1)



### VERSION 1 CODE (for reference)
#
Ejemplo n.º 40
0
def boxPlot(dataPath = '/home/muddcs15/research/work/hemiplasy/results/',
            prob1 = '0.001',
            prob2 = '0.05',
            prob3 = '0.1',
            prob4 = '0.5',
            spectree = '/home/muddcs15/research/work/hemiplasy/data/config/fungi.stree'):
    """
    A function that will output boxplots of probability of hemiplasy and probability of hemiplasy over duploss vs. initial allele frequency
    """
    

    stree = treelib.read_tree(spectree) # species tree
    species = stree.leaf_names()
    species1 = []
    species2 = []
    for node in stree:
        if len(node.leaves()) == 2:
            species1.append(node.children[0].name)
            species2.append(node.children[1].name)

    # identify the files for each of the different initial frequencies
    probs1 = os.path.join(dataPath, 'probabilities-' + prob1 + '.txt')
    probs2 = os.path.join(dataPath, 'probabilities-' + prob2 + '.txt')
    probs3 = os.path.join(dataPath, 'probabilities-' + prob3 + '.txt')
    probs4 = os.path.join(dataPath, 'probabilities-' + prob4 + '.txt')
    probsList = [probs1, probs2, probs3, probs4]

    totalPerList = []   # probability of hemiplasy compared to duploss
    totalAList = []     # probability of hemiplasy ocurring
    totalPairs = []

    h = 0 # probability that ocurred by hemiplasy
    d = 0 # probability that ocurred by duploss

    pair1 = []
    pair2 = []
    pair3 = []
    pair4 = []
    pair5 = []

    # open each probability file
    for probFilename in probsList:
        events = open('/home/muddcs15/research/work/hemiplasy/results/hemiplasy-loss.txt', 'r')
        
        hList = []      # list of probability of hemiplasy
        perList = []    # list of percentage with prob hemiplasy > prob duploss
        aveList = []    # list of average probability of hemiplasy per fam id

        countTrue = 0

        # look at each famid for that initial frequency
        probFile = open(probFilename, "r")
        for line in probFile:
            sepProbs = line.split()
            fam = sepProbs.pop(0)
            famid = fam[6:]

            # get the probability of duploss and hemiplasy for each trial in each famid
            for pair in sepProbs:
                duploss, hemiplasy = map(float, pair.split(','))
                hList.append(hemiplasy)
                # check whether hemiplasy is more likely or duploss
                if hemiplasy > duploss:
                    h += 1
                else:
                    d += 1

            # calculate the percent that likely ocurred by hemiplasy            
            percent = float(h)/float(h+d)
            # get the average probability of hemiplasy for each famid
            ave = stats.mean(hList)

            # append percent by hemiplasy to perList and average for the famid to aveList
            perList.append(percent)
            aveList.append(ave)
            
            for line in events:
                ev_famid, locus, spcs, gns, snode, lca = line.rstrip().split('\t')
                if famid == ev_famid:
                    countTrue += 1
                    for sp1, sp2 in zip(species1, species2):
                        if (sp1 in spcs and sp2 not in spcs):
                            spec_check = sp1
                            specPos = species1.index(sp1)
                            
                        elif (sp2 in spcs and sp1 not in spcs):
                            spec_check = sp2
                            specPos = species2.index(sp2)
                    break
                    
            if specPos == 0:
                pair1.append(ave)
            if specPos == 1:
                pair2.append(ave)
            if specPos == 2:
                pair3.append(ave)
            if specPos == 3:
                pair4.append(ave)
            if specPos == 4:
                pair5.append(ave)
        events.close()
                
        # append the lists through each famid to the large lists for each list of values
        totalPerList.append(perList)
        
        totalAList.append(aveList)
        

        # close file
        probFile.close()
        
    totalPairs.append(pair1)
    totalPairs.append(pair2)
    totalPairs.append(pair3)
    totalPairs.append(pair4)
    totalPairs.append(pair5)
    
    plt.boxplot(totalPairs)
    plt.title('Hemiplasy by Pairs')
    plt.xlabel('Pair')
    plt.ylabel('Probability')
    
    # print the plots
    plt.show()
Ejemplo n.º 41
0
tr = raxml.new_tree()
cmd = "raxmlHPC -t %s -s %s %s" % (treefile, seqfile, options.extra)
raxml.init_program(adef, tr, cmd.split(" "))

util.tic("Optimizing model...")
raxml.optimize_model(adef, tr)
util.toc()

# draw_raxml_tree(tr, adef)

util.tic("Getting parameters for LH...")
bestVector, bestLH, weightSum = raxml.compute_best_LH(tr)
util.log("bestLH: %.3f" % bestLH)
util.toc()

tree = treelib.read_tree(treefile)
for node in tree:
    node.dist = 0
    if "boot" in node.data:
        del node.data["boot"]
treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True))
treehashes = set([treehash])

for i in xrange(options.niter):
    while treehash in treehashes:
        util.log("random spr")
        node1, node2 = phylo.propose_random_spr(tree)
        phylo.perform_spr(tree, node1, node2)
        treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True))

    treehashes.add(treehash)
def boxPlot(dataPath = '/home/muddcs15/research/work/hemiplasy/results/',
            prob1 = '0.001',
            prob2 = '0.05',
            prob3 = '0.1',
            prob4 = '0.5',
            spectree = '/home/muddcs15/research/work/hemiplasy/data/config/fungi.stree'):
    """
    A function that will output boxplots of probability of hemiplasy and probability of hemiplasy over duploss vs. initial allele frequency
    """
    

    stree = treelib.read_tree(spectree) # species tree
    species = stree.leaf_names()
    species1 = []
    species2 = []
    for node in stree:
        if len(node.leaves()) == 2:
            species1.append(node.children[0].name)
            species2.append(node.children[1].name)
    
    # define number of plots to be outputed
    fig, axes = plt.subplots(nrows=2, ncols=3)

    # identify the files for each of the different initial frequencies
    probs1 = os.path.join(dataPath, 'probabilities-' + prob1 + '.txt')
    probs2 = os.path.join(dataPath, 'probabilities-' + prob2 + '.txt')
    probs3 = os.path.join(dataPath, 'probabilities-' + prob3 + '.txt')
    probs4 = os.path.join(dataPath, 'probabilities-' + prob4 + '.txt')
    probsList = [probs1, probs2, probs3, probs4]

    totalPerList = []   # probability of hemiplasy compared to duploss
    totalAList = []     # probability of hemiplasy ocurring
    totalPairs = []

    h = 0 # probability that ocurred by hemiplasy
    d = 0 # probability that ocurred by duploss

    pair1 = []
    pair2 = []
    pair3 = []
    pair4 = []
    pair5 = []
    pairList = [pair1, pair2, pair3, pair4, pair5]

    totalFList = []
    totalPDList =[]

    # open each probability file
    for probFilename in probsList:
        events = open('/home/muddcs15/research/work/hemiplasy/results/hemiplasy-loss.txt', 'r')
        
        hList = []      # list of probability of hemiplasy
        perList = []    # list of percentage with prob hemiplasy > prob duploss
        aveList = []    # list of average probability of hemiplasy per fam id
        famList = []
        PDList = []

        # look at each famid for that initial frequency
        probFile = open(probFilename, "r")
        for line in probFile:
            sepProbs = line.split()
            fam = sepProbs.pop(0)
            famid = fam[6:]
            famList.append(famid)
            
            # get the probability of duploss and hemiplasy for each trial in each famid
            for pair in sepProbs:
                duploss, hemiplasy = map(float, pair.split(','))
                hList.append(hemiplasy)
                # check whether hemiplasy is more likely or duploss
                if hemiplasy > duploss:
                    h += 1
                else:
                    d += 1

            # calculate the percent that likely ocurred by hemiplasy            
            percent = float(h)/float(h+d)
            # get the average probability of hemiplasy for each famid
            ave = stats.mean(hList)

            # append percent by hemiplasy to perList and average for the famid to aveList
            perList.append(percent)
            aveList.append(ave)
            
            for line in events:
                ev_famid, locus, spcs, gns, dup, lca = line.rstrip().split('\t')
                if famid == ev_famid:
                    for sp1, sp2 in zip(species1, species2):
                        if (sp1 in spcs and sp2 not in spcs):
                            spec_check = sp1
                            specPos = species1.index(sp1)
                            
                        elif (sp2 in spcs and sp1 not in spcs):
                            spec_check = sp2
                            specPos = species2.index(sp2)
                    break
            
            PDList.append((specPos, dup))

        famNum = 0
        for pos, dpl in PDList:
            if pos == 0:
                pair1.append((int(dpl), aveList[famNum]))
            if pos == 1:
                pair2.append((int(dpl), aveList[famNum]))
            if pos == 2:
                pair3.append((int(dpl), aveList[famNum]))
            if pos == 3:
                pair4.append((int(dpl), aveList[famNum]))
            if pos == 4:
                pair5.append((int(dpl), aveList[famNum]))
            famNum += 1
        events.close()
                
        # append the lists through each famid to the large lists for each list of values
        totalPerList.append(perList)
        totalAList.append(aveList)
        totalFList.append(famList)
        totalPDList.append(PDList)

        # close file
        probFile.close()

    # TODO: what does this do?
    finalPair = collections.defaultdict(list)
    
    pairCount = 0
    for pairNum in pairList:
        pairCount += 1
        dup = collections.defaultdict(list)
        
        for (dupLoc, prob) in pairNum:
            dup[dupLoc].append(prob)

        finalPair[pairCount].extend([dup[dupLoc] for dupLoc in xrange(1,14)])
   
            
    # define the first plot and its labels
    axes[0,0].boxplot(finalPair[1])
    axes[0,0].set_title('Pair1')
    axes[0,0].set_xlabel('Duplication Location')
    axes[0,0].set_ylabel('Probability')
    axes[0,0].set_ylim(0,0.25)

    # define the second plot and its labels
    axes[0,1].boxplot(finalPair[2])
    axes[0,1].set_title('Pair2')
    axes[0,1].set_xlabel('Duplication Location')
    axes[0,1].set_ylabel('Probability')
    axes[0,1].set_ylim(0,0.25)

    axes[0,2].boxplot(finalPair[3])
    axes[0,2].set_title('Pair3')
    axes[0,2].set_xlabel('Duplication Location')
    axes[0,2].set_ylabel('Probability')
    axes[0,2].set_ylim(0,0.25)

    axes[1,0].boxplot(finalPair[4])
    axes[1,0].set_title('Pair4')
    axes[1,0].set_xlabel('Duplication Location')
    axes[1,0].set_ylabel('Probability')
    axes[1,0].set_ylim(0,0.25)

    axes[1,1].boxplot(finalPair[5])
    axes[1,1].set_title('Pair5')
    axes[1,1].set_xlabel('Duplication Location')
    axes[1,1].set_ylabel('Probability')
    axes[1,1].set_ylim(0,0.25)

    # print the plots
    plt.show()
Ejemplo n.º 43
0
    def test_nhx(self):
        """Test parsing of NHX comments."""

        text = """(((ADH2:0.1[&&NHX:S=human:E=1.1.1.1], ADH1:0.11[&&NHX:S=human:E=1.1.1.1]):0.05[&&NHX:S=Primates:E=1.1.1.1:D=Y:B=100], ADHY:0.1[&&NHX:S=nematode:E=1.1.1.1],ADHX:0.12[&&NHX:S=insect:E=1.1.1.1]):0.1[&&NHX:S=Metazoa:E=1.1.1.1:D=N], (ADH4:0.09[&&NHX:S=yeast:E=1.1.1.1],ADH3:0.13[&&NHX:S=yeast:E=1.1.1.1], ADH2:0.12[&&NHX:S=yeast:E=1.1.1.1],ADH1:0.11[&&NHX:S=yeast:E=1.1.1.1]):0.1 [&&NHX:S=Fungi])[&&NHX:E=1.1.1.1:D=N];"""  # nopep8
        tree = read_tree(StringIO(text))

        data = {
            'ADH3': {
                'S': 'yeast',
                'E': '1.1.1.1'
            },
            1: {
                'E': '1.1.1.1',
                'D': 'N'
            },
            2: {
                'S': 'Metazoa',
                'E': '1.1.1.1',
                'D': 'N'
            },
            3: {
                'S': 'Primates',
                'B': '100',
                'E': '1.1.1.1',
                'D': 'Y'
            },
            4: {
                'S': 'Fungi'
            },
            'ADH2': {
                'S': 'human',
                'E': '1.1.1.1'
            },
            'ADHY': {
                'S': 'nematode',
                'E': '1.1.1.1'
            },
            'ADHX': {
                'S': 'insect',
                'E': '1.1.1.1'
            },
            'ADH1': {
                'S': 'human',
                'E': '1.1.1.1'
            },
            'ADH1_1': {
                'S': 'yeast',
                'E': '1.1.1.1'
            },
            'ADH4': {
                'S': 'yeast',
                'E': '1.1.1.1'
            },
            'ADH2_1': {
                'S': 'yeast',
                'E': '1.1.1.1'
            }
        }

        data2 = dict((node.name, node.data) for node in tree)

        for key, val in data.items():
            self.assertEqual(data2[key], val)
Ejemplo n.º 44
0
def hemiplasyConditions(numFamilies = 5351,
                        dataPath = '/home/muddcs15/research/work/hemiplasy/data/real-fungi/',
                        outputFile = '/home/muddcs15/research/work/hemiplasy/results/hemiplasy-loss.txt',
                        spectree = '/home/muddcs15/research/work/hemiplasy/data/config/fungi.stree'):
    # create variables and output file
    output = open(outputFile,'w')
    count = 0

    # define a list of all species and lists of each of the species pairs in separate lists
    stree = treelib.read_tree(spectree) # species tree
    species = stree.leaf_names()
    species1 = []
    species2 = []
    for node in stree:
        if len(node.leaves()) == 2:
            species1.append(node.children[0].name)
            species2.append(node.children[1].name)
    
    # loop over each fam id
    for famid in xrange(numFamilies):
        flag = False                                # this families met the criteria for possible hemiplasy
        locus_dict = collections.defaultdict(list)  # key = locus number, val = list of (gn, sp) in the locus
        famFilename = dataPath + '%d/%d-dup.dlcoal.dlcpar.recon' % (famid,famid)

        # if the file is not empty, process it
        if os.stat(famFilename).st_size != 0:
        
            # read the locus tree and the reconcilitation file
            tree_filename = dataPath + '%d/%d.dlcoal.locus.tree' % (famid,famid)
            recon_filename = dataPath + '%d/%d.dlcoal.locus.recon' % (famid,famid)
            tree = treelib.read_tree(tree_filename) # locus tree
            recon, events = phylo.read_recon_events(recon_filename, tree, stree) # reconciliation and events

            # create a dictionary for [locus] = species tree location
            locus_sname = {}
            
            # find location in species tree where each locus was created and then close dlcpar file
            dupFilename = dataPath + '%d/%d.dlcoal.dlcpar.dup.rel.txt' % (famid,famid)
            for line in util.open_stream(dupFilename):
                locus, gns1, gns2, sname = line.rstrip().split('\t')
                locus_sname[locus] = sname

            # track to genes and species in each locus
            for line in util.open_stream(famFilename):

                # assign names to the columns in the file
                gn, sp, locus = line.rstrip().split('\t')
                if locus == "1":
                    continue
                
                # store dict of key = locus, val = list of (gene, species) in locus
                locus_dict[locus].append((gn, sp))

            # for each locus, determine if genes in locus satisfy the properties for a possible hemiplasy
            for locus, lst in locus_dict.iteritems():
                sps = [sp for (gn,sp) in lst]
                gns = [gn for (gn,sp) in lst]
                
                # check if exists in only one species in a pair
                for sp1, sp2 in zip(species1, species2):
                    if (sp1 in sps and sp2 not in sps) or \
                        (sp2 in sps and sp1 not in sps):
                        
                        # check if exists elsewhere (outside pair)
                        for allsp in species:
                            if (allsp != sp1) and (allsp != sp2) and (allsp in sps):
                                flag = True

                # output this family id, the locus, the species with that locus, the genes on that locus,\
                # the species tree branch on which the duplication occurred, and the daughter of the duplication node in the locus tree
                if flag:
                    leaf_sps = []
                    leaf_gns = []
                    for gn, sp in lst:
                       if not gn.isdigit():
                           leaf_sps.append(sp)
                           leaf_gns.append(gn)

                    gnodes = [tree.nodes[name] for name in leaf_gns]
                    lca = treelib.lca(gnodes)
                    
                    output.write('\t'.join([str(famid), locus, ','.join(leaf_sps), ','.join(leaf_gns), locus_sname[locus], lca.name]))
                    output.write('\n')
                    break
            
            # if it is a true case, add to count
            if flag:
                count += 1
            
    # print total count and close output file        
    print "Total number of true cases =", count
    output.close()
Ejemplo n.º 45
0
sample_coal_cond_counts = coal.sample_coal_cond_counts

if __name__ == "__main__":
    #========================================
    # test cases for prob_locus_gene_species_alignment_recon
    # sim-flies, N = 1e6, g = 0.1, R = 1x, L = 100bp, mu = 5e-9

    import os
    import numpy
    from compbio import fasta
    import dlcoal
    import StringIO

    path = "/home/muddcs15/research/work/coestimation/"
    stree = treelib.read_tree(
        os.path.join(path, "simulation/config/flies.stree"))

    prob_raxml = []
    prob_treefix = []
    prob_dlca = []

    for i in range(100, 200):

        # read raxml recon
        coal_tree_raxml, extra_raxml = dlcoal.read_dlcoal_recon(
            os.path.join(path, "simulation/data/1000/5e-9/1e6-1x/", str(i),
                         str(i) + ".raxml.dlcoal"), stree)
        locus_tree_raxml = extra_raxml["locus_tree"]
        locus_recon_raxml = extra_raxml["locus_recon"]
        coal_recon_raxml = extra_raxml["coal_recon"]
        daughters_raxml = extra_raxml["daughters"]