Esempio n. 1
0
def addVisited(conf, visited, tree, gene2species, thash=None):
    if thash is None:
        thash = phylo.hash_tree(tree)
    
    if thash in visited:
        visited[thash][2] += 1
    else:
        visited[thash] = [tree.data["logl"], tree.copy(), 1]

    if "correcthash" in conf:
        if thash == conf["correcthash"]:
            debug("PROPOSED CORRECT TREE: visisted = ", len(visited))
            if conf["searchtest"]:
                drawTreeLogl(tree)
                sys.exit(0)
        
    if "debugtab_file" in conf:
        shash = phylo.hash_tree(tree, gene2species)

        if "correcthash" in conf:
            correct = (conf["correcthash"] == thash)
        else:
            correct = False

        conf["debugtab"].writeRow(conf["debugtab_file"],
                              {"correct": correct,
                               "logl": tree.data["logl"], 
                               "treelen": sum(x.dist for x in tree), 
                               "baserate": tree.data["baserate"], 
                               "error": tree.data["error"], 
                               "errorlogl": tree.data["errorlogl"],
                               "eventlogl": tree.data["eventlogl"], 
                               "tree": tree.getOnelineNewick(),
                               "topology": thash,
                               "species_hash": shash})
Esempio n. 2
0
    def test_search(self):
        """Test all terms"""

        prep_dir("test/output/all_terms_search")
        out = open("test/output/all_terms_search/flies.txt", "w")
        #out = sys.stderr

        treeids = os.listdir("test/data/flies")
        #treeids = ["3"]

        for treeid in treeids:
        
            tree_correct = read_tree("test/data/flies.nt/%s/%s.tree" %
                                    (treeid, treeid))
            align = read_fasta("test/data/flies.nt/%s/%s.align" %
                              (treeid, treeid))

            phylo.hash_order_tree(tree_correct)

            print >>out, treeid
            print >>out, "correct"
            drawTree(tree_correct, out=out)
            
            stree = read_tree("test/data/flies.norm.stree")
            gene2species = phylo.read_gene2species("test/data/flies.smap")
            params = spidir.read_params("test/data/flies.nt.param")
            birth = .4
            death = .39
            pretime = 1.0
            maxdoom = 20
            bgfreq = [.258,.267,.266,.209]
            kappa = 1.59

            genes = align.keys()
            seqs = align.values()
            
            tree = spidir.search_climb(genes, seqs,
                                       stree, gene2species,
                                       params, birth, death, pretime,
                                       bgfreq, kappa,
                                       maxdoom=maxdoom,
                                       niter=50, quickiter=100,
                                       nsamples=100, branch_approx=True)

            phylo.hash_order_tree(tree)
            
            

            print >>out, "constructed"
            drawTree(tree, out=out)
            

            print >>out, "is_correct:", (phylo.hash_tree(tree) ==
                                         phylo.hash_tree(tree_correct))
            

        out.close()
Esempio n. 3
0
def buildTree(conf, stree, gene2species):
    params = Spidir.readParams(conf["param"])
    
    if "correcttree" in conf:
        conf["correcthash"] = phylo.hash_tree(conf["correcttree"])
    
    
    if "dist" in conf:
        for i in range(len(conf["dist"])):
            distfile = conf["dist"][i]
            
            labels, distmat = phylip.read_dist_matrix(distfile)
        
            # read in different labels if needed
            if "labels" in conf:
                labels = Spidir.readLabels(conf["labels"][i])
                conf["aln"] = fasta.read_fasta(conf["labels"][i])
            
            tree, logl = Spidir.spidir(conf, distmat, labels, stree, 
                                          gene2species, params)
            tree.write(Spidir.outTreeFile(conf))
            
            # test for correctness
            if "correcttree" in conf:
                correctTree = conf["correcttree"]
                phylo.hash_order_tree(correctTree)
                phylo.hash_order_tree(tree)
                
                thash1 = phylo.hash_tree(tree)
                thash2 = phylo.hash_tree(correctTree)
                
                print "spidir: "
                treelib.draw_tree(tree, maxlen=5, minlen=5)
                print
                
                print "correct:"
                treelib.draw_tree(correctTree, maxlen=5, minlen=5)
                print
                
                if len(tree.leaves()) > 3:
                    rferror = Spidir.robinson_foulds_error(correctTree, tree)
                else:
                    rferror = 0.0
                
                if thash1 == thash2:
                    print "CORRECT TREE FOUND"
                else:
                    print "WRONG TREE FOUND (RF: %f)" % rferror
Esempio n. 4
0
 def propose(chain, tree):
     tree2 = proposeFunc(conf, tree,  distmat, labels, 
                         stree, gene2species, params, visited)
     
     # check visited dict
     thash = phylo.hash_tree(tree2)
     if thash in visited:
         logl, tree2, count = visited[thash]
         #this.nold += 1
     else:
         Spidir.setTreeDistances(conf, tree2, distmat, labels)
         logl = Spidir.treeLogLikelihood(conf, tree2, stree, gene2species, params)
         this.nold = 0
     
     addVisited(conf, visited, tree2, gene2species, thash)
     
     # best yet tree
     if logl > this.toplogl:
         printMCMC(conf, "%d:%d" % (chain.name, this.iter), 
                   tree2, stree, gene2species, visited)
         this.toplogl = logl
         this.toptree = tree2.copy()
         
         # move some other chains to best state
         #chains2 = sorted(chains, key=lambda x: x.logl)
         #for chain in chains2[:1]:
         #    chain.state = this.toptree.copy()
         #    chain.logl = this.toplogl
     
     # alter logl to influence search only
     #chain.relax = conf["speedup"] * this.nold
            
     return tree2, logl
Esempio n. 5
0
 def evalUserTree(tree):        
     setTreeDistances(conf, tree, distmat, labels)
     logl = treeLogLikelihood(conf, tree, stree, gene2species, params)
     
     thash = phylo.hash_tree(tree)
     if thash in visited:
         a, b, count = visited[thash]
     else:
         count = 0
     visited[thash] = [logl, tree.copy(), count+1]
     
     if isDebug(DEBUG_LOW):
         debug("\nuser given tree:")
         recon = phylo.reconcile(tree, stree, gene2species)
         events = phylo.label_events(tree, recon)
         drawTreeLogl(tree, events=events)        
Esempio n. 6
0
def getProposals(conf, tree, distmat, labels, stree, 
                 gene2species, params, visited, stuck=False):
                 
    # TODO: handle root edges
    
    # try all NNI
    # find edges for NNI
    nodes = tree.nodes.values()
    nodes = filter(lambda x: not x.isLeaf() and 
                             x != tree.root and
                             x not in tree.root.children, nodes)
    edges = [(node, node.parent) for node in nodes]
    edges.append(tuple(tree.root.children))
    
    
    treelib.drawTreeNames(tree, minlen=5, maxlen=5, out=sys.stderr)
    util.printcols(util.map2(lambda x: x.name, edges), out=sys.stderr)
    
    proposals = []
    for edge in edges:
        for change in (0,1):
            proposeNni(tree, edge[0], edge[1], change)
            tree2 = phylo.reconRoot(tree, stree, gene2species, newCopy=True)
            
            thash = phylo.hash_tree(tree2)
            if thash not in visited:
                Spidir.setTreeDistances(conf, tree2, distmat, labels)
                logl = Spidir.treeLogLikelihood(conf, tree2, stree, 
                                         gene2species, params)
                visited[thash] = [logl, tree2, 1]
                
                proposals.append([logl, edge, change])
            else:
                visited[thash][2] += 1
                logl = visited[thash][0]
                
                if not stuck:
                    proposals.append([logl, edge, change])
            
            
            
            # switch branch back
            proposeNni(tree, edge[0], edge[1], change)
    
    proposals.sort(key=lambda x: x[0], reverse=True)
    return proposals
Esempio n. 7
0
def searchExhaustive(conf, distmat, labels, tree, stree, gene2species, params,
                     depth=2, visited=None, visited2=None, topDepth=True,
                     toplogl=None, short=False):
    if visited == None:
        visited = {}
    if visited2 == None:
        visited2 = {}
    
    tree = tree.copy()
    
    # find initial logl
    thash = phylo.hash_tree(tree)
    if thash not in visited:
        Spidir.setTreeDistances(conf, tree, distmat, labels)
        logl = Spidir.treeLogLikelihood(conf, tree, stree, 
                                    gene2species, params)
        visited[thash] = [logl, tree.copy(), 1]
        
        drawTreeLogl(tree)
    else:
        logl = visited[thash][0]
        
    if toplogl == None:
        toplogl = [logl]
    
    
    debug(" " * (depth*2), "(%d)" % len(visited))
    sys.stdout.flush()
    
    if depth < 1:
        return tree, logl
    
    
    # try all NNI
    # find edges for NNI
    nodes = tree.nodes.values()
    nodes = filter(lambda x: not x.isLeaf() and 
                             x != tree.root and \
                             x.parent != tree.root, nodes)
    edges = [(node, node.parent) for node in nodes]
    
    for edge in edges:
        for change in (0,1):
            proposeNni(tree, edge[0], edge[1], change)
            
            thash = phylo.hash_tree(tree)
            if thash not in visited:
                Spidir.setTreeDistances(conf, tree, distmat, labels)
                logl = Spidir.treeLogLikelihood(conf, tree, stree, 
                                         gene2species, params)
                visited[thash] = [logl, tree.copy(), 1]
            else:
                logl = visited[thash][0]
            
            if logl > toplogl[0]:
                toplogl[0] = logl
                
                if short:
                    return tree, logl
                else:
                    printMCMC(conf, "N/A", tree, stree, gene2species, visited)
                
            
            if (thash not in visited2 or \
                depth > visited2[thash]) and \
                logl - toplogl[0] >= conf["eprune"]:
                visited2[thash] = depth
                
                # dig deeper
                if depth > 1:
                    tree2, logl2 = searchExhaustive(conf, distmat, labels, 
                                     tree, stree, gene2species, params,
                                     depth=depth-1, visited=visited,
                                     visited2=visited2,
                                     topDepth=False,
                                     toplogl=toplogl, short=short)
                    
                    if short and tree2 != None:
                        return tree2, logl2
                    
            
            # switch branch back
            proposeNni(tree, edge[0], edge[1], change)
    
    # debug
    if topDepth:
        items = visited.items()
        i = util.argmaxfunc(lambda x: x[1][0], items)
        
        thash, (logl, tree, count) = items[i]
        
        return tree, logl
    else:
        return None, None
Esempio n. 8
0
def searchHillClimb(conf, distmat, labels, stree, gene2species, params,
               initTree=None, visited=None):

    if visited == None:
        visited = {}
    
    # init with NJ    
    if initTree != None:
        tree = initTree
    else:
        #tree = bionj.bionj(labels=labels, distmat=distmat, verbose=False)
        tree = phylo.neighborjoin(distmat, labels)
        tree = phylo.reconRoot(tree, stree, gene2species)
        Spidir.setTreeDistances(conf, tree, distmat, labels)

    # init likelihood score
    logl = treeLogLikelihood(conf, tree, stree, gene2species, params)

    # store tree in visited
    addVisited(conf, visited, tree, gene2species)
    
    stuck = False
        
    for i in range(conf["hilliters"]):
        printMCMC(conf, i, tree, stree, gene2species, visited)
        
        proposals = getProposals(conf, tree, distmat, labels, 
                                 stree, gene2species, params, visited, stuck)
        
        util.printcols(map(lambda (a,(b,c),d): [a, b.name, c.name, d], proposals))
        print
        
        # determine which proposals to use
        edgeset = set()
        proposals2 = []
        for logl2, edge, change in proposals:
            if edge in edgeset:
                continue
            proposals2.append([logl2, edge, change])
            
            edgeset.add((getNniUncle(tree, edge[0], edge[1]), edge[1]))
            edgeset.add((edge[0].children[change], edge[0]))
            edgeset.add((edge[0], edge[1]))
        
        util.printcols(map(lambda (a,(b,c),d): [a, b.name, c.name, d], proposals2))
        print
        
        heat = 1.0
        start = 0
        while start < len(proposals2):
            nproposals = int(math.ceil(len(proposals2) * heat))
            
            # apply proposals
            for logl3, edge, change in proposals2[start:start+nproposals]:
                proposeNni(tree, edge[0], edge[1], change)
            tree2 = phylo.reconRoot(tree, stree, gene2species, newCopy=True)
            
            # calc likelihood
            thash = phylo.hash_tree(tree2)
            if thash not in visited:
                Spidir.setTreeDistances(conf, tree2, distmat, labels)
                logl2 = Spidir.treeLogLikelihood(conf, tree2, stree, 
                                          gene2species, params)
                stuck = False
            else:
                logl2 = visited[thash][0]
                
                Spidir.setTreeDistances(conf, tree2, distmat, labels)
                logl2 = Spidir.treeLogLikelihood(conf, tree2, stree, 
                                          gene2species, params)
                
                if nproposals == 1:
                    stuck = True
            
            addVisited(conf, visited, tree2, gene2species, thash)
            
            
            debug("logl2", logl2)
            
            if logl2 > logl:
                logl = logl2
                tree = tree2
                break
            
            if nproposals == 1:
                logl = logl2
                tree = tree2
                break
            
            heat *= .5
            
            # undo reversals
            for logl3, edge, change in util.reverse(proposals2[start:start+nproposals]):
                proposeNni(tree, edge[0], edge[1], change)
        
        debug("start:", start)
        debug("swaps:", nproposals)
        debug("heat:", heat)
        debug("stuck:", stuck)

    
    items = visited.items()
    i = util.argmaxfunc(lambda x: x[1][0], items)
    thash, (logl, tree, count) = items[i]
    return tree, logl
Esempio n. 9
0
def proposeTree3(conf, tree,  distmat, labels, 
                  stree, gene2species, params, visited):
    toplogl = tree.data["logl"]
    toptree = tree.copy()
    
    tree = tree.copy()
    
    nodes = tree.nodes.values()
    nodes.remove(tree.root)
    weights = [1 for x in nodes] #[x.data["error"] for x in nodes]
    badgene = nodes[stats.sample(weights)]
    
    
    # detemine distance from badgene to everyone else
    dists = util.Dict(default=-util.INF)
    def walk(node, dist):
        dists[node.name] = dist
        for child in node.children:
            walk(child, dist + child.dist)
    walk(badgene, 0)
    seen = set([badgene])
    node = badgene.parent
    dist = badgene.dist
    while node != None:        
        for child in node.children:
            if child not in seen:
                walk(child, dist)
        seen.add(node)
        dist +=  node.dist
        node = node.parent
    
    tree1, tree2 = splitTree(tree, badgene, badgene.parent)
    
    names = tree1.nodes.keys()
    names.remove(tree1.root.name)
    #names.sort(key=lambda x: dists[x])
    random.shuffle(names)
    
    
    for name in names[:min(len(names), conf["regraftloop"])]:
        tree = tree1.copy()
        node = tree.nodes[name]
        
        #print "p3>>", node.name, node.parent.name
        regraftTree(tree, tree2.copy(), node, node.parent)
        
        thash = phylo.hash_tree(tree)
        
        if thash not in visited:        
            Spidir.setTreeDistances(conf, tree, distmat, labels)
            logl = Spidir.treeLogLikelihood(conf, tree, stree, gene2species, params)
        addVisited(conf, visited, tree, gene2species, thash)
        logl, tree, count = visited[thash]
        
        if logl > toplogl:
            toplogl = logl
            toptree = tree
            
            # try returning immediately
            #return toptree

    
    assert toptree != None
    
    return toptree
Esempio n. 10
0
    def test_birth_death_single_sim(self):
        """test the single branch prior"""
        
        duprate = 2.0
        lossrate = .5
        ntrees = 1000
        tabsize = 100
        T = 1.0

        tops = []
        survivors = []
        lookup = {}

        # define species tree
        stree = treelib.parse_newick("(A:1);")
        def gene2species(gene):
            return gene[:1].upper()

        # simulate gene trees
        util.tic("simulating %d trees" % ntrees)
        for i in xrange(ntrees):
            tree, doom = birthdeath.sample_birth_death_tree(
                T, duprate, lossrate)

            if tree.root in doom:
                tops.append("()")
                survivors.append(0)
            else:
                rename_leaves(tree, stree, lambda x: "A")
                tops.append(phylo.hash_tree(tree, gene2species))
                survivors.append(len(tree.leaves()))
            lookup[tops[-1]] = tree
        util.toc()

        # setup test output
        outdir = "test/output/birthdeath_sim_simple"
        prep_dir(outdir)

        # histogram of topologies and survivors (# leaves)
        hist_tops = histtab(tops)
        hist_num = histtab(survivors)

        # compute survivor prior
        probs = []
        for row in hist_num:
            ngenes = row["item"]
            probs.append(birthDeathCount(ngenes, T, duprate, lossrate))

        # compute topologie priors
        probs_tops = []
        for row in hist_tops:
            tree = lookup[row["item"]]

            if tree.root.is_leaf():
                p = log(birthdeath.prob_birth_death1(
                    0, T, duprate, lossrate))
            else:
                nhist = numTopologyHistories(tree.root)
                s = len(tree.leaves())
                thist = factorial(s) * factorial(s-1) / 2**(s-1)
                r = numRedunantTopology(tree.root, gene2species,
                                        all_leaves=True)
                p = log(r * nhist / thist * birthdeath.prob_birth_death1(
                    s, T, duprate, lossrate))
            
            probs_tops.append(exp(p))

        self.calc_fit(outdir + "/sim_prior_ngenes", hist_num, probs)
        self.calc_fit(outdir + "/sim_prior_top", hist_tops, probs_tops)
Esempio n. 11
0
    def do_test_birth_death_gene_sim(self, stree, gene2species,
                                     duprate, lossrate,
                                     ntrees=10000, tabsize=30):
        """Perform a birth death gene tree simulation test"""

        doomtable = calcDoomTable(stree, duprate, lossrate)
        
        tops = []
        lookup = {}


        def rename_tree(tree, gene2species):
            if len(tree.nodes) == 0:
                return
            spcounts = util.hist_dict(map(gene2species, tree.leaf_names()))
            names = {}
            for sp, c in spcounts.items():
                names[sp] = range(1, c+1)
                random.shuffle(names[sp])

            for node in tree.leaves():
                sp = gene2species(node.name)
                tree.rename(node.name, sp + "." + str(names[sp].pop()))
            

        util.tic("simulating %d trees" % ntrees)
        for i in xrange(ntrees):
            tree, recon, events = birthdeath.sample_birth_death_gene_tree(
                stree, duprate, lossrate, 
                removeloss=True)
            phylo.add_implied_spec_nodes(tree, stree, recon, events)

            if len(tree.nodes) == 1 and recon[tree.root] == stree.root:
                tops.append("()")
                lookup["()"] = (None, None, None)
            else:
                rename_tree(tree, gene2species)

                tops.append(phylo.hash_tree(tree))
                lookup[tops[-1]] = (tree, recon, events)
        util.toc()
        
        hist = histtab(tops)

        probs = []
        for row in hist:
            tree, recon, events = lookup[row["item"]]

            if tree is None:
                probs.append(exp(doomtable[-1]))
            else:                
                p = c_calcBirthDeathPrior(tree, stree, recon,
                                          duprate, lossrate,
                                          events=events)
                p2 = calcBirthDeathPrior(tree, stree, recon,
                                         duprate, lossrate,
                                         events=events)

                fequal(p, p2)
                probs.append(exp(p))

        return hist, probs
Esempio n. 12
0
    def test_search(self):
        """Test all terms"""

        prep_dir("test/output/all_terms_search")
        out = open("test/output/all_terms_search/flies.txt", "w")
        #out = sys.stderr

        treeids = os.listdir("test/data/flies")
        #treeids = ["3"]

        for treeid in treeids:

            tree_correct = read_tree("test/data/flies.nt/%s/%s.tree" %
                                     (treeid, treeid))
            align = read_fasta("test/data/flies.nt/%s/%s.align" %
                               (treeid, treeid))

            phylo.hash_order_tree(tree_correct)

            print >> out, treeid
            print >> out, "correct"
            drawTree(tree_correct, out=out)

            stree = read_tree("test/data/flies.norm.stree")
            gene2species = phylo.read_gene2species("test/data/flies.smap")
            params = spidir.read_params("test/data/flies.nt.param")
            birth = .4
            death = .39
            pretime = 1.0
            maxdoom = 20
            bgfreq = [.258, .267, .266, .209]
            kappa = 1.59

            genes = align.keys()
            seqs = align.values()

            tree = spidir.search_climb(genes,
                                       seqs,
                                       stree,
                                       gene2species,
                                       params,
                                       birth,
                                       death,
                                       pretime,
                                       bgfreq,
                                       kappa,
                                       maxdoom=maxdoom,
                                       niter=50,
                                       quickiter=100,
                                       nsamples=100,
                                       branch_approx=True)

            phylo.hash_order_tree(tree)

            print >> out, "constructed"
            drawTree(tree, out=out)

            print >> out, "is_correct:", (
                phylo.hash_tree(tree) == phylo.hash_tree(tree_correct))

        out.close()