Example #1
0
 def processFunc():
     # remove old query tempfile if one exists
     if closure["oldtmp"] != None:
         os.remove(closure["oldtmp"])
         elapse = util.toc()
         closure["time"] += elapse
         
         util.log("blasted %d of %d sequences (%.1f%%), elapse %.0f m, left %.0f m" % (
             closure["index"], len(seqs.keys()), 
             100 * float(closure["index"]) / len(seqs.keys()),
             closure["time"] / 60.0, 
             elapse / split * (len(seqs.keys()) - closure["index"]) / 60.0))
         
     util.tic()
     
     # find new subset of query sequences
     i = closure["index"]
     names = seqs.keys()[i:i+split]
     
     # if no more sequences then quit
     if len(names) == 0:
         return False
     
     # start blast
     tmpfile = util.tempfile(".", "blastp", ".fasta")
     seqs.write(tmpfile, names = names)
     pipe = os.popen("blastall -p %s -d %s -i %s -m 8 -e .1 %s" % \
         (prog, databaseFile, tmpfile, options))
     
     # update variables
     closure["oldtmp"] = tmpfile
     closure["index"] = i + split
     
     return pipe
Example #2
0
 def processFunc():
     # remove old query tempfile if one exists
     if closure["oldtmp"] != None:
         os.remove(closure["oldtmp"])
         elapse = util.toc()
         closure["time"] += elapse
         
         util.log("blasted %d of %d sequences (%.1f%%), elapse %.0f m, left %.0f m" % (
             closure["index"], len(seqs.keys()), 
             100 * float(closure["index"]) / len(seqs.keys()),
             closure["time"] / 60.0, 
             elapse / split * (len(seqs.keys()) - closure["index"]) / 60.0))
         
     util.tic()
     
     # find new subset of query sequences
     i = closure["index"]
     names = seqs.keys()[i:i+split]
     
     # if no more sequences then quit
     if len(names) == 0:
         return False
     
     # start blast
     tmpfile = util.tempfile(".", "blastp", ".fasta")
     seqs.write(tmpfile, names = names)
     pipe = os.popen("blastall -p %s -d %s -i %s -m 8 -e .1 %s" % \
         (prog, databaseFile, tmpfile, options))
     
     # update variables
     closure["oldtmp"] = tmpfile
     closure["index"] = i + split
     
     return pipe
 def update(self, stream=None, msg="progress %2.0f%%"):
     self.pos += 1
     if (self.pos > self.prog):
         self.prog += self.step * self.end
         if stream is not None:
             print>>stream, msg % (100 * self.pos / self.end)
         else:
             util.log(msg % (100 * self.pos / self.end))
Example #4
0
 def update(self, stream=None, msg="progress %2.0f%%"):
     self.pos += 1
     if (self.pos > self.prog):
         self.prog += self.step * self.end
         if stream != None:
             print >> stream, msg % (100 * self.pos / self.end)
         else:
             util.log(msg % (100 * self.pos / self.end))
    def __init__(self, *args, **dargs):
        Progress.__init__(self, *args)
        self.width = 60
        self.step = 1 / self.width
        self.bar = 0

        if "title" in dargs:
            title = dargs["title"]
        else:
            title = "progress"

        util.log("+-" + title + ("-"*(self.width-len(title)-1)) + "+")
        util.indent()
        util.logExact("|")
        self.printBar()
Example #6
0
    def __init__(self, *args, **dargs):
        Progress.__init__(self, *args)
        self.width = 60
        self.step = 1 / self.width
        self.bar = 0

        if "title" in dargs:
            title = dargs["title"]
        else:
            title = "progress"

        util.log("+-" + title + ("-" * (self.width - len(title) - 1)) + "+")
        util.indent()
        util.logExact("|")
        self.printBar()
def blast(prog, databaseFile, queryFile, options="", split=100, resume=None):
    """Executes blastp in several smaller batches"""

    if not split:
        # do blasting in one call
        pipe = os.popen("blastall -p %s -d %s -i %s -m 8 %s" % (prog, databaseFile, queryFile, options))
        return BlastReader(pipe)

    else:
        # NOTE: split query file into about 100 sequences each
        # this is a work around for ncbi blastall 2.2.10 problem with outputing
        # in -m 8 mode.  error was   "BioseqFindFunc: couldn't uncache"

        seqs = fasta.read_fasta(queryFile)
        closure = {"index": 0, "oldtmp": None, "time": 0.0}

        if resume:
            try:
                closure["index"] = seqs.keys().index(resume)
                util.log("resuming with query '%s' (%d of %d)" % ((resume, closure["index"], len(seqs.keys()))))
            except ValueError:
                raise Exception("Could not resume from last query sequence '%s'" % resume)

        def processFunc():
            # remove old query tempfile if one exists
            if closure["oldtmp"] is not None:
                os.remove(closure["oldtmp"])
                elapse = util.toc()
                closure["time"] += elapse

                util.log(
                    "blasted %d of %d sequences (%.1f%%), "
                    "elapse %.0f m, left %.0f m"
                    % (
                        closure["index"],
                        len(seqs.keys()),
                        100 * float(closure["index"]) / len(seqs.keys()),
                        closure["time"] / 60.0,
                        elapse / split * (len(seqs.keys()) - closure["index"]) / 60.0,
                    )
                )

            util.tic()

            # find new subset of query sequences
            i = closure["index"]
            names = seqs.keys()[i : i + split]

            # if no more sequences then quit
            if len(names) == 0:
                return False

            # start blast
            tmpfile = util.tempfile(".", "blastp", ".fasta")
            seqs.write(tmpfile, names=names)
            pipe = os.popen("blastall -p %s -d %s -i %s -m 8 -e .1 %s" % (prog, databaseFile, tmpfile, options))

            # update variables
            closure["oldtmp"] = tmpfile
            closure["index"] = i + split

            return pipe

        filename = processFunc()
        if filename:
            return BlastReader(filename, processFunc)
        else:
            return BlastReader(os.popen("less"))
Example #8
0
def blast(prog, databaseFile, queryFile, options="", split=100, resume=None):
    """Executes blastp in several smaller batches"""

    if not split:
        # do blasting in one call
        pipe = os.popen("blastall -p %s -d %s -i %s -m 8 %s" %
                        (prog, databaseFile, queryFile, options))
        return BlastReader(pipe)

    else:
        # NOTE: split query file into about 100 sequences each
        # this is a work around for ncbi blastall 2.2.10 problem with outputing
        # in -m 8 mode.  error was   "BioseqFindFunc: couldn't uncache"

        seqs = fasta.read_fasta(queryFile)
        closure = {
            "index": 0,
            "oldtmp": None,
            "time": 0.0
        }

        if resume:
            try:
                closure["index"] = seqs.keys().index(resume)
                util.log("resuming with query '%s' (%d of %d)" % (
                    (resume, closure["index"], len(seqs.keys()))))
            except ValueError:
                raise Exception(
                    "Could not resume from last query sequence '%s'" % resume)

        def processFunc():
            # remove old query tempfile if one exists
            if closure["oldtmp"] is not None:
                os.remove(closure["oldtmp"])
                elapse = util.toc()
                closure["time"] += elapse

                util.log(
                    "blasted %d of %d sequences (%.1f%%), "
                    "elapse %.0f m, left %.0f m" % (
                        closure["index"], len(seqs.keys()),
                        100 * float(closure["index"]) / len(seqs.keys()),
                        closure["time"] / 60.0,
                        elapse / split *
                        (len(seqs.keys()) - closure["index"]) / 60.0))

            util.tic()

            # find new subset of query sequences
            i = closure["index"]
            names = seqs.keys()[i:i+split]

            # if no more sequences then quit
            if len(names) == 0:
                return False

            # start blast
            tmpfile = util.tempfile(".", "blastp", ".fasta")
            seqs.write(tmpfile, names=names)
            pipe = os.popen("blastall -p %s -d %s -i %s -m 8 -e .1 %s" %
                            (prog, databaseFile, tmpfile, options))

            # update variables
            closure["oldtmp"] = tmpfile
            closure["index"] = i + split

            return pipe

        filename = processFunc()
        if filename:
            return BlastReader(filename, processFunc)
        else:
            return BlastReader(os.popen("less"))
Example #9
0
adef = raxml.new_analdef()
raxml.init_adef(adef)
tr = raxml.new_tree()
cmd = "raxmlHPC -t %s -s %s %s" % (treefile, seqfile, options.extra)
raxml.init_program(adef, tr, cmd.split(" "))

util.tic("Optimizing model...")
raxml.optimize_model(adef, tr)
util.toc()

# draw_raxml_tree(tr, adef)

util.tic("Getting parameters for LH...")
bestVector, bestLH, weightSum = raxml.compute_best_LH(tr)
util.log("bestLH: %.3f" % bestLH)
util.toc()

tree = treelib.read_tree(treefile)
for node in tree:
    node.dist = 0
    if "boot" in node.data:
        del node.data["boot"]
treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True))
treehashes = set([treehash])

for i in xrange(options.niter):
    while treehash in treehashes:
        util.log("random spr")
        node1, node2 = phylo.propose_random_spr(tree)
        phylo.perform_spr(tree, node1, node2)
Example #10
0
util.tic("Initializing RAXML and optimizing...")
module = raxml.RAxML()
module.optimize_model(treefile, seqfile, options.extra)
util.toc()

tree = treelib.read_tree(treefile)
for node in tree:
    node.dist = 0
    if "boot" in node.data:
        del node.data["boot"]
treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True))
treehashes = set([treehash])

for i in xrange(options.niter):
    while treehash in treehashes:
        util.log("random spr")
        node1, node2 = phylo.propose_random_spr(tree)
        phylo.perform_spr(tree, node1, node2)
        treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True))

    treehashes.add(treehash)
    tree.write(out, oneline=True); out.write('\n'); out.flush()

    util.tic("Computing LH...")
    p, Dlnl = module.compute_lik_test(tree)
    util.log("pvalue: %.3f, Dlnl: %.3f" % (p, Dlnl))
    util.toc()

    if Dlnl <= 0:
        util.log("worse likelihood?: %s" % False)  # better topology (higher likelihood)
    else:
Example #11
0
def prob_gene_species_alignment_recon(alnfile,
                                      partfile,
                                      stree,
                                      popsizes,
                                      duprate,
                                      lossrate,
                                      subrate,
                                      beta,
                                      pretime,
                                      premean,
                                      coal_tree,
                                      coal_recon,
                                      nsamples_coal,
                                      locus_tree,
                                      locus_recon,
                                      nsamples_locus,
                                      daughters,
                                      rates,
                                      freqs,
                                      alphas,
                                      threads=1,
                                      seed=ALIGNMENT_SEED,
                                      eps=0.1,
                                      info=None):
    """
    Evaluate terms that depend on T^G and R^G.

    That is, fix T^L, R^L, and daughters and evaluate the double integral:
    int int P(t^L | T^L, R^L, S, theta) * P(T^G, R^G, t^G | t^L, T^L, daughters, R^L, theta) * P(A | T^G, t^G) dt^L dt^G

    This is the probability we used in the searching process. 

    alnfile           -- alignment file
    partfile          -- partition file
    stree             -- species tree
    popsizes          -- population sizes in species tree
    duprate           -- duplication rate
    lossrate          -- loss rate
    subrate           -- substitution rate
    beta              -- regularization parameter
    pretime           -- starting time before species tree
    premean           -- mean starting time before species tree

    coal_tree         -- coalescent tree
    coal_recon        -- reconciliation of coalescent tree to locus tree
    nsamples_coal     -- number of times to sample coal times t^G
    locus_tree        -- locus tree (has dup-loss)
    locus_recon       -- reconciliation of locus tree to species tree
    nsamples_locus    -- number of times to sample the locus tree times t^L
    daughters         -- daughter nodes
    
    rates, freqs, alphas  -- optimization parameters  

    """

    locus_events = phylo.label_events(locus_tree, locus_recon)

    # optimize the parameters
    # util.tic("optimize parameter")
    # rates, freqs, alphas = pllprob.optimize_parameters(alnfile, partfile, coal_tree,
    #                                                   threads=threads, seed=seed, eps=eps)
    # util.toc()
    # double integral
    double_integral_list = []
    double_integral = 0.0
    util.tic("recon prob")
    for i in xrange(nsamples_locus):

        # sample t^L, the unit should be in myr
        #util.tic("topo prob")
        locus_times = duploss.sample_dup_times(locus_tree,
                                               stree,
                                               locus_recon,
                                               duprate,
                                               lossrate,
                                               pretime,
                                               premean,
                                               events=locus_events)
        treelib.set_dists_from_timestamps(locus_tree, locus_times)

        # calculate P(T^G, R^G | T^L, t^L, daughters, theta)
        topology_prob = prob_locus_coal_recon_topology(coal_tree, coal_recon,
                                                       locus_tree, popsizes,
                                                       daughters)
        #util.toc()
        # for a fixed t^L, compute coal_prob
        # sample t^G for topology and compute the probabililty of observing the alignment using MonteCarlo integration
        coal_prob = 0.0
        alignment_prob_MonteCarlo = 0.0
        alignment_prob_list = []

        # check probability of lineage counts for this locus tree
        zero_lineage_prob = False

        #util.tic("set times")
        for lnode in locus_tree:
            lineages = coal.count_lineages_per_branch(coal_tree, coal_recon,
                                                      locus_tree)
            bottom_num, top_num = lineages[lnode]
            if lnode.parent:
                T = lnode.dist
            else:
                T = util.INF

            popsizes = popsizes
            lineage_prob = prob_coal_counts(bottom_num, top_num, T, popsizes)

            # set zero_lineage_prob = TRUE if one lineage returns zero probability
            if (lineage_prob == 0.0):
                zero_lineage_prob = True

        #util.toc()
        # if lineage_prob is zero, coal_prob is zero
        if zero_lineage_prob:
            coal_prob = -float("inf")

        # otherwise, we calculate the coal_prob
        else:
            for j in xrange(nsamples_coal):

                # sample coal times and set the coal_tree accordingly
                # locus tree branch lengths are in myr
                # make sure the input popsizes are scaled to fit the time unit (typically myr)

                try:
                    sample_coal_times_topology(coal_tree, coal_recon,
                                               locus_tree, popsizes)
                except (ZeroDivisionError, ValueError):
                    # bad sample
                    util.log("bad sample")
                    alignment_prob = -util.INF
                    continue

                #===============================================================================
                # (log) probability of observing the alignment
                #util.tic("alignment probability")

                # convert branch lengths from myr to sub/site
                for node in coal_tree:
                    node.dist *= subrate

                #util.tic("alignment prob")
                # set a regularization parameter beta
                print beta
                alignment_prob = beta * prob_alignment(alnfile,
                                                       partfile,
                                                       coal_tree,
                                                       rates,
                                                       freqs,
                                                       alphas,
                                                       threads=threads,
                                                       seed=seed,
                                                       eps=eps)
                #util.toc()
                ### util.log("p = %.6f" % alignment_prob)
                #util.toc()

                #===============================================================================
                ### util.log("   log p = %.6g" % alignment_prob)
                ### util.log("   p = %.6g" % exp(alignment_prob))
                alignment_prob_list.append(alignment_prob)

            ### util.log("p = %f" % alignment_prob_MonteCarlo)

            # log_sum_exp function exponentiate the log probability of observing alignment,
            # add them up, and take log again
            if len(alignment_prob_list) == 0:
                # all bad samples
                alignment_prob_MonteCarlo = -util.INF
            else:
                alignment_prob_MonteCarlo = log_sum_exp(
                    alignment_prob_list) - log(nsamples_coal)

            # P(T^G, R^G | T^L, t^L, daughters, theta) * $ P(t^G | ~) * P(A | T^G,t^G) dtG
            # coal_prob is a log probability
            coal_prob += topology_prob + alignment_prob_MonteCarlo

            # add coal probability to a list for further processing
        double_integral_list.append(coal_prob)

        # log_sum_exp function exponentiate the log probability of observing alignment,
        # add them up, and take log again
        double_integral = log_sum_exp(double_integral_list) - log(
            nsamples_locus)

        # logging info
        if info is not None:
            info["topology_prob"] = topology_prob  # one sample of t^L
            info[
                "alignment_prob"] = alignment_prob_MonteCarlo  # one sample of t^L, averaged over t^G
            info["coal_prob"] = double_integral
    util.toc()
    return double_integral