Exemple #1
0
    def finish(self):
        # first job is the raxml tree
        best_model = None
        best_model_in_next_line = False
        t = None
        for line in open(self.jobs[-1].stdout_file, "rU"):
            line = line.strip()
            if best_model_in_next_line and line.startswith("Model"):
                pass  #best_model = line.split("=")[1].strip()
            elif best_model_in_next_line and line.startswith("partition"):
                best_model = line.split("=")[1].strip()
                best_model_in_next_line = False
            elif line.startswith("Model selected:"):
                best_model_in_next_line = True
            elif line.startswith("ML tree (NNI) for the best AIC model ="):
                nw = line.replace("ML tree (NNI) for the best AIC model =", "")
                t = PhyloTree(nw)

        open(self.best_model_file, "w").write(best_model)
        log.log(26, "Best model: %s" % best_model)
        if self.ttype == "tree":
            tree_job = self.jobs[-1]
            tree_file = os.path.join(tree_job.jobdir,
                                     "jModelTest_tree." + self.nodeid)
            t.write(outfile=self.tree_file)
            self.model = best_model

        ModelTesterTask.finish(self)
Exemple #2
0
    def finish(self):
        # first job is the raxml tree
        best_model = None
        best_model_in_next_line = False
        t = None
        for line in open(self.jobs[-1].stdout_file, "rU"):
            line = line.strip()
            if best_model_in_next_line and line.startswith("Model"):
                pass  # best_model = line.split("=")[1].strip()
            elif best_model_in_next_line and line.startswith("partition"):
                best_model = line.split("=")[1].strip()
                best_model_in_next_line = False
            elif line.startswith("Model selected:"):
                best_model_in_next_line = True
            elif line.startswith("ML tree (NNI) for the best AIC model ="):
                nw = line.replace("ML tree (NNI) for the best AIC model =", "")
                t = PhyloTree(nw)

        open(self.best_model_file, "w").write(best_model)
        log.log(26, "Best model: %s" % best_model)
        if self.ttype == "tree":
            tree_job = self.jobs[-1]
            tree_file = os.path.join(tree_job.jobdir, "jModelTest_tree." + self.nodeid)
            t.write(outfile=self.tree_file)
            self.model = best_model

        ModelTesterTask.finish(self)
Exemple #3
0
    def finish(self):
        lks = []
        j = self.jobs[0]
        tree_file = os.path.join(j.jobdir,
                                 self.alg_phylip_file + "_phyml_tree.txt")
        stats_file = os.path.join(j.jobdir,
                                  self.alg_phylip_file + "_phyml_stats.txt")

        m = re.search('Log-likelihood:\s+(-?\d+\.\d+)',
                      open(stats_file).read())
        lk = float(m.groups()[0])
        stats = {"lk": lk}
        tree = PhyloTree(tree_file)
        TreeTask.store_data(self, tree.write(), stats)
Exemple #4
0
    def finish(self):
        lks = []
        j = self.jobs[0]
        tree_file = os.path.join(j.jobdir,
                                 self.alg_phylip_file+"_phyml_tree.txt")
        stats_file = os.path.join(j.jobdir,
                                  self.alg_phylip_file+"_phyml_stats.txt")

        m = re.search('Log-likelihood:\s+(-?\d+\.\d+)',
                      open(stats_file).read())
        lk = float(m.groups()[0])
        stats = {"lk": lk}
        tree = PhyloTree(tree_file)        
        TreeTask.store_data(self, tree.write(), stats)
Exemple #5
0
    def finish(self):
        lks = []
        if self.lk_mode == "phyml":
            for job in self.jobs:
                if job.jobcat != "bionj": continue
                phyml_job = job
                tree_file = pjoin(phyml_job.jobdir,
                                  self.alg_phylip_file+"_phyml_tree.txt")
                stats_file = pjoin(phyml_job.jobdir,
                                   self.alg_phylip_file+"_phyml_stats.txt")
                tree = PhyloTree(tree_file)
                m = re.search('Log-likelihood:\s+(-?\d+\.\d+)',
                              open(stats_file).read())
                lk = float(m.groups()[0])
                tree.add_feature("lk", lk)
                tree.add_feature("model", phyml_job.args["--model"])
                lks.append([float(tree.lk), tree.model, tree])
        elif self.lk_mode == "raxml":
            for job in self.jobs:
                if job.jobcat != "raxml": continue
                raxml_job = job
                lk = open(pjoin(raxml_job.jobdir, "RAxML_log.%s"
                                %raxml_job.args["-n"])).readline().split()[1]
                tree = PhyloTree(raxml_job.args["-t"])
                tree.add_feature("lk", lk)
                tree.add_feature("model", raxml_job.model)
                lks.append([float(tree.lk), tree.model, tree])

        # sort lks in ASC order
        lks.sort()
        # choose the model with higher likelihood, the lastone in the list
        best_model = lks[-1][1]
        best_tree = lks[-1][2]
        log.log(22, "%s model selected from the following lk values:\n%s" %(best_model, '\n'.join(map(str, lks))))
        ModelTesterTask.store_data(self, best_model, lks)
Exemple #6
0
 def finish(self):
     lks = []
     if self.lk_mode == "phyml":
         for job in [j for j in self.jobs if j.flag == "phyml"]:
             tree_file = os.path.join(job.jobdir,
                                      self.alg_basename+"_phyml_tree.txt")
             stats_file = os.path.join(j.jobdir,
                                       self.alg_basename+"_phyml_stats.txt")
             tree = PhyloTree(tree_file)
             m = re.search('Log-likelihood:\s+(-?\d+\.\d+)',
                           open(stats_file).read())
             lk = float(m.groups()[0])
             tree.add_feature("lk", lk)
             tree.add_feature("model", job.args["--model"])
             lks.append([float(tree.lk), tree.model, tree])
     elif self.lk_mode == "raxml":
         for job in [j for j in self.jobs if j.flag == "raxml"]:
             lk = open(os.path.join(job.jobdir, "RAxML_log.%s"
                                    %job.args["-n"])).readline().split()[1]
             tree = PhyloTree(job.args["-t"])
             tree.add_feature("lk", lk)
             tree.add_feature("model", job.model)
             lks.append([lk, tree.model, tree])
     lks.sort()
     lks.reverse()
     # choose the model with higher likelihood
     best_model = lks[-1][1]
     best_tree = lks[-1][2]
     open(self.best_model_file, "w").write(best_model)
     if self.tree_file:
         tree.write(self.tree_file)
     ModelTesterTask.finish(self)
Exemple #7
0
    def finish(self):
        def euc_dist(x, y):
            return len(x.symmetric_difference(y)) / float((len(x) + len(y)))

        dataid = db.get_dataid(*self.task_tree_file.split("."))
        ttree = PhyloTree(db.get_data(dataid))
        mtree = self.main_tree
        ttree.dist = 0
        cladeid, target_seqs, out_seqs = db.get_node_info(
            self.threadid, self.nodeid)
        self.out_seqs = out_seqs
        self.target_seqs = target_seqs

        ttree_content = ttree.get_cached_content()
        if mtree and not out_seqs:
            mtree_content = mtree.get_cached_content()
            log.log(24,
                    "Finding best scoring outgroup from previous iteration.")
            for _n in mtree_content:
                if _n.cladeid == cladeid:
                    orig_target = _n
            target_left = set(
                [_n.name for _n in mtree_content[orig_target.children[0]]])
            target_right = set(
                [_n.name for _n in mtree_content[orig_target.children[1]]])

            partition_pairs = []
            everything = set([_n.name for _n in ttree_content[ttree]])
            for n, content in ttree_content.iteritems():
                if n is ttree:
                    continue
                left = set([_n.name for _n in content])
                right = everything - left
                d1 = euc_dist(left, target_left)
                d2 = euc_dist(left, target_right)
                best_match = min(d1, d2)
                partition_pairs.append([best_match, left, right, n])

            partition_pairs.sort()

            self.outgroup_match_dist = partition_pairs[0][0]
            #self.outgroup_match = '#'.join( ['|'.join(partition_pairs[0][1]),
            #                      '|'.join(partition_pairs[0][2])] )

            outgroup = partition_pairs[0][3]
            ttree.set_outgroup(outgroup)

            ttree.dist = orig_target.dist
            ttree.support = orig_target.support

            # Merge task and main trees
            parent = orig_target.up
            orig_target.detach()
            parent.add_child(ttree)

        elif mtree and out_seqs:
            log.log(26, "Rooting tree using %d custom seqs" % len(out_seqs))

            self.outgroup_match = '|'.join(out_seqs)

            #log.log(22, "Out seqs:    %s", len(out_seqs))
            #log.log(22, "Target seqs: %s", target_seqs)
            if len(out_seqs) > 1:
                #first root to a single seqs outside the outgroup
                #(should never fail and avoids random outgroup split
                #problems in unrooted trees)
                ttree.set_outgroup(ttree & list(target_seqs)[0])
                # Now tries to get the outgroup node as a monophyletic clade
                outgroup = ttree.get_common_ancestor(out_seqs)
                if set(outgroup.get_leaf_names()) ^ out_seqs:
                    msg = "Monophyly of the selected outgroup could not be granted! Probably constrain tree failed."
                    #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, out_seqs)
                    raise TaskError(self, msg)
            else:
                outgroup = ttree & list(out_seqs)[0]

            ttree.set_outgroup(outgroup)
            orig_target = self.main_tree.get_common_ancestor(target_seqs)
            found_target = outgroup.get_sisters()[0]

            ttree = ttree.get_common_ancestor(target_seqs)
            outgroup.detach()
            self.pre_iter_support = orig_target.support
            # Use previous dist and support
            ttree.dist = orig_target.dist
            ttree.support = orig_target.support
            parent = orig_target.up
            orig_target.detach()
            parent.add_child(ttree)

        else:
            # ROOTS FIRST ITERATION
            log.log(24, "Getting outgroup for first NPR split")

            # if early split is provided in the command line, it
            # overrides config file
            mainout = GLOBALS.get("first_split_outgroup", "midpoint")

            if mainout.lower() == "midpoint":
                log.log(26, "Rooting to midpoint.")
                best_outgroup = ttree.get_midpoint_outgroup()
                if best_outgroup:
                    ttree.set_outgroup(best_outgroup)
                else:
                    log.warning("Midpoint outgroup could not be set!")
                    ttree.set_outgroup(ttree.iter_leaves().next())
            else:
                if mainout.startswith("~"):
                    # Lazy defined outgroup. Will trust in the common
                    # ancestor of two or more OTUs
                    strict_common_ancestor = False
                    outs = set(mainout[1:].split())
                    if len(outs) < 2:
                        raise TaskError(
                            self, "First split outgroup error: common "
                            "ancestor calculation requires at least two OTU names"
                        )
                else:
                    strict_common_ancestor = True
                    outs = set(mainout.split())

                if outs - target_seqs:
                    raise TaskError(
                        self,
                        "Unknown seqs cannot be used to set first split rooting:%s"
                        % (outs - target_seqs))

                if len(outs) > 1:
                    anchor = list(set(target_seqs) - outs)[0]
                    ttree.set_outgroup(ttree & anchor)
                    common = ttree.get_common_ancestor(outs)
                    out_seqs = common.get_leaf_names()
                    if common is ttree:
                        msg = "First split outgroup could not be granted:%s" % out_seqs
                        #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, outs)
                        raise TaskError(self, msg)
                    if strict_common_ancestor and set(out_seqs) ^ outs:
                        msg = "Monophyly of first split outgroup could not be granted:%s" % out_seqs
                        #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, outs)
                        raise TaskError(self, msg)

                    log.log(
                        26, "@@8:First split rooting to %d seqs@@1:: %s" %
                        (len(out_seqs), out_seqs))
                    ttree.set_outgroup(common)
                else:
                    single_out = outs.pop()
                    common = ttree.set_outgroup(single_out)
                    log.log(
                        26, "@@8:First split rooting to 1 seq@@1:: %s" %
                        (single_out))

            self.main_tree = ttree
            orig_target = ttree

        tn = orig_target.copy()
        self.pre_iter_task_tree = tn
        self.rf = orig_target.robinson_foulds(ttree)
        self.pre_iter_support = orig_target.support

        # Reloads node2content of the rooted tree and generate cladeids
        ttree_content = self.main_tree.get_cached_content()
        for n, content in ttree_content.iteritems():
            cid = generate_id([_n.name for _n in content])
            n.add_feature("cladeid", cid)

        #ttree.write(outfile=self.pruned_tree)
        self.task_tree = ttree
Exemple #8
0
 def finish(self):
     lks = []
     if self.lk_mode == "phyml":
         for job in [j for j in self.jobs if j.flag == "phyml"]:
             tree_file = os.path.join(job.jobdir,
                                      self.alg_basename + "_phyml_tree.txt")
             stats_file = os.path.join(
                 j.jobdir, self.alg_basename + "_phyml_stats.txt")
             tree = PhyloTree(tree_file)
             m = re.search('Log-likelihood:\s+(-?\d+\.\d+)',
                           open(stats_file).read())
             lk = float(m.groups()[0])
             tree.add_feature("lk", lk)
             tree.add_feature("model", job.args["--model"])
             lks.append([float(tree.lk), tree.model, tree])
     elif self.lk_mode == "raxml":
         for job in [j for j in self.jobs if j.flag == "raxml"]:
             lk = open(
                 os.path.join(job.jobdir, "RAxML_log.%s" %
                              job.args["-n"])).readline().split()[1]
             tree = PhyloTree(job.args["-t"])
             tree.add_feature("lk", lk)
             tree.add_feature("model", job.model)
             lks.append([lk, tree.model, tree])
     lks.sort()
     lks.reverse()
     # choose the model with higher likelihood
     best_model = lks[-1][1]
     best_tree = lks[-1][2]
     open(self.best_model_file, "w").write(best_model)
     if self.tree_file:
         tree.write(self.tree_file)
     ModelTesterTask.finish(self)
Exemple #9
0
    def finish(self):
        def euc_dist(x, y):
            return len(x.symmetric_difference(y)) / float((len(x) + len(y)))
        dataid = db.get_dataid(*self.task_tree_file.split("."))
        ttree = PhyloTree(db.get_data(dataid))
        mtree = self.main_tree
        ttree.dist = 0
        cladeid, target_seqs, out_seqs = db.get_node_info(self.threadid, self.nodeid)
        self.out_seqs = out_seqs
        self.target_seqs = target_seqs

        ttree_content = ttree.get_cached_content()
        if mtree and not out_seqs:
            mtree_content = mtree.get_cached_content()
            log.log(24, "Finding best scoring outgroup from previous iteration.")
            for _n in mtree_content:
                if _n.cladeid == cladeid:
                    orig_target = _n 
            target_left = set([_n.name for _n in mtree_content[orig_target.children[0]]])
            target_right = set([_n.name for _n in mtree_content[orig_target.children[1]]])
                    
            partition_pairs = []
            everything = set([_n.name for _n in ttree_content[ttree]])
            for n, content in ttree_content.iteritems():
                if n is ttree:
                    continue
                left = set([_n.name for _n in content])
                right =  everything - left
                d1 = euc_dist(left, target_left)
                d2 = euc_dist(left, target_right)
                best_match = min(d1, d2)
                partition_pairs.append([best_match, left, right, n])

            partition_pairs.sort()
            
            self.outgroup_match_dist = partition_pairs[0][0]
            #self.outgroup_match = '#'.join( ['|'.join(partition_pairs[0][1]),
            #                      '|'.join(partition_pairs[0][2])] )

            
            outgroup = partition_pairs[0][3]
            ttree.set_outgroup(outgroup)
      
            ttree.dist = orig_target.dist
            ttree.support = orig_target.support

            # Merge task and main trees
            parent = orig_target.up
            orig_target.detach()
            parent.add_child(ttree)

        elif mtree and out_seqs:
            log.log(26, "Rooting tree using %d custom seqs" %
                   len(out_seqs))

            self.outgroup_match = '|'.join(out_seqs)
                        
            #log.log(22, "Out seqs:    %s", len(out_seqs))
            #log.log(22, "Target seqs: %s", target_seqs)
            if len(out_seqs) > 1:
                #first root to a single seqs outside the outgroup
                #(should never fail and avoids random outgroup split
                #problems in unrooted trees)
                ttree.set_outgroup(ttree & list(target_seqs)[0])
                # Now tries to get the outgroup node as a monophyletic clade
                outgroup = ttree.get_common_ancestor(out_seqs)
                if set(outgroup.get_leaf_names()) ^ out_seqs:
                    msg = "Monophyly of the selected outgroup could not be granted! Probably constrain tree failed."
                    #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, out_seqs)
                    raise TaskError(self, msg)
            else:
                outgroup = ttree & list(out_seqs)[0]

            ttree.set_outgroup(outgroup)
            orig_target = self.main_tree.get_common_ancestor(target_seqs)
            found_target = outgroup.get_sisters()[0]

            ttree = ttree.get_common_ancestor(target_seqs)
            outgroup.detach()
            self.pre_iter_support = orig_target.support
            # Use previous dist and support
            ttree.dist = orig_target.dist
            ttree.support = orig_target.support
            parent = orig_target.up
            orig_target.detach()
            parent.add_child(ttree)
               
        else:
            # ROOTS FIRST ITERATION
            log.log(24, "Getting outgroup for first NPR split")
            
            # if early split is provided in the command line, it
            # overrides config file
            mainout = GLOBALS.get("first_split_outgroup", "midpoint")
            
            if mainout.lower() == "midpoint":
                log.log(26, "Rooting to midpoint.")
                best_outgroup = ttree.get_midpoint_outgroup()
                if best_outgroup:
                    ttree.set_outgroup(best_outgroup)
                else:
                    log.warning("Midpoint outgroup could not be set!")
                    ttree.set_outgroup(ttree.iter_leaves().next())
            else:
                if mainout.startswith("~"):
                    # Lazy defined outgroup. Will trust in the common
                    # ancestor of two or more OTUs
                    strict_common_ancestor = False
                    outs = set(mainout[1:].split())
                    if len(outs) < 2:          
                        raise TaskError(self, "First split outgroup error: common "
                                        "ancestor calculation requires at least two OTU names")
                else:
                    strict_common_ancestor = True
                    outs = set(mainout.split())

                if outs - target_seqs:
                    raise TaskError(self, "Unknown seqs cannot be used to set first split rooting:%s" %(outs - target_seqs))
                    
                if len(outs) > 1:
                    anchor = list(set(target_seqs) - outs)[0]
                    ttree.set_outgroup(ttree & anchor)
                    common = ttree.get_common_ancestor(outs)
                    out_seqs = common.get_leaf_names()
                    if common is ttree:
                        msg = "First split outgroup could not be granted:%s" %out_seqs
                        #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, outs)
                        raise TaskError(self, msg)
                    if strict_common_ancestor and set(out_seqs) ^ outs:
                        msg = "Monophyly of first split outgroup could not be granted:%s" %out_seqs
                        #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, outs)
                        raise TaskError(self, msg)
                    
                    log.log(26, "@@8:First split rooting to %d seqs@@1:: %s" %(len(out_seqs),out_seqs))
                    ttree.set_outgroup(common)
                else:
                    single_out = outs.pop()
                    common = ttree.set_outgroup(single_out)
                    log.log(26, "@@8:First split rooting to 1 seq@@1:: %s" %(single_out))
                    
            self.main_tree = ttree
            orig_target = ttree

        tn = orig_target.copy()
        self.pre_iter_task_tree = tn
        self.rf = orig_target.robinson_foulds(ttree)
        self.pre_iter_support = orig_target.support
                
        # Reloads node2content of the rooted tree and generate cladeids
        ttree_content = self.main_tree.get_cached_content()
        for n, content in ttree_content.iteritems():
            cid = generate_id([_n.name for _n in content])
            n.add_feature("cladeid", cid)

        #ttree.write(outfile=self.pruned_tree)
        self.task_tree = ttree