Example #1
0
 def finish(self):
     # Once executed, alignment is converted into relaxed
     # interleaved phylip format.
     alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "alg.fasta"))
     fasta = alg.write(format="fasta")
     phylip = alg.write(format="iphylip_relaxed")
     AlgTask.store_data(self, fasta, phylip)
Example #2
0
def switch_to_codon(alg_fasta_file,  kept_columns=None):
    # Check conservation of columns. If too many identities,
    # switch to codon alignment and make the tree with DNA.
    # Mixed models is another possibility.
    if kept_columns:
        kept_columns = set(map(int, kept_columns))
    else:
        kept_columns = []

    #all_nt_alg = SeqGroup(nt_seed_file)
    aa_alg = SeqGroup(alg_fasta_file)
    nt_alg = SeqGroup()

    for seqname, aaseq, comments in aa_alg.iter_entries():
        #ntseq = all_nt_alg.get_seq(seqname).upper()
        ntseq = db.get_seq(seqname, "nt").upper()
        ntalgseq = []
        nt_pos = 0
        for pos, ch in enumerate(aaseq):
            if ch in GAP_CHARS:
                codon = "---"
            else:
                codon = ntseq[nt_pos:nt_pos+3]
                nt_pos += 3

            if not kept_columns or pos in kept_columns:
                # we trust the sequence in DB, consistency should have been
                # checked during the start up
                ntalgseq.append(codon)

        ntalgseq = "".join(ntalgseq)
        nt_alg.set_seq(seqname, ntalgseq)

    return nt_alg
Example #3
0
 def finish(self):
     # Once executed, alignment is converted into relaxed
     # interleaved phylip format.
     alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "alg.fasta"))
     fasta = alg.write(format="fasta")
     phylip = alg.write(format="iphylip_relaxed")
     AlgTask.store_data(self, fasta, phylip)
Example #4
0
    def finish(self):
        # Once executed, alignment is converted into relaxed
        # interleaved phylip format. Both files, fasta and phylip,
        # remain accessible.

        # Set Task specific attributes
        main_job = self.jobs[0]
        fasta_path = pjoin(main_job.jobdir, "clean.alg.fasta")
        alg = SeqGroup(fasta_path)
        if len(alg) != self.size:
            log.warning(
                "Trimming was to aggressive and it tried"
                " to remove one or more sequences."
                " Alignment trimming will be disabled for this dataset.")
            self.clean_alg_fasta_file = db.register_task_data(
                self.taskid, DATATYPES.clean_alg_fasta, self.alg_fasta_file)
            self.clean_alg_phylip_file = db.register_task_data(
                self.taskid, DATATYPES.clean_alg_phylip, self.alg_phylip_file)
        else:
            for line in open(self.jobs[0].stdout_file):
                line = line.strip()
                if line.startswith("#ColumnsMap"):
                    kept_columns = list(
                        map(int,
                            line.split("\t")[1].split(",")))
            fasta = alg.write(format="fasta")
            phylip = alg.write(format="iphylip_relaxed")
            AlgCleanerTask.store_data(self, fasta, phylip, kept_columns)
Example #5
0
 def finish(self):
     # Once executed, alignment is converted into relaxed
     # interleaved phylip format.
     final_job = self.jobs[2]
     alg = SeqGroup(os.path.join(final_job.jobdir, "alg.fasta"))
     alg.write(outfile=self.alg_fasta_file, format="fasta")
     alg.write(outfile=self.alg_phylip_file, format="iphylip_relaxed")
     AlgTask.finish(self)
Example #6
0
    def finish(self):
        # Once executed, alignment is converted into relaxed
        # interleaved phylip format.
        alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "mcoffee.fasta"))
        fasta = alg.write(format="fasta")
        phylip = alg.write(format="iphylip_relaxed")

        alg_list_string = '\n'.join([pjoin(GLOBALS["input_dir"],
                                           aname) for aname in self.all_alg_files])
        db.add_task_data(self.taskid, DATATYPES.alg_list, alg_list_string)

        AlgTask.store_data(self, fasta, phylip)
Example #7
0
 def finish(self):
     if self.conf[self.confname]["_alg_trimming"]:
         # If trimming happened after mcoffee, let's save the
         # resulting output
         trim_job = self.jobs[-1]
         alg = SeqGroup(pjoin(trim_job.jobdir, trim_job.alg_fasta_file))
         fasta = alg.write(format="fasta")
         phylip = alg.write(format="iphylip_relaxed")
         AlgTask.store_data(self, fasta, phylip)
     else:
         # If no post trimming, output is just what Mcoffee
         # produced, so we can recycle its data ids.
         mc_task = self.jobs[-1]
         fasta_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_fasta)
         phylip_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_phylip)
         db.register_task_data(self.taskid, DATATYPES.alg_fasta, fasta_id)
         db.register_task_data(self.taskid, DATATYPES.alg_phylip, phylip_id)
Example #8
0
def switch_to_codon(alg_fasta_file, kept_columns=None):
    # Check conservation of columns. If too many identities,
    # switch to codon alignment and make the tree with DNA.
    # Mixed models is another possibility.
    if kept_columns:
        kept_columns = set(map(int, kept_columns))
    else:
        kept_columns = []

    #all_nt_alg = SeqGroup(nt_seed_file)
    aa_alg = SeqGroup(alg_fasta_file)
    nt_alg = SeqGroup()

    for seqname, aaseq, comments in aa_alg.iter_entries():
        #ntseq = all_nt_alg.get_seq(seqname).upper()
        ntseq = db.get_seq(seqname, "nt").upper()
        ntalgseq = []
        nt_pos = 0
        for pos, ch in enumerate(aaseq):
            if ch in GAP_CHARS:
                codon = "---"
            else:
                codon = ntseq[nt_pos:nt_pos + 3]
                nt_pos += 3

            if not kept_columns or pos in kept_columns:
                # we trust the sequence in DB, consistency should have been
                # checked during the start up
                ntalgseq.append(codon)

        ntalgseq = "".join(ntalgseq)
        nt_alg.set_seq(seqname, ntalgseq)

    return nt_alg
Example #9
0
def get_identity(fname):
    s = SeqGroup(fname)
    seqlen = len(six.itervalues(s.id2seq))
    ident = list()
    for i in range(seqlen):
        states = defaultdict(int)
        for seq in six.itervalues(s.id2seq):
            if seq[i] != "-":
                states[seq[i]] += 1
        values = list(states.values())
        if values:
            ident.append(float(max(values)) / sum(values))
    return (_max(ident), _min(ident), _mean(ident), _std(ident))
Example #10
0
 def finish(self):
     # Once executed, alignment is converted into relaxed
     # interleaved phylip format.
     final_job = self.jobs[2]
     alg = SeqGroup(os.path.join(final_job.jobdir, "alg.fasta"))
     alg.write(outfile=self.alg_fasta_file, format="fasta")
     alg.write(outfile=self.alg_phylip_file, format="iphylip_relaxed")
     AlgTask.finish(self)
Example #11
0
    def finish(self):
        # Once executed, alignment is converted into relaxed
        # interleaved phylip format. Both files, fasta and phylip,
        # remain accessible.

        # Set Task specific attributes
        main_job = self.jobs[0]
        fasta_path = pjoin(main_job.jobdir, "clean.alg.fasta")
        alg = SeqGroup(fasta_path)
        if len(alg) != self.size:
            log.warning("Trimming was to aggressive and it tried"
                        " to remove one or more sequences."
                        " Alignment trimming will be disabled for this dataset."
                        )
            self.clean_alg_fasta_file = db.register_task_data(self.taskid, DATATYPES.clean_alg_fasta, self.alg_fasta_file)
            self.clean_alg_phylip_file = db.register_task_data(self.taskid, DATATYPES.clean_alg_phylip, self.alg_phylip_file)
        else:
            for line in open(self.jobs[0].stdout_file):
                line = line.strip()
                if line.startswith("#ColumnsMap"):
                    kept_columns = list(map(int, line.split("\t")[1].split(",")))
            fasta = alg.write(format="fasta")
            phylip = alg.write(format="iphylip_relaxed")
            AlgCleanerTask.store_data(self, fasta, phylip, kept_columns)
Example #12
0
def schedule(workflow_task_processor, pending_tasks, schedule_time, execution,
             debug, norender):
    # Adjust debug mode
    if debug == "all":
        log.setLevel(10)
    pending_tasks = set(pending_tasks)

    ## ===================================
    ## INITIALIZE BASIC VARS
    execution, run_detached = execution
    thread2tasks = defaultdict(list)
    for task in pending_tasks:
        thread2tasks[task.configid].append(task)
    expected_threads = set(thread2tasks.keys())
    past_threads = {}
    thread_errors = defaultdict(list)
    ## END OF VARS AND SHORTCUTS
    ## ===================================

    cores_total = GLOBALS["_max_cores"]
    if cores_total > 0:
        job_queue = Queue()

        back_launcher = Process(target=background_job_launcher,
                                args=(job_queue, run_detached,
                                      GLOBALS["launch_time"], cores_total))
        back_launcher.start()
    else:
        job_queue = None
        back_launcher = None

    GLOBALS["_background_scheduler"] = back_launcher
    GLOBALS["_job_queue"] = job_queue

    # Captures Ctrl-C for debuging DEBUG
    #signal.signal(signal.SIGINT, control_c)

    last_report_time = None

    BUG = set()
    try:
        # Enters into task scheduling
        while pending_tasks:
            wtime = schedule_time

            # ask SGE for running jobs
            if execution == "sge":
                #sgeid2jobs = db.get_sge_tasks()
                #qstat_jobs = sge.qstat()
                pass
            else:
                qstat_jobs = None

            # Show summary of pending tasks per thread
            thread2tasks = defaultdict(list)
            for task in pending_tasks:
                thread2tasks[task.configid].append(task)
            set_logindent(0)
            log.log(28, "@@13: Updating tasks status:@@1: (%s)" % (ctime()))
            info_lines = []
            for tid, tlist in six.iteritems(thread2tasks):
                threadname = GLOBALS[tid]["_name"]
                sizelist = ["%s" % getattr(_ts, "size", "?") for _ts in tlist]
                info = "Thread @@13:%s@@1:: pending tasks: @@8:%s@@1: of sizes: %s" % (
                    threadname, len(tlist), ', '.join(sizelist))
                info_lines.append(info)

            for line in info_lines:
                log.log(28, line)

            if GLOBALS["email"] and last_report_time is None:
                last_report_time = time()
                send_mail(GLOBALS["email"], "Your NPR process has started",
                          '\n'.join(info_lines))

            ## ================================
            ## CHECK AND UPDATE CURRENT TASKS
            checked_tasks = set()
            check_start_time = time()
            to_add_tasks = set()

            GLOBALS["cached_status"] = {}
            for task in sorted(pending_tasks, sort_tasks):
                # Avoids endless periods without new job submissions
                elapsed_time = time() - check_start_time
                #if not back_launcher and pending_tasks and \
                #        elapsed_time > schedule_time * 2:
                #    log.log(26, "@@8:Interrupting task checks to schedule new jobs@@1:")
                #    db.commit()
                #    wtime = launch_jobs(sorted(pending_tasks, sort_tasks),
                #                        execution, run_detached)
                #    check_start_time = time()

                # Enter debuging mode if necessary
                if debug and log.level > 10 and task.taskid.startswith(debug):
                    log.setLevel(10)
                    log.debug("ENTERING IN DEBUGGING MODE")
                thread2tasks[task.configid].append(task)

                # Update tasks and job statuses

                if task.taskid not in checked_tasks:
                    try:
                        show_task_info(task)
                        task.status = task.get_status(qstat_jobs)
                        db.dataconn.commit()
                        if back_launcher and task.status not in set("DE"):
                            for j, cmd in task.iter_waiting_jobs():
                                j.status = "Q"
                                GLOBALS["cached_status"][j.jobid] = "Q"
                                if j.jobid not in BUG:
                                    if not os.path.exists(j.jobdir):
                                        os.makedirs(j.jobdir)
                                    for ifile, outpath in six.iteritems(
                                            j.input_files):
                                        try:
                                            _tid, _did = ifile.split(".")
                                            _did = int(_did)
                                        except (IndexError, ValueError):
                                            dataid = ifile
                                        else:
                                            dataid = db.get_dataid(_tid, _did)

                                        if not outpath:
                                            outfile = pjoin(
                                                GLOBALS["input_dir"], ifile)
                                        else:
                                            outfile = pjoin(outpath, ifile)

                                        if not os.path.exists(outfile):
                                            open(outfile, "w").write(
                                                db.get_data(dataid))

                                    log.log(
                                        24, "  @@8:Queueing @@1: %s from %s" %
                                        (j, task))
                                    if execution:
                                        job_queue.put([
                                            j.jobid, j.cores, cmd,
                                            j.status_file
                                        ])
                                BUG.add(j.jobid)

                        update_task_states_recursively(task)
                        db.commit()
                        checked_tasks.add(task.taskid)
                    except TaskError as e:
                        log.error("Errors found in %s" % task)
                        import traceback
                        traceback.print_exc()
                        if GLOBALS["email"]:
                            threadname = GLOBALS[task.configid]["_name"]
                            send_mail(
                                GLOBALS["email"],
                                "Errors found in %s!" % threadname,
                                '\n'.join(map(str, [task, e.value, e.msg])))
                        pending_tasks.discard(task)
                        thread_errors[task.configid].append(
                            [task, e.value, e.msg])
                        continue
                else:
                    # Set temporary Queued state to avoids launching
                    # jobs from clones
                    task.status = "Q"
                    if log.level < 24:
                        show_task_info(task)

                if task.status == "D":
                    #db.commit()
                    show_task_info(task)
                    logindent(3)

                    # Log commands of every task
                    if 'cmd_log_file' not in GLOBALS[task.configid]:
                        GLOBALS[task.configid]['cmd_log_file'] = pjoin(
                            GLOBALS[task.configid]["_outpath"], "cmd.log")
                        O = open(GLOBALS[task.configid]['cmd_log_file'], "w")
                        O.close()

                    cmd_lines = get_cmd_log(task)
                    CMD_LOG = open(GLOBALS[task.configid]['cmd_log_file'], "a")
                    print(task, file=CMD_LOG)
                    for c in cmd_lines:
                        print('   ' + '\t'.join(map(str, c)), file=CMD_LOG)
                    CMD_LOG.close()
                    #

                    try:
                        #wkname = GLOBALS[task.configid]['_name']
                        create_tasks = workflow_task_processor(
                            task, task.target_wkname)
                    except TaskError as e:
                        log.error("Errors found in %s" % task)
                        pending_tasks.discard(task)
                        thread_errors[task.configid].append(
                            [task, e.value, e.msg])
                        continue
                    else:
                        logindent(-3)

                        to_add_tasks.update(create_tasks)
                        pending_tasks.discard(task)

                elif task.status == "E":
                    log.error("task contains errors: %s " % task)
                    log.error("Errors found in %s")
                    pending_tasks.discard(task)
                    thread_errors[task.configid].append(
                        [task, None, "Found (E) task status"])

            #db.commit()
            #if not back_launcher:
            #    wtime = launch_jobs(sorted(pending_tasks, sort_tasks),
            #                    execution, run_detached)

            # Update global task list with recently added jobs to be check
            # during next cycle
            pending_tasks.update(to_add_tasks)

            ## END CHECK AND UPDATE CURRENT TASKS
            ## ================================

            if wtime:
                set_logindent(0)
                log.log(28, "@@13:Waiting %s seconds@@1:" % wtime)
                sleep(wtime)
            else:
                sleep(schedule_time)

            # Dump / show ended threads
            error_lines = []
            for configid, etasks in six.iteritems(thread_errors):
                error_lines.append("Thread @@10:%s@@1: contains errors:" %\
                            (GLOBALS[configid]["_name"]))
                for error in etasks:
                    error_lines.append(" ** %s" % error[0])
                    e_obj = error[1] if error[1] else error[0]
                    error_path = e_obj.jobdir if isjob(e_obj) else e_obj.taskid
                    if e_obj is not error[0]:
                        error_lines.append("      -> %s" % e_obj)
                    error_lines.append("      -> %s" % error_path)
                    error_lines.append("        -> %s" % error[2])
            for eline in error_lines:
                log.error(eline)

            pending_threads = set([ts.configid for ts in pending_tasks])
            finished_threads = expected_threads - (pending_threads
                                                   | set(thread_errors.keys()))
            just_finished_lines = []
            finished_lines = []
            for configid in finished_threads:
                # configid is the the same as threadid in master tasks
                final_tree_file = pjoin(GLOBALS[configid]["_outpath"],
                                        GLOBALS["inputname"] + ".final_tree")
                threadname = GLOBALS[configid]["_name"]

                if configid in past_threads:
                    log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)",
                            threadname, past_threads[configid])
                    finished_lines.append("Finished %s in %d iteration(s)" %
                                          (threadname, past_threads[configid]))
                else:

                    log.log(28, "Assembling final tree...")
                    main_tree, treeiters = assembly_tree(configid)
                    past_threads[configid] = treeiters - 1

                    log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)",
                            threadname, past_threads[configid])

                    log.log(
                        28, "Writing final tree for @@13:%s@@1:\n   %s\n   %s",
                        threadname, final_tree_file + ".nw",
                        final_tree_file + ".nwx (newick extended)")
                    main_tree.write(outfile=final_tree_file + ".nw")
                    main_tree.write(outfile=final_tree_file + ".nwx",
                                    features=[],
                                    format_root_node=True)

                    if hasattr(main_tree, "alg_path"):
                        log.log(
                            28,
                            "Writing root node alignment @@13:%s@@1:\n   %s",
                            threadname, final_tree_file + ".fa")

                        alg = SeqGroup(get_stored_data(main_tree.alg_path))
                        OUT = open(final_tree_file + ".fa", "w")
                        for name, seq, comments in alg:
                            realname = db.get_seq_name(name)
                            print(">%s\n%s" % (realname, seq), file=OUT)
                        OUT.close()

                    if hasattr(main_tree, "clean_alg_path"):
                        log.log(
                            28,
                            "Writing root node trimmed alignment @@13:%s@@1:\n   %s",
                            threadname, final_tree_file + ".trimmed.fa")

                        alg = SeqGroup(
                            get_stored_data(main_tree.clean_alg_path))
                        OUT = open(final_tree_file + ".trimmed.fa", "w")
                        for name, seq, comments in alg:
                            realname = db.get_seq_name(name)
                            print(">%s\n%s" % (realname, seq), file=OUT)
                        OUT.close()

                    if norender == False:
                        log.log(
                            28, "Generating tree image for @@13:%s@@1:\n   %s",
                            threadname, final_tree_file + ".png")
                        for lf in main_tree:
                            lf.add_feature("sequence",
                                           alg.get_seq(lf.safename))
                        try:
                            from ete3.tools.phylobuild_lib.visualize import draw_tree
                            draw_tree(main_tree, GLOBALS[configid],
                                      final_tree_file + ".png")
                        except Exception as e:
                            log.warning(
                                '@@8:something went wrong when generating the tree image. Try manually :(@@1:'
                            )
                            if DEBUG:
                                import traceback, sys
                                traceback.print_exc(file=sys.stdout)

                    just_finished_lines.append(
                        "Finished %s in %d iteration(s)" %
                        (threadname, past_threads[configid]))
            if GLOBALS["email"]:
                if not pending_tasks:
                    all_lines = finished_lines + just_finished_lines + error_lines
                    send_mail(GLOBALS["email"], "Your NPR process has ended",
                              '\n'.join(all_lines))

                elif GLOBALS["email_report_time"] and time() - last_report_time >= \
                        GLOBALS["email_report_time"]:
                    all_lines = info_lines + error_lines + just_finished_lines
                    send_mail(GLOBALS["email"], "Your NPR report",
                              '\n'.join(all_lines))
                    last_report_time = time()

                elif just_finished_lines:
                    send_mail(GLOBALS["email"], "Finished threads!",
                              '\n'.join(just_finished_lines))

            log.log(26, "")
    except:
        raise

    if thread_errors:
        log.error("Done with ERRORS")
    else:
        log.log(28, "Done")

    return thread_errors
Example #13
0
 def finish(self):
     alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "alg.fasta"))
     fasta = alg.write(format="fasta")
     phylip = alg.write(format="iphylip_relaxed")
     AlgTask.store_data(self, fasta, phylip)
Example #14
0
def split_tree(task_tree_node, task_outgroups, main_tree, alg_path, npr_conf,
               threadid, target_cladeids):
    """Browses a task tree from root to leaves and yields next
    suitable nodes for NPR iterations. Each yielded node comes with
    the set of target and outgroup tips.
    """
    def processable_node(_n):
        """This an internal function that returns true if a given node
        is suitable for a NPR iteration. It can be used as
        "is_leaf_fn" when traversing a tree.

        Note that this function uses several variables which change within the
        split_tree function, so must be kept within its namespace.

        """
        is_leaf = False
        for wkname, wkfilter in npr_conf.npr_workflows:
            # if node is not in the targets or does not meet size filters, skip
            # workflow
            if _n is master_node or \
               (_TARGET_NODES and _n not in _TARGET_NODES) or \
               (target_cladeids and _n.cladeid not in target_cladeids) or \
               len(n2content[_n]) < max(wkfilter.get("min_size", 3), 3) or \
               ("max_size" in wkfilter and len(n2content[_n]) > wkfilter["max_size"]):
                continue

            # If seq_sim filter used, calculate node stats
            if ALG and ("min_seq_sim" in wkfilter
                        or "max_seq_sim" in wkfilter):
                if not hasattr(_n, "seqs_mean_ident"):
                    log.log(20, "Calculating node sequence stats...")
                    mx, mn, avg, std = get_seqs_identity(
                        ALG, [__n.name for __n in n2content[_n]])
                    _n.add_features(seqs_max_ident=mx,
                                    seqs_min_ident=mn,
                                    seqs_mean_ident=avg,
                                    seqs_std_ident=std)
                    log.log(
                        20,
                        "mx=%s, mn=%s, avg=%s, std=%s" % (mx, mn, avg, std))

                if _n.seqs_mean_ident < wkfilter["min_seq_sim"]:
                    continue

                if _n.seqs_mean_ident > wkfilter["max_seq_sim"]:
                    continue

            else:
                _n.add_features(seqs_max_ident=None,
                                seqs_min_ident=None,
                                seqs_mean_ident=None,
                                seqs_std_ident=None)

            if "min_support" in wkfilter:
                # If we are optimizing only lowly supported nodes, and nodes are
                # optimized without an outgroup, our target node is actually the
                # parent of lowly supported nodes. Therefore, I check if support
                # is low in children nodes, and return this node if so.
                if not npr_conf.use_outgroup:
                    if not [
                            _ch for _ch in _n.children
                            if _ch.support <= wkfilter["min_support"]
                    ]:
                        continue
                # Otherwise, just skip the node if it above the min support
                elif _n.support > wkfilter["min_support"]:
                    continue

            # At this point, node passed all filters of this workflow were met,
            # so it can be optimized
            is_leaf = True
            _n._target_wkname = wkname
            break

        return is_leaf

    log.log(20, "Loading tree content...")
    n2content = main_tree.get_cached_content()
    if alg_path:
        log.log(20, "Loading associated alignment to check seq. similarity")
        raw_alg = db.get_task_data(*alg_path.split("."))
        ALG = SeqGroup(raw_alg)
    else:
        ALG = None

    log.log(20, "Finding next NPR nodes...")
    # task_tree_node is actually a node in main_tree, since it has been
    # already merged
    trees_to_browse = [task_tree_node]
    npr_nodes = 0
    # loads current tree content, so we can check not reconstructing exactly the
    # same tree
    tasktree_content = set([leaf.name for leaf in n2content[task_tree_node]
                            ]) | set(task_outgroups)
    while trees_to_browse:
        master_node = trees_to_browse.pop()

        # if custom taxa levels are defined as targets, find them in this
        # subtree
        _TARGET_NODES = defaultdict(list)  # this container is used by
        # processable_node function
        opt_levels = GLOBALS[threadid].get('_optimized_levels', None)
        if opt_levels is not None:
            # any descendant of the already processed node is suitable for
            # selection. If the ancestor of level-species is on top of the
            # task_tree_node, it will be discarded
            avail_nodes = set(master_node.get_descendants())
            for lin in opt_levels:
                sp2lin, lin2sp = GLOBALS["lineages"]
                optimized, strict_monophyly = opt_levels[lin]
                if not optimized:
                    ancestor = main_tree.get_common_ancestor(*lin2sp[lin])
                    if ancestor in avail_nodes:
                        # check that the node satisfies level monophyly config
                        ancestor_content = set(
                            [x.name for x in n2content[ancestor]])
                        if not strict_monophyly or lin2sp[
                                lin] == ancestor_content:
                            _TARGET_NODES[ancestor].append(lin)
                        elif strict_monophyly:
                            log.log(
                                26,
                                "Discarding not monophyletic level @@11:%s@@1:"
                                % lin)
                    else:
                        log.log(26, "Discarding upper clade @@11:%s@@1:" % lin)

        for node in master_node.iter_leaves(is_leaf_fn=processable_node):
            if opt_levels:
                log.log(
                    28, "Trying to optimizing custom tree level: @@11:%s@@1:" %
                    _TARGET_NODES[node])
                for lin in _TARGET_NODES[node]:
                    # Marks the level as optimized, so is not computed again
                    opt_levels[lin][0] = True

            log.log(
                28, "Found possible target node of size %s branch support %f" %
                (len(n2content[node]), node.support))
            log.log(28, "First suitable workflow: %s" % (node._target_wkname))

            # Finds best outgroup for the target node
            if npr_conf.use_outgroup:
                splitterconfname, _ = npr_conf.tree_splitter
                splitterconf = GLOBALS[threadid][splitterconfname]
                #seqs, outs = select_outgroups(node, n2content, splitterconf)
                #seqs, outs = select_closest_outgroup(node, n2content, splitterconf)
                seqs, outs = select_sister_outgroup(node, n2content,
                                                    splitterconf)
            else:
                seqs = set([_i.name for _i in n2content[node]])
                outs = set()

            if seqs | outs == tasktree_content:
                log.log(
                    26,
                    "Discarding target node of size %s, due to identity with its parent node"
                    % len(n2content[node]))
                #print tasktree_content
                #print seqs
                #print outs
                trees_to_browse.append(node)
            else:
                npr_nodes += 1
                yield node, seqs, outs, node._target_wkname
    log.log(28, "%s nodes will be optimized", npr_nodes)
Example #15
0
 def finish(self):
     alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "alg.fasta"))
     fasta = alg.write(format="fasta")
     phylip = alg.write(format="iphylip_relaxed")
     AlgTask.store_data(self, fasta, phylip)
Example #16
0
def schedule(workflow_task_processor, pending_tasks, schedule_time, execution, debug, norender):
    # Adjust debug mode
    if debug == "all":
        log.setLevel(10)
    pending_tasks = set(pending_tasks)

    ## ===================================
    ## INITIALIZE BASIC VARS
    execution, run_detached = execution
    thread2tasks = defaultdict(list)
    for task in pending_tasks:
        thread2tasks[task.configid].append(task)
    expected_threads = set(thread2tasks.keys())
    past_threads = {}
    thread_errors = defaultdict(list)
    ## END OF VARS AND SHORTCUTS
    ## ===================================

    cores_total = GLOBALS["_max_cores"]
    if cores_total > 0:
        job_queue = Queue()

        back_launcher = Process(target=background_job_launcher,
                                args=(job_queue, run_detached,
                                      GLOBALS["launch_time"], cores_total))
        back_launcher.start()
    else:
        job_queue = None
        back_launcher = None

    GLOBALS["_background_scheduler"] = back_launcher
    GLOBALS["_job_queue"] = job_queue


    # Captures Ctrl-C for debuging DEBUG
    #signal.signal(signal.SIGINT, control_c)



    last_report_time = None

    BUG = set()
    try:
        # Enters into task scheduling
        while pending_tasks:
            wtime = schedule_time

            # ask SGE for running jobs
            if execution == "sge":
                #sgeid2jobs = db.get_sge_tasks()
                #qstat_jobs = sge.qstat()
                pass
            else:
                qstat_jobs = None

            # Show summary of pending tasks per thread
            thread2tasks = defaultdict(list)
            for task in pending_tasks:
                thread2tasks[task.configid].append(task)
            set_logindent(0)
            log.log(28, "@@13: Updating tasks status:@@1: (%s)" % (ctime()))
            info_lines = []
            for tid, tlist in six.iteritems(thread2tasks):
                threadname = GLOBALS[tid]["_name"]
                sizelist = ["%s" %getattr(_ts, "size", "?") for _ts in tlist]
                info = "Thread @@13:%s@@1:: pending tasks: @@8:%s@@1: of sizes: %s" %(
                    threadname, len(tlist), ', '.join(sizelist))
                info_lines.append(info)

            for line in info_lines:
                log.log(28, line)

            if GLOBALS["email"]  and last_report_time is None:
                last_report_time = time()
                send_mail(GLOBALS["email"], "Your NPR process has started", '\n'.join(info_lines))

            ## ================================
            ## CHECK AND UPDATE CURRENT TASKS
            checked_tasks = set()
            check_start_time = time()
            to_add_tasks = set()

            GLOBALS["cached_status"] = {}
            for task in sorted(pending_tasks, sort_tasks):
                # Avoids endless periods without new job submissions
                elapsed_time = time() - check_start_time
                #if not back_launcher and pending_tasks and \
                #        elapsed_time > schedule_time * 2:
                #    log.log(26, "@@8:Interrupting task checks to schedule new jobs@@1:")
                #    db.commit()
                #    wtime = launch_jobs(sorted(pending_tasks, sort_tasks),
                #                        execution, run_detached)
                #    check_start_time = time()

                # Enter debuging mode if necessary
                if debug and log.level > 10 and task.taskid.startswith(debug):
                    log.setLevel(10)
                    log.debug("ENTERING IN DEBUGGING MODE")
                thread2tasks[task.configid].append(task)

                # Update tasks and job statuses

                if task.taskid not in checked_tasks:
                    try:
                        show_task_info(task)
                        task.status = task.get_status(qstat_jobs)
                        db.dataconn.commit()
                        if back_launcher and task.status not in set("DE"):
                            for j, cmd in task.iter_waiting_jobs():
                                j.status = "Q"
                                GLOBALS["cached_status"][j.jobid] = "Q"
                                if j.jobid not in BUG:
                                    if not os.path.exists(j.jobdir):
                                        os.makedirs(j.jobdir)
                                    for ifile, outpath in six.iteritems(j.input_files):
                                        try:
                                            _tid, _did = ifile.split(".")
                                            _did = int(_did)
                                        except (IndexError, ValueError):
                                            dataid = ifile
                                        else:
                                            dataid = db.get_dataid(_tid, _did)

                                        if not outpath:
                                            outfile = pjoin(GLOBALS["input_dir"], ifile)
                                        else:
                                            outfile = pjoin(outpath, ifile)

                                        if not os.path.exists(outfile):
                                            open(outfile, "w").write(db.get_data(dataid))

                                    log.log(24, "  @@8:Queueing @@1: %s from %s" %(j, task))
                                    if execution:
                                        job_queue.put([j.jobid, j.cores, cmd, j.status_file])
                                BUG.add(j.jobid)

                        update_task_states_recursively(task)
                        db.commit()
                        checked_tasks.add(task.taskid)
                    except TaskError as e:
                        log.error("Errors found in %s" %task)
                        import traceback
                        traceback.print_exc()
                        if GLOBALS["email"]:
                            threadname = GLOBALS[task.configid]["_name"]
                            send_mail(GLOBALS["email"], "Errors found in %s!" %threadname,
                                      '\n'.join(map(str, [task, e.value, e.msg])))
                        pending_tasks.discard(task)
                        thread_errors[task.configid].append([task, e.value, e.msg])
                        continue
                else:
                    # Set temporary Queued state to avoids launching
                    # jobs from clones
                    task.status = "Q"
                    if log.level < 24:
                        show_task_info(task)

                if task.status == "D":
                    #db.commit()
                    show_task_info(task)
                    logindent(3)


                    # Log commands of every task
                    if 'cmd_log_file' not in GLOBALS[task.configid]:
                         GLOBALS[task.configid]['cmd_log_file'] = pjoin(GLOBALS[task.configid]["_outpath"], "cmd.log")
                         O = open(GLOBALS[task.configid]['cmd_log_file'], "w")
                         O.close()

                    cmd_lines =  get_cmd_log(task)
                    CMD_LOG = open(GLOBALS[task.configid]['cmd_log_file'], "a")
                    print(task, file=CMD_LOG)
                    for c in cmd_lines:
                        print('   '+'\t'.join(map(str, c)), file=CMD_LOG)
                    CMD_LOG.close()
                    #

                    try:
                        #wkname = GLOBALS[task.configid]['_name']
                        create_tasks = workflow_task_processor(task, task.target_wkname)
                    except TaskError as e:
                        log.error("Errors found in %s" %task)
                        pending_tasks.discard(task)
                        thread_errors[task.configid].append([task, e.value, e.msg])
                        continue
                    else:
                        logindent(-3)

                        to_add_tasks.update(create_tasks)
                        pending_tasks.discard(task)

                elif task.status == "E":
                    log.error("task contains errors: %s " %task)
                    log.error("Errors found in %s")
                    pending_tasks.discard(task)
                    thread_errors[task.configid].append([task, None, "Found (E) task status"])

            #db.commit()
            #if not back_launcher:
            #    wtime = launch_jobs(sorted(pending_tasks, sort_tasks),
            #                    execution, run_detached)

            # Update global task list with recently added jobs to be check
            # during next cycle
            pending_tasks.update(to_add_tasks)

            ## END CHECK AND UPDATE CURRENT TASKS
            ## ================================

            if wtime:
                set_logindent(0)
                log.log(28, "@@13:Waiting %s seconds@@1:" %wtime)
                sleep(wtime)
            else:
                sleep(schedule_time)

            # Dump / show ended threads
            error_lines = []
            for configid, etasks in six.iteritems(thread_errors):
                error_lines.append("Thread @@10:%s@@1: contains errors:" %\
                            (GLOBALS[configid]["_name"]))
                for error in etasks:
                    error_lines.append(" ** %s" %error[0])
                    e_obj = error[1] if error[1] else error[0]
                    error_path = e_obj.jobdir if isjob(e_obj) else e_obj.taskid
                    if e_obj is not error[0]:
                        error_lines.append("      -> %s" %e_obj)
                    error_lines.append("      -> %s" %error_path)
                    error_lines.append("        -> %s" %error[2])
            for eline in error_lines:
                log.error(eline)

            pending_threads = set([ts.configid for ts in pending_tasks])
            finished_threads = expected_threads - (pending_threads | set(thread_errors.keys()))
            just_finished_lines = []
            finished_lines = []
            for configid in finished_threads:
                # configid is the the same as threadid in master tasks
                final_tree_file = pjoin(GLOBALS[configid]["_outpath"],
                                        GLOBALS["inputname"] + ".final_tree")
                threadname = GLOBALS[configid]["_name"]

                if configid in past_threads:
                    log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)",
                            threadname, past_threads[configid])
                    finished_lines.append("Finished %s in %d iteration(s)" %(
                            threadname, past_threads[configid]))
                else:

                    log.log(28, "Assembling final tree...")
                    main_tree, treeiters =  assembly_tree(configid)
                    past_threads[configid] = treeiters - 1

                    log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)",
                            threadname, past_threads[configid])


                    log.log(28, "Writing final tree for @@13:%s@@1:\n   %s\n   %s",
                            threadname, final_tree_file+".nw",
                            final_tree_file+".nwx (newick extended)")
                    main_tree.write(outfile=final_tree_file+".nw")
                    main_tree.write(outfile=final_tree_file+ ".nwx", features=[],
                                    format_root_node=True)

                    if hasattr(main_tree, "alg_path"):
                        log.log(28, "Writing root node alignment @@13:%s@@1:\n   %s",
                                threadname, final_tree_file+".fa")

                        alg = SeqGroup(get_stored_data(main_tree.alg_path))
                        OUT = open(final_tree_file+".fa", "w")
                        for name, seq, comments in alg:
                            realname = db.get_seq_name(name)
                            print(">%s\n%s" %(realname, seq), file=OUT)
                        OUT.close()

                    if hasattr(main_tree, "clean_alg_path"):
                        log.log(28, "Writing root node trimmed alignment @@13:%s@@1:\n   %s",
                                threadname, final_tree_file+".trimmed.fa")

                        alg = SeqGroup(get_stored_data(main_tree.clean_alg_path))
                        OUT = open(final_tree_file+".trimmed.fa", "w")
                        for name, seq, comments in alg:
                            realname = db.get_seq_name(name)
                            print(">%s\n%s" %(realname, seq), file=OUT)
                        OUT.close()

                    if norender == False:
                        log.log(28, "Generating tree image for @@13:%s@@1:\n   %s",
                                threadname, final_tree_file+".png")
                        for lf in main_tree:
                            lf.add_feature("sequence", alg.get_seq(lf.safename))
                        try:
                            from ete3.tools.phylobuild_lib.visualize import draw_tree
                            draw_tree(main_tree, GLOBALS[configid], final_tree_file+".png")
                        except Exception as e:
                            log.warning('@@8:something went wrong when generating the tree image. Try manually :(@@1:')
                            if DEBUG:
                                import traceback, sys
                                traceback.print_exc(file=sys.stdout)

                    just_finished_lines.append("Finished %s in %d iteration(s)" %(
                            threadname, past_threads[configid]))
            if GLOBALS["email"]:
                if not pending_tasks:
                    all_lines = finished_lines + just_finished_lines + error_lines
                    send_mail(GLOBALS["email"], "Your NPR process has ended", '\n'.join(all_lines))

                elif GLOBALS["email_report_time"] and time() - last_report_time >= \
                        GLOBALS["email_report_time"]:
                    all_lines = info_lines + error_lines + just_finished_lines
                    send_mail(GLOBALS["email"], "Your NPR report", '\n'.join(all_lines))
                    last_report_time = time()

                elif just_finished_lines:
                    send_mail(GLOBALS["email"], "Finished threads!",
                              '\n'.join(just_finished_lines))

            log.log(26, "")
    except:
        raise

    if thread_errors:
        log.error("Done with ERRORS")
    else:
        log.log(28, "Done")

    return thread_errors
Example #17
0
def get_concatenated_alg(alg_filenames, models=None,
                        sp_field=0, sp_delimiter="_",
                        kill_thr=0.0,
                        keep_species=set()):
    # Concat alg container
    concat = SeqGroup()
    # Used to store different model partitions
    concat.id2partition = {}

    if not models:
        models = ["None"]*len(alg_filenames)
    else:
        if len(models) != len(alg_filenames):
            raise ValueError("Different number of algs and model names was found!")

    expected_total_length = 0
    # Check algs and gets the whole set of species
    alg_objects = []
    sp2alg = defaultdict(list)

    for algfile, matrix in zip(alg_filenames, models):
        alg = SeqGroup(algfile, "fasta")
        alg_objects.append(alg)
        lenseq = None
        browsed_species = set()
        alg.sp2seq = {}
        # Set best matrix for this alignment
        alg.matrix = matrix
        # Change seq names to contain only species names
        for i, seq in six.iteritems(alg.id2seq):
            name = db.get_seq_name(alg.id2name[i])
            taxid = get_species_code(name, splitter=sp_delimiter, field=sp_field)
            if lenseq is not None and len(seq) != lenseq:
                raise Exception("Inconsistent alignment when concatenating: Unequal length")
            elif lenseq is None:
                lenseq = len(seq)
                alg.seqlength = len(seq)
                expected_total_length += len(seq)
            if taxid in browsed_species:
                raise Exception("Inconsistent alignment when concatenating: Repeated species")
            browsed_species.add(taxid) # Check no duplicated species in the same alg
            sp2alg[taxid].append(alg) # Records all species seen in all algs.
            alg.sp2seq[taxid] = seq

    valid_species = [sp for sp in six.iterkeys(sp2alg) \
                         if sp in keep_species or \
                         len(sp2alg[sp])/float(len(alg_objects)) > kill_thr]

    log.info("%d out of %d will be kept (missing factor threshold=%g, %d species forced to kept)" %\
                 (len(valid_species), len(sp2alg), kill_thr, len(keep_species)))

    def sort_single_algs(alg1, alg2):
        r = cmp(alg1.matrix, alg2.matrix)
        if r == 0:
            return cmp(sorted(alg1.id2name.values()),
                       sorted(alg2.id2name.values()))
        else:
            return r

    sorted_algs = sorted(alg_objects, sort_single_algs)
    concat_alg_lengths = [alg.seqlength for alg in sorted_algs]
    model2win = {}
    model2size = {}
    for alg in sorted_algs:
        model2size[alg.matrix] = model2size.get(alg.matrix, 0) + alg.seqlength

    # Create concat alg
    concat.id2seq = defaultdict(list)
    for sp in sorted(valid_species):
        log.log(20, "Concatenating sequences of [%s]" %sp)
        for alg in sorted_algs:
            seq = alg.sp2seq.get(sp, "-" * alg.seqlength)
            concat.id2seq[sp].append(seq)
            #current_seq = concat.id2seq.get(sp, "")
            #concat.id2seq[sp] = current_seq + seq.strip()
            concat.id2name[sp] = sp
            concat.name2id[sp] = sp
            concat.id2comment[sp] = [""]
        concat.id2seq[sp] = ''.join(concat.id2seq[sp])

    current_pos = 0
    partitions = []
    for model in sorted(model2size.keys()):
        size = model2size[model]
        part = "%s, %s = %d-%d" % (model, model+"_genes", \
                                       current_pos + 1,\
                                       current_pos + size)
        current_pos += size
        partitions.append(part)

    # Basic Checks
    seq_sizes = [len(seq) for seq in list(concat.id2seq.values())]
    if len(set(seq_sizes)) != 1:
        raise Exception("Concatenated alignment is not consistent: unequal seq length ")
    if seq_sizes[0] != expected_total_length:
        raise Exception("The size of concatenated alg is not what expected")
    return concat, partitions, sp2alg, valid_species, concat_alg_lengths
Example #18
0
def get_concatenated_alg(alg_filenames,
                         models=None,
                         sp_field=0,
                         sp_delimiter="_",
                         kill_thr=0.0,
                         keep_species=set()):
    # Concat alg container
    concat = SeqGroup()
    # Used to store different model partitions
    concat.id2partition = {}

    if not models:
        models = ["None"] * len(alg_filenames)
    else:
        if len(models) != len(alg_filenames):
            raise ValueError(
                "Different number of algs and model names was found!")

    expected_total_length = 0
    # Check algs and gets the whole set of species
    alg_objects = []
    sp2alg = defaultdict(list)

    for algfile, matrix in zip(alg_filenames, models):
        alg = SeqGroup(algfile, "fasta")
        alg_objects.append(alg)
        lenseq = None
        browsed_species = set()
        alg.sp2seq = {}
        # Set best matrix for this alignment
        alg.matrix = matrix
        # Change seq names to contain only species names
        for i, seq in six.iteritems(alg.id2seq):
            name = db.get_seq_name(alg.id2name[i])
            taxid = get_species_code(name,
                                     splitter=sp_delimiter,
                                     field=sp_field)
            if lenseq is not None and len(seq) != lenseq:
                raise Exception(
                    "Inconsistent alignment when concatenating: Unequal length"
                )
            elif lenseq is None:
                lenseq = len(seq)
                alg.seqlength = len(seq)
                expected_total_length += len(seq)
            if taxid in browsed_species:
                raise Exception(
                    "Inconsistent alignment when concatenating: Repeated species"
                )
            browsed_species.add(
                taxid)  # Check no duplicated species in the same alg
            sp2alg[taxid].append(alg)  # Records all species seen in all algs.
            alg.sp2seq[taxid] = seq

    valid_species = [sp for sp in six.iterkeys(sp2alg) \
                         if sp in keep_species or \
                         len(sp2alg[sp])/float(len(alg_objects)) > kill_thr]

    log.info("%d out of %d will be kept (missing factor threshold=%g, %d species forced to kept)" %\
                 (len(valid_species), len(sp2alg), kill_thr, len(keep_species)))

    def sort_single_algs(alg1, alg2):
        r = cmp(alg1.matrix, alg2.matrix)
        if r == 0:
            return cmp(sorted(alg1.id2name.values()),
                       sorted(alg2.id2name.values()))
        else:
            return r

    sorted_algs = sorted(alg_objects, sort_single_algs)
    concat_alg_lengths = [alg.seqlength for alg in sorted_algs]
    model2win = {}
    model2size = {}
    for alg in sorted_algs:
        model2size[alg.matrix] = model2size.get(alg.matrix, 0) + alg.seqlength

    # Create concat alg
    concat.id2seq = defaultdict(list)
    for sp in sorted(valid_species):
        log.log(20, "Concatenating sequences of [%s]" % sp)
        for alg in sorted_algs:
            seq = alg.sp2seq.get(sp, "-" * alg.seqlength)
            concat.id2seq[sp].append(seq)
            #current_seq = concat.id2seq.get(sp, "")
            #concat.id2seq[sp] = current_seq + seq.strip()
            concat.id2name[sp] = sp
            concat.name2id[sp] = sp
            concat.id2comment[sp] = [""]
        concat.id2seq[sp] = ''.join(concat.id2seq[sp])

    current_pos = 0
    partitions = []
    for model in sorted(model2size.keys()):
        size = model2size[model]
        part = "%s, %s = %d-%d" % (model, model+"_genes", \
                                       current_pos + 1,\
                                       current_pos + size)
        current_pos += size
        partitions.append(part)

    # Basic Checks
    seq_sizes = [len(seq) for seq in list(concat.id2seq.values())]
    if len(set(seq_sizes)) != 1:
        raise Exception(
            "Concatenated alignment is not consistent: unequal seq length ")
    if seq_sizes[0] != expected_total_length:
        raise Exception("The size of concatenated alg is not what expected")
    return concat, partitions, sp2alg, valid_species, concat_alg_lengths