Esempio n. 1
0
    def create_raw_fas_for_clusters_in_bin(self, cids, d, uc, partial_uc):
        """
        Create raw subreads fasta files for clusters in cids.
        For each cluster k in cids,
        * Collect raw subreads of zmws associated with cluster k
          in either uc or partial_uc.

        cids --- cluster ids
        d --- MetaSubreadsFastaReader
        uc --- uc[k] returns fl ccs reads associated with cluster k
        partial_uc --- partial_uc[k] returns nfl ccs reads associated with cluster k
        
        (Liz) for Quiver, subsample down to max 100 (with uc having priority over partial_uc)
        """
        #data_queue = []
        #fake_func = lambda x: x
        for k in cids:  # for each cluster k

            # $root_dir/tmp/?/c{k}/in.raw_with_partial.fa
            raw_fa = self.raw_fa_of_cluster(k)

            in_seqids = uc[k]
            if len(in_seqids) > 100:
                in_seqids = random.sample(in_seqids, 100)
            else:
                in_seqids += random.sample(partial_uc[k], min(len(partial_uc[k]), 100 - len(in_seqids)))
            # write cluster k's associated raw subreads to raw_fa
            #data_queue.append([d, in_seqids, raw_fa, True])
            write_in_raw_fasta(input_fasta_d=d,
                               in_seqids=in_seqids,
                               out_fa=raw_fa,
                               ignore_keyerror=True)
Esempio n. 2
0
    def create_raw_fas_for_clusters_in_bin(self, cids, d, uc, partial_uc):
        """
        Create raw subreads fasta files for clusters in cids.
        For each cluster k in cids,
        * Collect raw subreads of zmws associated with cluster k
          in either uc or partial_uc.

        cids --- cluster ids
        d --- MetaSubreadsFastaReader
        uc --- uc[k] returns fl ccs reads associated with cluster k
        partial_uc --- partial_uc[k] returns nfl ccs reads associated with cluster k
        
        (Liz) for Quiver, subsample down to max 100 (with uc having priority over partial_uc)
        """
        #data_queue = []
        #fake_func = lambda x: x
        for k in cids:  # for each cluster k

            # $root_dir/tmp/?/c{k}/in.raw_with_partial.fa
            raw_fa = self.raw_fa_of_cluster(k)

            in_seqids = uc[k]
            if len(in_seqids) > 100:
                in_seqids = random.sample(in_seqids, 100)
            else:
                in_seqids += random.sample(
                    partial_uc[k], min(len(partial_uc[k]),
                                       100 - len(in_seqids)))
            # write cluster k's associated raw subreads to raw_fa
            #data_queue.append([d, in_seqids, raw_fa, True])
            write_in_raw_fasta(input_fasta_d=d,
                               in_seqids=in_seqids,
                               out_fa=raw_fa,
                               ignore_keyerror=True)
Esempio n. 3
0
    def create_raw_fas_for_clusters_in_bin(self, cids, d, uc, partial_uc):
        """
        Create raw subreads fasta files for clusters in cids.
        For each cluster k in cids,
        * Collect raw subreads of zmws associated with cluster k
          in either uc or partial_uc.

        cids --- cluster ids
        d --- MetaSubreadsFastaReader
        uc --- uc[k] returns fl ccs reads associated with cluster k
        partial_uc --- partial_uc[k] returns nfl ccs reads associated with cluster k

        """
        for k in cids:  # for each cluster k

            # $root_dir/tmp/?/c{k}/in.raw_with_partial.fa
            raw_fa = self.raw_fa_of_cluster(k)

            # write cluster k's associated raw subreads to raw_fa
            write_in_raw_fasta(input_fasta_d=d,
                               in_seqids=uc[k] + partial_uc[k],
                               out_fa=raw_fa,
                               ignore_keyerror=True)
Esempio n. 4
0
    def submit_quiver_jobs(self, d, uc, partial_uc, refs, keys, start, end,
                           submitted, todo, use_sge, max_sge_jobs,
                           quiver_nproc):
        """Call quiver to polish consensus.
        (1) for each cluster k, obtain unrolled sequences of all reads (zmws)
            belonging to this cluster, and save in raw_fa_of_cluster(k)
        (2) for each cluster k, call blasr to align raw_f_of_cluster to
            consensus sequence of the cluster and create sam_of_cluster.

        (3) Put every 100 clusters into one big bin, and then
            merge all sam_of_cluster files to sam_of_quivered_bin
        (4) Prepare commands including
                samtoh5, loadPulses, cmph5tools.py ...
            in order to convert sam_of_quivered_bin to cmph5_of_quivered_bin.
                * Either write these command to script_of_quivered_bin and qsub
                  all jobs later when scripts of all quivered bins are done,
                * Or execute the commands immediately.
        """
        for i in xrange(start, end, 100):
            for k in keys[i:min(end, i + 100)]:
                #os.chdir(op.join('./tmp', str(k/10000), 'c'+str(k)))
                raw_fa = self.raw_fa_of_cluster(k)

                # write_in_raw_fa return movies of reads in partial_uc
                # logging.debug("uc[k]={0}".format(uc[k]))
                # logging.debug("partial_uc[k]={0}".format(uc[k]))
                write_in_raw_fasta(input_fasta_d=d,
                                   in_seqids=uc[k] + partial_uc[k],
                                   out_fa=raw_fa,
                                   ignore_keyerror=True)

                #TODO: use multi-processing pool, reduce nproc
                blasr_sam_for_quiver(input_fasta=raw_fa,
                                     ref_fasta=refs[k],
                                     out_sam_filename=self.sam_of_cluster(k),
                                     run_cmd=True)

            fname = self.setup_quiver_for_batch(cids=keys[i:min(end, i + 100)],
                                                refs=refs,
                                                quiver_nproc=quiver_nproc,
                                                return_script=True)
            todo.append(fname)

            if use_sge is not True or \
               max_sge_jobs == 0: # don't use SGE
                for job in todo:
                    elog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".elog")
                    olog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".olog")
                    msg = "Running quiver job locally: {j} ".format(j=job) + \
                          "1>{olog} 2>{elog}".format(olog=olog, elog=elog)
                    self.add_log(msg)
                    cmd = "bash " + job + " 1>{olog} 2>{elog}".\
                          format(olog=olog, elog=elog)
                    _out, _code, _msg = backticks(cmd)
                    if _code != 0:
                        errMsg = "Failed to run quiver {j}".format(
                            j=job) + _msg
                        self.add_log(errMsg, level=logging.ERROR)
                        raise RuntimeError(errMsg)
                    submitted.append(("local", job))
                todo = []
            else:
                while len(todo) > 0:
                    n = min(max_sge_jobs, len(todo))
                    for job in todo[:n]:
                        # ex: Your job 8613116 ("c20to70.sh") has been submitted
                        elog = op.join(self.quivered_log_dir,
                                       op.basename(job) + ".elog")
                        olog = op.join(self.quivered_log_dir,
                                       op.basename(job) + ".olog")
                        qsub_cmd = "qsub " + \
                                   "-pe smp {n} ".format(n=quiver_nproc) + \
                                   "-cwd -S /bin/bash -V " + \
                                   "-e {elog} ".format(elog=elog) + \
                                   "-o {olog} ".format(olog=olog) + \
                                   "{job}".format(job=job)
                        msg = "Submitting CMD: {cmd}.\n".format(cmd=qsub_cmd)
                        self.add_log(msg)
                        _out, _code, _msg = backticks(qsub_cmd)
                        if _code != 0:
                            errMsg = "Failed to submit CMD {cmd}.".format(
                                cmd=qsub_cmd)
                            self.add_log(errMsg, level=logging.ERROR)
                            raise RuntimeError(errMsg)

                        job_id = str(_out).split()[2]
                        submitted.append((job_id, job))
                        todo.remove(job)
Esempio n. 5
0
    def submit_quiver_jobs(self, d, uc, partial_uc, refs, keys, start, end,
                           submitted, todo,
                           use_sge, max_sge_jobs, quiver_nproc):
        """Call quiver to polish consensus.
        (1) for each cluster k, obtain unrolled sequences of all reads (zmws)
            belonging to this cluster, and save in raw_fa_of_cluster(k)
        (2) for each cluster k, call blasr to align raw_f_of_cluster to
            consensus sequence of the cluster and create sam_of_cluster.

        (3) Put every 100 clusters into one big bin, and then
            merge all sam_of_cluster files to sam_of_quivered_bin
        (4) Prepare commands including
                samtoh5, loadPulses, cmph5tools.py ...
            in order to convert sam_of_quivered_bin to cmph5_of_quivered_bin.
                * Either write these command to script_of_quivered_bin and qsub
                  all jobs later when scripts of all quivered bins are done,
                * Or execute the commands immediately.
        """
        for i in xrange(start, end, 100):
            for k in keys[i: min(end, i+100)]:
                #os.chdir(op.join('./tmp', str(k/10000), 'c'+str(k)))
                raw_fa = self.raw_fa_of_cluster(k)

                # write_in_raw_fa return movies of reads in partial_uc
                # logging.debug("uc[k]={0}".format(uc[k]))
                # logging.debug("partial_uc[k]={0}".format(uc[k]))
                write_in_raw_fasta(input_fasta_d=d,
                    in_seqids=uc[k] + partial_uc[k],
                    out_fa=raw_fa,
                    ignore_keyerror=True)

                #TODO: use multi-processing pool, reduce nproc
                blasr_sam_for_quiver(
                    input_fasta=raw_fa,
                    ref_fasta=refs[k],
                    out_sam_filename=self.sam_of_cluster(k),
                    run_cmd=True)

            fname = self.setup_quiver_for_batch(cids=keys[i: min(end, i+100)],
                        refs=refs, quiver_nproc=quiver_nproc,
                        return_script=True)
            todo.append(fname)

            if use_sge is not True or \
               max_sge_jobs == 0: # don't use SGE
                for job in todo:
                    elog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".elog")
                    olog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".olog")
                    msg = "Running quiver job locally: {j} ".format(j=job) + \
                          "1>{olog} 2>{elog}".format(olog=olog, elog=elog)
                    self.add_log(msg)
                    cmd = "bash " + job + " 1>{olog} 2>{elog}".\
                          format(olog=olog, elog=elog)
                    _out, _code, _msg = backticks(cmd)
                    if _code != 0:
                        errMsg = "Failed to run quiver {j}".format(j=job) + _msg
                        self.add_log(errMsg, level=logging.ERROR)
                        raise RuntimeError(errMsg)
                    submitted.append(("local", job))
                todo = []
            else:
                while len(todo) > 0:
                    n = min(max_sge_jobs, len(todo))
                    for job in todo[:n]:
                        # ex: Your job 8613116 ("c20to70.sh") has been submitted
                        elog = op.join(self.quivered_log_dir,
                                       op.basename(job) + ".elog")
                        olog = op.join(self.quivered_log_dir,
                                       op.basename(job) + ".olog")
                        qsub_cmd = "qsub " + \
                                   "-pe smp {n} ".format(n=quiver_nproc) + \
                                   "-cwd -S /bin/bash -V " + \
                                   "-e {elog} ".format(elog=elog) + \
                                   "-o {olog} ".format(olog=olog) + \
                                   "{job}".format(job=job)
                        msg = "Submitting CMD: {cmd}.\n".format(cmd=qsub_cmd)
                        self.add_log(msg)
                        _out, _code, _msg = backticks(qsub_cmd)
                        if _code != 0:
                            errMsg = "Failed to submit CMD {cmd}.".format(
                                    cmd=qsub_cmd)
                            self.add_log(errMsg, level=logging.ERROR)
                            raise RuntimeError(errMsg)

                        job_id = str(_out).split()[2]
                        submitted.append((job_id, job))
                        todo.remove(job)