Example #1
0
    def run(self):
        """Execute ice_partial.py all|split|i|merge."""
        cmd = self.args.subCommand
        logging.info("Running {f} {cmd} v{v}.".format(f=op.basename(__file__),
                                                      cmd=cmd,
                                                      v=get_version()))
        cmd_str = ""
        try:
            args = self.args
            obj = None
            if cmd == "all":
                sge_opts = SgeOptions(unique_id=args.unique_id,
                                      use_sge=args.use_sge,
                                      max_sge_jobs=args.max_sge_jobs,
                                      blasr_nproc=args.blasr_nproc)
                obj = IceAllPartials(
                    root_dir=args.root_dir,
                    fasta_filenames=args.fasta_filenames.split(','),
                    ref_fasta=args.ref_fasta,
                    out_pickle=args.out_pickle,
                    sge_opts=sge_opts,
                    sa_file=args.sa_file,
                    ccs_fofn=args.ccs_fofn)

            elif cmd == "one":
                # Only assign nfl reads in the given input_fasta file to isoforms
                obj = IcePartialOne(input_fasta=args.input_fasta,
                                    ref_fasta=args.ref_fasta,
                                    out_pickle=args.out_pickle,
                                    sa_file=args.sa_file,
                                    ccs_fofn=args.ccs_fofn,
                                    done_filename=args.done_filename,
                                    blasr_nproc=args.blasr_nproc,
                                    use_finer_qv=args.use_finer_qv)
            elif cmd == "split":
                obj = IcePartialSplit(root_dir=args.root_dir,
                                      nfl_fa=args.nfl_fa,
                                      N=args.N)
            elif cmd == "i":
                obj = IcePartialI(root_dir=args.root_dir,
                                  i=args.i,
                                  ccs_fofn=args.ccs_fofn,
                                  blasr_nproc=args.blasr_nproc)
            elif cmd == "merge":
                obj = IcePartialMerge(root_dir=args.root_dir, N=args.N)
            else:
                raise ValueError(
                    "Unknown command passed to {f}: {cmd}.".format(
                        f=op.basename(__file__), cmd=cmd))

            cmd_str = obj.cmd_str()
            logging.info("Running CMD: {cmd_str}".format(cmd_str=cmd_str))
            obj.run()
        except:
            logging.exception("Exiting {cmd_str} with return code 1.".format(
                cmd_str=cmd_str))
            return 1
        return 0
Example #2
0
    def run(self):
        """Execute ice_partial.py all|split|i|merge."""
        cmd = self.args.subCommand
        logging.info("Running {f} {cmd} v{v}.".format(f=op.basename(__file__),
                                                      cmd=cmd, v=get_version()))
        cmd_str = ""
        try:
            args = self.args
            obj = None
            if cmd == "all":
                sge_opts = SgeOptions(unique_id=args.unique_id,
                                      use_sge=args.use_sge,
                                      max_sge_jobs=args.max_sge_jobs,
                                      blasr_nproc=args.blasr_nproc)
                obj = IceAllPartials(root_dir=args.root_dir,
                                     fasta_filenames=args.fasta_filenames.split(','),
                                     ref_fasta=args.ref_fasta,
                                     out_pickle=args.out_pickle,
                                     sge_opts=sge_opts,
                                     sa_file=args.sa_file,
                                     ccs_fofn=args.ccs_fofn)

            elif cmd == "one":
                # Only assign nfl reads in the given input_fastq file to isoforms
                obj = IcePartialOne(input_fastq=args.input_fastq,
                                    ref_fasta=args.ref_fasta,
                                    out_pickle=args.out_pickle,
                                    sa_file=args.sa_file,
                                    ccs_fofn=args.ccs_fofn,
                                    done_filename=args.done_filename,
                                    blasr_nproc=args.blasr_nproc,
                                    use_finer_qv=args.use_finer_qv)
            elif cmd == "split":
                obj = IcePartialSplit(root_dir=args.root_dir,
                                      nfl_fa=args.nfl_fa,
                                      N=args.N)
            elif cmd == "i":
                obj = IcePartialI(root_dir=args.root_dir, i=args.i,
                                  ccs_fofn=args.ccs_fofn,
                                  blasr_nproc=args.blasr_nproc)
            elif cmd == "merge":
                obj = IcePartialMerge(root_dir=args.root_dir,
                                      N=args.N)
            else:
                raise ValueError("Unknown command passed to {f}: {cmd}.".
                                 format(f=op.basename(__file__), cmd=cmd))

            cmd_str = obj.cmd_str()
            logging.info("Running CMD: {cmd_str}".format(cmd_str=cmd_str))
            obj.run()
        except:
            logging.exception("Exiting {cmd_str} with return code 1.".
                              format(cmd_str=cmd_str))
            return 1
        return 0
Example #3
0
class Polish(IceFiles):
    """Polish isoforms clusters using Quiver."""
    def __init__(self,
                 root_dir,
                 nfl_fa,
                 bas_fofn,
                 ccs_fofn,
                 ice_opts,
                 sge_opts,
                 hq_isoforms_fa=None,
                 hq_isoforms_fq=None,
                 lq_isoforms_fa=None,
                 lq_isoforms_fq=None,
                 fasta_fofn=None):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fa
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. reads_of_insert.fofn of ccs files.
        hq_isoforms_fa|fq  --- polished, hiqh quality consensus isoforms in fasta|q
        lq_isoforms_fa|fq  --- polished, low quality consensus isoforms in fasta|q
        """
        IceFiles.__init__(self,
                          prog_name="IcePolish",
                          root_dir=root_dir,
                          bas_fofn=bas_fofn,
                          ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn)
        self.nfl_fa = realpath(nfl_fa)
        self.hq_isoforms_fa = hq_isoforms_fa
        self.hq_isoforms_fq = hq_isoforms_fq
        self.lq_isoforms_fa = lq_isoforms_fa
        self.lq_isoforms_fq = lq_isoforms_fq
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts

        self.icep = None  # IceAllPartials.
        self.iceq = None  # IceQuiver
        self.icepq = None  # IcePostQuiver
        self._nfl_splitted_fas = None

        self._validate_inputs()

    def _validate_inputs(self):
        """
        Validate input directories: root_dir, and
        files: nfl_fa, bas_fofn, ccs_fofn.
        """
        self.add_log("Validating inputs.")
        errMsg = ""
        if not op.exists(self.root_dir):
            errMsg = "Root dir {d} is not an existing directory!".\
                format(d=self.root_dir)
        if not op.exists(self.nfl_fa):
            errMsg = "Failed to find non-full-length reads {f}!".\
                format(f=self.nfl_fa)
        if not op.exists(self.bas_fofn):
            errMsg = "Failed to find bas fofn {f}!".format(f=self.bas_fofn)
        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise ValueError(errMsg)

    def get_sa_file(self):
        """Return saffix array of final_consensus_fa."""
        ret = True
        if not op.exists(self.final_consensus_sa):
            ret = build_sa(self.final_consensus_fa, self.final_consensus_sa)

        if ret is True:
            self.add_log(
                "Successfully created a suffix array for {f} at {sa}.".format(
                    f=self.final_consensus_fa, sa=self.final_consensus_sa))
        else:
            self.add_log("Failed to generate suffix array for {f}".format(
                f=self.final_consensus_fa),
                         level=logging.WARNING)
        if op.exists(self.final_consensus_sa):
            return self.final_consensus_sa
        else:
            return None

    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        # Create final.consensus.fa.sa
        self.add_log("Generating suffix array for {f}".format(
            f=self.final_consensus_sa),
                     level=logging.INFO)
        sa_file = self.get_sa_file()

        # Create input.fasta.fofn from bas_fofn
        self.add_log("Creating fasta fofn from bas/bax.h5 fofn",
                     level=logging.INFO)
        if self.fasta_fofn is None:
            self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
        self.add_log("bas fofn={f}".format(f=self.bas_fofn))
        self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
        convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                              out_filename=self.fasta_fofn,
                              fasta_out_dir=self.nfl_dir)

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fa, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(
            input_fasta=self.nfl_fa,
            reads_per_split=self.ice_opts.nfl_reads_per_split,
            out_dir=self.nfl_dir,
            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("IceAllPartials initiated.", level=logging.INFO)
        sa_file = self.final_consensus_sa \
                  if op.exists(self.final_consensus_fa) else None
        self.icep = IceAllPartials(root_dir=self.root_dir,
                                   fasta_filenames=self._nfl_splitted_fas,
                                   ref_fasta=self.final_consensus_fa,
                                   out_pickle=self.nfl_all_pickle_fn,
                                   sge_opts=self.sge_opts,
                                   sa_file=sa_file,
                                   ccs_fofn=self.ccs_fofn)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("IceQuiver initiated.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("IcePostQuiver initiated.", level=logging.INFO)
        self.icepq = IcePostQuiver(root_dir=self.root_dir,
                                   hq_isoforms_fa=self.hq_isoforms_fa,
                                   hq_isoforms_fq=self.hq_isoforms_fq,
                                   lq_isoforms_fa=self.lq_isoforms_fa,
                                   lq_isoforms_fq=self.lq_isoforms_fq,
                                   use_sge=self.sge_opts.use_sge,
                                   quit_if_not_done=False)
        self.icepq.run()
        self.add_log("IcePostQuiver finished.", level=logging.INFO)
Example #4
0
    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        # Create final.consensus.fa.sa
        self.add_log("Generating suffix array for {f}".format(
            f=self.final_consensus_sa),
                     level=logging.INFO)
        sa_file = self.get_sa_file()

        # Create input.fasta.fofn from bas_fofn
        self.add_log("Creating fasta fofn from bas/bax.h5 fofn",
                     level=logging.INFO)
        if self.fasta_fofn is None:
            self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
        self.add_log("bas fofn={f}".format(f=self.bas_fofn))
        self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
        convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                              out_filename=self.fasta_fofn,
                              fasta_out_dir=self.nfl_dir)

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fa, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(
            input_fasta=self.nfl_fa,
            reads_per_split=self.ice_opts.nfl_reads_per_split,
            out_dir=self.nfl_dir,
            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("IceAllPartials initiated.", level=logging.INFO)
        sa_file = self.final_consensus_sa \
                  if op.exists(self.final_consensus_fa) else None
        self.icep = IceAllPartials(root_dir=self.root_dir,
                                   fasta_filenames=self._nfl_splitted_fas,
                                   ref_fasta=self.final_consensus_fa,
                                   out_pickle=self.nfl_all_pickle_fn,
                                   sge_opts=self.sge_opts,
                                   sa_file=sa_file,
                                   ccs_fofn=self.ccs_fofn)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("IceQuiver initiated.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("IcePostQuiver initiated.", level=logging.INFO)
        self.icepq = IcePostQuiver(root_dir=self.root_dir,
                                   hq_isoforms_fa=self.hq_isoforms_fa,
                                   hq_isoforms_fq=self.hq_isoforms_fq,
                                   lq_isoforms_fa=self.lq_isoforms_fa,
                                   lq_isoforms_fq=self.lq_isoforms_fq,
                                   use_sge=self.sge_opts.use_sge,
                                   quit_if_not_done=False)
        self.icepq.run()
        self.add_log("IcePostQuiver finished.", level=logging.INFO)
Example #5
0
class Polish(IceFiles):

    """Polish isoforms clusters using Quiver."""

    def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn,
            ice_opts, sge_opts, ipq_opts, fasta_fofn=None, nfl_reads_per_split=30000):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fa
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. reads_of_insert.fofn of ccs files.

        ipq_opts --- IceQuiverHQLQOptions
                     qv_trim_5: ignore QV of n bases in the 5' end
                     qv_trim_3: ignore QV of n bases in the 3' end
                     hq_quiver_min_accuracy: minimum allowed quiver accuracy
                                      to mark an isoform as high quality
                     hq_isoforms_fa|fq: polished, hiqh quality consensus
                                        isoforms in fasta|q
                     lq_isoforms_fa|fq: polished, low quality consensus
                                        isoforms in fasta|q
        """
        IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir,
                          bas_fofn=bas_fofn, ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn)

        #self.add_log("DEBUG: in Polish ccs_fofn is {0}".format(self.ccs_fofn))
        #self.add_log("DEBUG: in Polish fasta_fofn is {0}".format(self.fasta_fofn))
        #self.add_log("DEBUG: in Polish bas_fofn is {0}".format(self.bas_fofn))
        self.nfl_fa = realpath(nfl_fa)
        self.nfl_reads_per_split = nfl_reads_per_split
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.ipq_opts = ipq_opts

        self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(self.ice_opts.ece_penalty, self.ice_opts.ece_min_len))

        self.icep = None   # IceAllPartials.
        self.iceq = None   # IceQuiver
        self.icepq = None  # IceQuiverPostprocess
        self._nfl_splitted_fas = None

        self.validate_inputs()

    def validate_inputs(self):
        """
        Validate input directories: root_dir, and
        files: nfl_fa, bas_fofn, ccs_fofn.
        """
        self.add_log("Validating inputs.")
        errMsg = ""
        if not op.exists(self.root_dir):
            errMsg = "Root dir {d} is not an existing directory!".\
                format(d=self.root_dir)
        if not op.exists(self.nfl_fa):
            errMsg = "Failed to find non-full-length reads {f}!".\
                format(f=self.nfl_fa)
        if not op.exists(self.bas_fofn):
            errMsg = "Failed to find bas fofn {f}!".format(f=self.bas_fofn)
        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise ValueError(errMsg)

    # def get_sa_file(self):
    #     """Return saffix array of final_consensus_fa."""
    #     ret = True
    #     if not op.exists(self.final_consensus_sa):
    #         ret = build_sa(self.final_consensus_fa, self.final_consensus_sa)
    #
    #     if ret is True:
    #         self.add_log("Successfully created a suffix array for {f} at {sa}.".
    #                      format(f=self.final_consensus_fa,
    #                             sa=self.final_consensus_sa))
    #     else:
    #         self.add_log("Failed to generate suffix array for {f}".format(
    #             f=self.final_consensus_fa), level=logging.WARNING)
    #     if op.exists(self.final_consensus_sa):
    #         return self.final_consensus_sa
    #     else:
    #         return None

    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        # Create final.consensus.fa.sa
        #self.add_log("Generating suffix array for {f}".format(
        #             f=self.final_consensus_sa), level=logging.INFO)
        #sa_file = self.get_sa_file()

        # Create input.fasta.fofn from bas_fofn
        self.add_log("Creating fasta fofn from bas/bax.h5 fofn",
                     level=logging.INFO)
        if self.fasta_fofn is None:
            self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
        self.add_log("bas fofn={f}".format(f=self.bas_fofn))
        self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
        if op.exists(self.fasta_fofn):
            self.add_log("No need to run convert_fofn_to_fasta.")
        else:
            convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                                out_filename=self.fasta_fofn,
                                fasta_out_dir=self.nfl_dir,
                                cpus=self.sge_opts.blasr_nproc)

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fa, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(input_fasta=self.nfl_fa,
                                            reads_per_split=self.nfl_reads_per_split,
                                            out_dir=self.nfl_dir,
                                            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Generating dazz DB for final.consensus.fasta
        ref_obj = DazzIDHandler(self.final_consensus_fa, False)
        DalignerRunner.make_db(ref_obj.dazz_filename)
        msg = "Dazz DB made for: " + ref_obj.dazz_filename
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("Initializing IceAllPartials.", level=logging.INFO)
        #sa_file = self.final_consensus_sa \
        #    if op.exists(self.final_consensus_fa) else None

        self.icep = IceAllPartials(
            root_dir=self.root_dir,
            fasta_filenames=self._nfl_splitted_fas,
            ref_fasta=self.final_consensus_fa,
            out_pickle=self.nfl_all_pickle_fn,
            sge_opts=self.sge_opts,
            sa_file=None,  # since we are switching to daligner, just give it as None now; remove sa_file completely later when daligner is mature (ToDo)
            ccs_fofn=self.ccs_fofn)
        self.add_log("IceAllPartials log: {f}.".format(f=self.icep.log_fn),
                     level=logging.INFO)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("Initializing IceQuiver.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts)
        self.add_log("IceQuiver log: {f}.".format(f=self.iceq.log_fn),
                     level=logging.INFO)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("Initializing IceQuiverPostprocess.", level=logging.INFO)
        self.icepq = IceQuiverPostprocess(root_dir=self.root_dir,
                                          use_sge=self.sge_opts.use_sge,
                                          quit_if_not_done=False,
                                          ipq_opts=self.ipq_opts)
        self.add_log("IceQuiverPostprocess log: {f}.".
                     format(f=self.icepq.log_fn), level=logging.INFO)
        self.icepq.run()
        self.add_log("IceQuiverPostprocess finished.", level=logging.INFO)
Example #6
0
    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        # Create final.consensus.fa.sa
        #self.add_log("Generating suffix array for {f}".format(
        #             f=self.final_consensus_sa), level=logging.INFO)
        #sa_file = self.get_sa_file()

        # Create input.fasta.fofn from bas_fofn
        self.add_log("Creating fasta fofn from bas/bax.h5 fofn",
                     level=logging.INFO)
        if self.fasta_fofn is None:
            self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
        self.add_log("bas fofn={f}".format(f=self.bas_fofn))
        self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
        if op.exists(self.fasta_fofn):
            self.add_log("No need to run convert_fofn_to_fasta.")
        else:
            convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                                out_filename=self.fasta_fofn,
                                fasta_out_dir=self.nfl_dir,
                                cpus=self.sge_opts.blasr_nproc)

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fa, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(input_fasta=self.nfl_fa,
                                            reads_per_split=self.nfl_reads_per_split,
                                            out_dir=self.nfl_dir,
                                            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Generating dazz DB for final.consensus.fasta
        ref_obj = DazzIDHandler(self.final_consensus_fa, False)
        DalignerRunner.make_db(ref_obj.dazz_filename)
        msg = "Dazz DB made for: " + ref_obj.dazz_filename
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("Initializing IceAllPartials.", level=logging.INFO)
        #sa_file = self.final_consensus_sa \
        #    if op.exists(self.final_consensus_fa) else None

        self.icep = IceAllPartials(
            root_dir=self.root_dir,
            fasta_filenames=self._nfl_splitted_fas,
            ref_fasta=self.final_consensus_fa,
            out_pickle=self.nfl_all_pickle_fn,
            sge_opts=self.sge_opts,
            sa_file=None,  # since we are switching to daligner, just give it as None now; remove sa_file completely later when daligner is mature (ToDo)
            ccs_fofn=self.ccs_fofn)
        self.add_log("IceAllPartials log: {f}.".format(f=self.icep.log_fn),
                     level=logging.INFO)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("Initializing IceQuiver.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts)
        self.add_log("IceQuiver log: {f}.".format(f=self.iceq.log_fn),
                     level=logging.INFO)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("Initializing IceQuiverPostprocess.", level=logging.INFO)
        self.icepq = IceQuiverPostprocess(root_dir=self.root_dir,
                                          use_sge=self.sge_opts.use_sge,
                                          quit_if_not_done=False,
                                          ipq_opts=self.ipq_opts)
        self.add_log("IceQuiverPostprocess log: {f}.".
                     format(f=self.icepq.log_fn), level=logging.INFO)
        self.icepq.run()
        self.add_log("IceQuiverPostprocess finished.", level=logging.INFO)
Example #7
0
    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        # Create final.consensus.fa.sa
        self.add_log("Generating suffix array for {f}".format(
                     f=self.final_consensus_sa), level=logging.INFO)
        sa_file = self.get_sa_file()

        # Create input.fasta.fofn from bas_fofn
        self.add_log("Creating fasta fofn from bas/bax.h5 fofn",
                     level=logging.INFO)
        if self.fasta_fofn is None:
            self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
        self.add_log("bas fofn={f}".format(f=self.bas_fofn))
        self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
        convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                              out_filename=self.fasta_fofn,
                              fasta_out_dir=self.nfl_dir)

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fa, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.ice_opts.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(input_fasta=self.nfl_fa,
            reads_per_split=self.ice_opts.nfl_reads_per_split,
            out_dir=self.nfl_dir,
            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("IceAllPartials initiated.", level=logging.INFO)
        sa_file = self.final_consensus_sa \
                  if op.exists(self.final_consensus_fa) else None
        self.icep = IceAllPartials(
                root_dir=self.root_dir,
                fasta_filenames=self._nfl_splitted_fas,
                ref_fasta=self.final_consensus_fa,
                out_pickle=self.nfl_all_pickle_fn,
                sge_opts=self.sge_opts,
                sa_file=sa_file,
                ccs_fofn=self.ccs_fofn)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("IceQuiver initiated.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("IcePostQuiver initiated.", level=logging.INFO)
        self.icepq = IcePostQuiver(root_dir=self.root_dir,
                                   hq_isoforms_fa=self.hq_isoforms_fa,
                                   hq_isoforms_fq=self.hq_isoforms_fq,
                                   lq_isoforms_fa=self.lq_isoforms_fa,
                                   lq_isoforms_fq=self.lq_isoforms_fq,
                                   use_sge=self.sge_opts.use_sge,
                                   quit_if_not_done=False)
        self.icepq.run()
        self.add_log("IcePostQuiver finished.", level=logging.INFO)
Example #8
0
class Polish(IceFiles):
    """Polish isoforms clusters using Quiver."""
    def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn,
                 ice_opts, sge_opts, hq_isoforms_fa=None, hq_isoforms_fq=None,
                 lq_isoforms_fa=None, lq_isoforms_fq=None, fasta_fofn=None):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fa
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. reads_of_insert.fofn of ccs files.
        hq_isoforms_fa|fq  --- polished, hiqh quality consensus isoforms in fasta|q
        lq_isoforms_fa|fq  --- polished, low quality consensus isoforms in fasta|q
        """
        IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir,
                          bas_fofn=bas_fofn, ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn)
        self.nfl_fa = realpath(nfl_fa)
        self.hq_isoforms_fa = hq_isoforms_fa
        self.hq_isoforms_fq = hq_isoforms_fq
        self.lq_isoforms_fa = lq_isoforms_fa
        self.lq_isoforms_fq = lq_isoforms_fq
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts

        self.icep = None   # IceAllPartials.
        self.iceq = None   # IceQuiver
        self.icepq = None  # IcePostQuiver
        self._nfl_splitted_fas = None

        self._validate_inputs()

    def _validate_inputs(self):
        """
        Validate input directories: root_dir, and
        files: nfl_fa, bas_fofn, ccs_fofn.
        """
        self.add_log("Validating inputs.")
        errMsg = ""
        if not op.exists(self.root_dir):
            errMsg = "Root dir {d} is not an existing directory!".\
                format(d=self.root_dir)
        if not op.exists(self.nfl_fa):
            errMsg = "Failed to find non-full-length reads {f}!".\
                format(f=self.nfl_fa)
        if not op.exists(self.bas_fofn):
            errMsg = "Failed to find bas fofn {f}!".format(f=self.bas_fofn)
        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise ValueError(errMsg)

    def get_sa_file(self):
        """Return saffix array of final_consensus_fa."""
        ret = True
        if not op.exists(self.final_consensus_sa):
            ret = build_sa(self.final_consensus_fa, self.final_consensus_sa)

        if ret is True:
            self.add_log("Successfully created a suffix array for {f} at {sa}.".
                         format(f=self.final_consensus_fa,
                                sa=self.final_consensus_sa))
        else:
            self.add_log("Failed to generate suffix array for {f}".format(
                f=self.final_consensus_fa), level=logging.WARNING)
        if op.exists(self.final_consensus_sa):
            return self.final_consensus_sa
        else:
            return None

    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        # Create final.consensus.fa.sa
        self.add_log("Generating suffix array for {f}".format(
                     f=self.final_consensus_sa), level=logging.INFO)
        sa_file = self.get_sa_file()

        # Create input.fasta.fofn from bas_fofn
        self.add_log("Creating fasta fofn from bas/bax.h5 fofn",
                     level=logging.INFO)
        if self.fasta_fofn is None:
            self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
        self.add_log("bas fofn={f}".format(f=self.bas_fofn))
        self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
        convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                              out_filename=self.fasta_fofn,
                              fasta_out_dir=self.nfl_dir)

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fa, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.ice_opts.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(input_fasta=self.nfl_fa,
            reads_per_split=self.ice_opts.nfl_reads_per_split,
            out_dir=self.nfl_dir,
            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("IceAllPartials initiated.", level=logging.INFO)
        sa_file = self.final_consensus_sa \
                  if op.exists(self.final_consensus_fa) else None
        self.icep = IceAllPartials(
                root_dir=self.root_dir,
                fasta_filenames=self._nfl_splitted_fas,
                ref_fasta=self.final_consensus_fa,
                out_pickle=self.nfl_all_pickle_fn,
                sge_opts=self.sge_opts,
                sa_file=sa_file,
                ccs_fofn=self.ccs_fofn)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("IceQuiver initiated.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("IcePostQuiver initiated.", level=logging.INFO)
        self.icepq = IcePostQuiver(root_dir=self.root_dir,
                                   hq_isoforms_fa=self.hq_isoforms_fa,
                                   hq_isoforms_fq=self.hq_isoforms_fq,
                                   lq_isoforms_fa=self.lq_isoforms_fa,
                                   lq_isoforms_fq=self.lq_isoforms_fq,
                                   use_sge=self.sge_opts.use_sge,
                                   quit_if_not_done=False)
        self.icepq.run()
        self.add_log("IcePostQuiver finished.", level=logging.INFO)