Ejemplo n.º 1
0
    def _validate_inputs(self, root_dir, N):
        """
        Check inputs, return
        (splitted_pickles, out_pickle)
        """
        icef = IceFiles(prog_name="ice_partial_merge",
                        root_dir=root_dir, no_log_f=False)

        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle
        splitted_pickles = [icef.nfl_pickle_i(i) for i in range(0, N)]
        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE
        dones = [icef.nfl_done_i(i) for i in range(0, N)]

        # Check if inputs exist.
        errMsg = ""
        for done in dones:
            if not nfs_exists(done):
                errMsg = "DONE file {f} does not exist.".format(f=done)
        for pickle in splitted_pickles:
            if not nfs_exists(pickle):
                errMsg = "Pickle file {f} does not exist.".format(f=pickle)

        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # root_dir/output/map_noFL/nfl.all.partial_uc.pickle
        out_pickle = icef.nfl_all_pickle_fn
        return (splitted_pickles, out_pickle)
Ejemplo n.º 2
0
    def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn,
                 ice_opts, sge_opts, hq_isoforms_fa=None, hq_isoforms_fq=None,
                 lq_isoforms_fa=None, lq_isoforms_fq=None, fasta_fofn=None):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fa
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. reads_of_insert.fofn of ccs files.
        hq_isoforms_fa|fq  --- polished, hiqh quality consensus isoforms in fasta|q
        lq_isoforms_fa|fq  --- polished, low quality consensus isoforms in fasta|q
        """
        IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir,
                          bas_fofn=bas_fofn, ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn)
        self.nfl_fa = realpath(nfl_fa)
        self.hq_isoforms_fa = hq_isoforms_fa
        self.hq_isoforms_fq = hq_isoforms_fq
        self.lq_isoforms_fa = lq_isoforms_fa
        self.lq_isoforms_fq = lq_isoforms_fq
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts

        self.icep = None   # IceAllPartials.
        self.iceq = None   # IceQuiver
        self.icepq = None  # IcePostQuiver
        self._nfl_splitted_fas = None

        self._validate_inputs()
Ejemplo n.º 3
0
    def _validate_inputs(self, root_dir, nfl_fa, N):
        """
        Check inputs, return
        (num_reads,
         number_reads_per_chunk,
         nfl_dir,
         [i-th_chunk_nfl_fa for i in [0...N-1]])
        """
        icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False)

        nfl_dir = icef.nfl_dir

        # root_dir/output/map_noFL/input.split_{0:02d}.fa
        splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)]

        mkdir(icef.nfl_dir)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(nfl_fa):
            errMsg = "The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa)
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        num_reads = num_reads_in_fasta(nfl_fa)
        reads_per_split = int(max(1, ceil(num_reads / N)))

        return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
Ejemplo n.º 4
0
    def _validate_inputs(self, root_dir, nfl_fa, N):
        """
        Check inputs, return
        (num_reads,
         number_reads_per_chunk,
         nfl_dir,
         [i-th_chunk_nfl_fa for i in [0...N-1]])
        """
        icef = IceFiles(prog_name="ice_partial_split",
                        root_dir=root_dir,
                        no_log_f=False)

        nfl_dir = icef.nfl_dir

        # root_dir/output/map_noFL/input.split_{0:02d}.fa
        splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)]

        mkdir(icef.nfl_dir)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(nfl_fa):
            errMsg = ("The input non-full-length reads fasta file " +
                      "{f} does not exists. ".format(f=nfl_fa))
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        num_reads = num_reads_in_fasta(nfl_fa)
        reads_per_split = int(max(1, ceil(num_reads / N)))

        return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
Ejemplo n.º 5
0
    def _validate_inputs(self, root_dir, N):
        """
        Check inputs, return
        (splitted_pickles, out_pickle)
        """
        icef = IceFiles(prog_name="ice_partial_merge",
                        root_dir=root_dir,
                        no_log_f=False)

        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle
        splitted_pickles = [icef.nfl_pickle_i(i) for i in range(0, N)]
        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE
        dones = [icef.nfl_done_i(i) for i in range(0, N)]

        # Check if inputs exist.
        errMsg = ""
        for done in dones:
            if not nfs_exists(done):
                errMsg = "DONE file {f} does not exist.".format(f=done)
        for pickle in splitted_pickles:
            if not nfs_exists(pickle):
                errMsg = "Pickle file {f} does not exist.".format(f=pickle)

        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # root_dir/output/map_noFL/nfl.all.partial_uc.pickle
        out_pickle = icef.nfl_all_pickle_fn
        return (splitted_pickles, out_pickle)
Ejemplo n.º 6
0
    def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc):
        """
        Check inputs, write ice_partial.py i command to script_file
        and return (input_fasta, ref_fasta, sa_file, out_pickle, done_file)
        for the i-th chunk of nfl reads.
        """
        icef = IceFiles(prog_name="ice_partial_{i}".format(i=i),
                        root_dir=root_dir, no_log_f=False)

        # root_dir/output/final.consensus.fa
        ref_fasta = icef.final_consensus_fa
        sa_file = icef.final_consensus_sa

        # root_dir/output/map_noFL/input.split_{0:02d}.fa
        input_fasta = icef.nfl_fa_i(i)

        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle
        out_pickle = icef.nfl_pickle_i(i)

        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE
        done_file = icef.nfl_done_i(i)

        # root_dir/scripts/input.split_{0:02d}.fa.partial_uc.sh
        script_file = icef.nfl_script_i(i)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(input_fasta):
            errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) +
                      "fasta file {f} does not exist. ".format(f=input_fasta) +
                      "Please run ice_partial_split.py first.")
        elif not nfs_exists(ref_fasta):
            errMsg = ("The unpolished consensus isoforms fasta file " +
                      "{f} does not exist. ".format(f=ref_fasta) +
                      "Please make sure ICE is successfully done.")
        elif not nfs_exists(sa_file):
            errMsg = ("The suffix array of unpolished consensus isoforms " +
                      "(i.e., final_consensus_sa) {f} does not exist.")
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # Save cmd to script_file.
        cmd = self._cmd_str(root_dir=root_dir, i=i,
                            ccs_fofn=ccs_fofn,
                            blasr_nproc=blasr_nproc)
        with open(script_file, 'w') as writer:
            writer.write(cmd + "\n")

        icef.add_log("Writing CMD to: {script_file}".
                     format(script_file=script_file))
        icef.close_log()

        return (input_fasta, ref_fasta, sa_file, out_pickle, done_file)
Ejemplo n.º 7
0
    def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc):
        """
        Check inputs, write ice_partial.py i command to script_file
        and return (input_fasta, ref_fasta, sa_file, out_pickle, done_file)
        for the i-th chunk of nfl reads.
        """
        icef = IceFiles(prog_name="ice_partial_{i}".format(i=i),
                        root_dir=root_dir, no_log_f=False)

        # root_dir/output/final.consensus.fa
        ref_fasta = icef.final_consensus_fa
        ref_dazz = icef.final_dazz_db
        #sa_file = icef.final_consensus_sa

        # root_dir/output/map_noFL/input.split_{0:02d}.fa
        input_fasta = icef.nfl_fa_i(i)

        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle
        out_pickle = icef.nfl_pickle_i(i)

        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE
        done_file = icef.nfl_done_i(i)

        # root_dir/scripts/input.split_{0:02d}.fa.partial_uc.sh
        script_file = icef.nfl_script_i(i)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(input_fasta):
            errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) +
                      "fasta file {f} does not exist. ".format(f=input_fasta) +
                      "Please run ice_partial_split.py first.")
        elif not nfs_exists(ref_fasta):
            errMsg = ("The unpolished consensus isoforms fasta file " +
                      "{f} does not exist. ".format(f=ref_fasta) +
                      "Please make sure ICE is successfully done.")
        elif not nfs_exists(ref_dazz):
            errMsg = ("The dazz db " +
                      "{f} does not exist. ".format(f=ref_dazz) +
                      "Please make sure it is already built.")
#        elif not nfs_exists(sa_file):
#            errMsg = ("The suffix array of unpolished consensus isoforms " +
#                      "(i.e., final_consensus_sa) {f} does not exist.")
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # Save cmd to script_file.
        cmd = self._cmd_str(root_dir=root_dir, i=i,
                            ccs_fofn=ccs_fofn,
                            blasr_nproc=blasr_nproc)
        with open(script_file, 'w') as writer:
            writer.write(cmd + "\n")

        icef.add_log("Writing CMD to: {script_file}".
                     format(script_file=script_file))
        icef.close_log()

        return (input_fasta, ref_fasta, ref_dazz, out_pickle, done_file)
Ejemplo n.º 8
0
    def __init__(self,
                 root_dir,
                 fasta_filenames,
                 ref_fasta,
                 out_pickle,
                 sge_opts,
                 sa_file=None,
                 ccs_fofn=None):
        """
        fasta_filenames --- a list of splitted nfl fasta files.

        ref_fasta --- (unpolished) consensus isoforms

        out_pickle --- a pickle file with all nfl fasta reads

        ccs_fofn --- should be reads_of_insert.fofn or None

        root_dir --- ICE root output directory

        sge_opts --- params for SGE environment, including
            use_sge    : use SGE or not
            max_sge_jobs: maximum number of gcon jobs submitted
            unique_id  : unique qsub job id, important that this
                        DOES NOT CONFLICT!
            blasr_nproc: blasr -nproc param, number of threads per cpu.
            gcon_nproc : number of gcon that can run at the same time
        """
        self.prog_name = "IceAllPartials"
        IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir)

        self.fasta_filenames, self.ref_fasta, self.ccs_fofn, self.sa_file = \
            self._validateInputs(fasta_filenames=fasta_filenames,
                                 ref_fasta=ref_fasta,
                                 ccs_fofn=ccs_fofn,
                                 sa_file=sa_file)

        self.out_pickle = out_pickle

        self.sge_opts = sge_opts

        self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir)
        mkdir(self.nfl_dir)

        self.add_log("input fasta files are: " +
                     ", ".join(self.fasta_filenames))
        self.add_log("temp pickle files are: " +
                     ", ".join(self.pickle_filenames))
        self.add_log("out pickle file is: " + self.out_pickle)
Ejemplo n.º 9
0
    def __init__(self,
                 root_dir,
                 nfl_fa,
                 bas_fofn,
                 ccs_fofn,
                 ice_opts,
                 sge_opts,
                 ipq_opts,
                 fasta_fofn=None,
                 nfl_reads_per_split=30000):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fa
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. reads_of_insert.fofn of ccs files.

        ipq_opts --- IceQuiverHQLQOptions
                     qv_trim_5: ignore QV of n bases in the 5' end
                     qv_trim_3: ignore QV of n bases in the 3' end
                     hq_quiver_min_accuracy: minimum allowed quiver accuracy
                                      to mark an isoform as high quality
                     hq_isoforms_fa|fq: polished, hiqh quality consensus
                                        isoforms in fasta|q
                     lq_isoforms_fa|fq: polished, low quality consensus
                                        isoforms in fasta|q
        """
        IceFiles.__init__(self,
                          prog_name="IcePolish",
                          root_dir=root_dir,
                          bas_fofn=bas_fofn,
                          ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn)
        self.nfl_fa = realpath(nfl_fa)
        self.nfl_reads_per_split = nfl_reads_per_split
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.ipq_opts = ipq_opts

        self.icep = None  # IceAllPartials.
        self.iceq = None  # IceQuiver
        self.icepq = None  # IceQuiverPostprocess
        self._nfl_splitted_fas = None

        self.validate_inputs()
Ejemplo n.º 10
0
    def __init__(self, root_dir, fastq_filenames, ref_fasta,
                 out_pickle, sge_opts, sa_file=None, ccs_fofn=None):
        """
        fastq_filenames --- a list of splitted nfl fastq files.

        ref_fasta --- (unpolished) consensus isoforms

        out_pickle --- a pickle file with all nfl fasta reads

        ccs_fofn --- should be reads_of_insert.fofn or None

        root_dir --- ICE root output directory

        sge_opts --- params for SGE environment, including
            use_sge    : use SGE or not
            max_sge_jobs: maximum number of sub-jobs submitted
            unique_id  : unique qsub job id, important that this
                        DOES NOT CONFLICT!
            blasr_nproc: blasr -nproc param, number of threads per cpu.
        """
        self.prog_name = "IceAllPartials"
        IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir)

        self.add_log("DEBUG: in IceAllPartials, ccs_fofn is {0}.".format(ccs_fofn), level=logging.INFO)

        self.fastq_filenames, self.ref_fasta, self.ccs_fofn, self.sa_file = \
            self._validate_inputs(fastq_filenames=fastq_filenames,
                                  ref_fasta=ref_fasta,
                                  ccs_fofn=ccs_fofn,
                                  sa_file=sa_file)

        self.out_pickle = out_pickle

        self.sge_opts = sge_opts

        self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir)
        mkdir(self.nfl_dir)

        self.add_log("input fasta files are: " +
                     ", ".join(self.fastq_filenames))
        self.add_log("temp pickle files are: " +
                     ", ".join(self.pickle_filenames))
        self.add_log("out pickle file is: " + self.out_pickle)
Ejemplo n.º 11
0
    def __init__(self,
                 root_dir,
                 nfl_fa,
                 bas_fofn,
                 ccs_fofn,
                 ice_opts,
                 sge_opts,
                 hq_isoforms_fa=None,
                 hq_isoforms_fq=None,
                 lq_isoforms_fa=None,
                 lq_isoforms_fq=None,
                 fasta_fofn=None):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fa
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. reads_of_insert.fofn of ccs files.
        hq_isoforms_fa|fq  --- polished, hiqh quality consensus isoforms in fasta|q
        lq_isoforms_fa|fq  --- polished, low quality consensus isoforms in fasta|q
        """
        IceFiles.__init__(self,
                          prog_name="IcePolish",
                          root_dir=root_dir,
                          bas_fofn=bas_fofn,
                          ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn)
        self.nfl_fa = realpath(nfl_fa)
        self.hq_isoforms_fa = hq_isoforms_fa
        self.hq_isoforms_fq = hq_isoforms_fq
        self.lq_isoforms_fa = lq_isoforms_fa
        self.lq_isoforms_fq = lq_isoforms_fq
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts

        self.icep = None  # IceAllPartials.
        self.iceq = None  # IceQuiver
        self.icepq = None  # IcePostQuiver
        self._nfl_splitted_fas = None

        self._validate_inputs()
Ejemplo n.º 12
0
    def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn,
            ice_opts, sge_opts, ipq_opts, fasta_fofn=None, nfl_reads_per_split=30000):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fa
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. reads_of_insert.fofn of ccs files.

        ipq_opts --- IceQuiverHQLQOptions
                     qv_trim_5: ignore QV of n bases in the 5' end
                     qv_trim_3: ignore QV of n bases in the 3' end
                     hq_quiver_min_accuracy: minimum allowed quiver accuracy
                                      to mark an isoform as high quality
                     hq_isoforms_fa|fq: polished, hiqh quality consensus
                                        isoforms in fasta|q
                     lq_isoforms_fa|fq: polished, low quality consensus
                                        isoforms in fasta|q
        """
        IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir,
                          bas_fofn=bas_fofn, ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn)

        #self.add_log("DEBUG: in Polish ccs_fofn is {0}".format(self.ccs_fofn))
        #self.add_log("DEBUG: in Polish fasta_fofn is {0}".format(self.fasta_fofn))
        #self.add_log("DEBUG: in Polish bas_fofn is {0}".format(self.bas_fofn))
        self.nfl_fa = realpath(nfl_fa)
        self.nfl_reads_per_split = nfl_reads_per_split
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.ipq_opts = ipq_opts

        self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(self.ice_opts.ece_penalty, self.ice_opts.ece_min_len))

        self.icep = None   # IceAllPartials.
        self.iceq = None   # IceQuiver
        self.icepq = None  # IceQuiverPostprocess
        self._nfl_splitted_fas = None

        self.validate_inputs()