Ejemplo n.º 1
0
    def _validate_inputs(self, root_dir, N):
        """
        Check inputs, return
        (splitted_pickles, out_pickle)
        """
        icef = IceFiles(prog_name="ice_partial_merge",
                        root_dir=root_dir, no_log_f=False)

        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle
        splitted_pickles = [icef.nfl_pickle_i(i) for i in range(0, N)]
        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE
        dones = [icef.nfl_done_i(i) for i in range(0, N)]

        # Check if inputs exist.
        errMsg = ""
        for done in dones:
            if not nfs_exists(done):
                errMsg = "DONE file {f} does not exist.".format(f=done)
        for pickle in splitted_pickles:
            if not nfs_exists(pickle):
                errMsg = "Pickle file {f} does not exist.".format(f=pickle)

        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # root_dir/output/map_noFL/nfl.all.partial_uc.pickle
        out_pickle = icef.nfl_all_pickle_fn
        return (splitted_pickles, out_pickle)
Ejemplo n.º 2
0
    def _validate_inputs(self, root_dir, N):
        """
        Check inputs, return
        (splitted_pickles, out_pickle)
        """
        icef = IceFiles(prog_name="ice_partial_merge",
                        root_dir=root_dir,
                        no_log_f=False)

        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle
        splitted_pickles = [icef.nfl_pickle_i(i) for i in range(0, N)]
        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE
        dones = [icef.nfl_done_i(i) for i in range(0, N)]

        # Check if inputs exist.
        errMsg = ""
        for done in dones:
            if not nfs_exists(done):
                errMsg = "DONE file {f} does not exist.".format(f=done)
        for pickle in splitted_pickles:
            if not nfs_exists(pickle):
                errMsg = "Pickle file {f} does not exist.".format(f=pickle)

        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # root_dir/output/map_noFL/nfl.all.partial_uc.pickle
        out_pickle = icef.nfl_all_pickle_fn
        return (splitted_pickles, out_pickle)
Ejemplo n.º 3
0
    def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc):
        """
        Check inputs, write ice_partial.py i command to script_file
        and return (input_fasta, ref_fasta, sa_file, out_pickle, done_file)
        for the i-th chunk of nfl reads.
        """
        icef = IceFiles(prog_name="ice_partial_{i}".format(i=i),
                        root_dir=root_dir, no_log_f=False)

        # root_dir/output/final.consensus.fa
        ref_fasta = icef.final_consensus_fa
        ref_dazz = icef.final_dazz_db
        #sa_file = icef.final_consensus_sa

        # root_dir/output/map_noFL/input.split_{0:02d}.fa
        input_fasta = icef.nfl_fa_i(i)

        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle
        out_pickle = icef.nfl_pickle_i(i)

        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE
        done_file = icef.nfl_done_i(i)

        # root_dir/scripts/input.split_{0:02d}.fa.partial_uc.sh
        script_file = icef.nfl_script_i(i)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(input_fasta):
            errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) +
                      "fasta file {f} does not exist. ".format(f=input_fasta) +
                      "Please run ice_partial_split.py first.")
        elif not nfs_exists(ref_fasta):
            errMsg = ("The unpolished consensus isoforms fasta file " +
                      "{f} does not exist. ".format(f=ref_fasta) +
                      "Please make sure ICE is successfully done.")
        elif not nfs_exists(ref_dazz):
            errMsg = ("The dazz db " +
                      "{f} does not exist. ".format(f=ref_dazz) +
                      "Please make sure it is already built.")
#        elif not nfs_exists(sa_file):
#            errMsg = ("The suffix array of unpolished consensus isoforms " +
#                      "(i.e., final_consensus_sa) {f} does not exist.")
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # Save cmd to script_file.
        cmd = self._cmd_str(root_dir=root_dir, i=i,
                            ccs_fofn=ccs_fofn,
                            blasr_nproc=blasr_nproc)
        with open(script_file, 'w') as writer:
            writer.write(cmd + "\n")

        icef.add_log("Writing CMD to: {script_file}".
                     format(script_file=script_file))
        icef.close_log()

        return (input_fasta, ref_fasta, ref_dazz, out_pickle, done_file)
Ejemplo n.º 4
0
    def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc):
        """
        Check inputs, write ice_partial.py i command to script_file
        and return (input_fasta, ref_fasta, sa_file, out_pickle, done_file)
        for the i-th chunk of nfl reads.
        """
        icef = IceFiles(prog_name="ice_partial_{i}".format(i=i),
                        root_dir=root_dir, no_log_f=False)

        # root_dir/output/final.consensus.fa
        ref_fasta = icef.final_consensus_fa
        sa_file = icef.final_consensus_sa

        # root_dir/output/map_noFL/input.split_{0:02d}.fa
        input_fasta = icef.nfl_fa_i(i)

        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle
        out_pickle = icef.nfl_pickle_i(i)

        # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE
        done_file = icef.nfl_done_i(i)

        # root_dir/scripts/input.split_{0:02d}.fa.partial_uc.sh
        script_file = icef.nfl_script_i(i)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(input_fasta):
            errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) +
                      "fasta file {f} does not exist. ".format(f=input_fasta) +
                      "Please run ice_partial_split.py first.")
        elif not nfs_exists(ref_fasta):
            errMsg = ("The unpolished consensus isoforms fasta file " +
                      "{f} does not exist. ".format(f=ref_fasta) +
                      "Please make sure ICE is successfully done.")
        elif not nfs_exists(sa_file):
            errMsg = ("The suffix array of unpolished consensus isoforms " +
                      "(i.e., final_consensus_sa) {f} does not exist.")
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # Save cmd to script_file.
        cmd = self._cmd_str(root_dir=root_dir, i=i,
                            ccs_fofn=ccs_fofn,
                            blasr_nproc=blasr_nproc)
        with open(script_file, 'w') as writer:
            writer.write(cmd + "\n")

        icef.add_log("Writing CMD to: {script_file}".
                     format(script_file=script_file))
        icef.close_log()

        return (input_fasta, ref_fasta, sa_file, out_pickle, done_file)
Ejemplo n.º 5
0
    def _validate_inputs(self, root_dir, nfl_fa, N):
        """
        Check inputs, return
        (num_reads,
         number_reads_per_chunk,
         nfl_dir,
         [i-th_chunk_nfl_fa for i in [0...N-1]])
        """
        icef = IceFiles(prog_name="ice_partial_split",
                        root_dir=root_dir,
                        no_log_f=False)

        nfl_dir = icef.nfl_dir

        # root_dir/output/map_noFL/input.split_{0:02d}.fa
        splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)]

        mkdir(icef.nfl_dir)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(nfl_fa):
            errMsg = ("The input non-full-length reads fasta file " +
                      "{f} does not exists. ".format(f=nfl_fa))
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        num_reads = num_reads_in_fasta(nfl_fa)
        reads_per_split = int(max(1, ceil(num_reads / N)))

        return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
Ejemplo n.º 6
0
    def _validate_inputs(self, root_dir, nfl_fa, N):
        """
        Check inputs, return
        (num_reads,
         number_reads_per_chunk,
         nfl_dir,
         [i-th_chunk_nfl_fa for i in [0...N-1]])
        """
        icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False)

        nfl_dir = icef.nfl_dir

        # root_dir/output/map_noFL/input.split_{0:02d}.fa
        splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)]

        mkdir(icef.nfl_dir)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(nfl_fa):
            errMsg = "The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa)
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        num_reads = num_reads_in_fasta(nfl_fa)
        reads_per_split = int(max(1, ceil(num_reads / N)))

        return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
Ejemplo n.º 7
0
    def run(self):
        """Run"""
        iceq = IceQuiver(root_dir=self.root_dir,
                         bas_fofn=None,
                         fasta_fofn=None,
                         sge_opts=None,
                         prog_name="ice_quiver_merge")

        iceq.add_log(self.cmd_str())
        iceq.add_log("root_dir: {d}.".format(d=self.root_dir))
        iceq.add_log("Total number of chunks: N = {N}.".format(N=self.N))

        src = [
            iceq.submitted_quiver_jobs_log_of_chunk_i(i=i, num_chunks=self.N)
            for i in range(0, self.N)
        ]
        for f in src:
            if not nfs_exists(f):
                raise IOError("Log {f} ".format(f=f) +
                              "of submitted quiver jobs does not exist.")

        dst = iceq.submitted_quiver_jobs_log

        iceq.add_log(
            "Collecting submitted quiver jobs from:\n{src}\nto {dst}.".format(
                src="\n".join(src), dst=dst))

        cat_files(src=src, dst=dst)

        iceq.close_log()
Ejemplo n.º 8
0
    def run(self):
        """Run"""
        iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=None,
                         fasta_fofn=None, sge_opts=None,
                         prog_name="ice_quiver_merge")

        iceq.add_log(self.cmd_str())
        iceq.add_log("root_dir: {d}.".format(d=self.root_dir))
        iceq.add_log("Total number of chunks: N = {N}.".format(N=self.N))

        src = [iceq.submitted_quiver_jobs_log_of_chunk_i(i=i, num_chunks=self.N)
               for i in range(0, self.N)]
        for f in src:
            if not nfs_exists(f):
                raise IOError("Log {f} ".format(f=f) +
                              "of submitted quiver jobs does not exist.")

        dst = iceq.submitted_quiver_jobs_log

        iceq.add_log("Collecting submitted quiver jobs from:\n{src}\nto {dst}.".
                     format(src="\n".join(src), dst=dst))

        cat_files(src=src, dst=dst)

        iceq.close_log()
Ejemplo n.º 9
0
    def validate_inputs(self):
        """Validate if logs and pickle for non-full-length reads exist."""
        errMsg = ""

        if not nfs_exists(self.nfl_all_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."
        elif not nfs_exists(self.submitted_quiver_jobs_log):
            errMsg = "Log file {f}".format(f=self.submitted_quiver_jobs_log) + \
                     " of all submitted quiver jobs {f} does not exist."

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
Ejemplo n.º 10
0
    def validate_inputs(self):
        """Validate if logs and pickle for non-full-length reads exist."""
        errMsg = ""

        if not nfs_exists(self.nfl_all_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."
        elif not nfs_exists(self.submitted_quiver_jobs_log):
            errMsg = "Log file {f}".format(f=self.submitted_quiver_jobs_log) + \
                     " of all submitted quiver jobs {f} does not exist."

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
Ejemplo n.º 11
0
    def validate_inputs(self):
        """Validate input fofns, and root_dir, log_dir, tmp_dir,
        create quivered_dir and quivered_log_dir"""
        self.add_log("Validating inputs.")

        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        try:
            mkdir(self.quivered_dir)
            mkdir(self.quivered_log_dir)
        except OSError:
            # Multiple ice_quiver_i jobs may run at the same time and try to
            # mkdir, race condition may happen, so ignore OSError here.
            pass

        errMsg = ""

        if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir):
            errMsg = "Log dir {l} is not an existing directory.".\
                format(l=self.log_dir)
        elif self.bas_fofn is None:
            errMsg = "Please specify bas_fofn (e.g. input.fofn)."
        elif not nfs_exists(self.bas_fofn):
            errMsg = "bas_fofn {f} ".format(f=self.bas_fofn) + \
                     "which contains bas/bax.h5 files does not exist."
        elif self.fasta_fofn is None:
            errMsg = "Please make sure ice_make_fasta_fofn has " + \
                     "been called, and specify fasta_fofn."
        elif not nfs_exists(self.fasta_fofn):
            errMsg = "Input fasta_fofn {f} does not exists.".\
                     format(f=self.fasta_fofn)
            fasta_files = get_files_from_fofn(self.fasta_fofn)
            for fasta_file in fasta_files:
                if not nfs_exists(fasta_file):
                    errMsg = "A file {f} in fasta_fofn does not exist.".\
                             format(f=fasta_file)
        elif not nfs_exists(self.nfl_all_pickle_fn):
            #"output/map_noFL/noFL.ALL.partial_uc.pickle"):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
Ejemplo n.º 12
0
    def validate_inputs(self):
        """Validate input fofns, and root_dir, log_dir, tmp_dir,
        create quivered_dir and quivered_log_dir"""
        self.add_log("Validating inputs.")

        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        try:
            mkdir(self.quivered_dir)
            mkdir(self.quivered_log_dir)
        except OSError:
            # Multiple ice_quiver_i jobs may run at the same time and try to
            # mkdir, race condition may happen, so ignore OSError here.
            pass

        errMsg = ""

        if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir):
            errMsg = "Log dir {l} is not an existing directory.".\
                format(l=self.log_dir)
        elif self.bas_fofn is None:
            errMsg = "Please specify bas_fofn (e.g. input.fofn)."
        elif not nfs_exists(self.bas_fofn):
            errMsg = "bas_fofn {f} ".format(f=self.bas_fofn) + \
                     "which contains bas/bax.h5 files does not exist."
        elif self.fasta_fofn is None:
            errMsg = "Please make sure ice_make_fasta_fofn has " + \
                     "been called, and specify fasta_fofn."
        elif not nfs_exists(self.fasta_fofn):
            errMsg = "Input fasta_fofn {f} does not exists.".\
                     format(f=self.fasta_fofn)
            fasta_files = get_files_from_fofn(self.fasta_fofn)
            for fasta_file in fasta_files:
                if not nfs_exists(fasta_file):
                    errMsg = "A file {f} in fasta_fofn does not exist.".\
                             format(f=fasta_file)
        elif not nfs_exists(self.nfl_all_pickle_fn):
            #"output/map_noFL/noFL.ALL.partial_uc.pickle"):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
Ejemplo n.º 13
0
    def check_quiver_jobs_completion(self):
        """Check whether quiver jobs are completed.
        submitted_quiver_jobs.txt should have format like:
        <job_id> \t ./quivered/<range>.sh

        (1) if all jobs are done and files are there return True
        (2) if all jobs are done but some files incomplete ask if to resubmit
        (3) if not all jobs are done, just quit
        fq_filenames contains all the finished fastq files.
        """
        self.add_log("Checking if quiver jobs are completed.")
        done_flag = True
        bad_sh = []
        self.fq_filenames = []
        submitted = {}
        self.add_log("Submitted quiver jobs are at {f}:".
                     format(f=self.submitted_quiver_jobs_log))

        sge_used = False
        with open(self.submitted_quiver_jobs_log, 'r') as f:
            for line in f:
                a, b = line.strip().split('\t')
                if a == 'local':
                    submitted[b] = b
                else:
                    sge_used = True
                    submitted[a] = b

        if sge_used is True and self.use_sge is True:
            stuff = os.popen("qstat").read().strip().split('\n')
            # first two lines are header
            running_jids = []
            for x in stuff[2:]:
                job_id = x.split()[0]
                running_jids.append(job_id)
                if job_id in submitted:
                    self.add_log("job {0} is still running.".format(job_id))
                    done_flag = False

        for job_id, sh_name in submitted.iteritems():
            fq_filename = op.join(self.quivered_dir,
                                  op.basename(sh_name).replace('.sh', '.quivered.fq'))

            if not nfs_exists(fq_filename) or \
                    os.stat(fq_filename).st_size == 0:
                if job_id in running_jids:  # still running, pass
                    done_flag = False
                else:
                    self.add_log("job {0} is completed but {1} is still empty!".
                                 format(job_id, fq_filename))
                    bad_sh.append(submitted[job_id])
            else:
                self.add_log("job {0} is done".format(job_id))
                self.fq_filenames.append(fq_filename)

        if not done_flag:
            if len(bad_sh) == 0:
                return "RUNNING"
            else:
                self.add_log("Some Quiver jobs failed. Attempt to rerun locally.\n")
                still_bad_sh = locally_run_failed_quiver_jobs(bad_sh)
                if len(still_bad_sh) > 0:
                    self.add_log("The following jobs were completed but " +
                             "no output file. Please check and resubmit: " +
                             "\n{0}\n".format('\n'.join(still_bad_sh)))
                    return "FAILED"
                else:
                    return "DONE"
        else:
            return "DONE"
Ejemplo n.º 14
0
    def check_quiver_jobs_completion(self):
        """Check whether quiver jobs are completed.
        submitted_quiver_jobs.txt should have format like:
        <job_id> \t ./quivered/<range>.sh

        (1) if all jobs are done and files are there return True
        (2) if all jobs are done but some files incomplete ask if to resubmit
        (3) if not all jobs are done, just quit
        fq_filenames contains all the finished fastq files.
        """
        self.add_log("Checking if quiver jobs are completed.")
        done_flag = True
        bad_sh = []
        self.fq_filenames = []
        submitted = {}
        self.add_log("Submitted quiver jobs are at {f}:".format(
            f=self.submitted_quiver_jobs_log))

        sge_used = False
        with open(self.submitted_quiver_jobs_log, 'r') as f:
            for line in f:
                a, b = line.strip().split('\t')
                if a == 'local':
                    submitted[b] = b
                else:
                    sge_used = True
                    submitted[a] = b

        if sge_used is True and self.use_sge is True:
            stuff = os.popen("qstat").read().strip().split('\n')
            # first two lines are header
            running_jids = []
            for x in stuff[2:]:
                job_id = x.split()[0]
                running_jids.append(job_id)
                if job_id in submitted:
                    self.add_log("job {0} is still running.".format(job_id))
                    done_flag = False

        for job_id, sh_name in submitted.iteritems():
            fq_filename = op.join(
                self.quivered_dir,
                op.basename(sh_name).replace('.sh', '.quivered.fq'))

            if not nfs_exists(fq_filename) or \
                    os.stat(fq_filename).st_size == 0:
                if job_id in running_jids:  # still running, pass
                    done_flag = False
                else:
                    self.add_log(
                        "job {0} is completed but {1} is still empty!".format(
                            job_id, fq_filename))
                    bad_sh.append(submitted[job_id])
            else:
                self.add_log("job {0} is done".format(job_id))
                self.fq_filenames.append(fq_filename)

        if not done_flag:
            if len(bad_sh) == 0:
                return "RUNNING"
            else:
                self.add_log("The following jobs were completed but " +
                             "no output file. Please check and resubmit: " +
                             "\n{0}\n".format('\n'.join(bad_sh)))
                return "FAILED"
        else:
            return "DONE"