def _validate_inputs(self, root_dir, N): """ Check inputs, return (splitted_pickles, out_pickle) """ icef = IceFiles(prog_name="ice_partial_merge", root_dir=root_dir, no_log_f=False) # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle splitted_pickles = [icef.nfl_pickle_i(i) for i in range(0, N)] # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE dones = [icef.nfl_done_i(i) for i in range(0, N)] # Check if inputs exist. errMsg = "" for done in dones: if not nfs_exists(done): errMsg = "DONE file {f} does not exist.".format(f=done) for pickle in splitted_pickles: if not nfs_exists(pickle): errMsg = "Pickle file {f} does not exist.".format(f=pickle) if len(errMsg) != 0: raise ValueError(errMsg) # root_dir/output/map_noFL/nfl.all.partial_uc.pickle out_pickle = icef.nfl_all_pickle_fn return (splitted_pickles, out_pickle)
def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc): """ Check inputs, write ice_partial.py i command to script_file and return (input_fasta, ref_fasta, sa_file, out_pickle, done_file) for the i-th chunk of nfl reads. """ icef = IceFiles(prog_name="ice_partial_{i}".format(i=i), root_dir=root_dir, no_log_f=False) # root_dir/output/final.consensus.fa ref_fasta = icef.final_consensus_fa ref_dazz = icef.final_dazz_db #sa_file = icef.final_consensus_sa # root_dir/output/map_noFL/input.split_{0:02d}.fa input_fasta = icef.nfl_fa_i(i) # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle out_pickle = icef.nfl_pickle_i(i) # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE done_file = icef.nfl_done_i(i) # root_dir/scripts/input.split_{0:02d}.fa.partial_uc.sh script_file = icef.nfl_script_i(i) # Check if inputs exist. errMsg = "" if not nfs_exists(input_fasta): errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) + "fasta file {f} does not exist. ".format(f=input_fasta) + "Please run ice_partial_split.py first.") elif not nfs_exists(ref_fasta): errMsg = ("The unpolished consensus isoforms fasta file " + "{f} does not exist. ".format(f=ref_fasta) + "Please make sure ICE is successfully done.") elif not nfs_exists(ref_dazz): errMsg = ("The dazz db " + "{f} does not exist. ".format(f=ref_dazz) + "Please make sure it is already built.") # elif not nfs_exists(sa_file): # errMsg = ("The suffix array of unpolished consensus isoforms " + # "(i.e., final_consensus_sa) {f} does not exist.") if len(errMsg) != 0: raise ValueError(errMsg) # Save cmd to script_file. cmd = self._cmd_str(root_dir=root_dir, i=i, ccs_fofn=ccs_fofn, blasr_nproc=blasr_nproc) with open(script_file, 'w') as writer: writer.write(cmd + "\n") icef.add_log("Writing CMD to: {script_file}". format(script_file=script_file)) icef.close_log() return (input_fasta, ref_fasta, ref_dazz, out_pickle, done_file)
def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc): """ Check inputs, write ice_partial.py i command to script_file and return (input_fasta, ref_fasta, sa_file, out_pickle, done_file) for the i-th chunk of nfl reads. """ icef = IceFiles(prog_name="ice_partial_{i}".format(i=i), root_dir=root_dir, no_log_f=False) # root_dir/output/final.consensus.fa ref_fasta = icef.final_consensus_fa sa_file = icef.final_consensus_sa # root_dir/output/map_noFL/input.split_{0:02d}.fa input_fasta = icef.nfl_fa_i(i) # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle out_pickle = icef.nfl_pickle_i(i) # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE done_file = icef.nfl_done_i(i) # root_dir/scripts/input.split_{0:02d}.fa.partial_uc.sh script_file = icef.nfl_script_i(i) # Check if inputs exist. errMsg = "" if not nfs_exists(input_fasta): errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) + "fasta file {f} does not exist. ".format(f=input_fasta) + "Please run ice_partial_split.py first.") elif not nfs_exists(ref_fasta): errMsg = ("The unpolished consensus isoforms fasta file " + "{f} does not exist. ".format(f=ref_fasta) + "Please make sure ICE is successfully done.") elif not nfs_exists(sa_file): errMsg = ("The suffix array of unpolished consensus isoforms " + "(i.e., final_consensus_sa) {f} does not exist.") if len(errMsg) != 0: raise ValueError(errMsg) # Save cmd to script_file. cmd = self._cmd_str(root_dir=root_dir, i=i, ccs_fofn=ccs_fofn, blasr_nproc=blasr_nproc) with open(script_file, 'w') as writer: writer.write(cmd + "\n") icef.add_log("Writing CMD to: {script_file}". format(script_file=script_file)) icef.close_log() return (input_fasta, ref_fasta, sa_file, out_pickle, done_file)
def _validate_inputs(self, root_dir, nfl_fa, N): """ Check inputs, return (num_reads, number_reads_per_chunk, nfl_dir, [i-th_chunk_nfl_fa for i in [0...N-1]]) """ icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False) nfl_dir = icef.nfl_dir # root_dir/output/map_noFL/input.split_{0:02d}.fa splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)] mkdir(icef.nfl_dir) # Check if inputs exist. errMsg = "" if not nfs_exists(nfl_fa): errMsg = ("The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa)) if len(errMsg) != 0: raise ValueError(errMsg) num_reads = num_reads_in_fasta(nfl_fa) reads_per_split = int(max(1, ceil(num_reads / N))) return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
def _validate_inputs(self, root_dir, nfl_fa, N): """ Check inputs, return (num_reads, number_reads_per_chunk, nfl_dir, [i-th_chunk_nfl_fa for i in [0...N-1]]) """ icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False) nfl_dir = icef.nfl_dir # root_dir/output/map_noFL/input.split_{0:02d}.fa splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)] mkdir(icef.nfl_dir) # Check if inputs exist. errMsg = "" if not nfs_exists(nfl_fa): errMsg = "The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa) if len(errMsg) != 0: raise ValueError(errMsg) num_reads = num_reads_in_fasta(nfl_fa) reads_per_split = int(max(1, ceil(num_reads / N))) return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
def run(self): """Run""" iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=None, fasta_fofn=None, sge_opts=None, prog_name="ice_quiver_merge") iceq.add_log(self.cmd_str()) iceq.add_log("root_dir: {d}.".format(d=self.root_dir)) iceq.add_log("Total number of chunks: N = {N}.".format(N=self.N)) src = [ iceq.submitted_quiver_jobs_log_of_chunk_i(i=i, num_chunks=self.N) for i in range(0, self.N) ] for f in src: if not nfs_exists(f): raise IOError("Log {f} ".format(f=f) + "of submitted quiver jobs does not exist.") dst = iceq.submitted_quiver_jobs_log iceq.add_log( "Collecting submitted quiver jobs from:\n{src}\nto {dst}.".format( src="\n".join(src), dst=dst)) cat_files(src=src, dst=dst) iceq.close_log()
def run(self): """Run""" iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=None, fasta_fofn=None, sge_opts=None, prog_name="ice_quiver_merge") iceq.add_log(self.cmd_str()) iceq.add_log("root_dir: {d}.".format(d=self.root_dir)) iceq.add_log("Total number of chunks: N = {N}.".format(N=self.N)) src = [iceq.submitted_quiver_jobs_log_of_chunk_i(i=i, num_chunks=self.N) for i in range(0, self.N)] for f in src: if not nfs_exists(f): raise IOError("Log {f} ".format(f=f) + "of submitted quiver jobs does not exist.") dst = iceq.submitted_quiver_jobs_log iceq.add_log("Collecting submitted quiver jobs from:\n{src}\nto {dst}.". format(src="\n".join(src), dst=dst)) cat_files(src=src, dst=dst) iceq.close_log()
def validate_inputs(self): """Validate if logs and pickle for non-full-length reads exist.""" errMsg = "" if not nfs_exists(self.nfl_all_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \ "which assigns all non-full-length reads to isoforms " + \ "does not exist. Please check 'ice_partial.py *' are " + \ "all done." elif not nfs_exists(self.final_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \ "which assigns full-length non-chimeric reads to " + \ "isoforms does not exist." elif not nfs_exists(self.submitted_quiver_jobs_log): errMsg = "Log file {f}".format(f=self.submitted_quiver_jobs_log) + \ " of all submitted quiver jobs {f} does not exist." if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise IOError(errMsg)
def validate_inputs(self): """Validate input fofns, and root_dir, log_dir, tmp_dir, create quivered_dir and quivered_log_dir""" self.add_log("Validating inputs.") # Create directories: root_dir/quivered and root_dir/log_dir/quivered try: mkdir(self.quivered_dir) mkdir(self.quivered_log_dir) except OSError: # Multiple ice_quiver_i jobs may run at the same time and try to # mkdir, race condition may happen, so ignore OSError here. pass errMsg = "" if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir): errMsg = "Log dir {l} is not an existing directory.".\ format(l=self.log_dir) elif self.bas_fofn is None: errMsg = "Please specify bas_fofn (e.g. input.fofn)." elif not nfs_exists(self.bas_fofn): errMsg = "bas_fofn {f} ".format(f=self.bas_fofn) + \ "which contains bas/bax.h5 files does not exist." elif self.fasta_fofn is None: errMsg = "Please make sure ice_make_fasta_fofn has " + \ "been called, and specify fasta_fofn." elif not nfs_exists(self.fasta_fofn): errMsg = "Input fasta_fofn {f} does not exists.".\ format(f=self.fasta_fofn) fasta_files = get_files_from_fofn(self.fasta_fofn) for fasta_file in fasta_files: if not nfs_exists(fasta_file): errMsg = "A file {f} in fasta_fofn does not exist.".\ format(f=fasta_file) elif not nfs_exists(self.nfl_all_pickle_fn): #"output/map_noFL/noFL.ALL.partial_uc.pickle"): errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \ "which assigns all non-full-length reads to isoforms " + \ "does not exist. Please check 'ice_partial.py *' are " + \ "all done." elif not nfs_exists(self.final_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \ "which assigns full-length non-chimeric reads to " + \ "isoforms does not exist." if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise IOError(errMsg)
def check_quiver_jobs_completion(self): """Check whether quiver jobs are completed. submitted_quiver_jobs.txt should have format like: <job_id> \t ./quivered/<range>.sh (1) if all jobs are done and files are there return True (2) if all jobs are done but some files incomplete ask if to resubmit (3) if not all jobs are done, just quit fq_filenames contains all the finished fastq files. """ self.add_log("Checking if quiver jobs are completed.") done_flag = True bad_sh = [] self.fq_filenames = [] submitted = {} self.add_log("Submitted quiver jobs are at {f}:". format(f=self.submitted_quiver_jobs_log)) sge_used = False with open(self.submitted_quiver_jobs_log, 'r') as f: for line in f: a, b = line.strip().split('\t') if a == 'local': submitted[b] = b else: sge_used = True submitted[a] = b if sge_used is True and self.use_sge is True: stuff = os.popen("qstat").read().strip().split('\n') # first two lines are header running_jids = [] for x in stuff[2:]: job_id = x.split()[0] running_jids.append(job_id) if job_id in submitted: self.add_log("job {0} is still running.".format(job_id)) done_flag = False for job_id, sh_name in submitted.iteritems(): fq_filename = op.join(self.quivered_dir, op.basename(sh_name).replace('.sh', '.quivered.fq')) if not nfs_exists(fq_filename) or \ os.stat(fq_filename).st_size == 0: if job_id in running_jids: # still running, pass done_flag = False else: self.add_log("job {0} is completed but {1} is still empty!". format(job_id, fq_filename)) bad_sh.append(submitted[job_id]) else: self.add_log("job {0} is done".format(job_id)) self.fq_filenames.append(fq_filename) if not done_flag: if len(bad_sh) == 0: return "RUNNING" else: self.add_log("Some Quiver jobs failed. Attempt to rerun locally.\n") still_bad_sh = locally_run_failed_quiver_jobs(bad_sh) if len(still_bad_sh) > 0: self.add_log("The following jobs were completed but " + "no output file. Please check and resubmit: " + "\n{0}\n".format('\n'.join(still_bad_sh))) return "FAILED" else: return "DONE" else: return "DONE"
def check_quiver_jobs_completion(self): """Check whether quiver jobs are completed. submitted_quiver_jobs.txt should have format like: <job_id> \t ./quivered/<range>.sh (1) if all jobs are done and files are there return True (2) if all jobs are done but some files incomplete ask if to resubmit (3) if not all jobs are done, just quit fq_filenames contains all the finished fastq files. """ self.add_log("Checking if quiver jobs are completed.") done_flag = True bad_sh = [] self.fq_filenames = [] submitted = {} self.add_log("Submitted quiver jobs are at {f}:".format( f=self.submitted_quiver_jobs_log)) sge_used = False with open(self.submitted_quiver_jobs_log, 'r') as f: for line in f: a, b = line.strip().split('\t') if a == 'local': submitted[b] = b else: sge_used = True submitted[a] = b if sge_used is True and self.use_sge is True: stuff = os.popen("qstat").read().strip().split('\n') # first two lines are header running_jids = [] for x in stuff[2:]: job_id = x.split()[0] running_jids.append(job_id) if job_id in submitted: self.add_log("job {0} is still running.".format(job_id)) done_flag = False for job_id, sh_name in submitted.iteritems(): fq_filename = op.join( self.quivered_dir, op.basename(sh_name).replace('.sh', '.quivered.fq')) if not nfs_exists(fq_filename) or \ os.stat(fq_filename).st_size == 0: if job_id in running_jids: # still running, pass done_flag = False else: self.add_log( "job {0} is completed but {1} is still empty!".format( job_id, fq_filename)) bad_sh.append(submitted[job_id]) else: self.add_log("job {0} is done".format(job_id)) self.fq_filenames.append(fq_filename) if not done_flag: if len(bad_sh) == 0: return "RUNNING" else: self.add_log("The following jobs were completed but " + "no output file. Please check and resubmit: " + "\n{0}\n".format('\n'.join(bad_sh))) return "FAILED" else: return "DONE"