def _validate_inputs(self, root_dir, N): """ Check inputs, return (splitted_pickles, out_pickle) """ icef = IceFiles(prog_name="ice_partial_merge", root_dir=root_dir, no_log_f=False) # root_dir/output/map_noFL/input.split_{0:03d}.fasta.partial_uc.pickle splitted_pickles = [icef.nfl_pickle_i(i) for i in range(0, N)] dones = [icef.nfl_done_i(i) for i in range(0, N)] # Check if inputs exist. errMsg = "" for done in dones: if not nfs_exists(done): errMsg = "DONE file {f} does not exist.".format(f=done) for pickle in splitted_pickles: if not nfs_exists(pickle): errMsg = "Pickle file {f} does not exist.".format(f=pickle) if len(errMsg) != 0: raise ValueError(errMsg) # root_dir/output/map_noFL/nfl.all.partial_uc.pickle out_pickle = icef.nfl_all_pickle_fn return (splitted_pickles, out_pickle)
def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc, tmp_dir): """ Check inputs, write $ICE_PARTIAL_PY i command to script_file and return (input_fasta, ref_fasta, out_pickle, done_file) for the i-th chunk of nfl reads. """ icef = IceFiles(prog_name="ice_partial_{i}".format(i=i), root_dir=root_dir, no_log_f=False) # root_dir/output/final.consensus.fasta ref_fasta = icef.final_consensus_fa ref_dazz = icef.final_dazz_db # root_dir/output/map_noFL/input.split_{0:03d}.fasta input_fasta = icef.nfl_fa_i(i) # $input_fasta.partial_uc.pickle out_pickle = icef.nfl_pickle_i(i) # $input_fasta.partial_uc.pickle.DONE done_file = icef.nfl_done_i(i) # $input_fasta.partial_uc.sh script_file = icef.nfl_script_i(i) # Check if inputs exist. errMsg = "" if not nfs_exists(input_fasta): errMsg = ( "The {i}-th splitted non-full-length reads ".format(i=i) + "fasta file {f} does not exist. ".format(f=input_fasta) + "Please run $ICE_PARTIAL_PY split first.") elif not nfs_exists(ref_fasta): errMsg = ("The unpolished consensus isoforms fasta file " + "{f} does not exist. ".format(f=ref_fasta) + "Please make sure ICE is successfully done.") elif not nfs_exists(ref_dazz): errMsg = ("The dazz db " + "{f} does not exist. ".format(f=ref_dazz) + "Please make sure it is already built.") if len(errMsg) != 0: raise ValueError(errMsg) # Save cmd to script_file. cmd = self._cmd_str(root_dir=root_dir, i=[i], ccs_fofn=ccs_fofn, blasr_nproc=blasr_nproc, tmp_dir=tmp_dir) with open(script_file, 'w') as writer: writer.write(cmd + "\n") icef.add_log( "Writing CMD to: {script_file}".format(script_file=script_file)) icef.close_log() return (input_fasta, ref_fasta, out_pickle, done_file)
def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc, tmp_dir): """ Check inputs, write $ICE_PARTIAL_PY i command to script_file and return (input_fasta, ref_fasta, out_pickle, done_file) for the i-th chunk of nfl reads. """ icef = IceFiles(prog_name="ice_partial_{i}".format(i=i), root_dir=root_dir, no_log_f=False) # root_dir/output/final.consensus.fasta ref_fasta = icef.final_consensus_fa ref_dazz = icef.final_dazz_db # root_dir/output/map_noFL/input.split_{0:03d}.fasta input_fasta = icef.nfl_fa_i(i) # $input_fasta.partial_uc.pickle out_pickle = icef.nfl_pickle_i(i) # $input_fasta.partial_uc.pickle.DONE done_file = icef.nfl_done_i(i) # $input_fasta.partial_uc.sh script_file = icef.nfl_script_i(i) # Check if inputs exist. errMsg = "" if not nfs_exists(input_fasta): errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) + "fasta file {f} does not exist. ".format(f=input_fasta) + "Please run $ICE_PARTIAL_PY split first.") elif not nfs_exists(ref_fasta): errMsg = ("The unpolished consensus isoforms fasta file " + "{f} does not exist. ".format(f=ref_fasta) + "Please make sure ICE is successfully done.") elif not nfs_exists(ref_dazz): errMsg = ("The dazz db " + "{f} does not exist. ".format(f=ref_dazz) + "Please make sure it is already built.") if len(errMsg) != 0: raise ValueError(errMsg) # Save cmd to script_file. cmd = self._cmd_str(root_dir=root_dir, i=[i], ccs_fofn=ccs_fofn, blasr_nproc=blasr_nproc, tmp_dir=tmp_dir) with open(script_file, 'w') as writer: writer.write(cmd + "\n") icef.add_log("Writing CMD to: {script_file}". format(script_file=script_file)) icef.close_log() return (input_fasta, ref_fasta, out_pickle, done_file)
def create_quiver_bins_and_submit_jobs(self, d, uc, partial_uc, refs, keys, start, end, submitted, sge_opts): """ Put every 100 clusters together and create bins. Create a bash script (e.g., script_of_quivered_bin), for each bin, and submit the script either using qsub or running it locally. return all bash scripts in a list. """ if start >= end or start < 0 or start > len(keys) or end > len(keys): return [] # Update refs new_refs = {cid: op.join(self.cluster_dir(cid), op.basename(refs[cid])) for cid in keys[start:end]} refs = new_refs # Reconstruct refs if not exist. if not nfs_exists(refs[keys[start]]): self.reconstruct_ref_fa_for_clusters_in_bin(cids=keys[start:end], refs=refs) all_todo = [] for i in xrange(start, end, 100): # Put every 100 clusters to a bin cids = keys[i:min(end, i + 100)] bin_sh = self.create_a_quiver_bin(cids=cids, d=d, uc=uc, partial_uc=partial_uc, refs=refs, sge_opts=sge_opts) all_todo.append(bin_sh) # assert bin_sh == self.script_of_quivered_bin(first, last) # submit the created script of this quiver bin self.submit_todo_quiver_jobs(todo=[bin_sh], submitted=submitted, sge_opts=sge_opts) # end of for i in xrange(start, end, 100): return all_todo
def run(self): """Run""" iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=None, fasta_fofn=None, sge_opts=None, prog_name="ice_quiver_merge") iceq.add_log(self.cmd_str()) iceq.add_log("root_dir: {d}.".format(d=self.root_dir)) iceq.add_log("Total number of chunks: N = {N}.".format(N=self.N)) src = [iceq.submitted_quiver_jobs_log_of_chunk_i(i=i, num_chunks=self.N) for i in range(0, self.N)] for f in src: if not nfs_exists(f): raise IOError("Log {f} ".format(f=f) + "of submitted quiver jobs does not exist.") dst = iceq.submitted_quiver_jobs_log iceq.add_log("Collecting submitted quiver jobs from:\n{src}\nto {dst}.". format(src="\n".join(src), dst=dst)) cat_files(src=src, dst=dst) iceq.close_log()
def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs): """ Reconstruct ref_fa of the cluster in the new tmp_dir e.g., self.g_consensus_ref_fa_of_cluster(cid) cids --- list[int(cid)], e.g., [10, 11, 12, ..., 20] refs --- dict{int(cid): ref_fa of cluster(cid)} """ # Check existence when first time it is read. if not nfs_exists(self.final_consensus_fa): raise IOError("Final consensus FASTA file {f}".format( f=self.final_consensus_fa) + "does not exist.") self.add_log("Reconstructing g consensus files for clusters " "[%d, %d] in %s" % (cids[0], cids[-1], self.tmp_dir), level=logging.INFO) final_consensus_d = FastaRandomReader(self.final_consensus_fa) for ref_id in final_consensus_d.d.keys(): cid = int(ref_id.split('/')[0].replace('c', '')) # e.g., ref_id = c103/1/3708, cid = 103, # refs[cid] = ...tmp/0/c103/g_consensus_ref.fasta if cid in cids: mkdir(self.cluster_dir(cid)) ref_fa = op.join(self.cluster_dir(cid), op.basename(refs[cid])) refs[cid] = ref_fa with FastaWriter(ref_fa) as writer: self.add_log("Writing ref_fa %s" % refs[cid]) writer.writeRecord(ref_id, final_consensus_d[ref_id].sequence[:]) self.add_log("Reconstruct of g consensus files completed.", level=logging.INFO)
def _validate_inputs(self, root_dir, nfl_fa, N): """ Check inputs, return (num_reads, number_reads_per_chunk, nfl_dir, [i-th_chunk_nfl_fa for i in [0...N-1]]) """ icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False) nfl_dir = icef.nfl_dir # root_dir/output/map_noFL/input.split_{0:03d}.fasta splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)] mkdir(icef.nfl_dir) # Check if inputs exist. errMsg = "" if not nfs_exists(nfl_fa): errMsg = ("The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa)) if len(errMsg) != 0: raise ValueError(errMsg) num_reads = num_reads_in_fasta(nfl_fa) reads_per_split = int(max(1, ceil(num_reads * 1.0 / N))) return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
def __init__(self, input_filename, converted=False, dazz_dir=None): """ input_filename - input FASTA/FASTQ/ContigSet file converted - whether or not input file has been converted to daligner compatible FASTA file. dazz_dir - if None, save all dazz.fasta, dazz.pickle, db files in the same directory as inputfile. if a valid path, save all output files to dazz_dir. """ self.dazz_dir = dazz_dir self.input_filename = realpath(input_filename) self.validate_file_type(self.input_filename) # index --> original sequence ID ex: 1 --> movie/zmw/start_end_CCS self.dazz_mapping = {} if converted and not nfs_exists(self.db_filename): log.warning( str(self.input_filename) + " should have been converted to daligner-compatible" + " format, but in fact it is not. Converting ...") converted = False if not converted: self.convert_to_dazz_fasta() self.make_db() else: self.read_dazz_pickle()
def _validate_inputs(self, root_dir, nfl_fa, N): """ Check inputs, return (num_reads, number_reads_per_chunk, nfl_dir, [i-th_chunk_nfl_fa for i in [0...N-1]]) """ icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False) nfl_dir = icef.nfl_dir # root_dir/output/map_noFL/input.split_{0:03d}.fasta splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)] mkdir(icef.nfl_dir) # Check if inputs exist. errMsg = "" if N <= 0 or N > 100: errMsg = "Input file can not be splitted into %d chunks!" % N if not nfs_exists(nfl_fa): errMsg = ("The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa)) if len(errMsg) != 0: raise ValueError(errMsg) num_reads = num_reads_in_fasta(nfl_fa) reads_per_split = int(max(1, ceil(num_reads * 1.0 / N))) return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
def __init__(self, input_filename, converted=False, dazz_dir=None): """ input_filename - input FASTA/FASTQ/ContigSet file converted - whether or not input file has been converted to daligner compatible FASTA file. dazz_dir - if None, save all dazz.fasta, dazz.pickle, db files in the same directory as inputfile. if a valid path, save all output files to dazz_dir. """ self.dazz_dir = dazz_dir self.input_filename = realpath(input_filename) self.validate_file_type(self.input_filename) # index --> original sequence ID ex: 1 --> movie/zmw/start_end_CCS self.dazz_mapping = {} if converted and not nfs_exists(self.db_filename): log.warning(str(self.input_filename) + " should have been converted to daligner-compatible" + " format, but in fact it is not. Converting ...") converted = False if not converted: self.convert_to_dazz_fasta() self.make_db() else: self.read_dazz_pickle()
def validate_inputs(self): """Validate if logs and pickle for non-full-length reads exist.""" errMsg = "" if not nfs_exists(self.nfl_all_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \ "which assigns all non-full-length reads to isoforms " + \ "does not exist. Please check 'ice_partial.py *' are " + \ "all done." elif not nfs_exists(self.final_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \ "which assigns full-length non-chimeric reads to " + \ "isoforms does not exist." elif not nfs_exists(self.submitted_quiver_jobs_log): errMsg = "Log file {f}".format(f=self.submitted_quiver_jobs_log) + \ " of all submitted quiver jobs {f} does not exist." if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise IOError(errMsg)
def validate_inputs(self): """Validate if logs and pickle for non-full-length reads exist.""" errMsg = "" if not nfs_exists(self.nfl_all_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \ "which assigns all non-full-length reads to isoforms " + \ "does not exist. Please check 'ice_partial.py *' are " + \ "all done." elif not nfs_exists(self.final_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \ "which assigns full-length non-chimeric reads to " + \ "isoforms does not exist." elif not nfs_exists(self.arrow_submission_run_file): errMsg = "Log file {f}".format(f=self.arrow_submission_run_file) + \ " of all submitted arrow jobs does not exist." if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise IOError(errMsg)
def validate_inputs(self): """Validate input fofns, and root_dir, log_dir, tmp_dir, create arrowed_dir and arrowed_log_dir""" self.add_log("Validating inputs.") # Create directories: root_dir/quivered and root_dir/log_dir/quivered try: mkdir(self.arrowed_dir) mkdir(self.arrowed_log_dir) except OSError: # Multiple ice_arrow_i jobs may run at the same time and try to # mkdir, race condition may happen, so ignore OSError here. pass errMsg = "" if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir): errMsg = "Log dir {l} is not an existing directory.".\ format(l=self.log_dir) elif self.subread_xml is None: errMsg = "Please specify subreads XML (e.g., --subread_xml=<movie>.subreadset.xml)." elif not nfs_exists(self.subread_xml): errMsg = "Specified subreads file (subread_xml={f}) does not exist.".format( f=self.subread_xml) elif guess_file_format(self.subread_xml) is not FILE_FORMATS.BAM: errMsg = "Invalid subreads XML file: {0}!".format(self.subread_xml) elif not nfs_exists(self.nfl_all_pickle_fn): #"output/map_noFL/noFL.ALL.partial_uc.pickle"): errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \ "which assigns all non-full-length reads to isoforms " + \ "does not exist. Please check 'run_IcePartials2.py *' are " + \ "all done." elif not nfs_exists(self.final_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \ "which assigns full-length non-chimeric reads to " + \ "isoforms does not exist." if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise IOError(errMsg)
def validate_inputs(self): """Validate input fofns, and root_dir, log_dir, tmp_dir, create quivered_dir and quivered_log_dir""" self.add_log("Validating inputs.") # Create directories: root_dir/quivered and root_dir/log_dir/quivered try: mkdir(self.quivered_dir) mkdir(self.quivered_log_dir) except OSError: # Multiple ice_quiver_i jobs may run at the same time and try to # mkdir, race condition may happen, so ignore OSError here. pass errMsg = "" if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir): errMsg = "Log dir {l} is not an existing directory.".\ format(l=self.log_dir) elif self.bas_fofn is None: errMsg = "Please specify subreads file (e.g., --bas_fofn=input.fofn|subreadset.xml)." elif not nfs_exists(self.bas_fofn): errMsg = "Specified subreads file (bas_fofn={f}) does not exist.".format( f=self.bas_fofn) elif not nfs_exists(self.nfl_all_pickle_fn): #"output/map_noFL/noFL.ALL.partial_uc.pickle"): errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \ "which assigns all non-full-length reads to isoforms " + \ "does not exist. Please check 'ice_partial.py *' are " + \ "all done." elif not nfs_exists(self.final_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \ "which assigns full-length non-chimeric reads to " + \ "isoforms does not exist." if self.bas_fofn is not None and \ guess_file_format(self.bas_fofn) is not FILE_FORMATS.BAM: # No need to convert subreads.bam to fasta if self.fasta_fofn is None: errMsg = "Please make sure ice_make_fasta_fofn has " + \ "been called, and specify fasta_fofn." elif not nfs_exists(self.fasta_fofn): errMsg = "Input fasta_fofn {f} does not exists.".\ format(f=self.fasta_fofn) fasta_files = get_files_from_file_or_fofn(self.fasta_fofn) for fasta_file in fasta_files: if not nfs_exists(fasta_file): errMsg = "A file {f} in fasta_fofn does not exist.".\ format(f=fasta_file) if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise IOError(errMsg)
def validate_inputs(self): """Validate input fofns, and root_dir, log_dir, tmp_dir, create quivered_dir and quivered_log_dir""" self.add_log("Validating inputs.") # Create directories: root_dir/quivered and root_dir/log_dir/quivered try: mkdir(self.quivered_dir) mkdir(self.quivered_log_dir) except OSError: # Multiple ice_quiver_i jobs may run at the same time and try to # mkdir, race condition may happen, so ignore OSError here. pass errMsg = "" if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir): errMsg = "Log dir {l} is not an existing directory.".\ format(l=self.log_dir) elif self.bas_fofn is None: errMsg = "Please specify subreads file (e.g., --bas_fofn=input.fofn|subreadset.xml)." elif not nfs_exists(self.bas_fofn): errMsg = "Specified subreads file (bas_fofn={f}) does not exist.".format(f=self.bas_fofn) elif not nfs_exists(self.nfl_all_pickle_fn): #"output/map_noFL/noFL.ALL.partial_uc.pickle"): errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \ "which assigns all non-full-length reads to isoforms " + \ "does not exist. Please check 'ice_partial.py *' are " + \ "all done." elif not nfs_exists(self.final_pickle_fn): errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \ "which assigns full-length non-chimeric reads to " + \ "isoforms does not exist." if self.bas_fofn is not None and \ guess_file_format(self.bas_fofn) is not FILE_FORMATS.BAM: # No need to convert subreads.bam to fasta if self.fasta_fofn is None: errMsg = "Please make sure ice_make_fasta_fofn has " + \ "been called, and specify fasta_fofn." elif not nfs_exists(self.fasta_fofn): errMsg = "Input fasta_fofn {f} does not exists.".\ format(f=self.fasta_fofn) fasta_files = get_files_from_file_or_fofn(self.fasta_fofn) for fasta_file in fasta_files: if not nfs_exists(fasta_file): errMsg = "A file {f} in fasta_fofn does not exist.".\ format(f=fasta_file) if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise IOError(errMsg)
def create_arrows_bins_no_submit(self, d, uc, partial_uc, refs, cids_todo): """ Create arrow bins for cids in <cids_todo>. Handle missing references, etc. Create/Write the jobs but DO NOT submit. """ # Liz: I'm commenting this out because the "refs" from the pickle should be accurate # plus the new cids after ice2 collection is b<bin>_c<cid> # Update refs #new_refs = {cid: op.join(self.cluster_dir(cid), op.basename(refs[cid])) for cid in cids_todo} #refs = new_refs # Reconstruct refs if not exist. cids_missing_refs = filter(lambda x: not nfs_exists(refs[x]), cids_todo) if len(cids_missing_refs) > 0: self.reconstruct_ref_fa_for_clusters_in_bin(cids=cids_missing_refs, refs=refs) return self.create_a_arrow_bin(cids_todo, d, uc, partial_uc, refs)
def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs): """ Reconstruct ref_fa of the cluster in the new tmp_dir e.g., self.g_consensus_ref_fa_of_cluster(cid) Liz: new cids after ice2 collection is b<bin>_c<cid> refs --- dict{int(cid): ref_fa of cluster(cid)} """ # Check existence when first time it is read. if not nfs_exists(self.final_consensus_fa): raise IOError("Final consensus FASTA file {f}".format( f=self.final_consensus_fa) + "does not exist.") print("Reconstructing g consensus files for clusters {0}, {1} in {2}". format(cids[0], cids[-1], self.tmp_dir)) self.add_log( "Reconstructing g consensus files for clusters {0}, {1} in {2}". format(cids[0], cids[-1], self.tmp_dir)) final_consensus_d = FastaRandomReader(self.final_consensus_fa) for ref_id in list(final_consensus_d.d.keys()): # Liz: this is no longer valid for the Ice2 cids #cid = int(ref_id.split('/')[0].replace('c', '')) cid = ref_id if cid in cids: _dir = self.cluster_dir_for_reconstructed_ref(cid) mkdir(_dir) ref_fa = op.join(_dir, op.basename(refs[cid])) refs[cid] = ref_fa with FastaWriter(ref_fa) as writer: self.add_log("Writing ref_fa %s" % refs[cid]) writer.writeRecord(ref_id, final_consensus_d[ref_id].sequence[:]) self.add_log("Reconstruct of g consensus files completed.", level=logging.INFO)
def check_quiver_jobs_completion(self): """Check whether quiver jobs are completed. submitted_quiver_jobs.txt should have format like: <job_id> \t ./quivered/<range>.sh (1) if all jobs are done and files are there return True (2) if all jobs are done but some files incomplete ask if to resubmit (3) if not all jobs are done, just quit fq_filenames contains all the finished fastq files. """ self.add_log("Checking if quiver jobs are completed.") done_flag = True bad_sh = [] self.fq_filenames = [] submitted = {} self.add_log("Submitted quiver jobs are at {f}:". format(f=self.submitted_quiver_jobs_log)) sge_used = False with open(self.submitted_quiver_jobs_log, 'r') as f: for line in f: a, b = line.strip().split('\t') if a == 'local': submitted[b] = b else: sge_used = True submitted[a] = b running_jids = [] if sge_used is True and self.use_sge is True: stuff = os.popen("qstat").read().strip().split('\n') # first two lines are header for x in stuff[2:]: job_id = x.split()[0] running_jids.append(job_id) if job_id in submitted: self.add_log("job {0} is still running.".format(job_id)) done_flag = False for job_id, sh_name in submitted.iteritems(): fq_filename = op.join(self.quivered_dir, op.basename(sh_name).replace('.sh', '.quivered.fastq')) if not nfs_exists(fq_filename) or \ os.stat(fq_filename).st_size == 0: if job_id in running_jids: # still running, pass done_flag = False else: self.add_log("job {0} is completed but {1} is still empty!". format(job_id, fq_filename)) bad_sh.append(submitted[job_id]) else: self.add_log("job {0} is done".format(job_id)) self.fq_filenames.append(fq_filename) if not done_flag: if len(bad_sh) == 0: return "RUNNING" else: self.add_log("The following jobs were completed but " + "no output file. Please check and resubmit: " + "\n{0}\n".format('\n'.join(bad_sh))) return "FAILED" else: return "DONE"
def check_arrow_jobs_completion(self): """Check whether arrow jobs are completed. submitted_arrow_jobs.txt should have format like: <job_id> \t ./arrowed/c0to10.sh Returns: "DONE" --- if all jobs are done and files are there return "FAILED" --- all jobs are done but some files incomplete ask if to resubmit "RUNNING" --- if not all jobs are done, just quit fq_filenames contains all the finished fastq files. """ self.add_log("Checking if arrow jobs are completed.") bad_sh = [] self.fq_filenames = [] self.add_log("Submitted arrow jobs are at {f}:". format(f=self.arrow_submission_run_file)) # submitted = list of (SGE jobid or local, script file that is running) sge_jobids, submitted = self.list_of_expected_arrow_fq_files() done_flag = True running_jids = [] # if one or more jobs were submitted through SGE, # go through qstat to see if anything is still running if len(sge_jobids): # at least one job was run through SGE stuff = os.popen("qstat").read().strip().split('\n') assert stuff[0].startswith('job-ID') assert stuff[1].startswith('-------') # first two lines are header for x in stuff[2:]: job_id = x.split()[0] running_jids.append(job_id) if job_id in sge_jobids: self.add_log("job {0} is still running.".format(job_id)) done_flag = False # now go through all the expected fastq files and check they exist for fq_filename,(job_id,sh_file) in submitted.iteritems(): if not nfs_exists(fq_filename) or \ os.stat(fq_filename).st_size == 0: if job_id in running_jids: # still running, pass done_flag = False else: self.add_log("job {0} is completed but {1} is still empty!". format(job_id, fq_filename)) bad_sh.append(sh_file) else: self.add_log("job {0} is done".format(job_id)) self.fq_filenames.append(fq_filename) if not done_flag: if len(bad_sh) == 0: return "RUNNING" else: # write the unfinished jobs to $unfinished_arrow_sh_files$ f = open(self.unfinished_arrow_sh_files, 'w') f.write("\n".join(bad_sh) + '\n') f.close() self.add_log("Some jobs were incomplete! Please re-run all files listed in {1}.\n".format(\ len(bad_sh), f.name)) return "FAILED" else: return "DONE"
def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc, tmp_dir, ref_fasta=None): """ Check inputs, write $ICE_PARTIAL_PY i command to script_file and return (input_fasta, ref_fasta, out_pickle, done_file) for the i-th chunk of nfl reads. """ icef = IceFiles(prog_name="ice_partial_{i}".format(i=i), root_dir=root_dir, no_log_f=False) # root_dir/output/final.consensus.fasta ref_fasta = icef.final_consensus_fa ref_dazz = icef.final_dazz_db # root_dir/output/map_noFL/input.split_{0:03d}.fasta input_fasta = icef.nfl_fa_i(i) # $input_fasta.partial_uc.pickle out_pickle = icef.nfl_pickle_i(i) # $input_fasta.partial_uc.pickle.DONE done_file = icef.nfl_done_i(i) # $input_fasta.partial_uc.sh script_file = icef.nfl_script_i(i) # Check if inputs exist. errMsg = "" if not nfs_exists(input_fasta): errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) + "fasta file {f} does not exist. ".format(f=input_fasta) + "Please run $ICE_PARTIAL_PY split first.") elif not nfs_exists(ref_fasta): # ref_fasta --- root_dir/output/final.consensus.fasta # ref_dazz --- root_dir/output/final.consensus.dazz.fasta.db # ref_fasta and ref_dazz must exist if ICE has run successfully in # root_dir. If either one does not exist, it means ICE has not # successfully run in root_dir. Then we have to throw an error message # requring users to copy the root_dir/output directory manually, # rather than providing an option to overwrite ref_fasta and build # ref_dazz, because a race condition can happen when multiple # IcePartialI tasks start to run at the same time, which can corrupt # fasta and dazz db files and lead to unexpected runtime errors. errMsg = ("The unpolished consensus isoforms fasta file " + "{f} does not exist. ".format(f=ref_fasta) + "Please make sure ICE is successfully done in root_dir, " + "or copy ICE output directory (e.g., cluster_out/output) " + "to {dst}".format(dst=op.dirname(ref_fasta))) elif not nfs_exists(ref_dazz): errMsg = ("The dazz db " + "{f} does not exist. ".format(f=ref_dazz) + "Please make sure it is already built.") if len(errMsg) != 0: raise IOError(errMsg) # Save cmd to script_file. cmd = self._cmd_str(root_dir=root_dir, i=[i], ccs_fofn=ccs_fofn, blasr_nproc=blasr_nproc, tmp_dir=tmp_dir) with open(script_file, 'w') as writer: writer.write(cmd + "\n") icef.add_log("Writing CMD to: {script_file}". format(script_file=script_file)) icef.close_log() return (input_fasta, ref_fasta, out_pickle, done_file)
def check_arrow_jobs_completion(self): """Check whether arrow jobs are completed. submitted_arrow_jobs.txt should have format like: <job_id> \t ./arrowed/c0to10.sh Returns: "DONE" --- if all jobs are done and files are there return "FAILED" --- all jobs are done but some files incomplete ask if to resubmit "RUNNING" --- if not all jobs are done, just quit fq_filenames contains all the finished fastq files. """ self.add_log("Checking if arrow jobs are completed.") bad_sh = [] self.fq_filenames = [] self.add_log("Submitted arrow jobs are at {f}:".format( f=self.arrow_submission_run_file)) # submitted = list of (SGE jobid or local, script file that is running) sge_jobids, submitted = self.list_of_expected_arrow_fq_files() done_flag = True running_jids = [] # if one or more jobs were submitted through SGE, # go through qstat to see if anything is still running if len(sge_jobids): # at least one job was run through SGE stuff = os.popen("qstat").read().strip().split('\n') assert stuff[0].startswith('job-ID') assert stuff[1].startswith('-------') # first two lines are header for x in stuff[2:]: job_id = x.split()[0] running_jids.append(job_id) if job_id in sge_jobids: self.add_log("job {0} is still running.".format(job_id)) done_flag = False # now go through all the expected fastq files and check they exist for fq_filename, (job_id, sh_file) in submitted.iteritems(): if not nfs_exists(fq_filename) or \ os.stat(fq_filename).st_size == 0: if job_id in running_jids: # still running, pass done_flag = False else: self.add_log( "job {0} is completed but {1} is still empty!".format( job_id, fq_filename)) bad_sh.append(sh_file) else: self.add_log("job {0} is done".format(job_id)) self.fq_filenames.append(fq_filename) if not done_flag: if len(bad_sh) == 0: return "RUNNING" else: # write the unfinished jobs to $unfinished_arrow_sh_files$ f = open(self.unfinished_arrow_sh_files, 'w') f.write("\n".join(bad_sh) + '\n') f.close() self.add_log("Some jobs were incomplete! Please re-run all files listed in {1}.\n".format(\ len(bad_sh), f.name)) return "FAILED" else: return "DONE"