def _validate_inputs(self, root_dir, N): """ Check inputs, return (splitted_pickles, out_pickle) """ icef = IceFiles(prog_name="ice_partial_merge", root_dir=root_dir, no_log_f=False) # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle splitted_pickles = [icef.nfl_pickle_i(i) for i in range(0, N)] # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE dones = [icef.nfl_done_i(i) for i in range(0, N)] # Check if inputs exist. errMsg = "" for done in dones: if not nfs_exists(done): errMsg = "DONE file {f} does not exist.".format(f=done) for pickle in splitted_pickles: if not nfs_exists(pickle): errMsg = "Pickle file {f} does not exist.".format(f=pickle) if len(errMsg) != 0: raise ValueError(errMsg) # root_dir/output/map_noFL/nfl.all.partial_uc.pickle out_pickle = icef.nfl_all_pickle_fn return (splitted_pickles, out_pickle)
def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn, ice_opts, sge_opts, hq_isoforms_fa=None, hq_isoforms_fq=None, lq_isoforms_fa=None, lq_isoforms_fq=None, fasta_fofn=None): """ root_dir --- IceFiles.root_dir, usually data/clusterOutDir nfl_fa --- non-full-length reads in fasta, e.g., isoseq_nfl.fa bas_fofn --- e.g. input.fofn of bas|bax.h5 files ccs_fofn --- e.g. reads_of_insert.fofn of ccs files. hq_isoforms_fa|fq --- polished, hiqh quality consensus isoforms in fasta|q lq_isoforms_fa|fq --- polished, low quality consensus isoforms in fasta|q """ IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn) self.nfl_fa = realpath(nfl_fa) self.hq_isoforms_fa = hq_isoforms_fa self.hq_isoforms_fq = hq_isoforms_fq self.lq_isoforms_fa = lq_isoforms_fa self.lq_isoforms_fq = lq_isoforms_fq self.ice_opts = ice_opts self.sge_opts = sge_opts self.icep = None # IceAllPartials. self.iceq = None # IceQuiver self.icepq = None # IcePostQuiver self._nfl_splitted_fas = None self._validate_inputs()
def _validate_inputs(self, root_dir, nfl_fa, N): """ Check inputs, return (num_reads, number_reads_per_chunk, nfl_dir, [i-th_chunk_nfl_fa for i in [0...N-1]]) """ icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False) nfl_dir = icef.nfl_dir # root_dir/output/map_noFL/input.split_{0:02d}.fa splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)] mkdir(icef.nfl_dir) # Check if inputs exist. errMsg = "" if not nfs_exists(nfl_fa): errMsg = "The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa) if len(errMsg) != 0: raise ValueError(errMsg) num_reads = num_reads_in_fasta(nfl_fa) reads_per_split = int(max(1, ceil(num_reads / N))) return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
def _validate_inputs(self, root_dir, nfl_fa, N): """ Check inputs, return (num_reads, number_reads_per_chunk, nfl_dir, [i-th_chunk_nfl_fa for i in [0...N-1]]) """ icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False) nfl_dir = icef.nfl_dir # root_dir/output/map_noFL/input.split_{0:02d}.fa splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)] mkdir(icef.nfl_dir) # Check if inputs exist. errMsg = "" if not nfs_exists(nfl_fa): errMsg = ("The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa)) if len(errMsg) != 0: raise ValueError(errMsg) num_reads = num_reads_in_fasta(nfl_fa) reads_per_split = int(max(1, ceil(num_reads / N))) return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc): """ Check inputs, write ice_partial.py i command to script_file and return (input_fasta, ref_fasta, sa_file, out_pickle, done_file) for the i-th chunk of nfl reads. """ icef = IceFiles(prog_name="ice_partial_{i}".format(i=i), root_dir=root_dir, no_log_f=False) # root_dir/output/final.consensus.fa ref_fasta = icef.final_consensus_fa sa_file = icef.final_consensus_sa # root_dir/output/map_noFL/input.split_{0:02d}.fa input_fasta = icef.nfl_fa_i(i) # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle out_pickle = icef.nfl_pickle_i(i) # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE done_file = icef.nfl_done_i(i) # root_dir/scripts/input.split_{0:02d}.fa.partial_uc.sh script_file = icef.nfl_script_i(i) # Check if inputs exist. errMsg = "" if not nfs_exists(input_fasta): errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) + "fasta file {f} does not exist. ".format(f=input_fasta) + "Please run ice_partial_split.py first.") elif not nfs_exists(ref_fasta): errMsg = ("The unpolished consensus isoforms fasta file " + "{f} does not exist. ".format(f=ref_fasta) + "Please make sure ICE is successfully done.") elif not nfs_exists(sa_file): errMsg = ("The suffix array of unpolished consensus isoforms " + "(i.e., final_consensus_sa) {f} does not exist.") if len(errMsg) != 0: raise ValueError(errMsg) # Save cmd to script_file. cmd = self._cmd_str(root_dir=root_dir, i=i, ccs_fofn=ccs_fofn, blasr_nproc=blasr_nproc) with open(script_file, 'w') as writer: writer.write(cmd + "\n") icef.add_log("Writing CMD to: {script_file}". format(script_file=script_file)) icef.close_log() return (input_fasta, ref_fasta, sa_file, out_pickle, done_file)
def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc): """ Check inputs, write ice_partial.py i command to script_file and return (input_fasta, ref_fasta, sa_file, out_pickle, done_file) for the i-th chunk of nfl reads. """ icef = IceFiles(prog_name="ice_partial_{i}".format(i=i), root_dir=root_dir, no_log_f=False) # root_dir/output/final.consensus.fa ref_fasta = icef.final_consensus_fa ref_dazz = icef.final_dazz_db #sa_file = icef.final_consensus_sa # root_dir/output/map_noFL/input.split_{0:02d}.fa input_fasta = icef.nfl_fa_i(i) # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle out_pickle = icef.nfl_pickle_i(i) # root_dir/output/map_noFL/input.split_{0:02d}.fa.partial_uc.pickle.DONE done_file = icef.nfl_done_i(i) # root_dir/scripts/input.split_{0:02d}.fa.partial_uc.sh script_file = icef.nfl_script_i(i) # Check if inputs exist. errMsg = "" if not nfs_exists(input_fasta): errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) + "fasta file {f} does not exist. ".format(f=input_fasta) + "Please run ice_partial_split.py first.") elif not nfs_exists(ref_fasta): errMsg = ("The unpolished consensus isoforms fasta file " + "{f} does not exist. ".format(f=ref_fasta) + "Please make sure ICE is successfully done.") elif not nfs_exists(ref_dazz): errMsg = ("The dazz db " + "{f} does not exist. ".format(f=ref_dazz) + "Please make sure it is already built.") # elif not nfs_exists(sa_file): # errMsg = ("The suffix array of unpolished consensus isoforms " + # "(i.e., final_consensus_sa) {f} does not exist.") if len(errMsg) != 0: raise ValueError(errMsg) # Save cmd to script_file. cmd = self._cmd_str(root_dir=root_dir, i=i, ccs_fofn=ccs_fofn, blasr_nproc=blasr_nproc) with open(script_file, 'w') as writer: writer.write(cmd + "\n") icef.add_log("Writing CMD to: {script_file}". format(script_file=script_file)) icef.close_log() return (input_fasta, ref_fasta, ref_dazz, out_pickle, done_file)
def __init__(self, root_dir, fasta_filenames, ref_fasta, out_pickle, sge_opts, sa_file=None, ccs_fofn=None): """ fasta_filenames --- a list of splitted nfl fasta files. ref_fasta --- (unpolished) consensus isoforms out_pickle --- a pickle file with all nfl fasta reads ccs_fofn --- should be reads_of_insert.fofn or None root_dir --- ICE root output directory sge_opts --- params for SGE environment, including use_sge : use SGE or not max_sge_jobs: maximum number of gcon jobs submitted unique_id : unique qsub job id, important that this DOES NOT CONFLICT! blasr_nproc: blasr -nproc param, number of threads per cpu. gcon_nproc : number of gcon that can run at the same time """ self.prog_name = "IceAllPartials" IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir) self.fasta_filenames, self.ref_fasta, self.ccs_fofn, self.sa_file = \ self._validateInputs(fasta_filenames=fasta_filenames, ref_fasta=ref_fasta, ccs_fofn=ccs_fofn, sa_file=sa_file) self.out_pickle = out_pickle self.sge_opts = sge_opts self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir) mkdir(self.nfl_dir) self.add_log("input fasta files are: " + ", ".join(self.fasta_filenames)) self.add_log("temp pickle files are: " + ", ".join(self.pickle_filenames)) self.add_log("out pickle file is: " + self.out_pickle)
def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn, ice_opts, sge_opts, ipq_opts, fasta_fofn=None, nfl_reads_per_split=30000): """ root_dir --- IceFiles.root_dir, usually data/clusterOutDir nfl_fa --- non-full-length reads in fasta, e.g., isoseq_nfl.fa bas_fofn --- e.g. input.fofn of bas|bax.h5 files ccs_fofn --- e.g. reads_of_insert.fofn of ccs files. ipq_opts --- IceQuiverHQLQOptions qv_trim_5: ignore QV of n bases in the 5' end qv_trim_3: ignore QV of n bases in the 3' end hq_quiver_min_accuracy: minimum allowed quiver accuracy to mark an isoform as high quality hq_isoforms_fa|fq: polished, hiqh quality consensus isoforms in fasta|q lq_isoforms_fa|fq: polished, low quality consensus isoforms in fasta|q """ IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn) self.nfl_fa = realpath(nfl_fa) self.nfl_reads_per_split = nfl_reads_per_split self.ice_opts = ice_opts self.sge_opts = sge_opts self.ipq_opts = ipq_opts self.icep = None # IceAllPartials. self.iceq = None # IceQuiver self.icepq = None # IceQuiverPostprocess self._nfl_splitted_fas = None self.validate_inputs()
def __init__(self, root_dir, fastq_filenames, ref_fasta, out_pickle, sge_opts, sa_file=None, ccs_fofn=None): """ fastq_filenames --- a list of splitted nfl fastq files. ref_fasta --- (unpolished) consensus isoforms out_pickle --- a pickle file with all nfl fasta reads ccs_fofn --- should be reads_of_insert.fofn or None root_dir --- ICE root output directory sge_opts --- params for SGE environment, including use_sge : use SGE or not max_sge_jobs: maximum number of sub-jobs submitted unique_id : unique qsub job id, important that this DOES NOT CONFLICT! blasr_nproc: blasr -nproc param, number of threads per cpu. """ self.prog_name = "IceAllPartials" IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir) self.add_log("DEBUG: in IceAllPartials, ccs_fofn is {0}.".format(ccs_fofn), level=logging.INFO) self.fastq_filenames, self.ref_fasta, self.ccs_fofn, self.sa_file = \ self._validate_inputs(fastq_filenames=fastq_filenames, ref_fasta=ref_fasta, ccs_fofn=ccs_fofn, sa_file=sa_file) self.out_pickle = out_pickle self.sge_opts = sge_opts self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir) mkdir(self.nfl_dir) self.add_log("input fasta files are: " + ", ".join(self.fastq_filenames)) self.add_log("temp pickle files are: " + ", ".join(self.pickle_filenames)) self.add_log("out pickle file is: " + self.out_pickle)
def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn, ice_opts, sge_opts, ipq_opts, fasta_fofn=None, nfl_reads_per_split=30000): """ root_dir --- IceFiles.root_dir, usually data/clusterOutDir nfl_fa --- non-full-length reads in fasta, e.g., isoseq_nfl.fa bas_fofn --- e.g. input.fofn of bas|bax.h5 files ccs_fofn --- e.g. reads_of_insert.fofn of ccs files. ipq_opts --- IceQuiverHQLQOptions qv_trim_5: ignore QV of n bases in the 5' end qv_trim_3: ignore QV of n bases in the 3' end hq_quiver_min_accuracy: minimum allowed quiver accuracy to mark an isoform as high quality hq_isoforms_fa|fq: polished, hiqh quality consensus isoforms in fasta|q lq_isoforms_fa|fq: polished, low quality consensus isoforms in fasta|q """ IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn) #self.add_log("DEBUG: in Polish ccs_fofn is {0}".format(self.ccs_fofn)) #self.add_log("DEBUG: in Polish fasta_fofn is {0}".format(self.fasta_fofn)) #self.add_log("DEBUG: in Polish bas_fofn is {0}".format(self.bas_fofn)) self.nfl_fa = realpath(nfl_fa) self.nfl_reads_per_split = nfl_reads_per_split self.ice_opts = ice_opts self.sge_opts = sge_opts self.ipq_opts = ipq_opts self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(self.ice_opts.ece_penalty, self.ice_opts.ece_min_len)) self.icep = None # IceAllPartials. self.iceq = None # IceQuiver self.icepq = None # IceQuiverPostprocess self._nfl_splitted_fas = None self.validate_inputs()