def combinePickles(self, pickle_filenames, out_pickle): """Combine all *.pickle files to one and dump to self.out_pickle.""" self.add_log("Combining pickles: {ps} to a big pickle {p}".format( ps=", ".join(pickle_filenames), p=out_pickle), level=logging.INFO) if len(pickle_filenames) == 1: src = pickle_filenames[0] dst = out_pickle if (realpath(src) != realpath(dst)): self.add_log("Copying {src} to {dst}.".format(src=src, dst=dst)) shutil.copyfile(src, dst) else: self.add_log("{dst} has been created, no need to merge.". format(dst=out_pickle)) else: # Combine all partial outputs self.add_log("Merging all pickles.") partial_uc = defaultdict(lambda: []) nohit = set() for pf in pickle_filenames: self.add_log("Merging {pf}.".format(pf=pf)) a = load(open(pf)) nohit.update(a['nohit']) for k, v in a['partial_uc'].iteritems(): partial_uc[k] += v self.add_log("Dumping all to {f}".format(f=out_pickle)) # Dump to one file partial_uc = dict(partial_uc) with open(out_pickle, 'w') as f: dump({'nohit':nohit, 'partial_uc':partial_uc}, f)
def combine_nfl_pickles(splitted_pickles, out_pickle): """Combine splitted nfl pickles to a big pickle.""" logging.debug("Cominbing {N} nfl pickles: {ps} ". format(N=len(splitted_pickles), ps=",".join(splitted_pickles)) + " into a big pickle {p}.".format(p=out_pickle)) if len(splitted_pickles) == 1: logging.debug("Copying the only given pickle to out_pickle.") if realpath(splitted_pickles[0]) != realpath(out_pickle): shutil.copyfile(splitted_pickles[0], out_pickle) else: # Combine all partial outputs logging.debug("Merging all pickles.") partial_uc = defaultdict(lambda: []) nohit = set() for pf in splitted_pickles: logging.debug("Merging {pf}.".format(pf=pf)) a = load(open(pf)) nohit.update(a['nohit']) for k, v in a['partial_uc'].iteritems(): partial_uc[k] += v logging.debug("Dumping all to {f}".format(f=out_pickle)) # Dump to one file partial_uc = dict(partial_uc) with open(out_pickle, 'w') as f: dump({'nohit': nohit, 'partial_uc': partial_uc}, f) logging.debug("{f} created.".format(f=out_pickle))
def _validate_inputs(self, _flnc_fa, _nfl_fa, _ccs_fofn, fasta_fofn=None, quiver=False): """Validate input files and return absolute expaneded paths.""" flnc_fa, nfl_fa, ccs_fofn = _flnc_fa, _nfl_fa, _ccs_fofn self.add_log("Checking input files.", level=logging.INFO) if flnc_fa is None: raise ClusterException( "Input full-length non-chimeric reads " + "files (i.e., flnc_fa) needs to be specified." ) else: flnc_fa = realpath(flnc_fa) if not op.exists(flnc_fa): raise ClusterException("Unable to find full-length " + "non-chimeric reads: {fn}".format(fn=flnc_fa)) if nfl_fa is None: if quiver is True: raise ClusterException( "Input non-full-length reads file " + "(i.e., nfl_fa) needs to be specified for isoform polish." ) else: nfl_fa = realpath(nfl_fa) if not op.exists(nfl_fa): raise ClusterException("Unable to find non-full-length " + "non-chimeric reads: {fn}".format(fn=nfl_fa)) if ccs_fofn is not None and not op.exists(ccs_fofn): raise ClusterException("Unable to find CCS FOFN file: " + "{fn}".format(fn=ccs_fofn)) if fasta_fofn is not None and quiver: if not os.path.exists(fasta_fofn): raise ClusterException("Unable to find FASTA_FOFN file: {0}".format(fasta_fofn)) for line in open(fasta_fofn): if not os.path.exists(line.strip()): raise ClusterException("Unable to locate {0} in {1}".format(line.strip(), fasta_fofn)) return (flnc_fa, nfl_fa, ccs_fofn, fasta_fofn)
def _validate_inputs(self, _flnc_fa, _nfl_fa, _ccs_fofn, fasta_fofn=None, quiver=False): """Validate input files and return absolute expaneded paths.""" flnc_fa, nfl_fa, ccs_fofn = _flnc_fa, _nfl_fa, _ccs_fofn self.add_log("Checking input files.", level=logging.INFO) if flnc_fa is None: raise ClusterTestException("Input full-length non-chimeric reads " + "files (i.e., flnc_fa) needs to be specified.") else: flnc_fa = realpath(flnc_fa) if not op.exists(flnc_fa): raise ClusterTestException("Unable to find full-length " + "non-chimeric reads: {fn}".format(fn=flnc_fa)) if nfl_fa is None: if quiver is True: raise ClusterTestException("Input non-full-length reads file " + "(i.e., nfl_fa) needs to be specified for isoform polish.") else: nfl_fa = realpath(nfl_fa) if not op.exists(nfl_fa): raise ClusterTestException("Unable to find non-full-length " + "non-chimeric reads: {fn}".format(fn=nfl_fa)) if ccs_fofn is not None and not op.exists(ccs_fofn): raise ClusterTestException("Unable to find CCS FOFN file: " + "{fn}".format(fn=ccs_fofn)) if fasta_fofn is not None and quiver: if not os.path.exists(fasta_fofn): raise ClusterTestException("Unable to find FASTA_FOFN file: {0}".format(fasta_fofn)) for line in open(fasta_fofn): if not os.path.exists(line.strip()): raise ClusterTestException("Unable to locate {0} in {1}".format(\ line.strip(), fasta_fofn)) return (flnc_fa, nfl_fa, ccs_fofn, fasta_fofn)
def combinePickles(self, pickle_filenames, out_pickle): """Combine all *.pickle files to one and dump to self.out_pickle.""" self.add_log("Combining pickles: {ps} to a big pickle {p}".format( ps=", ".join(pickle_filenames), p=out_pickle), level=logging.INFO) if len(pickle_filenames) == 1: src = pickle_filenames[0] dst = out_pickle if (realpath(src) != realpath(dst)): self.add_log("Copying {src} to {dst}.".format(src=src, dst=dst)) shutil.copyfile(src, dst) else: self.add_log( "{dst} has been created, no need to merge.".format( dst=out_pickle)) else: # Combine all partial outputs self.add_log("Merging all pickles.") partial_uc = defaultdict(lambda: []) nohit = set() for pf in pickle_filenames: self.add_log("Merging {pf}.".format(pf=pf)) a = load(open(pf)) nohit.update(a['nohit']) for k, v in a['partial_uc'].iteritems(): partial_uc[k] += v self.add_log("Dumping all to {f}".format(f=out_pickle)) # Dump to one file partial_uc = dict(partial_uc) with open(out_pickle, 'w') as f: dump({'nohit': nohit, 'partial_uc': partial_uc}, f)
def __init__(self, root_dir, flnc_fa, nfl_fa, bas_fofn, ccs_fofn, out_fa, sge_opts, ice_opts, ipq_opts, report_fn=None, summary_fn=None, fasta_fofn=None, nfl_reads_per_split=30000): super(Cluster, self).__init__(prog_name="Cluster", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn) self.sge_opts = sge_opts # SGE, CPU arguments and etc self.ice_opts = ice_opts # ICE clustering algorithm arguments self.ipq_opts = ipq_opts # IceQuiver HQ/LQ isoform arguments self.nfl_reads_per_split = int( nfl_reads_per_split) # ToDo: setsanity check here? self.flnc_fa, self.nfl_fa, self.ccs_fofn, self.fasta_fofn = self._validate_inputs( flnc_fa, nfl_fa, ccs_fofn, fasta_fofn=fasta_fofn, quiver=self.ice_opts.quiver) self.add_log("DEBUG: self.fasta_fofn is {0}".format(self.fasta_fofn)) self.root_dir, self.out_fa = self._validate_outputs(root_dir, out_fa) self.sanity_check() self._probqv = None # probability & quality value self._flnc_splitted_fas = [] # split flnc_fa into smaller files. self._nflncSplittedFas = [] # split nfl_fa into smaller files. self._logConfigs() # Log configurations self.iceinit = None self.icec = None self.iceq = None self.pol = None self.report_fn = realpath(report_fn) if report_fn is not None \ else op.join(self.root_dir, "cluster_report.csv") self.summary_fn = realpath(summary_fn) if summary_fn is not None \ else op.join(self.root_dir, "cluster_summary.txt") self.add_log("A Cluster Object created.", level=logging.INFO)
def __init__( self, root_dir, flnc_fa, nfl_fa, bas_fofn, ccs_fofn, out_fa, sge_opts, ice_opts, ipq_opts, report_fn=None, summary_fn=None, fasta_fofn=None, nfl_reads_per_split=30000, ): super(Cluster, self).__init__( prog_name="Cluster", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn ) self.sge_opts = sge_opts # SGE, CPU arguments and etc self.ice_opts = ice_opts # ICE clustering algorithm arguments self.ipq_opts = ipq_opts # IceQuiver HQ/LQ isoform arguments self.nfl_reads_per_split = int(nfl_reads_per_split) # ToDo: setsanity check here? self.flnc_fa, self.nfl_fa, self.ccs_fofn, self.fasta_fofn = self._validate_inputs( flnc_fa, nfl_fa, ccs_fofn, fasta_fofn=fasta_fofn, quiver=self.ice_opts.quiver ) self.root_dir, self.out_fa = self._validate_outputs(root_dir, out_fa) self.sanity_check() self._probqv = None # probability & quality value self._flnc_splitted_fas = [] # split flnc_fa into smaller files. self._nflncSplittedFas = [] # split nfl_fa into smaller files. self._logConfigs() # Log configurations self.iceinit = None self.icec = None self.iceq = None self.pol = None self.add_log( "Setting ece_penalty: {0} ece_min_len: {1}".format(ice_opts.ece_penalty, ice_opts.ece_min_len), level=logging.INFO, ) self.report_fn = realpath(report_fn) if report_fn is not None else op.join(self.root_dir, "cluster_report.csv") self.summary_fn = ( realpath(summary_fn) if summary_fn is not None else op.join(self.root_dir, "cluster_summary.txt") ) self.add_log("A Cluster Object created.", level=logging.INFO)
def __init__(self, root_dir, flnc_fa, nfl_fa, bas_fofn, ccs_fofn, out_fa, sge_opts, ice_opts, hq_isoforms_fa=None, hq_isoforms_fq=None, lq_isoforms_fa=None, lq_isoforms_fq=None, report_fn=None, summary_fn=None): super(Cluster, self).__init__(prog_name="Cluster", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn) self.flnc_fa, self.nfl_fa, self.ccs_fofn = self._validateInputs( flnc_fa, nfl_fa, ccs_fofn) self.root_dir, self.out_fa = self._validateOutputs(root_dir, out_fa) self.hq_isoforms_fa = hq_isoforms_fa self.hq_isoforms_fq = hq_isoforms_fq self.lq_isoforms_fa = lq_isoforms_fa self.lq_isoforms_fq = lq_isoforms_fq self.sge_opts = sge_opts # SGE, CPU options and etc self.ice_opts = ice_opts # The ICE clutering algorithm options self.sanity_check() self._probqv = None # probability & quality value self._flnc_splitted_fas = [] # split flnc_fa into smaller files. self._nflncSplittedFas = [] # split nfl_fa into smaller files. self._logConfigs() # Log configurations self.iceinit = None self.icec = None self.iceq = None self.pol = None self.report_fn = realpath(report_fn) if report_fn is not None \ else op.join(self.root_dir, "cluster_report.csv") self.summary_fn = realpath(summary_fn) if summary_fn is not None \ else op.join(self.root_dir, "cluster_summary.txt") self.summary = ClusterSummary() self.add_log("Finishing create Cluster Object.", level=logging.INFO)
def _validateInputs(self, fasta_filenames, ref_fasta, ccs_fofn, sa_file): """Validate input files.""" for f in fasta_filenames: if not op.exists(f): raise IOError("Input fasta {f} does not exist.".format(f=f)) if ref_fasta is None or not op.exists(ref_fasta): raise IOError("Reference {r} does not exist.".format(r=ref_fasta)) if ccs_fofn is not None and not op.exists(ccs_fofn): raise IOError("ccs_fofn file {ccs_fofn} does not exist.".format( ccs_fofn=ccs_fofn)) if sa_file is not None and not op.exists(sa_file): raise IOError("suffix array {s} does not exist.".format(s=sa_file)) return ([realpath(f) for f in fasta_filenames], realpath(ref_fasta), realpath(ccs_fofn), realpath(sa_file) if sa_file is not None else None)
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"): """ Run daligner on gcon_in.fa, but don't care about results. Just make sure it runs. """ scriptDir = realpath(scriptDir) testDir = op.join(scriptDir, testDirName) if not op.exists(scriptDir): os.makedirs(scriptDir) if not op.exists(testDir): os.makedirs(testDir) testInFa = op.join(testDir, "gcon_in.fa") if op.exists(testInFa): os.remove(testInFa) shutil.copy(GCON_IN_FA, testInFa) assert(op.exists(testInFa)) obj = DazzIDHandler(testInFa) DalignerRunner.make_db(obj.dazz_filename) runner = DalignerRunner(testInFa, testInFa, is_FL=True, same_strand_only=True, \ query_converted=True, db_converted=True, query_made=True, \ db_made=True, use_sge=False, cpus=4, sge_opts=None) runner.runHPC(min_match_len=300, output_dir=testDir, sensitive_mode=False) shutil.rmtree(testDir) logging.info("daligner check passed.") return True
def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn, ice_opts, sge_opts, hq_isoforms_fa=None, hq_isoforms_fq=None, lq_isoforms_fa=None, lq_isoforms_fq=None, fasta_fofn=None): """ root_dir --- IceFiles.root_dir, usually data/clusterOutDir nfl_fa --- non-full-length reads in fasta, e.g., isoseq_nfl.fa bas_fofn --- e.g. input.fofn of bas|bax.h5 files ccs_fofn --- e.g. reads_of_insert.fofn of ccs files. hq_isoforms_fa|fq --- polished, hiqh quality consensus isoforms in fasta|q lq_isoforms_fa|fq --- polished, low quality consensus isoforms in fasta|q """ IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn) self.nfl_fa = realpath(nfl_fa) self.hq_isoforms_fa = hq_isoforms_fa self.hq_isoforms_fq = hq_isoforms_fq self.lq_isoforms_fa = lq_isoforms_fa self.lq_isoforms_fq = lq_isoforms_fq self.ice_opts = ice_opts self.sge_opts = sge_opts self.icep = None # IceAllPartials. self.iceq = None # IceQuiver self.icepq = None # IcePostQuiver self._nfl_splitted_fas = None self._validate_inputs()
def _validate_inputs(self, fastq_filenames, ref_fasta, ccs_fofn, sa_file): """Validate input files.""" for f in fastq_filenames: if not op.exists(f): raise IOError("Input fastq {f} does not exist.".format(f=f)) if ref_fasta is None or not op.exists(ref_fasta): raise IOError("Reference {r} does not exist.".format(r=ref_fasta)) if ccs_fofn is not None and not op.exists(ccs_fofn): raise IOError("ccs_fofn file {ccs_fofn} does not exist.".format( ccs_fofn=ccs_fofn)) if sa_file is not None and not op.exists(sa_file): raise IOError("suffix array {s} does not exist.".format(s=sa_file)) return ([realpath(f) for f in fastq_filenames], realpath(ref_fasta), realpath(ccs_fofn), realpath(sa_file) if sa_file is not None else None)
def __init__(self, root_dir, flnc_fa, nfl_fa, bas_fofn, ccs_fofn, out_fa, sge_opts, ice_opts, hq_isoforms_fa=None, hq_isoforms_fq=None, lq_isoforms_fa=None, lq_isoforms_fq=None, report_fn=None, summary_fn=None): super(Cluster, self).__init__(prog_name="Cluster", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn) self.flnc_fa, self.nfl_fa, self.ccs_fofn = self._validateInputs( flnc_fa, nfl_fa, ccs_fofn) self.root_dir, self.out_fa = self._validateOutputs( root_dir, out_fa) self.hq_isoforms_fa = hq_isoforms_fa self.hq_isoforms_fq = hq_isoforms_fq self.lq_isoforms_fa = lq_isoforms_fa self.lq_isoforms_fq = lq_isoforms_fq self.sge_opts = sge_opts # SGE, CPU options and etc self.ice_opts = ice_opts # The ICE clutering algorithm options self.sanity_check() self._probqv = None # probability & quality value self._flnc_splitted_fas = [] # split flnc_fa into smaller files. self._nflncSplittedFas = [] # split nfl_fa into smaller files. self._logConfigs() # Log configurations self.iceinit = None self.icec = None self.iceq = None self.pol = None self.report_fn = realpath(report_fn) if report_fn is not None \ else op.join(self.root_dir, "cluster_report.csv") self.summary_fn = realpath(summary_fn) if summary_fn is not None \ else op.join(self.root_dir, "cluster_summary.txt") self.summary = ClusterSummary() self.add_log("Finishing create Cluster Object.", level=logging.INFO)
def _validateInputs(self, _flnc_fa, _nfl_fa, _ccs_fofn): """Validate input files and return absolute expaneded paths.""" flnc_fa, nfl_fa, ccs_fofn = _flnc_fa, _nfl_fa, _ccs_fofn self.add_log("Checking input files.", level=logging.INFO) if flnc_fa is None or nfl_fa is None: raise ClusterException( "Input non-chimeric reads files needs to be specified.") else: flnc_fa, nfl_fa = realpath(flnc_fa), realpath(nfl_fa) if not op.exists(flnc_fa): raise ClusterException("Unable to find full-length " + "non-chimeric reads: {fn}".format(fn=flnc_fa)) if not op.exists(nfl_fa): raise ClusterException("Unable to find non-full-length " + "non-chimeric reads: {fn}".format(fn=nfl_fa)) if ccs_fofn is not None and not op.exists(ccs_fofn): raise ClusterException("Unable to find FOFN file: " + "{fn}".format(fn=ccs_fofn)) return (flnc_fa, nfl_fa, ccs_fofn)
def run(self): """Assigning nfl reads to consensus isoforms and merge.""" # Call ice_partial.py to create a pickle for each splitted nfl fasta self.createPickles() # Wait for pickles to be created, if SGE is used. self.waitForPickles(pickle_filenames=self.pickle_filenames, done_filenames=self.done_filenames) # Combine all pickles to a big pickle file: nfl_all_pickle_fn. self.combinePickles(pickle_filenames=self.pickle_filenames, out_pickle=self.nfl_all_pickle_fn) # Create symbolic link if necessary if realpath(self.nfl_all_pickle_fn) != realpath(self.out_pickle): self.add_log("Creating a symbolic link for {f}".format( f=self.out_pickle), level=logging.INFO) if op.exists(self.out_pickle): os.remove(self.out_pickle) os.symlink(self.nfl_all_pickle_fn, self.out_pickle) # Close log self.close_log()
def _validate_outputs(self, _root_dir, _out_fa): """Validate outputs, create root_dir if it does not exist.""" self.add_log("Checking outputs.", level=logging.INFO) root_dir, out_fa = _root_dir, _out_fa if root_dir is None: self.add_log("Output directory needs to be specified.", level=logging.ERROR) if out_fa is None: self.add_log("Output consensus fasta needs to be specified.", level=logging.ERROR) root_dir = realpath(root_dir) out_fa = realpath(out_fa) if op.exists(root_dir): self.add_log("Output directory {d} already exists.".format(d=root_dir)) else: self.add_log("Creating output directory {d}.".format(d=root_dir)) os.mkdir(root_dir) if op.exists(out_fa): raise ClusterException("Consensus FASTA file {f} already exists.".format(f=out_fa)) return root_dir, out_fa
def run(self): """Assigning nfl reads to consensus isoforms and merge.""" # Call ice_partial.py to create a pickle for each splitted nfl fasta self.createPickles() # Wait for pickles to be created, if SGE is used. self.waitForPickles(pickle_filenames=self.pickle_filenames, done_filenames=self.done_filenames) # Combine all pickles to a big pickle file: nfl_all_pickle_fn. self.combinePickles(pickle_filenames=self.pickle_filenames, out_pickle=self.nfl_all_pickle_fn) # Create symbolic link if necessary if realpath(self.nfl_all_pickle_fn) != realpath(self.out_pickle): self.add_log( "Creating a symbolic link for {f}".format(f=self.out_pickle), level=logging.INFO) if op.exists(self.out_pickle): os.remove(self.out_pickle) os.symlink(self.nfl_all_pickle_fn, self.out_pickle) # Close log self.close_log()
def _validateInputs(self, _flnc_fa, _nfl_fa, _ccs_fofn): """Validate input files and return absolute expaneded paths.""" flnc_fa, nfl_fa, ccs_fofn = _flnc_fa, _nfl_fa, _ccs_fofn self.add_log("Checking input files.", level=logging.INFO) if flnc_fa is None or nfl_fa is None: raise ClusterException( "Input non-chimeric reads files needs to be specified.") else: flnc_fa, nfl_fa = realpath(flnc_fa), realpath(nfl_fa) if not op.exists(flnc_fa): raise ClusterException("Unable to find full-length " + "non-chimeric reads: {fn}".format( fn=flnc_fa)) if not op.exists(nfl_fa): raise ClusterException("Unable to find non-full-length " + "non-chimeric reads: {fn}".format( fn=nfl_fa)) if ccs_fofn is not None and not op.exists(ccs_fofn): raise ClusterException("Unable to find FOFN file: " + "{fn}".format(fn=ccs_fofn)) return (flnc_fa, nfl_fa, ccs_fofn)
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"): """Sanity check if sge can work.""" scriptDir = realpath(scriptDir) testDir = op.join(scriptDir, testDirName) if not op.exists(scriptDir): os.makedirs(scriptDir) if not op.exists(testDir): os.makedirs(testDir) testSh = op.join(scriptDir, 'test.sh') consensusFa = op.join(testDir, "g_consensus.fasta") testInFa = op.join(testDir, "gcon_in.fa") if op.exists(testInFa): os.remove(testInFa) shutil.copy(GCON_IN_FA, testInFa) assert(op.exists(testInFa)) with open(testSh, 'w') as f: f.write("#!/bin/bash\n") f.write("{gcon}".format(gcon=gcon_py) + " {inFa} ".format(inFa=real_upath(testInFa)) + " {testDir}/g_consensus".format(testDir=real_upath(testDir)) + " c1\n") assert(op.exists(testSh)) cmd = "qsub" if sge_opts.sge_queue is not None: cmd += " -q " + sge_opts.sge_queue cmd += " -sync y -pe {env} 1 -cwd -S /bin/bash -V -e /dev/null -o /dev/null {t}".\ format(t=real_upath(testSh), env=sge_opts.sge_env_name) logging.info("Submitting cmd: " + cmd) _out, _code, _msg = backticks(cmd) # answer = FastaReader(GCON_OUT_FA).__iter__().next() # tester = FastaReader(consensusFa).__iter__().next() # # if answer.name != tester.name or \ # answer.sequence != tester.sequence: if not filecmp.cmp(consensusFa, GCON_OUT_FA): errMsg = "Trouble running qsub or output is not as " + \ "expected ({0} and {1} must agree). Abort!".format( consensusFa, GCON_OUT_FA) logging.error(errMsg) return False else: shutil.rmtree(testDir) logging.info("sge and gcon check passed.") return True
def _validate_outputs(self, _root_dir, _out_fa): """Validate outputs, create root_dir if it does not exist.""" self.add_log("Checking outputs.", level=logging.INFO) root_dir, out_fa = _root_dir, _out_fa if root_dir is None: self.add_log("Output directory needs to be specified.", level=logging.ERROR) if out_fa is None: self.add_log("Output consensus fasta needs to be specified.", level=logging.ERROR) root_dir = realpath(root_dir) out_fa = realpath(out_fa) if op.exists(root_dir): self.add_log( "Output directory {d} already exists.".format(d=root_dir)) else: self.add_log("Creating output directory {d}.".format(d=root_dir)) os.mkdir(root_dir) if op.exists(out_fa): raise ClusterException( "Consensus FASTA file {f} already exists.".format(f=out_fa)) return root_dir, out_fa
def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn, ice_opts, sge_opts, ipq_opts, fasta_fofn=None, nfl_reads_per_split=30000): """ root_dir --- IceFiles.root_dir, usually data/clusterOutDir nfl_fa --- non-full-length reads in fasta, e.g., isoseq_nfl.fa bas_fofn --- e.g. input.fofn of bas|bax.h5 files ccs_fofn --- e.g. reads_of_insert.fofn of ccs files. ipq_opts --- IceQuiverHQLQOptions qv_trim_5: ignore QV of n bases in the 5' end qv_trim_3: ignore QV of n bases in the 3' end hq_quiver_min_accuracy: minimum allowed quiver accuracy to mark an isoform as high quality hq_isoforms_fa|fq: polished, hiqh quality consensus isoforms in fasta|q lq_isoforms_fa|fq: polished, low quality consensus isoforms in fasta|q """ IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn) self.nfl_fa = realpath(nfl_fa) self.nfl_reads_per_split = nfl_reads_per_split self.ice_opts = ice_opts self.sge_opts = sge_opts self.ipq_opts = ipq_opts self.icep = None # IceAllPartials. self.iceq = None # IceQuiver self.icepq = None # IceQuiverPostprocess self._nfl_splitted_fas = None self.validate_inputs()
def __init__(self, prog_name, root_dir, bas_fofn=None, ccs_fofn=None, fasta_fofn=None): """ prog_name --- name of a sub-class root_dir --- root directory of the whole project. There will be sub-directories under it, including: tmp/ --- 0/ c0, c1, ..., c9999 --- 1/ c10000, c10001, ..., c19999 ... each c? folder contains data for a cluster id=c? script/ --- 0/ gcon_job_?.sh, gcon jobs in the first iteration --- 1/ gcon_job_?.sh, gcon jobs in the second iteration ... log/ --- ICE.log Log of the ICE algorithm --- 0/ log for jobs in the first iteration ... output/ output files go here. bas_fofn --- input.fofn which contains movie.bas|bax.h5 files. ccs_fofn --- a fofn contains movie.ccs.h5 files. fasta_fofn --- a fofn contains movie.bax.h5.fasta files. script/ output/ bas_fofn --- input.fofn which contains movie.bas|bax.h5 files. ccs_fofn --- a fofn contains movie.ccs.h5 files. """ self.prog_name = str(prog_name) self.root_dir = realpath(root_dir) self.bas_fofn = bas_fofn self.ccs_fofn = ccs_fofn self.fasta_fofn = fasta_fofn mkdir(self.root_dir) mkdir(self.tmp_dir) mkdir(self.log_dir) mkdir(self.script_dir) mkdir(self.out_dir) self.log_f = open(self.log_fn, 'w', 0) self.add_log(msg= "{p} initiated".format(p=self.prog_name))
def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn, ice_opts, sge_opts, ipq_opts, fasta_fofn=None, nfl_reads_per_split=30000): """ root_dir --- IceFiles.root_dir, usually data/clusterOutDir nfl_fa --- non-full-length reads in fasta, e.g., isoseq_nfl.fa bas_fofn --- e.g. input.fofn of bas|bax.h5 files ccs_fofn --- e.g. reads_of_insert.fofn of ccs files. ipq_opts --- IceQuiverHQLQOptions qv_trim_5: ignore QV of n bases in the 5' end qv_trim_3: ignore QV of n bases in the 3' end hq_quiver_min_accuracy: minimum allowed quiver accuracy to mark an isoform as high quality hq_isoforms_fa|fq: polished, hiqh quality consensus isoforms in fasta|q lq_isoforms_fa|fq: polished, low quality consensus isoforms in fasta|q """ IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn) #self.add_log("DEBUG: in Polish ccs_fofn is {0}".format(self.ccs_fofn)) #self.add_log("DEBUG: in Polish fasta_fofn is {0}".format(self.fasta_fofn)) #self.add_log("DEBUG: in Polish bas_fofn is {0}".format(self.bas_fofn)) self.nfl_fa = realpath(nfl_fa) self.nfl_reads_per_split = nfl_reads_per_split self.ice_opts = ice_opts self.sge_opts = sge_opts self.ipq_opts = ipq_opts self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(self.ice_opts.ece_penalty, self.ice_opts.ece_min_len)) self.icep = None # IceAllPartials. self.iceq = None # IceQuiver self.icepq = None # IceQuiverPostprocess self._nfl_splitted_fas = None self.validate_inputs()
def __init__(self, reads_fn="test.fa", out_dir="output/", out_reads_fn="testout.fa", primer_fn=None, primer_report_fn=None, summary_fn=None, cpus=1, change_read_id=True, opts=ChimeraDetectionOptions(50, 10, 100, 50, 100, False), out_nfl_fn=None, out_flnc_fn=None, ignore_polyA=False, reuse_dom=False): self.reads_fn = realpath(reads_fn) self.out_dir = realpath(out_dir) self.cpus = cpus self.change_read_id = change_read_id self.chimera_detection_opts = opts self.ignore_polyA = ignore_polyA self.reuse_dom = reuse_dom # The input primer file: primers.fa self.primer_fn = primer_fn if primer_fn is not None else \ op.join(self.data_dir, PRIMERFN) # The output fasta file. self.out_all_reads_fn = realpath(out_reads_fn) # Intermediate output fasta file before chimera detection. # trimmed full-length reads: fl.trimmed.fasta # and # trimmed non-full-length reads: nfl.trimmed.fasta self._trimmed_fl_reads_fn = op.join(self.out_dir, "fl.trimmed.fasta") self._trimmed_nfl_reads_fn = op.join(self.out_dir, "nfl.trimmed.fasta") self.primer_front_back_fn = op.join(self.out_dir, PRIMERFRONTENDFN) self.primer_chimera_fn = op.join(self.out_dir, PRIMERCHIMERAFN) # The output primer file: primer_info.csv self.primer_report_fn = primer_report_fn \ if primer_report_fn is not None else \ ".".join(out_reads_fn.split('.')[:-1]) + "." + PRIMERREPORTFN # primer reports for nfl reads before chimera detection. Note that # chimera detection is not necessary for nfl reads. self._primer_report_nfl_fn = op.join(self.out_dir, "primer_report.nfl.csv") # primer reports for fl reads after chimera detection. Note that # chimera detection is required for fl reads. self._primer_report_fl_fn = op.join(self.out_dir, "primer_report.fl.csv") # The matrix file: PBMATRIX.txt self.pbmatrix_fn = op.join(self.data_dir, PBMATRIXFN) # The output phmmer Dom file for trimming primers: hmmer.front_end.dom self.out_front_back_dom_fn = op.join(self.out_dir, FRONTENDDOMFN) # The output phmmer Dom file for chimera detection: # hmmer.fl.chimera.dom and hmmer.nfl.chimera.dom self.out_trimmed_fl_dom_fn = op.join(self.out_dir, FLCHIMERADOMFN) self.out_trimmed_nfl_dom_fn = op.join(self.out_dir, NFLCHIMERADOMFN) self.chunked_front_back_reads_fns = None self.chunked_front_back_dom_fns = None #self.chunked_trimmed_reads_fns = None #self.chunked_trimmed_reads_dom_fns = None # The summary file: *.classify_summary.txt self.summary = ClassifySummary() self.summary_fn = summary_fn if summary_fn is not None else \ ".".join(out_reads_fn.split('.')[:-1]) + \ "." + CLASSIFYSUMMARY self.out_nfl_fn = realpath(out_nfl_fn) if out_nfl_fn is not None \ else op.join(self.out_dir, "nfl.fasta") self.out_nflnc_fn = op.join(self.out_dir, "nflnc.fasta") self.out_nflc_fn = op.join(self.out_dir, "nflc.fasta") self.out_flnc_fn = realpath(out_flnc_fn) if out_flnc_fn is not None \ else op.join(self.out_dir, "flnc.fasta") self.out_flc_fn = op.join(self.out_dir, "flc.fasta")
def tofu_wrap_main(): parser = argparse.ArgumentParser(prog='tofu_wrap') add_cluster_arguments(parser, show_sge_env_name=True, show_sge_queue=True) parser.add_argument("--bin_size_kb", default=1, type=int, help="Bin size by kb (default: 1)") parser.add_argument("--bin_manual", default=None, help="Bin manual (ex: (1,2,3,5)), overwrites bin_size_kb") parser.add_argument("--bin_by_primer", default=False, action="store_true", help="Instead of binning by size, bin by primer (overwrites --bin_size_kb and --bin_manual)") parser.add_argument("--max_base_limit_MB", default=600, type=int, help="Maximum number of bases per partitioned bin, in MB (default: 600)") parser.add_argument("--gmap_name", default="hg19", help="GMAP DB name (default: hg19)") parser.add_argument("--gmap_db", default="/home/UNIXHOME/etseng/share/gmap_db_new/", help="GMAP DB location (default: /home/UNIXHOME/etseng/share/gmap_db_new/)") parser.add_argument("--output_seqid_prefix", type=str, default=None, help="Output seqid prefix. If not given, a random ID is generated") parser.add_argument("--mem_debug", default=False, action="store_true", help=argparse.SUPPRESS) parser.add_argument("--max_fuzzy_junction", default=5, type=int, help="Max fuzzy junction (default: 5 bp)") parser.add_argument("--version", action='version', version='%(prog)s ' + str(get_version())) args = parser.parse_args() # PRINT VERSION AND EXIT # if args.version: # print >> sys.stderr, get_version() # sys.exit(0) # DEBUG if args.mem_debug: from memory_profiler import memory_usage # ################################################################# # SANITY CHECKS if not args.quiver: print >> sys.stderr, "--quiver must be turned on for tofu_wrap. Quit." sys.exit(-1) if args.nfl_fa is None: print >> sys.stderr, "--nfl_fa must be provided for tofu_wrap. Quit." sys.exit(-1) if not os.path.exists(args.gmap_db): print >> sys.stderr, "GMAP DB location not valid: {0}. Quit.".format(args.gmap_db) sys.exit(-1) if not os.path.exists(os.path.join(args.gmap_db, args.gmap_name)): print >> sys.stderr, "GMAP name not valid: {0}. Quit.".format(args.gmap_name) sys.exit(-1) # ################################################################# tofu_prefix = binascii.b2a_hex(os.urandom(3)) if args.output_seqid_prefix is None else args.output_seqid_prefix ice_opts = IceOptions(quiver=args.quiver, use_finer_qv=args.use_finer_qv, targeted_isoseq=args.targeted_isoseq, ece_penalty=args.ece_penalty, ece_min_len=args.ece_min_len, ) sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge, max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc, quiver_nproc=args.quiver_nproc, gcon_nproc=args.gcon_nproc, sge_env_name=args.sge_env_name, sge_queue=args.sge_queue) ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3, hq_quiver_min_accuracy=args.hq_quiver_min_accuracy, hq_isoforms_fa=args.hq_isoforms_fa, hq_isoforms_fq=args.hq_isoforms_fq, lq_isoforms_fa=args.lq_isoforms_fa, lq_isoforms_fq=args.lq_isoforms_fq) # ex: all_quivered_hq.100_30_0.99.fastq quiver_hq_filename = "all_quivered_hq.{0}_{1}_{2:.2f}.fastq".format(\ args.qv_trim_5,args.qv_trim_3,args.hq_quiver_min_accuracy) quiver_lq_filename = "all_quivered_lq.fastq" # (1) separate input flnc into size bins or primers if args.bin_by_primer: split_files = sep_flnc_by_primer(args.flnc_fa, os.path.abspath(args.root_dir)) else: bin_manual = eval(args.bin_manual) if args.bin_manual is not None else None split_files = sep_flnc_by_size(args.flnc_fa, args.root_dir, bin_size_kb=args.bin_size_kb, bin_manual=bin_manual, max_base_limit_MB=args.max_base_limit_MB) print >> sys.stderr, "split input {0} into {1} bins".format(args.flnc_fa, len(split_files)) # (2) if fasta_fofn already is there, use it; otherwise make it first if args.quiver and args.fasta_fofn is None: print >> sys.stderr, "Making fasta_fofn now" nfl_dir = os.path.abspath(os.path.join(args.root_dir, "fasta_fofn_files")) if not os.path.exists(nfl_dir): os.makedirs(nfl_dir) args.fasta_fofn = os.path.join(nfl_dir, 'input.fasta.fofn') print >> sys.stderr, "fasta_fofn", args.fasta_fofn print >> sys.stderr, "nfl_dir", nfl_dir convert_fofn_to_fasta(fofn_filename=args.bas_fofn, out_filename=args.fasta_fofn, fasta_out_dir=nfl_dir, cpus=args.blasr_nproc) else: if not os.path.exists(args.fasta_fofn): raise Exception, "fasta_fofn {0} does not exist!".format(args.fasta_fofn) for line in open(args.fasta_fofn): file = line.strip() if len(file) > 0 and not os.path.exists(file): raise Exception, "File {0} does not exists in {1}".format(file, args.fasta_fofn) # (3) run ICE/Quiver (the whole thing), providing the fasta_fofn split_dirs = [] for cur_file in split_files: cur_dir = os.path.abspath(os.path.dirname(cur_file)) split_dirs.append(cur_dir) cur_out_cons = os.path.join(cur_dir, args.consensusFa) hq_quiver = os.path.join(cur_dir, quiver_hq_filename) if os.path.exists(hq_quiver): print >> sys.stderr, "{0} already exists. SKIP!".format(hq_quiver) continue print >> sys.stderr, "running ICE/Quiver on", cur_dir start_t = time.time() obj = Cluster(root_dir=cur_dir, flnc_fa=cur_file, nfl_fa=realpath(args.nfl_fa), bas_fofn=realpath(args.bas_fofn), ccs_fofn=realpath(args.ccs_fofn), fasta_fofn=realpath(args.fasta_fofn), out_fa=cur_out_cons, sge_opts=sge_opts, ice_opts=ice_opts, ipq_opts=ipq_opts, report_fn=args.report_fn, summary_fn=args.summary_fn, nfl_reads_per_split=args.nfl_reads_per_split) # DEBUG if args.mem_debug: mem_usage = memory_usage(obj.run, interval=60) end_t = time.time() with open('mem_debug.log', 'a') as f: f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(cur_dir, end_t-start_t)) f.write("Maximum memory usage: {0}\n".format(max(mem_usage))) f.write("Memory usage: {0}\n".format(mem_usage)) else: obj.run() combined_dir = os.path.join(args.root_dir, 'combined') if not os.path.exists(combined_dir): os.makedirs(combined_dir) # (4) combine quivered HQ/LQ results hq_filename, lq_filename, hq_pre_dict, lq_pre_dict = \ combine_quiver_results(split_dirs, combined_dir, quiver_hq_filename, quiver_lq_filename,\ tofu_prefix) with open(os.path.join(args.root_dir, 'combined', 'combined.hq_lq_pre_dict.pickle'), 'w') as f: dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, f) # (5) collapse quivered HQ results collapse_prefix_hq = run_collapse_sam(hq_filename, args.gmap_db, args.gmap_name, cpus=args.blasr_nproc, max_fuzzy_junction=args.max_fuzzy_junction, dun_merge_5_shorter=True) # (6) make abundance get_abundance(collapse_prefix_hq, hq_pre_dict, collapse_prefix_hq) # (7) run filtering & removing subsets in no5merge if args.targeted_isoseq: run_filtering_by_count(collapse_prefix_hq, collapse_prefix_hq+'.min_fl_5', min_count=5) run_filtering_away_subsets(collapse_prefix_hq+'.min_fl_5', collapse_prefix_hq+'.min_fl_5.filtered', args.max_fuzzy_junction) else: run_filtering_by_count(collapse_prefix_hq, collapse_prefix_hq+'.min_fl_2', min_count=2) run_filtering_away_subsets(collapse_prefix_hq+'.min_fl_2', collapse_prefix_hq+'.min_fl_2.filtered', args.max_fuzzy_junction)
def build_uc_from_partial_daligner(input_fastq, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. """ input_fastq = realpath(input_fastq) input_fasta = input_fastq[:input_fastq.rfind('.')] + '.fasta' ice_fq2fa(input_fastq, input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = os.path.dirname(out_pickle) daligner_sensitive_mode, _low, _high, _ignore5, _ignore3, _ece_min_len = get_daligner_sensitivity_setting(ref_fasta, is_fasta=True) # DB should always be already converted ref_obj = DazzIDHandler(ref_fasta, True) input_obj = DazzIDHandler(input_fasta, False) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(input_fasta, ref_fasta, is_FL=False, same_strand_only=False, \ query_converted=True, db_converted=True, query_made=False, \ db_made=True, use_sge=False, cpus=cpus, sge_opts=None) las_filenames, las_out_filenames = runner.runHPC(min_match_len=_low, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode) if no_qv_or_aln_checking: # not using QVs or alignment checking! # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used logging.info("Not using QV for partial_uc. Loading dummy QV.") probqv = ProbFromModel(.01, .07, .06) else: # if ccs_fofn is None: # logging.info("Loading probability from model (0.01,0.07,0.06)") # probqv = ProbFromModel(.01, .07, .06) # else: start_t = time.time() probqv = ProbFromFastq(input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) # --------- comment out below since we are just using FASTQ / BAM # if use_finer_qv: # probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) # logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ # s=time.time()-start_t)) # else: # input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' # logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) # ice_fa2fq(input_fasta, ccs_fofn, input_fastq) # probqv = ProbFromFastq(input_fastq) # logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) # print >> sys.stderr, "Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t) logging.info("Calling dalign_against_ref ...") partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for las_out_filename in las_out_filenames: start_t = time.time() hitItems = dalign_against_ref(input_obj, ref_obj, las_out_filename, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=_ece_min_len, same_strand_only=False, no_qv_or_aln_checking=no_qv_or_aln_checking, max_missed_start=_ignore5, max_missed_end=_ignore3) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing {0} took {1} sec".format(las_out_filename, time.time()-start_t)) print >> sys.stderr, "processing {0} took {1} sec".format(las_out_filename, time.time()-start_t) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename) # remove all the .las and .las.out filenames for file in las_filenames: os.remove(file) for file in las_out_filenames: os.remove(file)
def tofu_wrap_main(): parser = argparse.ArgumentParser(prog='tofu_wrap') add_cluster_arguments(parser, show_sge_env_name=True, show_sge_queue=True) parser.add_argument("--bin_size_kb", default=1, type=int, help="Bin size by kb (default: 1)") parser.add_argument("--bin_manual", default=None, help="Bin manual (ex: (1,2,3,5)), overwrites bin_size_kb") parser.add_argument("--bin_by_primer", default=False, action="store_true", help="Instead of binning by size, bin by primer (overwrites --bin_size_kb and --bin_manual)") parser.add_argument("--max_base_limit_MB", default=600, type=int, help="Maximum number of bases per partitioned bin, in MB (default: 600)") parser.add_argument("--gmap_name", default="hg19", help="GMAP DB name (default: hg19)") parser.add_argument("--gmap_db", default="/home/UNIXHOME/etseng/share/gmap_db_new/", help="GMAP DB location (default: /home/UNIXHOME/etseng/share/gmap_db_new/)") parser.add_argument("--output_seqid_prefix", type=str, default=None, help="Output seqid prefix. If not given, a random ID is generated") parser.add_argument("--mem_debug", default=False, action="store_true", help=argparse.SUPPRESS) parser.add_argument("--max_fuzzy_junction", default=5, type=int, help="Max fuzzy junction (default: 5 bp)") parser.add_argument("--version", action='version', version='%(prog)s ' + str(get_version())) args = parser.parse_args() # PRINT VERSION AND EXIT # if args.version: # print >> sys.stderr, get_version() # sys.exit(0) # DEBUG if args.mem_debug: from memory_profiler import memory_usage # ################################################################# # SANITY CHECKS if not args.quiver: print >> sys.stderr, "--quiver must be turned on for tofu_wrap. Quit." sys.exit(-1) if args.nfl_fa is None: print >> sys.stderr, "--nfl_fa must be provided for tofu_wrap. Quit." sys.exit(-1) if not os.path.exists(args.gmap_db): print >> sys.stderr, "GMAP DB location not valid: {0}. Quit.".format(args.gmap_db) sys.exit(-1) if not os.path.exists(os.path.join(args.gmap_db, args.gmap_name)): print >> sys.stderr, "GMAP name not valid: {0}. Quit.".format(args.gmap_name) sys.exit(-1) # ################################################################# tofu_prefix = binascii.b2a_hex(os.urandom(3)) if args.output_seqid_prefix is None else args.output_seqid_prefix ice_opts = IceOptions(quiver=args.quiver, use_finer_qv=args.use_finer_qv, targeted_isoseq=args.targeted_isoseq, ece_penalty=args.ece_penalty, ece_min_len=args.ece_min_len, ) sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge, max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc, quiver_nproc=args.quiver_nproc, gcon_nproc=args.gcon_nproc, sge_env_name=args.sge_env_name, sge_queue=args.sge_queue) ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3, hq_quiver_min_accuracy=args.hq_quiver_min_accuracy, hq_isoforms_fa=args.hq_isoforms_fa, hq_isoforms_fq=args.hq_isoforms_fq, lq_isoforms_fa=args.lq_isoforms_fa, lq_isoforms_fq=args.lq_isoforms_fq) # ex: all_quivered_hq.100_30_0.99.fastq quiver_hq_filename = "all_quivered_hq.{0}_{1}_{2:.2f}.fastq".format(\ args.qv_trim_5,args.qv_trim_3,args.hq_quiver_min_accuracy) quiver_lq_filename = "all_quivered_lq.fastq" # (1) separate input flnc into size bins or primers if args.bin_by_primer: split_files = sep_flnc_by_primer(args.flnc_fa, os.path.abspath(args.root_dir)) else: bin_manual = eval(args.bin_manual) if args.bin_manual is not None else None split_files = sep_flnc_by_size(args.flnc_fa, args.root_dir, bin_size_kb=args.bin_size_kb, bin_manual=bin_manual, max_base_limit_MB=args.max_base_limit_MB) print >> sys.stderr, "split input {0} into {1} bins".format(args.flnc_fa, len(split_files)) # (2) if fasta_fofn already is there, use it; otherwise make it first if args.quiver and args.fasta_fofn is None: print >> sys.stderr, "Making fasta_fofn now" nfl_dir = os.path.abspath(os.path.join(args.root_dir, "fasta_fofn_files")) if not os.path.exists(nfl_dir): os.makedirs(nfl_dir) args.fasta_fofn = os.path.join(nfl_dir, 'input.fasta.fofn') print >> sys.stderr, "fasta_fofn", args.fasta_fofn print >> sys.stderr, "nfl_dir", nfl_dir convert_fofn_to_fasta(fofn_filename=args.bas_fofn, out_filename=args.fasta_fofn, fasta_out_dir=nfl_dir, cpus=args.blasr_nproc) else: if not os.path.exists(args.fasta_fofn): raise Exception, "fasta_fofn {0} does not exist!".format(args.fasta_fofn) for line in open(args.fasta_fofn): file = line.strip() if len(file) > 0 and not os.path.exists(file): raise Exception, "File {0} does not exists in {1}".format(file, args.fasta_fofn) # (3) run ICE/Quiver (the whole thing), providing the fasta_fofn split_dirs = [] for cur_file in split_files: cur_dir = os.path.abspath(os.path.dirname(cur_file)) split_dirs.append(cur_dir) cur_out_cons = os.path.join(cur_dir, args.consensusFa) hq_quiver = os.path.join(cur_dir, quiver_hq_filename) if os.path.exists(hq_quiver): print >> sys.stderr, "{0} already exists. SKIP!".format(hq_quiver) continue print >> sys.stderr, "running ICE/Quiver on", cur_dir start_t = time.time() obj = Cluster(root_dir=cur_dir, flnc_fa=cur_file, nfl_fa=realpath(args.nfl_fa), bas_fofn=realpath(args.bas_fofn), ccs_fofn=realpath(args.ccs_fofn), fasta_fofn=realpath(args.fasta_fofn), out_fa=cur_out_cons, sge_opts=sge_opts, ice_opts=ice_opts, ipq_opts=ipq_opts, report_fn=args.report_fn, summary_fn=args.summary_fn, nfl_reads_per_split=args.nfl_reads_per_split) # DEBUG if args.mem_debug: mem_usage = memory_usage(obj.run, interval=60) end_t = time.time() with open('mem_debug.log', 'a') as f: f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(cur_dir, end_t-start_t)) f.write("Maximum memory usage: {0}\n".format(max(mem_usage))) f.write("Memory usage: {0}\n".format(mem_usage)) else: obj.run() combined_dir = os.path.join(args.root_dir, 'combined') if not os.path.exists(combined_dir): os.makedirs(combined_dir) # (4) combine quivered HQ/LQ results hq_filename, lq_filename, hq_pre_dict, lq_pre_dict = \ combine_quiver_results(split_dirs, combined_dir, quiver_hq_filename, quiver_lq_filename,\ tofu_prefix) with open(os.path.join(args.root_dir, 'combined', 'combined.hq_lq_pre_dict.pickle'), 'w') as f: dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, f) # (5) collapse quivered HQ results collapse_prefix_hq = run_collapse_sam(hq_filename, args.gmap_db, args.gmap_name, cpus=args.blasr_nproc, max_fuzzy_junction=args.max_fuzzy_junction) # (6) make abundance get_abundance(collapse_prefix_hq, hq_pre_dict, collapse_prefix_hq) # (7) run filtering run_filtering_by_count(collapse_prefix_hq, collapse_prefix_hq+'.min_fl_2', min_count=2)
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, sa_file=None, ccs_fofn=None, done_filename=None, blasr_nproc=12, use_finer_qv=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = realpath(input_fasta) m5_file = input_fasta + ".blasr" out_pickle = realpath(out_pickle) if sa_file is None: if op.exists(input_fasta + ".sa"): sa_file = input_fasta + ".sa" cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \ "-nproc {n} -m 5 ".format(n=blasr_nproc) + \ "-maxScore -1000 -minPctIdentity 85 " + \ "-out {o} ".format(o=real_upath(m5_file)) if sa_file is not None and op.exists(sa_file): cmd += "-sa {sa}".format(sa=real_upath(sa_file)) logging.info("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg) logging.error(errMsg) raise RuntimeError(errMsg) if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ s=time.time()-start_t)) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) probqv = ProbFromFastq(input_fastq) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qvmean_get_func=probqv.get_mean, qver_get_func=probqv.get_smoothed, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename)
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. """ input_fasta = realpath(input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = os.path.dirname(out_pickle) daligner_sensitive_mode, _low, _high = get_daligner_sensitivity_setting(ref_fasta) # DB should always be already converted ref_obj = DazzIDHandler(ref_fasta, True) input_obj = DazzIDHandler(input_fasta, False) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(input_fasta, ref_fasta, is_FL=False, same_strand_only=False, \ query_converted=True, db_converted=True, query_made=False, \ db_made=True, use_sge=False, cpus=cpus, sge_opts=None) las_filenames, las_out_filenames = runner.runHPC(min_match_len=300, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode) if no_qv_or_aln_checking: # not using QVs or alignment checking! # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used logging.info("Not using QV for partial_uc. Loading dummy QV.") probqv = ProbFromModel(.01, .07, .06) else: if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ s=time.time()-start_t)) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) probqv = ProbFromFastq(input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) print >> sys.stderr, "Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t) logging.info("Calling dalign_against_ref ...") partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for las_out_filename in las_out_filenames: start_t = time.time() hitItems = dalign_against_ref(input_obj, ref_obj, las_out_filename, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=20, same_strand_only=False, no_qv_or_aln_checking=no_qv_or_aln_checking) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing {0} took {1} sec".format(las_out_filename, time.time()-start_t)) print >> sys.stderr, "processing {0} took {1} sec".format(las_out_filename, time.time()-start_t) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename) # remove all the .las and .las.out filenames for file in las_filenames: os.remove(file) for file in las_out_filenames: os.remove(file)
def __init__(self, reads_fn="test.fa", out_dir="output/", out_reads_fn="testout.fa", primer_fn=None, primer_report_fn=None, summary_fn=None, cpus=1, change_read_id=True, opts=ChimeraDetectionOptions(50, 10, 100, 50, 100), out_nfl_fn=None, out_flnc_fn=None, ignore_polyA=False): self.reads_fn = realpath(reads_fn) self.out_dir = realpath(out_dir) self.cpus = cpus self.change_read_id = change_read_id self.chimera_detection_opts = opts self.ignore_polyA = ignore_polyA # The input primer file: primers.fa self.primer_fn = primer_fn if primer_fn is not None else \ op.join(self.data_dir, PRIMERFN) # The output fasta file. self.out_all_reads_fn = realpath(out_reads_fn) # Intermediate output fasta file before chimera detection. # trimmed full-length reads before chimera detection self._trimmed_fl_reads_fn = op.join(self.out_dir, "fl.trimmed.fasta") self.primer_front_back_fn = op.join(self.out_dir, PRIMERFRONTENDFN) self.primer_chimera_fn = op.join(self.out_dir, PRIMERCHIMERAFN) # The output primer file: primer_info.csv self.primer_report_fn = primer_report_fn \ if primer_report_fn is not None else \ ".".join(out_reads_fn.split('.')[:-1]) + "." + PRIMERREPORTFN # primer reports for nfl reads before chimera detection. Note that # chimera detection is not necessary for nfl reads. self._primer_report_nfl_fn = op.join(self.out_dir, "primer_report.nfl.csv") # primer reports for fl reads after chimera detection. Note that # chimera detection is required for fl reads. self._primer_report_fl_fn = op.join(self.out_dir, "primer_report.fl.csv") # The matrix file: PBMATRIX.txt self.pbmatrix_fn = op.join(self.data_dir, PBMATRIXFN) # The output phmmer Dom file: hmmer.front_end.dom and hmmer.chimera.dom self.out_front_back_dom_fn = op.join(self.out_dir, FRONTENDDOMFN) self.out_trimmed_reads_dom_fn = op.join(self.out_dir, CHIMERADOMFN) self.chunked_front_back_reads_fns = None self.chunked_front_back_dom_fns = None self.chunked_trimmed_reads_fns = None self.chunked_trimmed_reads_dom_fns = None # The summary file: *.classify_summary.txt self.summary = ClassifySummary() self.summary_fn = summary_fn if summary_fn is not None else \ ".".join(out_reads_fn.split('.')[:-1]) + \ "." + CLASSIFYSUMMARY self.out_nfl_fn = realpath(out_nfl_fn) if out_nfl_fn is not None \ else op.join(self.out_dir, "nfl.fasta") self.out_flnc_fn = realpath(out_flnc_fn) if out_flnc_fn is not None \ else op.join(self.out_dir, "flnc.fasta") self.out_flc_fn = op.join(self.out_dir, "flc.fasta")
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, sa_file=None, ccs_fofn=None, done_filename=None, blasr_nproc=12): """Align consensus isoforms in ref_fasta and reads in input_fasta, and save mappings between isoforms and reads to out_pickle. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = realpath(input_fasta) m5_file = input_fasta + ".blasr" out_pickle = realpath(out_pickle) if sa_file is None: if op.exists(input_fasta + ".sa"): sa_file = input_fasta + ".sa" cmd = "blasr {i} ".format(i=input_fasta) + \ "{r} -bestn 5 ".format(r=ref_fasta) + \ "-nproc {n} -m 5 ".format(n=blasr_nproc) + \ "-maxScore -1000 -minPctIdentity 85 -out {o} ".format(o=m5_file) if sa_file is not None and op.exists(sa_file): cmd += "-sa {sa}".format(sa=sa_file) logging.info("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg) logging.error(errMsg) raise RuntimeError(errMsg) if ccs_fofn is None: logging.info("Loading probability from model") probqv = ProbFromModel(.01, .07, .06) else: logging.info("Loading probability from QV in {f}".format(f=ccs_fofn)) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = [] partial_uc[h.cID].append(h.qID) seen.add(h.qID) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename)
def __init__(self, reads_fn="ccs.fasta", out_dir="classifyOut/", out_reads_fn="isoseq_draft.fasta", primer_fn_forward=None, primer_fn_reverse=None, primer_report_fn=None, summary_fn=None, cpus=1, change_read_id=True, opts=ChimeraDetectionOptions(50, 10, 100, 50, 150, False), out_nfl_fn=None, out_flnc_fn=None, ignore_polyA=False, keep_primer=False, reuse_dom=False): self.reads_fn = realpath(reads_fn) self.out_dir = realpath(out_dir) self.cpus = cpus self.change_read_id = change_read_id self.chimera_detection_opts = opts self.ignore_polyA = ignore_polyA self.keep_primer = keep_primer # if True, primers are not removed (useful for targeted) self.reuse_dom = reuse_dom # for now, the barcoded primer files must be given! assert primer_fn_forward is not None assert primer_fn_reverse is not None self.primer_fn_forward = primer_fn_forward self.primer_fn_reverse = primer_fn_reverse # The output fasta file. self.out_all_reads_fn = realpath(out_reads_fn) # Intermediate output fasta file before chimera detection. # trimmed full-length reads: fl.trimmed.fasta # and # trimmed non-full-length reads: nfl.trimmed.fasta self._trimmed_fl_reads_fn = op.join(self.out_dir, "fl.trimmed.fasta") self._trimmed_nfl_reads_fn = op.join(self.out_dir, "nfl.trimmed.fasta") self.primer_front_back_fn = op.join(self.out_dir, PRIMERFRONTENDFN) self.primer_chimera_fn = op.join(self.out_dir, PRIMERCHIMERAFN) # The output primer file: primer_info.csv self.primer_report_fn = primer_report_fn \ if primer_report_fn is not None else \ ".".join(out_reads_fn.split('.')[:-1]) + "." + PRIMERREPORTFN # primer reports for nfl reads before chimera detection. Note that # chimera detection is not necessary for nfl reads. self._primer_report_nfl_fn = op.join(self.out_dir, "primer_report.nfl.csv") # primer reports for fl reads after chimera detection. Note that # chimera detection is required for fl reads. self._primer_report_fl_fn = op.join(self.out_dir, "primer_report.fl.csv") # The matrix file: PBMATRIX.txt self.pbmatrix_fn = op.join(self.data_dir, PBMATRIXFN) # The output phmmer Dom file for trimming primers: hmmer.front_end.dom self.out_front_back_dom_fn = op.join(self.out_dir, FRONTENDDOMFN) # The output phmmer Dom file for chimera detection: # hmmer.fl.chimera.dom and hmmer.nfl.chimera.dom self.out_trimmed_fl_dom_fn = op.join(self.out_dir, FLCHIMERADOMFN) self.out_trimmed_nfl_dom_fn = op.join(self.out_dir, NFLCHIMERADOMFN) self.chunked_front_back_reads_fns = None self.chunked_front_back_dom_fns = None #self.chunked_trimmed_reads_fns = None #self.chunked_trimmed_reads_dom_fns = None # The summary file: *.classify_summary.txt self.summary = ClassifySummary() self.summary_fn = summary_fn if summary_fn is not None else \ ".".join(out_reads_fn.split('.')[:-1]) + \ "." + CLASSIFYSUMMARY self.out_nfl_fn = realpath(out_nfl_fn) if out_nfl_fn is not None \ else op.join(self.out_dir, "nfl.fasta") self.out_nflnc_fn = op.join(self.out_dir, "nflnc.fasta") self.out_nflc_fn = op.join(self.out_dir, "nflc.fasta") self.out_flnc_fn = realpath(out_flnc_fn) if out_flnc_fn is not None \ else op.join(self.out_dir, "flnc.fasta") self.out_flc_fn = op.join(self.out_dir, "flc.fasta")
def build_uc_from_partial( input_fasta, ref_fasta, out_pickle, sa_file=None, ccs_fofn=None, done_filename=None, blasr_nproc=12 ): """Align consensus isoforms in ref_fasta and reads in input_fasta, and save mappings between isoforms and reads to out_pickle. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = realpath(input_fasta) m5_file = input_fasta + ".blasr" out_pickle = realpath(out_pickle) if sa_file is None: if op.exists(input_fasta + ".sa"): sa_file = input_fasta + ".sa" cmd = ( "blasr {i} ".format(i=input_fasta) + "{r} -bestn 5 ".format(r=ref_fasta) + "-nproc {n} -m 5 ".format(n=blasr_nproc) + "-maxScore -1000 -minPctIdentity 85 -out {o} ".format(o=m5_file) ) if sa_file is not None and op.exists(sa_file): cmd += "-sa {sa}".format(sa=sa_file) logging.info("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg) logging.error(errMsg) raise RuntimeError(errMsg) if ccs_fofn is None: logging.info("Loading probability from model") probqv = ProbFromModel(0.01, 0.07, 0.06) else: logging.info("Loading probability from QV in {f}".format(f=ccs_fofn)) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref( output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, ece_penalty=1, ece_min_len=10, same_strand_only=False, ) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = [] partial_uc[h.cID].append(h.qID) seen.add(h.qID) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, "w") as f: dump({"partial_uc": partial_uc, "nohit": nohit}, f) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None else out_pickle + ".DONE" logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename)
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, sa_file=None, ccs_fofn=None, done_filename=None, blasr_nproc=12, use_finer_qv=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = realpath(input_fasta) m5_file = input_fasta + ".blasr" out_pickle = realpath(out_pickle) if sa_file is None: if op.exists(input_fasta + ".sa"): sa_file = input_fasta + ".sa" cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \ "-nproc {n} -m 5 ".format(n=blasr_nproc) + \ "-maxScore -1000 -minPctIdentity 85 " + \ "-out {o} ".format(o=real_upath(m5_file)) if sa_file is not None and op.exists(sa_file): cmd += "-sa {sa}".format(sa=real_upath(sa_file)) logging.info("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg) logging.error(errMsg) raise RuntimeError(errMsg) if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ s=time.time()-start_t)) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) probqv = ProbFromFastq(input_fastq) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qvmean_get_func=probqv.get_mean, qver_get_func=probqv.get_smoothed, ece_penalty=1, ece_min_len=20, same_strand_only=False, max_missed_start=200, max_missed_end=50) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename)