def run(self): """Execute ice_fa2fq.py.""" logging.info("Running {f} v{v}.".format(f=op.basename(__file__), v=get_version())) cmd_str = "" try: args = self.args in_fa, ccs_fofn, out_fq = args.in_fa, args.ccs_fofn, \ args.out_fq self.validate_inputs(in_fa=in_fa, ccs_fofn=ccs_fofn) cmd_str = self.cmd_str(in_fa=in_fa, ccs_fofn=ccs_fofn, out_fq=out_fq) ice_fa2fq(in_fa=in_fa, ccs_fofn=ccs_fofn, out_fq=out_fq) except: logging.exception( "Exiting {cmd} with return code 1.".format(cmd=cmd_str)) return 1 return 0
def run(self): """Execute ice_fa2fq.py.""" logging.info("Running {f} v{v}.".format(f=op.basename(__file__), v=get_version())) cmd_str = "" try: args = self.args in_fa, ccs_fofn, out_fq = args.in_fa, args.ccs_fofn, \ args.out_fq self.validate_inputs(in_fa=in_fa, ccs_fofn=ccs_fofn) cmd_str = self.cmd_str(in_fa=in_fa, ccs_fofn=ccs_fofn, out_fq=out_fq) ice_fa2fq(in_fa=in_fa, ccs_fofn=ccs_fofn, out_fq=out_fq) except: logging.exception("Exiting {cmd} with return code 1.". format(cmd=cmd_str)) return 1 return 0
def pickup_icec_job(pickle_filename, ccs_fofn, flnc_filename, fasta_files_to_add, root_dir): icec_obj, icec_pickle_filename = ensure_pickle_goodness( pickle_filename, root_dir, fasta_files_to_add) make_current_fasta(icec_obj, flnc_filename, root_dir) print >> sys.stderr, "Reading QV information...." # first need to convert to fastq ice_fa2fq('current.fasta', ccs_fofn, 'current.fastq') probqv = pm.ProbFromFastq(os.path.join(root_dir, 'current.fastq')) icec = ice.IceIterative.from_pickle(icec_pickle_filename, probqv) # first must RE-RUN gcon to get all the proper refs icec.changes = set() icec.refs = {} todo = icec.uc.keys() print >> sys.stderr, "Re-run gcon for proper refs...." icec.run_gcon_parallel(todo) print >> sys.stderr, "Re-calculating cluster prob, just to be safe...." icec.calc_cluster_prob(True) print >> sys.stderr, "Sanity checking now...." icec.sanity_check_uc_refs() icec.ensure_probQV_newid_consistency() print >> sys.stderr, "Sanity check done. Resuming ICE job." icec.run()
def pickup_icec_job(pickle_filename, ccs_fofn, flnc_filename, fasta_files_to_add, root_dir): icec_obj, icec_pickle_filename = ensure_pickle_goodness(pickle_filename, root_dir, fasta_files_to_add) make_current_fasta(icec_obj, flnc_filename, root_dir) print >> sys.stderr, "Reading QV information...." # first need to convert to fastq ice_fa2fq('current.fasta', ccs_fofn, 'current.fastq') probqv = pm.ProbFromFastq(os.path.join(root_dir,'current.fastq')) icec = ice.IceIterative.from_pickle(icec_pickle_filename, probqv) # first must RE-RUN gcon to get all the proper refs icec.changes = set() icec.refs = {} icec.ccs_fofn = ccs_fofn icec.all_fasta_filename = flnc_filename todo = icec.uc.keys() print >> sys.stderr, "Re-run gcon for proper refs...." icec.run_gcon_parallel(todo) print >> sys.stderr, "Re-calculating cluster prob, just to be safe...." icec.calc_cluster_prob(True) print >> sys.stderr, "Sanity checking now...." icec.sanity_check_uc_refs() icec.ensure_probQV_newid_consistency() print >> sys.stderr, "Sanity check done. Resuming ICE job." icec.run()
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, sa_file=None, ccs_fofn=None, done_filename=None, blasr_nproc=12, use_finer_qv=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = realpath(input_fasta) m5_file = input_fasta + ".blasr" out_pickle = realpath(out_pickle) if sa_file is None: if op.exists(input_fasta + ".sa"): sa_file = input_fasta + ".sa" cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \ "-nproc {n} -m 5 ".format(n=blasr_nproc) + \ "-maxScore -1000 -minPctIdentity 85 " + \ "-out {o} ".format(o=real_upath(m5_file)) if sa_file is not None and op.exists(sa_file): cmd += "-sa {sa}".format(sa=real_upath(sa_file)) logging.info("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg) logging.error(errMsg) raise RuntimeError(errMsg) if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ s=time.time()-start_t)) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) probqv = ProbFromFastq(input_fastq) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qvmean_get_func=probqv.get_mean, qver_get_func=probqv.get_smoothed, ece_penalty=1, ece_min_len=20, same_strand_only=False, max_missed_start=200, max_missed_end=50) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename)
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) if self.ice_opts.targeted_isoseq: first_split = 1000 self.ice_opts.flnc_reads_per_split = 10000 self.add_log( "targeted_isoseq: further splitting JUST first split to 1000. Changing flnc_reads_per_split=10000." ) else: first_split = None # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log( "Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format(n=self.ice_opts.flnc_reads_per_split), level=logging.INFO, ) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split", first_split=first_split, ) self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) firstSplit = self._flnc_splitted_fas[0] firstSplit_fq = firstSplit[: firstSplit.rfind(".")] + ".fastq" self.add_log( "Converting first split file {0} + {1} into fastq\n".format(firstSplit, self.ccs_fofn), level=logging.INFO ) # Convert this into FASTQ ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq) # Set up probabbility and quality value model if self.ice_opts.use_finer_qv: self._setProbQV_ccs(self.ccs_fofn, firstSplit) else: self._setProbQV_fq(firstSplitFq=firstSplit_fq) # Initialize cluster by clique if os.path.exists(self.initPickleFN): self.add_log("Reading existing uc pickle: {0}".format(self.initPickleFN), level=logging.INFO) with open(self.initPickleFN) as f: uc = cPickle.load(f) else: self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO) self.iceinit = IceInit( readsFa=firstSplit, qver_get_func=self._probqv.get_smoothed, ice_opts=self.ice_opts, sge_opts=self.sge_opts, qvmean_get_func=self._probqv.get_mean, ) uc = self.iceinit.uc # Dump uc to a file self.add_log("Dumping initial clusters to {f}".format(f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, "w") as f: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO) # self.add_log("In Cluster. DEBUG: Calling Cluster with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn)) self.icec = IceIterative( fasta_filename=firstSplit, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv, fastq_filename=firstSplit_fq, use_ccs_qv=self.ice_opts.use_finer_qv, ) self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn)) self.icec.run() self.add_log("IceIterative completed.", level=logging.INFO) # IceIterative done, write predicted (unplished) consensus isoforms # to an output fasta self.add_log("Creating a link to unpolished consensus isoforms.") ln(self.icec.final_consensus_fa, self.out_fa) # self.add_log("In Cluster. DEBUG: End Cluster with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn)) # Call quiver to polish predicted consensus isoforms. if self.ice_opts.quiver is not True: self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.icec.report_fn, dst=self.report_fn) # Summarize cluster and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa) else: # self.ice_opts.quiver is True self.add_log("Polishing clusters: initializing IcePolish.", level=logging.INFO) # self.add_log("In Cluster. DEBUG: Calling Polish with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn)) self.pol = Polish( root_dir=self.root_dir, nfl_fa=self.nfl_fa, bas_fofn=self.bas_fofn, ccs_fofn=self.ccs_fofn, fasta_fofn=self.fasta_fofn, ice_opts=self.ice_opts, sge_opts=self.sge_opts, ipq_opts=self.ipq_opts, nfl_reads_per_split=self.nfl_reads_per_split, ) self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn), level=logging.INFO) self.pol.run() self.add_log("IcePolish completed.", level=logging.INFO) # cluster report self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.pol.iceq.report_fn, dst=self.report_fn) # Summarize cluster & polish and write to summary_fn. self.write_summary( summary_fn=self.summary_fn, isoforms_fa=self.out_fa, hq_fa=self.pol.icepq.quivered_good_fa, lq_fa=self.pol.icepq.quivered_bad_fa, ) # Create log file. self.close_log() return 0
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) #self.ice_opts.flnc_reads_per_split=1000 #FOR DEBUGGING, REMOVE LATER # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.flnc_reads_per_split), level=logging.INFO) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split") self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) firstSplit = self._flnc_splitted_fas[0] firstSplit_fq = firstSplit[:firstSplit.rfind('.')] + '.fastq' self.add_log("Converting first split file {0} + {1} into fastq\n".format(\ firstSplit, self.ccs_fofn), level=logging.INFO) # Convert this into FASTQ ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq) # Set up probabbility and quality value model if self.ice_opts.use_finer_qv: self._setProbQV_ccs(self.ccs_fofn, firstSplit) else: self._setProbQV_fq(firstSplitFq=firstSplit_fq) # Initialize cluster by clique self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO) self.iceinit = IceInit(readsFa=firstSplit, qver_get_func=self._probqv.get_smoothed, ice_opts=self.ice_opts, sge_opts=self.sge_opts) uc = self.iceinit.uc # Dump uc to a file self.add_log( "Dumping initial clusters to {f}".format(f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, 'w') as f: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO) self.icec = IceIterative( fasta_filename=firstSplit, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv, fastq_filename=firstSplit_fq, use_ccs_qv=self.ice_opts.use_finer_qv) self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn)) self.icec.run() self.add_log("IceIterative completed.", level=logging.INFO) # IceIterative done, write predicted (unplished) consensus isoforms # to an output fasta self.add_log("Creating a link to unpolished consensus isoforms.") ln(self.icec.final_consensus_fa, self.out_fa) # Call quiver to polish predicted consensus isoforms. if self.ice_opts.quiver is not True: self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.icec.report_fn, dst=self.report_fn) # Summarize cluster and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa) else: # self.ice_opts.quiver is True self.add_log("Polishing clusters: initializing IcePolish.", level=logging.INFO) self.pol = Polish(root_dir=self.root_dir, nfl_fa=self.nfl_fa, bas_fofn=self.bas_fofn, ccs_fofn=self.ccs_fofn, fasta_fofn=self.fasta_fofn, ice_opts=self.ice_opts, sge_opts=self.sge_opts, ipq_opts=self.ipq_opts, nfl_reads_per_split=self.nfl_reads_per_split) self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn), level=logging.INFO) self.pol.run() self.add_log("IcePolish completed.", level=logging.INFO) # cluster report self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.pol.iceq.report_fn, dst=self.report_fn) # Summarize cluster & polish and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa, hq_fa=self.pol.icepq.quivered_good_fa, lq_fa=self.pol.icepq.quivered_bad_fa) # Create log file. self.close_log() return 0
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) #self.ice_opts.flnc_reads_per_split=1000 #FOR DEBUGGING, REMOVE LATER # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.flnc_reads_per_split), level=logging.INFO) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split") self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) firstSplit = self._flnc_splitted_fas[0] firstSplit_fq = firstSplit[:firstSplit.rfind('.')] + '.fastq' self.add_log("Converting first split file {0} + {1} into fastq\n".format(\ firstSplit, self.ccs_fofn), level=logging.INFO) # Convert this into FASTQ ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq) # Set up probabbility and quality value model if self.ice_opts.use_finer_qv: self._setProbQV_ccs(self.ccs_fofn, firstSplit) else: self._setProbQV_fq(firstSplitFq=firstSplit_fq) # Initialize cluster by clique if os.path.exists(self.initPickleFN): self.add_log("Reading existing uc pickle: {0}".format(self.initPickleFN), level=logging.INFO) with open(self.initPickleFN) as f: uc = cPickle.load(f) else: self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO) self.iceinit = IceInit(readsFa=firstSplit, qver_get_func=self._probqv.get_smoothed, ice_opts=self.ice_opts, sge_opts=self.sge_opts) uc = self.iceinit.uc # Dump uc to a file self.add_log("Dumping initial clusters to {f}".format( f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, 'w') as f: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO) self.icec = IceIterative( fasta_filename=firstSplit, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv, fastq_filename=firstSplit_fq, use_ccs_qv=self.ice_opts.use_finer_qv) self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn)) return self.icec
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. """ input_fasta = realpath(input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = os.path.dirname(out_pickle) daligner_sensitive_mode, _low, _high = get_daligner_sensitivity_setting(ref_fasta) # DB should always be already converted ref_obj = DazzIDHandler(ref_fasta, True) input_obj = DazzIDHandler(input_fasta, False) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(input_fasta, ref_fasta, is_FL=False, same_strand_only=False, \ query_converted=True, db_converted=True, query_made=False, \ db_made=True, use_sge=False, cpus=cpus, sge_opts=None) las_filenames, las_out_filenames = runner.runHPC(min_match_len=300, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode) if no_qv_or_aln_checking: # not using QVs or alignment checking! # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used logging.info("Not using QV for partial_uc. Loading dummy QV.") probqv = ProbFromModel(.01, .07, .06) else: if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ s=time.time()-start_t)) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) probqv = ProbFromFastq(input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) print >> sys.stderr, "Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t) logging.info("Calling dalign_against_ref ...") partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for las_out_filename in las_out_filenames: start_t = time.time() hitItems = dalign_against_ref(input_obj, ref_obj, las_out_filename, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=20, same_strand_only=False, no_qv_or_aln_checking=no_qv_or_aln_checking) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing {0} took {1} sec".format(las_out_filename, time.time()-start_t)) print >> sys.stderr, "processing {0} took {1} sec".format(las_out_filename, time.time()-start_t) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename) # remove all the .las and .las.out filenames for file in las_filenames: os.remove(file) for file in las_out_filenames: os.remove(file)
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, sa_file=None, ccs_fofn=None, done_filename=None, blasr_nproc=12, use_finer_qv=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = realpath(input_fasta) m5_file = input_fasta + ".blasr" out_pickle = realpath(out_pickle) if sa_file is None: if op.exists(input_fasta + ".sa"): sa_file = input_fasta + ".sa" cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \ "-nproc {n} -m 5 ".format(n=blasr_nproc) + \ "-maxScore -1000 -minPctIdentity 85 " + \ "-out {o} ".format(o=real_upath(m5_file)) if sa_file is not None and op.exists(sa_file): cmd += "-sa {sa}".format(sa=real_upath(sa_file)) logging.info("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg) logging.error(errMsg) raise RuntimeError(errMsg) if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ s=time.time()-start_t)) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) probqv = ProbFromFastq(input_fastq) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qvmean_get_func=probqv.get_mean, qver_get_func=probqv.get_smoothed, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename)