def _align(self, queryFa, output_dir, ice_opts, sge_opts): daligner_sensitive_mode, _low, _high = get_daligner_sensitivity_setting(queryFa) input_obj = DazzIDHandler(queryFa, False) DalignerRunner.make_db(input_obj.dazz_filename) # run this locally runner = DalignerRunner(queryFa, queryFa, is_FL=True, same_strand_only=True, \ query_converted=True, db_converted=True, query_made=True, \ db_made=True, use_sge=False, cpus=4, sge_opts=None) las_filenames, las_out_filenames = runner.runHPC(min_match_len=_low, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode) return input_obj, las_out_filenames
def _align(self, queryFa, output_dir, ice_opts, sge_opts): daligner_sensitive_mode, _low, _high = get_daligner_sensitivity_setting( queryFa) input_obj = DazzIDHandler(queryFa, False) DalignerRunner.make_db(input_obj.dazz_filename) # run this locally runner = DalignerRunner(queryFa, queryFa, is_FL=True, same_strand_only=True, \ query_converted=True, db_converted=True, query_made=True, \ db_made=True, use_sge=False, cpus=4, sge_opts=None) las_filenames, las_out_filenames = runner.runHPC( min_match_len=_low, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode) return input_obj, las_out_filenames
def init_cluster_by_clique(self, readsFa, ice_opts, sge_opts): """ Only called once and in the very beginning, when (probably a subset) of sequences are given to generate the initial cluster. readsFa --- initial fasta filename, probably called *_split00.fa bestn --- parameter in BLASR, higher helps in finding perfect cliques but bigger output nproc, maxScore --- parameter in BLASR, set maxScore appropriate to input transcript length ece_penalty, ece_min_len --- parameter in isoform hit calling Self-blasr input then iteratively find all mutually exclusive cliques (in decreasing size) Returns dict of cluster_index --> list of seqids which is the 'uc' dict that can be used by IceIterative """ output_dir = os.path.dirname(readsFa) try: dazz_obj, las_out_filenames, _ignore5, _ignore3, _ece_min_len = self._align( queryFa=readsFa, output_dir=output_dir, ice_opts=ice_opts, sge_opts=sge_opts) ice_opts.ece_min_len = _ece_min_len # overwrite alignGraph = self._makeGraphFromLasOut(las_out_filenames, dazz_obj, qver_get_func, ice_opts, \ max_missed_start=_ignore5, max_missed_end=_ignore3, \ qvmean_get_func=qvmean_get_func) except: # daligner probably crashed, fall back to blasr outFN = readsFa + '.self.blasr' daligner_sensitive_mode, _low, _high, _ignore5, _ignore3, _ece_min_len = get_daligner_sensitivity_setting( readsFa, is_fasta=True) ice_opts.ece_min_len = _ece_min_len # overwrite self._align_withBLASR(queryFa=readsFa, targetFa=readsFa, outFN=outFN, ice_opts=ice_opts, sge_opts=sge_opts) alignGraph = self._makeGraphFromM5(outFN, qver_get_func, qvmean_get_func, ice_opts, \ max_missed_start=_ignore5, max_missed_end=_ignore3) uc = self._findCliques(alignGraph=alignGraph, readsFa=readsFa) return uc
def init_cluster_by_clique(self, readsFa, qver_get_func, ice_opts, sge_opts, qvmean_get_func): """ Only called once and in the very beginning, when (probably a subset) of sequences are given to generate the initial cluster. readsFa --- initial fasta filename, probably called *_split00.fa qver_get_func --- function that returns QVs on reads bestn --- parameter in BLASR, higher helps in finding perfect cliques but bigger output nproc, maxScore --- parameter in BLASR, set maxScore appropriate to input transcript length ece_penalty, ece_min_len --- parameter in isoform hit calling Self-blasr input then iteratively find all mutually exclusive cliques (in decreasing size) Returns dict of cluster_index --> list of seqids which is the 'uc' dict that can be used by IceIterative """ output_dir = os.path.dirname(readsFa) try: dazz_obj, las_out_filenames, _ignore5, _ignore3, _ece_min_len = self._align(queryFa=readsFa, output_dir=output_dir, ice_opts=ice_opts, sge_opts=sge_opts) ice_opts.ece_min_len = _ece_min_len # overwrite alignGraph = self._makeGraphFromLasOut(las_out_filenames, dazz_obj, qver_get_func, ice_opts, \ max_missed_start=_ignore5, max_missed_end=_ignore3, \ qvmean_get_func=qvmean_get_func) except: # daligner probably crashed, fall back to blasr outFN = readsFa + '.self.blasr' daligner_sensitive_mode, _low, _high, _ignore5, _ignore3, _ece_min_len = get_daligner_sensitivity_setting(readsFa, is_fasta=True) ice_opts.ece_min_len = _ece_min_len # overwrite self._align_withBLASR(queryFa=readsFa, targetFa=readsFa, outFN=outFN, ice_opts=ice_opts, sge_opts=sge_opts) alignGraph = self._makeGraphFromM5(outFN, qver_get_func, qvmean_get_func, ice_opts, \ max_missed_start=_ignore5, max_missed_end=_ignore3) uc = self._findCliques(alignGraph=alignGraph, readsFa=readsFa) return uc
def build_uc_from_partial_daligner(input_fastq, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. """ input_fastq = realpath(input_fastq) input_fasta = input_fastq[:input_fastq.rfind('.')] + '.fasta' ice_fq2fa(input_fastq, input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = os.path.dirname(out_pickle) daligner_sensitive_mode, _low, _high, _ignore5, _ignore3, _ece_min_len = get_daligner_sensitivity_setting(ref_fasta, is_fasta=True) # DB should always be already converted ref_obj = DazzIDHandler(ref_fasta, True) input_obj = DazzIDHandler(input_fasta, False) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(input_fasta, ref_fasta, is_FL=False, same_strand_only=False, \ query_converted=True, db_converted=True, query_made=False, \ db_made=True, use_sge=False, cpus=cpus, sge_opts=None) las_filenames, las_out_filenames = runner.runHPC(min_match_len=_low, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode) if no_qv_or_aln_checking: # not using QVs or alignment checking! # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used logging.info("Not using QV for partial_uc. Loading dummy QV.") probqv = ProbFromModel(.01, .07, .06) else: # if ccs_fofn is None: # logging.info("Loading probability from model (0.01,0.07,0.06)") # probqv = ProbFromModel(.01, .07, .06) # else: start_t = time.time() probqv = ProbFromFastq(input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) # --------- comment out below since we are just using FASTQ / BAM # if use_finer_qv: # probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) # logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ # s=time.time()-start_t)) # else: # input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' # logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) # ice_fa2fq(input_fasta, ccs_fofn, input_fastq) # probqv = ProbFromFastq(input_fastq) # logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) # print >> sys.stderr, "Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t) logging.info("Calling dalign_against_ref ...") partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for las_out_filename in las_out_filenames: start_t = time.time() hitItems = dalign_against_ref(input_obj, ref_obj, las_out_filename, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=_ece_min_len, same_strand_only=False, no_qv_or_aln_checking=no_qv_or_aln_checking, max_missed_start=_ignore5, max_missed_end=_ignore3) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing {0} took {1} sec".format(las_out_filename, time.time()-start_t)) print >> sys.stderr, "processing {0} took {1} sec".format(las_out_filename, time.time()-start_t) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename) # remove all the .las and .las.out filenames for file in las_filenames: os.remove(file) for file in las_out_filenames: os.remove(file)
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. """ input_fasta = realpath(input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = os.path.dirname(out_pickle) daligner_sensitive_mode, _low, _high = get_daligner_sensitivity_setting(ref_fasta) # DB should always be already converted ref_obj = DazzIDHandler(ref_fasta, True) input_obj = DazzIDHandler(input_fasta, False) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(input_fasta, ref_fasta, is_FL=False, same_strand_only=False, \ query_converted=True, db_converted=True, query_made=False, \ db_made=True, use_sge=False, cpus=cpus, sge_opts=None) las_filenames, las_out_filenames = runner.runHPC(min_match_len=300, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode) if no_qv_or_aln_checking: # not using QVs or alignment checking! # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used logging.info("Not using QV for partial_uc. Loading dummy QV.") probqv = ProbFromModel(.01, .07, .06) else: if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ s=time.time()-start_t)) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) probqv = ProbFromFastq(input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) print >> sys.stderr, "Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t) logging.info("Calling dalign_against_ref ...") partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for las_out_filename in las_out_filenames: start_t = time.time() hitItems = dalign_against_ref(input_obj, ref_obj, las_out_filename, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=20, same_strand_only=False, no_qv_or_aln_checking=no_qv_or_aln_checking) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing {0} took {1} sec".format(las_out_filename, time.time()-start_t)) print >> sys.stderr, "processing {0} took {1} sec".format(las_out_filename, time.time()-start_t) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename) # remove all the .las and .las.out filenames for file in las_filenames: os.remove(file) for file in las_out_filenames: os.remove(file)