def run(self): """Run""" logging.debug("root_dir: {d}.".format(d=self.root_dir)) logging.debug("nfl_fa: {f}.".format(f=self.nfl_fa)) logging.debug("Total number of chunks: N={N}.".format(N=self.N)) # Validate input files, (num_reads, reads_per_split, nfl_dir, splitted_fas_todo) = \ self.validate_inputs() logging.info("Total number of reads is {n}.".format(n=num_reads)) logging.info("Splitting nfl_fa into chunks each " + "containing {n} reads.".format(n=reads_per_split)) splitted_fas_done = splitFasta( input_fasta=real_ppath(self.nfl_fa), reads_per_split=reads_per_split, out_dir=nfl_dir, out_prefix="input.split") logging.info("Splitted files are: " + "\n".join(splitted_fas_done)) for fa in splitted_fas_todo: if fa not in splitted_fas_done: logging.info("touching {f}".format(f=fa)) touch(fa)
def run(self): """Run""" logging.debug("root_dir: {d}.".format(d=self.root_dir)) logging.debug("nfl_fa: {f}.".format(f=self.nfl_fa)) logging.debug("nfl_fq: {f}.".format(f=self.nfl_fq)) logging.debug("Total number of chunks: N={N}.".format(N=self.N)) # Validate input files, (num_reads, reads_per_split, nfl_dir, splitted_fas_todo, splitted_fqs_todo) = \ self.validate_inputs() logging.info("Total number of reads is {n}.".format(n=num_reads)) logging.info("Splitting nfl_fa into chunks each " + "containing {n} reads.".format(n=reads_per_split)) splitted_fas_done = splitFaFq( input_fa_or_fq=self.nfl_fa, reads_per_split=reads_per_split, out_dir=nfl_dir, out_format=IceFiles2.nfl_fa_format, is_fq=False, reads_in_first_split=None) splitted_fqs_done = splitFaFq( input_fa_or_fq=self.nfl_fq, reads_per_split=reads_per_split, out_dir=nfl_dir, out_format=IceFiles2.nfl_fq_format, is_fq=True, reads_in_first_split=None) logging.info("Splitted fastas are: " + "\n".join(splitted_fas_done)) logging.info("Splitted fastqs are: " + "\n".join(splitted_fqs_done)) # Liz: why is below necessary???? for fa in splitted_fas_todo: if fa not in splitted_fas_done: logging.info("touching {f}".format(f=fa)) touch(fa) for fq in splitted_fqs_todo: if fq not in splitted_fqs_done: logging.info("touching {f}".format(f=fq)) touch(fq)
def run(self): """Run""" logging.debug("root_dir: {d}.".format(d=self.root_dir)) logging.debug("nfl_fa: {f}.".format(f=self.nfl_fa)) logging.debug("nfl_fq: {f}.".format(f=self.nfl_fq)) logging.debug("Total number of chunks: N={N}.".format(N=self.N)) # Validate input files, (num_reads, reads_per_split, nfl_dir, splitted_fas_todo, splitted_fqs_todo) = \ self.validate_inputs() logging.info("Total number of reads is {n}.".format(n=num_reads)) logging.info("Splitting nfl_fa into chunks each " + "containing {n} reads.".format(n=reads_per_split)) splitted_fas_done = splitFaFq(input_fa_or_fq=self.nfl_fa, reads_per_split=reads_per_split, out_dir=nfl_dir, out_format=IceFiles2.nfl_fa_format, is_fq=False, reads_in_first_split=None) splitted_fqs_done = splitFaFq(input_fa_or_fq=self.nfl_fq, reads_per_split=reads_per_split, out_dir=nfl_dir, out_format=IceFiles2.nfl_fq_format, is_fq=True, reads_in_first_split=None) logging.info("Splitted fastas are: " + "\n".join(splitted_fas_done)) logging.info("Splitted fastqs are: " + "\n".join(splitted_fqs_done)) # Liz: why is below necessary???? for fa in splitted_fas_todo: if fa not in splitted_fas_done: logging.info("touching {f}".format(f=fa)) touch(fa) for fq in splitted_fqs_todo: if fq not in splitted_fqs_done: logging.info("touching {f}".format(f=fq)) touch(fq)
def run(self): """Run""" logging.debug("root_dir: {d}.".format(d=self.root_dir)) logging.debug("nfl_fa: {f}.".format(f=self.nfl_fa)) logging.debug("Total number of chunks: N={N}.".format(N=self.N)) # Validate input files, (num_reads, reads_per_split, nfl_dir, splitted_fas_todo) = \ self.validate_inputs() logging.info("Total number of reads is {n}.".format(n=num_reads)) logging.info("Splitting nfl_fa into chunks each " + "containing {n} reads.".format(n=reads_per_split)) splitted_fas_done = splitFasta(input_fasta=real_ppath(self.nfl_fa), reads_per_split=reads_per_split, out_dir=nfl_dir, out_prefix="input.split") logging.info("Splitted files are: " + "\n".join(splitted_fas_done)) for fa in splitted_fas_todo: if fa not in splitted_fas_done: logging.info("touching {f}".format(f=fa)) touch(fa)
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle, done_filename, ice_opts, probqv, qv_prob_threshold=0.3, cpus=4, no_qv_or_aln_checking=False, tmp_dir=None, sID_starts_with_c=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using DALIGNER, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. tmp_dir - where to save intermediate files such as dazz files. if None, writer dazz files to the same directory as query/target. """ input_fasta = realpath(input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = op.dirname(out_pickle) ice_opts.detect_cDNA_size(ref_fasta) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(query_filename=input_fasta, target_filename=ref_fasta, is_FL=False, same_strand_only=False, query_converted=False, target_converted=True, dazz_dir=tmp_dir, script_dir=op.join(output_dir, "script"), use_sge=False, sge_opts=None, cpus=cpus) runner.run(min_match_len=ice_opts.min_match_len, output_dir=output_dir, sensitive_mode=ice_opts.sensitive_mode) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for la4ice_filename in runner.la4ice_filenames: start_t = time.time() # not providing full_missed_start/end since aligning nFLs, ok to partially align only hitItems = daligner_against_ref2(query_dazz_handler=runner.query_dazz_handler, target_dazz_handler=runner.target_dazz_handler, la4ice_filename=la4ice_filename, is_FL=False, sID_starts_with_c=sID_starts_with_c, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, qv_prob_threshold=qv_prob_threshold, ece_penalty=ice_opts.ece_penalty, ece_min_len=ice_opts.ece_min_len, same_strand_only=True, no_qv_or_aln_checking=no_qv_or_aln_checking, max_missed_start=ice_opts.max_missed_start, max_missed_end=ice_opts.max_missed_end, full_missed_start=ice_opts.full_missed_start, full_missed_end=ice_opts.full_missed_end) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing %s took %s sec", la4ice_filename, str(time.time()-start_t)) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename) # remove all the .las and .las.out filenames runner.clean_run()
def build_uc_from_partial_blasr(input_fasta, ref_fasta, out_pickle, done_filename, ice_opts, probqv, qv_prob_threshold=0.3, cpus=4, no_qv_or_aln_checking=False, tmp_dir=None, sID_starts_with_c=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. """ input_fasta = _get_fasta_path(realpath(input_fasta)) m5_file = os.path.basename(input_fasta) + ".blasr" if tmp_dir is not None: m5_file = op.join(tmp_dir, m5_file) out_pickle = realpath(out_pickle) cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} --bestn 100 --nCandidates 200 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \ "--nproc {n} -m 5 ".format(n=cpus) + \ "--maxScore -1000 --minPctIdentity 85 " + \ "--minAlnLength {a} ".format(a=ice_opts.min_match_len) + \ "--out {o} ".format(o=real_upath(m5_file)) + \ "1>/dev/null 2>/dev/null" execute(cmd) logging.info("Calling blasr_against_ref ...") # no need to provide full_missed_start/end for nFLs, since is_FL = False hitItems = blasr_against_ref2(output_filename=m5_file, is_FL=False, sID_starts_with_c=sID_starts_with_c, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, qv_prob_threshold=qv_prob_threshold, ece_penalty=ice_opts.ece_penalty, ece_min_len=ice_opts.ece_min_len, max_missed_start=ice_opts.max_missed_start, max_missed_end=ice_opts.max_missed_end, full_missed_start=ice_opts.full_missed_start, full_missed_end=ice_opts.full_missed_end, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename)
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True, tmp_dir=None): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. tmp_dir - where to save intermediate files such as dazz files. if None, writer dazz files to the same directory as query/target. """ input_fasta = realpath(input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = op.dirname(out_pickle) ice_opts = IceOptions() ice_opts.detect_cDNA_size(ref_fasta) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(query_filename=input_fasta, target_filename=ref_fasta, is_FL=False, same_strand_only=False, query_converted=False, target_converted=True, dazz_dir=tmp_dir, script_dir=op.join(output_dir, "script"), use_sge=False, sge_opts=None, cpus=cpus) runner.run(min_match_len=300, output_dir=output_dir, sensitive_mode=ice_opts.sensitive_mode) if no_qv_or_aln_checking: # not using QVs or alignment checking! # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used logging.info("Not using QV for partial_uc. Loading dummy QV.") probqv = ProbFromModel(.01, .07, .06) else: if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Loading QVs from %s + %s took %s secs", ccs_fofn, input_fasta, time.time()-start_t) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting %s + %s --> %s", input_fasta, ccs_fofn, input_fastq) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) probqv = ProbFromFastq(input_fastq) logging.info("Loading QVs from %s took %s secs", input_fastq, time.time()-start_t) logging.info("Calling dalign_against_ref ...") partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for la4ice_filename in runner.la4ice_filenames: start_t = time.time() hitItems = daligner_against_ref(query_dazz_handler=runner.query_dazz_handler, target_dazz_handler=runner.target_dazz_handler, la4ice_filename=la4ice_filename, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=20, same_strand_only=False, no_qv_or_aln_checking=no_qv_or_aln_checking) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing %s took %s sec", la4ice_filename, str(time.time()-start_t)) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename) # remove all the .las and .las.out filenames runner.clean_run()
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, blasr_nproc=12, tmp_dir=None): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = _get_fasta_path(realpath(input_fasta)) m5_file = os.path.basename(input_fasta) + ".blasr" if tmp_dir is not None: m5_file = op.join(tmp_dir, m5_file) out_pickle = realpath(out_pickle) cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \ "--nproc {n} -m 5 ".format(n=blasr_nproc) + \ "--maxScore -1000 --minPctIdentity 85 " + \ "--out {o} ".format(o=real_upath(m5_file)) + \ "1>/dev/null 2>/dev/null" execute(cmd) if ccs_fofn is None: logging.info("Loading probability from model") probqv = ProbFromModel(.01, .07, .06) else: # FIXME this will not work with current CCS bam output, which lacks # QV pulse features required - this is handled via a workaround in # pbtranscript.tasks.ice_partial logging.info("Loading probability from QV in %s", ccs_fofn) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename)
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True, tmp_dir=None): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. tmp_dir - where to save intermediate files such as dazz files. if None, writer dazz files to the same directory as query/target. """ input_fasta = realpath(input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = op.dirname(out_pickle) ice_opts = IceOptions() ice_opts.detect_cDNA_size(ref_fasta) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(query_filename=input_fasta, target_filename=ref_fasta, is_FL=False, same_strand_only=False, query_converted=False, target_converted=True, dazz_dir=tmp_dir, script_dir=op.join(output_dir, "script"), use_sge=False, sge_opts=None, cpus=cpus) runner.run(min_match_len=300, output_dir=output_dir, sensitive_mode=ice_opts.sensitive_mode) if no_qv_or_aln_checking: # not using QVs or alignment checking! # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used logging.info("Not using QV for partial_uc. Loading dummy QV.") probqv = ProbFromModel(.01, .07, .06) else: if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Loading QVs from %s + %s took %s secs", ccs_fofn, input_fasta, time.time() - start_t) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting %s + %s --> %s", input_fasta, ccs_fofn, input_fastq) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) probqv = ProbFromFastq(input_fastq) logging.info("Loading QVs from %s took %s secs", input_fastq, time.time() - start_t) logging.info("Calling dalign_against_ref ...") partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for la4ice_filename in runner.la4ice_filenames: start_t = time.time() hitItems = daligner_against_ref( query_dazz_handler=runner.query_dazz_handler, target_dazz_handler=runner.target_dazz_handler, la4ice_filename=la4ice_filename, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=20, same_strand_only=False, no_qv_or_aln_checking=no_qv_or_aln_checking) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing %s took %s sec", la4ice_filename, str(time.time() - start_t)) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename) # remove all the .las and .las.out filenames runner.clean_run()
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, blasr_nproc=12, tmp_dir=None): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = _get_fasta_path(realpath(input_fasta)) m5_file = os.path.basename(input_fasta) + ".blasr" if tmp_dir is not None: m5_file = op.join(tmp_dir, m5_file) out_pickle = realpath(out_pickle) cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \ "--nproc {n} -m 5 ".format(n=blasr_nproc) + \ "--maxScore -1000 --minPctIdentity 85 " + \ "--out {o} ".format(o=real_upath(m5_file)) + \ "1>/dev/null 2>/dev/null" execute(cmd) if ccs_fofn is None: logging.info("Loading probability from model") probqv = ProbFromModel(.01, .07, .06) else: # FIXME this will not work with current CCS bam output, which lacks # QV pulse features required - this is handled via a workaround in # pbtranscript.tasks.ice_partial logging.info("Loading probability from QV in %s", ccs_fofn) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename)