def run_main(chunk_json, contigset_output, chunk_key): """run main""" chunks = load_pipeline_chunks_from_json(chunk_json) # Allow looseness if not chunk_key.startswith('$chunk.'): chunk_key = '$chunk.' + chunk_key log.warn("Prepending chunk key with '$chunk.' to '%s'", str(chunk_key)) fasta_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key) log.debug("Chunked consensus isoforms files are %s.", (', '.join(fasta_files))) out_fa = CombinedFiles(combined_dir=op.dirname(contigset_output)).all_consensus_isoforms_fa combine_consensus_isoforms(split_indices=range(0, len(fasta_files)), split_files=fasta_files, combined_consensus_isoforms_fa=out_fa) log.info("Combining files to %s.", out_fa) log.info("Writing contigset %s", contigset_output) assert contigset_output.endswith('xml') as_contigset(out_fa, contigset_output) #cs = ContigSet(*fasta_files) #cs.newUuid() #cs.write(contigset_output) return 0
def make_flnc(in_flnc, root_dir): bin_name = op.basename(op.dirname(in_flnc)) flnc_name = op.basename(in_flnc) assert in_flnc.endswith(".contigset.xml") in_flnc_fa = in_flnc.replace(".contigset.xml", ".fasta") new_flnc = op.join(root_dir, bin_name, flnc_name) new_flnc_fa = new_flnc.replace(".contigset.xml", ".fasta") print "new_flnc = %s" % new_flnc shutil.copy(in_flnc_fa, new_flnc_fa) as_contigset(new_flnc_fa, new_flnc)
def __exit__(self, exc_type, exc_value, traceback): """ Close all fasta file handles. If create_contigset is True, convert out_fasta_files to out_contigset_files. """ # close fasta file handlers for f in self.handles.itervalues(): f.close() if self.create_contigset is True: for fasta_fn, xml_fn in zip(self.out_fasta_files, self.out_contigset_files): as_contigset(fasta_fn, xml_fn) # write out_pickle self.write_pickle()
def run(self): """Classify/annotate reads according to 5' primer seen, 3' primer seen, polyA seen, chimera (concatenation of two or multiple transcripts with primers seen in the middle of a read) (1) Create and validate input/output (2) Check phmmer is runnable (3) Find primers using phmmer and trim away primers and polyAs (4) Detect chimeras from trimmed reads """ # Validate input files and required data files. self._validate_inputs(self.reads_fn, self.primer_fn, self.pbmatrix_fn) # Validate and create output dir. self._validate_outputs(self.out_dir, self.out_all_reads_fn_fasta) # Sanity check phmmer can be called successfully. self._checkPhmmer() # Find and trim primers and polyAs. self.runPrimerTrimmer() # Check whether no fl reads detected. no_flnc_errMsg = "No full-length non-chimeric reads detected." if self.summary.num_fl == 0: logging.error(no_flnc_errMsg) if not self.ignore_empty_output: raise ClassifierException(no_flnc_errMsg) else: # Detect chimeras and generate primer reports. self.runChimeraDetector() dataset_uuids = [] for file_attr in ["out_nfl_fn", "out_nflnc_fn", "out_nflc_fn", "out_flnc_fn", "out_flc_fn", "out_all_reads_fn"]: file_name = getattr(self, file_attr) fasta_file_name = getattr(self, "%s_fasta" % file_attr) ds = as_contigset( fasta_file=fasta_file_name, xml_file=file_name) if file_attr in ["out_all_reads_fn", "out_nfl_fn", "out_flnc_fn"]: dataset_uuids.append(ds.uuid) try: # Write summary. logging.info("Writing report to {f}".format(f=self.summary_fn)) self.summary.write(self.summary_fn, dataset_uuids=dataset_uuids) except ZeroDivisionError: logging.error(no_flnc_errMsg) raise ClassifierException(no_flnc_errMsg) return 0
def run(self): """Classify/annotate reads according to 5' primer seen, 3' primer seen, polyA seen, chimera (concatenation of two or multiple transcripts with primers seen in the middle of a read) (1) Create and validate input/output (2) Check phmmer is runnable (3) Find primers using phmmer and trim away primers and polyAs (4) Detect chimeras from trimmed reads """ # Validate input files and required data files. self._validate_inputs(self.reads_fn, self.primer_fn, self.pbmatrix_fn) # Validate and create output dir. self._validate_outputs(self.out_dir, self.out_all_reads_fn_fasta) # Sanity check phmmer can be called successfully. self._checkPhmmer() # Find and trim primers and polyAs. self.runPrimerTrimmer() # Check whether no fl reads detected. no_flnc_errMsg = "No full-length non-chimeric reads detected." if self.summary.num_fl == 0: logging.error(no_flnc_errMsg) if not self.ignore_empty_output: raise ClassifierException(no_flnc_errMsg) else: # Detect chimeras and generate primer reports. self.runChimeraDetector() dataset_uuids = [] for file_attr in [ "out_nfl_fn", "out_nflnc_fn", "out_nflc_fn", "out_flnc_fn", "out_flc_fn", "out_all_reads_fn" ]: file_name = getattr(self, file_attr) fasta_file_name = getattr(self, "%s_fasta" % file_attr) ds = as_contigset(fasta_file=fasta_file_name, xml_file=file_name) if file_attr in ["out_all_reads_fn", "out_nfl_fn", "out_flnc_fn"]: dataset_uuids.append(ds.uuid) try: # Write summary. logging.info("Writing report to {f}".format(f=self.summary_fn)) self.summary.write(self.summary_fn, dataset_uuids=dataset_uuids) except ZeroDivisionError: logging.error(no_flnc_errMsg) raise ClassifierException(no_flnc_errMsg) return 0
def args_runner(args): """Run given input args""" c = CollapseIsoformsRunner(isoform_filename=args.input_isoforms, sam_filename=args.sam_filename, output_prefix=args.output_prefix, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, min_flnc_coverage=args.min_flnc_coverage, max_fuzzy_junction=args.max_fuzzy_junction, allow_extra_5exon=args.allow_extra_5exon, skip_5_exon_alt=args.skip_5_exon_alt) c.run() if args.collapsed_isoforms is not None: suffix = parse_ds_filename(args.collapsed_isoforms)[1] if op.exists(c.rep_fn(suffix)): ln(c.rep_fn(suffix), args.collapsed_isoforms) else: if suffix == ".contigset.xml": # make contigset from fasta as_contigset(c.rep_fn("fasta"), args.collapsed_isoforms) else: raise IOError("Could not make collapsed isoform file %s" % args.collapsed_isoforms) return 0
def test_as_contigset(self): """Test as_contigset""" out_dir = op.join(OUT_DIR, 'test_Utils') mknewdir(out_dir) fa = op.join(out_dir, "empty.fasta") xml = op.join(out_dir, "empty.contigset.xml") fai = fa + ".fai" execute("touch %s" % fa) as_contigset(fa, xml) self.assertTrue(op.exists(xml)) self.assertTrue(op.exists(fai)) fn = 'reads_of_insert.fasta' shutil.copy(src=op.join(DATA_DIR, fn), dst=op.join(out_dir, fn)) fa = op.join(out_dir, fn) as_contigset(fa, fa) fai = fa + ".fai" xml = op.join(out_dir, 'reads_of_insert.contigset.xml') as_contigset(fa, xml) self.assertTrue(op.exists(xml)) self.assertTrue(op.exists(fai))
def run(self): """ Check all arrow jobs are running, failed or done. Write high-quality consensus and low-quality consensus to all_arrowed.hq|lq fasta|fastq """ self.validate_inputs() job_stats = self.check_arrow_jobs_completion() self.add_log("Arrow job status: {s}".format(s=job_stats)) if job_stats == 'DONE': pass # continue on below to process data elif job_stats == 'FAILED': self.add_log("Has incomplete jobs. Please re-run them.", level=logging.ERROR) return -1 elif job_stats == 'RUNNING': if self.quit_if_not_done: self.add_log( "Jobs are still running. Please wait before running this script." ) return 1 else: while job_stats != "DONE": self.add_log( "Jobs are still running. Wait. Sleeping for 180 seconds." ) sleep(180) job_stats = self.check_arrow_jobs_completion() if job_stats == "DONE": break elif job_stats == "FAILED": self.add_log( "There are some failed jobs. Please check.", level=logging.ERROR) return 1 elif job_stats == "RUNNING": self.add_log( "Jobs are still running. Wait. Sleeping for 180 seconds.", level=logging.INFO) else: msg = "Unable to recognize job_stats {s}".format(s=job_stats) self.add_log(msg, logging.ERROR) raise ValueError(msg) # at this point, all jobs must be done and all fastq files present. self.pickup_best_clusters() self.add_log("Creating polished high quality consensus isoforms.") if self.hq_isoforms_fa is not None: ln(self.arrowed_good_fa, self.hq_isoforms_fa) if self.hq_isoforms_fq is not None: ln(self.arrowed_good_fq, self.hq_isoforms_fq) self.add_log("Creating polished low quality consensus isoforms.") if self.lq_isoforms_fa is not None: ln(self.arrowed_bad_fa, self.lq_isoforms_fa) if self.lq_isoforms_fq is not None: ln(self.arrowed_bad_fq, self.lq_isoforms_fq) if self.hq_isoforms_dataset is not None: ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset) if self.lq_isoforms_dataset is not None: ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset) if self.summary_fn is not None: self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.final_consensus_fa, hq_fa=self.hq_isoforms_fa, lq_fa=self.lq_isoforms_fa) self.close_log()
def resolved_tool_contract_runner(rtc): """ For each cluster bin, create summary.json, cluster_report.csv, hq_isoforms.fa|fq, lq_isoforms.fa|fq Finally, merge all cluster bins and save all outputs to 'combined'. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) p.sorted_by_attr(attr='cluster_bin_index') opts = rtc.task.options ipq_opts = IceQuiverHQLQOptions(qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID], qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID], hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID]) sample_name = get_sample_name(input_sample_name=opts[Constants.SAMPLE_NAME_ID]) out_consensus_isoforms_cs = rtc.task.output_files[0] out_summary = rtc.task.output_files[1] out_report = rtc.task.output_files[2] out_hq_cs = rtc.task.output_files[3] out_hq_fq = rtc.task.output_files[4] out_lq_cs = rtc.task.output_files[5] out_lq_fq = rtc.task.output_files[6] out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7] assert out_consensus_isoforms_cs.endswith(".contigset.xml") assert out_hq_cs.endswith(".contigset.xml") assert out_lq_cs.endswith(".contigset.xml") out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(".contigset.xml", ".fasta") out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta') out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta') hq_fq_fns, lq_fq_fns = [], [] split_uc_pickles, split_partial_uc_pickles = [], [] split_consensus_isoforms = [] cluster_bin_indices = [task.cluster_bin_index for task in p] cluster_out_dirs = [task.cluster_out_dir for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) for task in p: ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir, ipq_opts=ipq_opts) hq_fq_fns.append(ice_pq.quivered_good_fq) lq_fq_fns.append(ice_pq.quivered_bad_fq) split_uc_pickles.append(ice_pq.final_pickle_fn) split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn) split_consensus_isoforms.append(ice_pq.final_consensus_fa) combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])), "combined") mkdir(combined_dir) combined_files = CombinedFiles(combined_dir) log.info("Combining results of all cluster bins to %s.", combined_dir) log.info("Merging HQ|LQ isoforms from all cluster bins.") log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns)) log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns)) combine_polished_isoforms(split_indices=cluster_bin_indices, split_hq_fns=hq_fq_fns, split_lq_fns=lq_fq_fns, combined_hq_fa=combined_files.all_hq_fa, combined_hq_fq=combined_files.all_hq_fq, combined_lq_fa=combined_files.all_lq_fa, combined_lq_fq=combined_files.all_lq_fq, hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle, sample_name=sample_name) ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms' ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms' ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms' ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms' ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle) as_contigset(out_hq_fa, out_hq_cs) as_contigset(out_lq_fa, out_lq_cs) log.info("Merging consensus isoforms from all cluster bins.") combine_consensus_isoforms(split_indices=cluster_bin_indices, split_files=split_consensus_isoforms, combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa, sample_name=sample_name) ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa) #consensus isoforms as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs) log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn) write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn, isoforms_fa=out_consensus_isoforms_cs, hq_fa=out_hq_fa, lq_fa=out_lq_fa) ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary" log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn) write_combined_cluster_report(split_indices=cluster_bin_indices, split_uc_pickles=split_uc_pickles, split_partial_uc_pickles=split_partial_uc_pickles, report_fn=combined_files.all_cluster_report_fn, sample_name=sample_name) ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
def pick_rep(isoform_filename, gff_filename, group_filename, output_filename, pick_least_err_instead=False, bad_gff_filename=None): """ For each group of collapsed sam records, select the representative record. If is FASTA file -- then always pick the longest one If is FASTQ file -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ fd = None is_fq = False dummy_prefix, _suffix = parse_ds_filename(isoform_filename) if _suffix == "fasta": fd = FastaRandomReader(isoform_filename) elif _suffix == "fastq": fd = FastqRandomReader(isoform_filename) is_fq = True elif _suffix == "contigset.xml": fd = ContigSet(isoform_filename) _fns = fd.toExternalFiles() if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(".fastq"): fd = FastqRandomReader(_fns[0]) is_fq = True else: if not fd.isIndexed: # Must be indexed FASTA, or exactly contains one FASTQ file raise IOError("%s must contain either indexed FASTA files or " % isoform_filename + "contain exactly one FASTQ file!") else: raise IOError("Unable to recognize file type of %s." % isoform_filename) fa_out_fn, fq_out_fn, ds_out_fn = None, None, None _prefix, _suffix = parse_ds_filename(output_filename) if _suffix == "fasta": fa_out_fn = output_filename elif _suffix == "fastq": if not is_fq: raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename) else: fq_out_fn = output_filename elif _suffix == "contigset.xml": # output is contigset.xml ds_out_fn = output_filename fa_out_fn = _prefix + ".fasta" if is_fq: fq_out_fn = _prefix + ".fastq" else: raise IOError("Unable to recognize file type of %s." % output_filename) fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None coords = {} for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) if bad_gff_filename is not None: for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) for group in GroupReader(group_filename): pb_id, members = group.name, group.members if not pb_id in coords: raise ValueError("Could not find %s in %s and %s" % (pb_id, gff_filename, bad_gff_filename)) #logging.info("Picking representative sequence for %s", pb_id) best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members: if is_fq and pick_least_err_instead: err = sum(i**-(i/10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or \ ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if fq_writer is not None: fq_writer.writeRecord(_id_, _seq_, best_qual) if fa_writer is not None: fa_writer.writeRecord(_id_, _seq_) if fa_writer is not None: fa_writer.close() if fq_writer is not None: fq_writer.close() if ds_out_fn is not None: as_contigset(fa_out_fn, ds_out_fn)
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) if self.ice_opts.targeted_isoseq: reads_in_first_split = 1000 self.ice_opts.flnc_reads_per_split = 10000 self.add_log("targeted_isoseq: further splitting JUST first " + "split to 1000. Changing flnc_reads_per_split=10000.") else: reads_in_first_split = None # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.flnc_reads_per_split), level=logging.INFO) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split", reads_in_first_split=reads_in_first_split) self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) # This is the first piece of reads to work on first_split_fa = self._flnc_splitted_fas[0] first_split_fq = fafn2fqfn(first_split_fa) # Set up probability and quality value model if self.ice_opts.use_finer_qv: # default off # Use multi-Qvs from ccs.h5, no need to write FASTQ self._probqv, msg = set_probqv_from_ccs( ccs_fofn=self.ccs_fofn, fasta_filename=first_split_fa) else: # use a single Qv from FASTQ if self.ccs_fofn is not None: self.add_log("Converting {fa} + {ccs} into {fq}\n".format( fa=first_split_fa, ccs=self.ccs_fofn, fq=first_split_fq), level=logging.INFO) ice_fa2fq(in_fa=first_split_fa, ccs_fofn=self.ccs_fofn, out_fq=first_split_fq) # Set probqv from the first splitted FASTQ file. self._probqv, msg = set_probqv_from_fq(fastq_filename=first_split_fq) else: # use predefined model self._probqv, msg = set_probqv_from_model() self.add_log(msg, level=logging.INFO) # Initialize cluster by clique self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO) self.iceinit = IceInit(readsFa=first_split_fa, qver_get_func=self._probqv.get_smoothed, qvmean_get_func=self._probqv.get_mean, ice_opts=self.ice_opts, sge_opts=self.sge_opts) uc = self.iceinit.uc # Dump uc to a file self.add_log("Dumping initial clusters to {f}" .format(f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, 'w') as f: if self.initPickleFN.endswith(".json"): f.write(json.dumps(uc)) else: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO) self.icec = IceIterative( fasta_filename=first_split_fa, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv, fastq_filename=first_split_fq, output_pickle_file=self.output_pickle_file, tmp_dir=self.tmp_dir) self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn)) self.icec.run() self.add_log("IceIterative completed.", level=logging.INFO) # IceIterative done, write predicted (unplished) consensus isoforms # to an output fasta self.add_log("Creating a link to unpolished consensus isoforms.") ln(self.icec.final_consensus_fa, self.out_fa) if self.out_fa_dataset is not None: dummy_ds = as_contigset( fasta_file=self.icec.final_consensus_fa, xml_file=self.out_fa_dataset) # Call quiver to polish predicted consensus isoforms. if self.ice_opts.quiver is not True: self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.icec.report_fn, dst=self.report_fn) # Summarize cluster and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa) else: # self.ice_opts.quiver is True self.add_log("Polishing clusters: initializing IcePolish.", level=logging.INFO) self.pol = Polish(root_dir=self.root_dir, nfl_fa=self.nfl_fa, bas_fofn=self.bas_fofn, ccs_fofn=self.ccs_fofn, fasta_fofn=self.fasta_fofn, ice_opts=self.ice_opts, sge_opts=self.sge_opts, ipq_opts=self.ipq_opts, tmp_dir=self.tmp_dir) self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn), level=logging.INFO) self.pol.run() self.add_log("IcePolish completed.", level=logging.INFO) # cluster report self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.pol.iceq.report_fn, dst=self.report_fn) # Summarize cluster & polish and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa, hq_fa=self.pol.icepq.quivered_good_fa, lq_fa=self.pol.icepq.quivered_bad_fa) # Create log file. self.close_log() return 0
def pick_rep(isoform_filename, gff_filename, group_filename, output_filename, pick_least_err_instead=False, bad_gff_filename=None): """ For each group of collapsed sam records, select the representative record. If is FASTA file -- then always pick the longest one If is FASTQ file -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ fd = None is_fq = False dummy_prefix, _suffix = parse_ds_filename(isoform_filename) if _suffix == "fasta": fd = FastaRandomReader(isoform_filename) elif _suffix == "fastq": fd = FastqRandomReader(isoform_filename) is_fq = True elif _suffix == "contigset.xml": fd = ContigSet(isoform_filename) _fns = fd.toExternalFiles() if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith( ".fastq"): fd = FastqRandomReader(_fns[0]) is_fq = True else: if not fd.isIndexed: # Must be indexed FASTA, or exactly contains one FASTQ file raise IOError( "%s must contain either indexed FASTA files or " % isoform_filename + "contain exactly one FASTQ file!") else: raise IOError("Unable to recognize file type of %s." % isoform_filename) fa_out_fn, fq_out_fn, ds_out_fn = None, None, None _prefix, _suffix = parse_ds_filename(output_filename) if _suffix == "fasta": fa_out_fn = output_filename elif _suffix == "fastq": if not is_fq: raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename) else: fq_out_fn = output_filename elif _suffix == "contigset.xml": # output is contigset.xml ds_out_fn = output_filename fa_out_fn = _prefix + ".fasta" if is_fq: fq_out_fn = _prefix + ".fastq" else: raise IOError("Unable to recognize file type of %s." % output_filename) fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None coords = {} for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) if bad_gff_filename is not None: for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) for group in GroupReader(group_filename): pb_id, members = group.name, group.members if not pb_id in coords: raise ValueError("Could not find %s in %s and %s" % (pb_id, gff_filename, bad_gff_filename)) #logging.info("Picking representative sequence for %s", pb_id) best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members: if is_fq and pick_least_err_instead: err = sum(i**-(i / 10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or \ ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if fq_writer is not None: fq_writer.writeRecord(_id_, _seq_, best_qual) if fa_writer is not None: fa_writer.writeRecord(_id_, _seq_) if fa_writer is not None: fa_writer.close() if fq_writer is not None: fq_writer.close() if ds_out_fn is not None: as_contigset(fa_out_fn, ds_out_fn)
def run(self): """ Check all arrow jobs are running, failed or done. Write high-quality consensus and low-quality consensus to all_arrowed.hq|lq fasta|fastq """ self.validate_inputs() job_stats = self.check_arrow_jobs_completion() self.add_log("Arrow job status: {s}".format(s=job_stats)) if job_stats == 'DONE': pass # continue on below to process data elif job_stats == 'FAILED': self.add_log("Has incomplete jobs. Please re-run them.", level=logging.ERROR) return -1 elif job_stats == 'RUNNING': if self.quit_if_not_done: self.add_log("Jobs are still running. Please wait before running this script.") return 1 else: while job_stats != "DONE": self.add_log("Jobs are still running. Wait. Sleeping for 180 seconds.") sleep(180) job_stats = self.check_arrow_jobs_completion() if job_stats == "DONE": break elif job_stats == "FAILED": self.add_log("There are some failed jobs. Please check.", level=logging.ERROR) return 1 elif job_stats == "RUNNING": self.add_log("Jobs are still running. Wait. Sleeping for 180 seconds.", level=logging.INFO) else: msg = "Unable to recognize job_stats {s}".format(s=job_stats) self.add_log(msg, logging.ERROR) raise ValueError(msg) # at this point, all jobs must be done and all fastq files present. self.pickup_best_clusters() self.add_log("Creating polished high quality consensus isoforms.") if self.hq_isoforms_fa is not None: ln(self.arrowed_good_fa, self.hq_isoforms_fa) if self.hq_isoforms_fq is not None: ln(self.arrowed_good_fq, self.hq_isoforms_fq) self.add_log("Creating polished low quality consensus isoforms.") if self.lq_isoforms_fa is not None: ln(self.arrowed_bad_fa, self.lq_isoforms_fa) if self.lq_isoforms_fq is not None: ln(self.arrowed_bad_fq, self.lq_isoforms_fq) if self.hq_isoforms_dataset is not None: ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset) if self.lq_isoforms_dataset is not None: ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset) if self.summary_fn is not None: self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.final_consensus_fa, hq_fa=self.hq_isoforms_fa, lq_fa=self.lq_isoforms_fa) self.close_log()
def run(self): """Check all quiver jobs are running, failed or done. Write high-quality consensus and low-quality consensus to all_quivered.good|bad.fasta|fastq. """ self.validate_inputs() job_stats = self.check_quiver_jobs_completion() self.add_log("quiver job status: {s}".format(s=job_stats)) if self.use_sge is not True and job_stats != "DONE": self.add_log("quiver jobs were not submitted via sge, " + "however are still incomplete. Please check.", level=logging.ERROR) return -1 elif self.use_sge is True: while job_stats != "DONE": self.add_log("Sleeping for 180 seconds.") sleep(180) job_stats = self.check_quiver_jobs_completion() if job_stats == "DONE": break elif job_stats == "FAILED": self.add_log("There are some failed jobs. Please check.", level=logging.ERROR) return 1 elif job_stats == "RUNNING": self.add_log("There are jobs still running, waiting...", level=logging.INFO) if self.quit_if_not_done is True: return 0 else: msg = "Unable to recognize job_stats {s}".format(job_stats) self.add_log(msg, logging.ERROR) raise ValueError(msg) self.pickup_best_clusters(self.fq_filenames) self.add_log("Creating polished high quality consensus isoforms.") if self.hq_isoforms_fa is not None: ln(self.quivered_good_fa, self.hq_isoforms_fa) if self.hq_isoforms_fq is not None: ln(self.quivered_good_fq, self.hq_isoforms_fq) self.add_log("Creating polished low quality consensus isoforms.") if self.lq_isoforms_fa is not None: ln(self.quivered_bad_fa, self.lq_isoforms_fa) if self.lq_isoforms_fq is not None: ln(self.quivered_bad_fq, self.lq_isoforms_fq) hq_fa = self.hq_isoforms_fa lq_fa = self.lq_isoforms_fa if self.hq_isoforms_dataset is not None: ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset) if self.lq_isoforms_dataset is not None: ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset) if self.summary_fn is not None: self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.final_consensus_fa, hq_fa=self.hq_isoforms_fa, lq_fa=self.lq_isoforms_fa) self.close_log()
def resolved_tool_contract_runner(rtc): """ For each cluster bin, create summary.json, cluster_report.csv, hq_isoforms.fa|fq, lq_isoforms.fa|fq Finally, merge all cluster bins and save all outputs to 'combined'. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) p.sorted_by_attr(attr='cluster_bin_index') opts = rtc.task.options ipq_opts = IceQuiverHQLQOptions( qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID], qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID], hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID]) sample_name = get_sample_name( input_sample_name=opts[Constants.SAMPLE_NAME_ID]) out_consensus_isoforms_cs = rtc.task.output_files[0] out_summary = rtc.task.output_files[1] out_report = rtc.task.output_files[2] out_hq_cs = rtc.task.output_files[3] out_hq_fq = rtc.task.output_files[4] out_lq_cs = rtc.task.output_files[5] out_lq_fq = rtc.task.output_files[6] out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7] assert out_consensus_isoforms_cs.endswith(".contigset.xml") assert out_hq_cs.endswith(".contigset.xml") assert out_lq_cs.endswith(".contigset.xml") out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace( ".contigset.xml", ".fasta") out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta') out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta') hq_fq_fns, lq_fq_fns = [], [] split_uc_pickles, split_partial_uc_pickles = [], [] split_consensus_isoforms = [] cluster_bin_indices = [task.cluster_bin_index for task in p] cluster_out_dirs = [task.cluster_out_dir for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) for task in p: ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir, ipq_opts=ipq_opts) hq_fq_fns.append(ice_pq.quivered_good_fq) lq_fq_fns.append(ice_pq.quivered_bad_fq) split_uc_pickles.append(ice_pq.final_pickle_fn) split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn) split_consensus_isoforms.append(ice_pq.final_consensus_fa) combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])), "combined") mkdir(combined_dir) combined_files = CombinedFiles(combined_dir) log.info("Combining results of all cluster bins to %s.", combined_dir) log.info("Merging HQ|LQ isoforms from all cluster bins.") log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns)) log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns)) combine_polished_isoforms( split_indices=cluster_bin_indices, split_hq_fns=hq_fq_fns, split_lq_fns=lq_fq_fns, combined_hq_fa=combined_files.all_hq_fa, combined_hq_fq=combined_files.all_hq_fq, combined_lq_fa=combined_files.all_lq_fa, combined_lq_fq=combined_files.all_lq_fq, hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle, sample_name=sample_name) ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms' ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms' ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms' ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms' ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle) as_contigset(out_hq_fa, out_hq_cs) as_contigset(out_lq_fa, out_lq_cs) log.info("Merging consensus isoforms from all cluster bins.") combine_consensus_isoforms(split_indices=cluster_bin_indices, split_files=split_consensus_isoforms, combined_consensus_isoforms_fa=combined_files. all_consensus_isoforms_fa, sample_name=sample_name) ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa) #consensus isoforms as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs) log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn) write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn, isoforms_fa=out_consensus_isoforms_cs, hq_fa=out_hq_fa, lq_fa=out_lq_fa) ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary" log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn) write_combined_cluster_report( split_indices=cluster_bin_indices, split_uc_pickles=split_uc_pickles, split_partial_uc_pickles=split_partial_uc_pickles, report_fn=combined_files.all_cluster_report_fn, sample_name=sample_name) ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) if self.ice_opts.targeted_isoseq: reads_in_first_split = 1000 self.ice_opts.flnc_reads_per_split = 10000 self.add_log("targeted_isoseq: further splitting JUST first " + "split to 1000. Changing flnc_reads_per_split=10000.") else: reads_in_first_split = None # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.flnc_reads_per_split), level=logging.INFO) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split", reads_in_first_split=reads_in_first_split) self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) # This is the first piece of reads to work on first_split_fa = self._flnc_splitted_fas[0] first_split_fq = fafn2fqfn(first_split_fa) # Set up probability and quality value model if self.ice_opts.use_finer_qv: # default off # Use multi-Qvs from ccs.h5, no need to write FASTQ self._probqv, msg = set_probqv_from_ccs( ccs_fofn=self.ccs_fofn, fasta_filename=first_split_fa) else: # use a single Qv from FASTQ if self.ccs_fofn is not None: self.add_log("Converting {fa} + {ccs} into {fq}\n".format( fa=first_split_fa, ccs=self.ccs_fofn, fq=first_split_fq), level=logging.INFO) ice_fa2fq(in_fa=first_split_fa, ccs_fofn=self.ccs_fofn, out_fq=first_split_fq) # Set probqv from the first splitted FASTQ file. self._probqv, msg = set_probqv_from_fq( fastq_filename=first_split_fq) else: # use predefined model self._probqv, msg = set_probqv_from_model() self.add_log(msg, level=logging.INFO) # Initialize cluster by clique self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO) self.iceinit = IceInit(readsFa=first_split_fa, qver_get_func=self._probqv.get_smoothed, qvmean_get_func=self._probqv.get_mean, ice_opts=self.ice_opts, sge_opts=self.sge_opts) uc = self.iceinit.uc # Dump uc to a file self.add_log( "Dumping initial clusters to {f}".format(f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, 'w') as f: if self.initPickleFN.endswith(".json"): f.write(json.dumps(uc)) else: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO) self.icec = IceIterative( fasta_filename=first_split_fa, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv, fastq_filename=first_split_fq, output_pickle_file=self.output_pickle_file, tmp_dir=self.tmp_dir) self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn)) self.icec.run() self.add_log("IceIterative completed.", level=logging.INFO) # IceIterative done, write predicted (unplished) consensus isoforms # to an output fasta self.add_log("Creating a link to unpolished consensus isoforms.") ln(self.icec.final_consensus_fa, self.out_fa) if self.out_fa_dataset is not None: dummy_ds = as_contigset(fasta_file=self.icec.final_consensus_fa, xml_file=self.out_fa_dataset) # Call quiver to polish predicted consensus isoforms. if self.ice_opts.quiver is not True: self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.icec.report_fn, dst=self.report_fn) # Summarize cluster and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa) else: # self.ice_opts.quiver is True self.add_log("Polishing clusters: initializing IcePolish.", level=logging.INFO) self.pol = Polish(root_dir=self.root_dir, nfl_fa=self.nfl_fa, bas_fofn=self.bas_fofn, ccs_fofn=self.ccs_fofn, fasta_fofn=self.fasta_fofn, ice_opts=self.ice_opts, sge_opts=self.sge_opts, ipq_opts=self.ipq_opts, tmp_dir=self.tmp_dir) self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn), level=logging.INFO) self.pol.run() self.add_log("IcePolish completed.", level=logging.INFO) # cluster report self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.pol.iceq.report_fn, dst=self.report_fn) # Summarize cluster & polish and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa, hq_fa=self.pol.icepq.quivered_good_fa, lq_fa=self.pol.icepq.quivered_bad_fa) # Create log file. self.close_log() return 0