def run_main(separate_flnc_pickle_file, nfl_contigset, cluster_chunk_pickle, partial_chunk_pickle, polish_chunk_pickle, max_nchunks): """ Create chunk tasks for ICE, ice_partial and ice_polish, write each set of chunk tasks to output pickles. """ log.info("Getting all binned flnc files from %s", separate_flnc_pickle_file) flnc_fns = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files( separate_flnc_pickle_file) log.debug("Binned flnc files are: %s", ", ".join(flnc_fns)) # Number of ICE chunk tasks equals to number of bins. n_bins = len(flnc_fns) assert n_bins > 0 log.info("max_nchunks: %s", max_nchunks) n_nfl_chunks = max(1, int(max_nchunks)) out_dir = op.dirname(cluster_chunk_pickle) nfl_chunk_json = op.join(out_dir, 'nfl_chunk.json') chunked_nfl_files = chunk_contigset(in_file=nfl_contigset, n_chunks=n_nfl_chunks, out_dir=out_dir, out_chunk_json=nfl_chunk_json) create_cluster_pickle(flnc_files=flnc_fns, out_pickle=cluster_chunk_pickle) create_partial_pickle(flnc_files=flnc_fns, chunked_nfl_files=chunked_nfl_files, out_pickle=partial_chunk_pickle) # Total number of flnc reads in all bins n_reads_in_bins = n_reads_in_contigsets(flnc_fns) sum_n_flnc_reads = sum(n_reads_in_bins) n_polish_chunks_in_bins = [ max(1, int(n * max_nchunks / (1.0 * sum_n_flnc_reads))) for n in n_reads_in_bins ] create_polish_pickle(n_polish_chunks_in_bins=n_polish_chunks_in_bins, flnc_files=flnc_fns, out_pickle=polish_chunk_pickle) # Make a soft link of nfl_contigset in the same directory as separate_flnc.pickle # for users' convenience dst_nfl_contigset = op.join(op.dirname(separate_flnc_pickle_file), "isoseq_nfl.contigset.xml") log.info("Making a soft link of %s to %s.", nfl_contigset, dst_nfl_contigset) ln(nfl_contigset, dst_nfl_contigset)
def run(self): """Assigning nfl reads to consensus isoforms and merge.""" # Call $ICE_PARTIAL_PY to create a pickle for each splitted nfl fasta self.createPickles() # Wait for pickles to be created, if SGE is used. self.waitForPickles(pickle_filenames=self.pickle_filenames, done_filenames=self.done_filenames) # Combine all pickles to a big pickle file: nfl_all_pickle_fn. self.combinePickles(pickle_filenames=self.pickle_filenames, out_pickle=self.nfl_all_pickle_fn) # Create symbolic link if necessary ln(self.nfl_all_pickle_fn, self.out_pickle) # Close log self.close_log()
def link_files(smrtlink_job_dir, out_dir, more_files): """ Make soft link of some smrtlink isoseq job output files and more_files in {out_dir}. """ log.info("Making soft link of files") hq_fq = smrtlink_file(smrtlink_job_dir=smrtlink_job_dir, basename="hq_isoforms.fastq") cluster_report = smrtlink_file(smrtlink_job_dir=smrtlink_job_dir, basename="cluster_report.csv") hq_lq_prefix_pickle = smrtlink_file(smrtlink_job_dir=smrtlink_job_dir, basename="hq_lq_prefix_dict.pickle") assert isinstance(more_files, list) fs = more_files + [hq_fq, cluster_report, hq_lq_prefix_pickle] for f in fs: dst = op.join(out_dir, op.basename(f)) log.debug("%s --> %s", f, dst) ln(f, dst) return hq_fq, hq_lq_prefix_pickle
def args_runner(args): """Run given input args""" c = CollapseIsoformsRunner(isoform_filename=args.input_isoforms, sam_filename=args.sam_filename, output_prefix=args.output_prefix, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, min_flnc_coverage=args.min_flnc_coverage, max_fuzzy_junction=args.max_fuzzy_junction, allow_extra_5exon=args.allow_extra_5exon, skip_5_exon_alt=args.skip_5_exon_alt) c.run() if args.collapsed_isoforms is not None: suffix = parse_ds_filename(args.collapsed_isoforms)[1] if op.exists(c.rep_fn(suffix)): ln(c.rep_fn(suffix), args.collapsed_isoforms) else: if suffix == ".contigset.xml": # make contigset from fasta as_contigset(c.rep_fn("fasta"), args.collapsed_isoforms) else: raise IOError("Could not make collapsed isoform file %s" % args.collapsed_isoforms) return 0
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) if self.ice_opts.targeted_isoseq: reads_in_first_split = 1000 self.ice_opts.flnc_reads_per_split = 10000 self.add_log("targeted_isoseq: further splitting JUST first " + "split to 1000. Changing flnc_reads_per_split=10000.") else: reads_in_first_split = None # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.flnc_reads_per_split), level=logging.INFO) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split", reads_in_first_split=reads_in_first_split) self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) # This is the first piece of reads to work on first_split_fa = self._flnc_splitted_fas[0] first_split_fq = fafn2fqfn(first_split_fa) # Set up probability and quality value model if self.ice_opts.use_finer_qv: # default off # Use multi-Qvs from ccs.h5, no need to write FASTQ self._probqv, msg = set_probqv_from_ccs( ccs_fofn=self.ccs_fofn, fasta_filename=first_split_fa) else: # use a single Qv from FASTQ if self.ccs_fofn is not None: self.add_log("Converting {fa} + {ccs} into {fq}\n".format( fa=first_split_fa, ccs=self.ccs_fofn, fq=first_split_fq), level=logging.INFO) ice_fa2fq(in_fa=first_split_fa, ccs_fofn=self.ccs_fofn, out_fq=first_split_fq) # Set probqv from the first splitted FASTQ file. self._probqv, msg = set_probqv_from_fq(fastq_filename=first_split_fq) else: # use predefined model self._probqv, msg = set_probqv_from_model() self.add_log(msg, level=logging.INFO) # Initialize cluster by clique self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO) self.iceinit = IceInit(readsFa=first_split_fa, qver_get_func=self._probqv.get_smoothed, qvmean_get_func=self._probqv.get_mean, ice_opts=self.ice_opts, sge_opts=self.sge_opts) uc = self.iceinit.uc # Dump uc to a file self.add_log("Dumping initial clusters to {f}" .format(f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, 'w') as f: if self.initPickleFN.endswith(".json"): f.write(json.dumps(uc)) else: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO) self.icec = IceIterative( fasta_filename=first_split_fa, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv, fastq_filename=first_split_fq, output_pickle_file=self.output_pickle_file, tmp_dir=self.tmp_dir) self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn)) self.icec.run() self.add_log("IceIterative completed.", level=logging.INFO) # IceIterative done, write predicted (unplished) consensus isoforms # to an output fasta self.add_log("Creating a link to unpolished consensus isoforms.") ln(self.icec.final_consensus_fa, self.out_fa) if self.out_fa_dataset is not None: dummy_ds = as_contigset( fasta_file=self.icec.final_consensus_fa, xml_file=self.out_fa_dataset) # Call quiver to polish predicted consensus isoforms. if self.ice_opts.quiver is not True: self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.icec.report_fn, dst=self.report_fn) # Summarize cluster and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa) else: # self.ice_opts.quiver is True self.add_log("Polishing clusters: initializing IcePolish.", level=logging.INFO) self.pol = Polish(root_dir=self.root_dir, nfl_fa=self.nfl_fa, bas_fofn=self.bas_fofn, ccs_fofn=self.ccs_fofn, fasta_fofn=self.fasta_fofn, ice_opts=self.ice_opts, sge_opts=self.sge_opts, ipq_opts=self.ipq_opts, tmp_dir=self.tmp_dir) self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn), level=logging.INFO) self.pol.run() self.add_log("IcePolish completed.", level=logging.INFO) # cluster report self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.pol.iceq.report_fn, dst=self.report_fn) # Summarize cluster & polish and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa, hq_fa=self.pol.icepq.quivered_good_fa, lq_fa=self.pol.icepq.quivered_bad_fa) # Create log file. self.close_log() return 0
def resolved_tool_contract_runner(rtc): """ For each cluster bin, create summary.json, cluster_report.csv, hq_isoforms.fa|fq, lq_isoforms.fa|fq Finally, merge all cluster bins and save all outputs to 'combined'. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) p.sorted_by_attr(attr='cluster_bin_index') opts = rtc.task.options ipq_opts = IceQuiverHQLQOptions( qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID], qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID], hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID]) sample_name = get_sample_name( input_sample_name=opts[Constants.SAMPLE_NAME_ID]) out_consensus_isoforms_cs = rtc.task.output_files[0] out_summary = rtc.task.output_files[1] out_report = rtc.task.output_files[2] out_hq_cs = rtc.task.output_files[3] out_hq_fq = rtc.task.output_files[4] out_lq_cs = rtc.task.output_files[5] out_lq_fq = rtc.task.output_files[6] out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7] assert out_consensus_isoforms_cs.endswith(".contigset.xml") assert out_hq_cs.endswith(".contigset.xml") assert out_lq_cs.endswith(".contigset.xml") out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace( ".contigset.xml", ".fasta") out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta') out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta') hq_fq_fns, lq_fq_fns = [], [] split_uc_pickles, split_partial_uc_pickles = [], [] split_consensus_isoforms = [] cluster_bin_indices = [task.cluster_bin_index for task in p] cluster_out_dirs = [task.cluster_out_dir for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) for task in p: ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir, ipq_opts=ipq_opts) hq_fq_fns.append(ice_pq.quivered_good_fq) lq_fq_fns.append(ice_pq.quivered_bad_fq) split_uc_pickles.append(ice_pq.final_pickle_fn) split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn) split_consensus_isoforms.append(ice_pq.final_consensus_fa) combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])), "combined") mkdir(combined_dir) combined_files = CombinedFiles(combined_dir) log.info("Combining results of all cluster bins to %s.", combined_dir) log.info("Merging HQ|LQ isoforms from all cluster bins.") log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns)) log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns)) combine_polished_isoforms( split_indices=cluster_bin_indices, split_hq_fns=hq_fq_fns, split_lq_fns=lq_fq_fns, combined_hq_fa=combined_files.all_hq_fa, combined_hq_fq=combined_files.all_hq_fq, combined_lq_fa=combined_files.all_lq_fa, combined_lq_fq=combined_files.all_lq_fq, hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle, sample_name=sample_name) ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms' ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms' ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms' ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms' ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle) as_contigset(out_hq_fa, out_hq_cs) as_contigset(out_lq_fa, out_lq_cs) log.info("Merging consensus isoforms from all cluster bins.") combine_consensus_isoforms(split_indices=cluster_bin_indices, split_files=split_consensus_isoforms, combined_consensus_isoforms_fa=combined_files. all_consensus_isoforms_fa, sample_name=sample_name) ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa) #consensus isoforms as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs) log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn) write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn, isoforms_fa=out_consensus_isoforms_cs, hq_fa=out_hq_fa, lq_fa=out_lq_fa) ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary" log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn) write_combined_cluster_report( split_indices=cluster_bin_indices, split_uc_pickles=split_uc_pickles, split_partial_uc_pickles=split_partial_uc_pickles, report_fn=combined_files.all_cluster_report_fn, sample_name=sample_name) ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
def post_mapping_to_genome_runner(in_isoforms, in_sam, in_pickle, out_isoforms, out_gff, out_abundance, out_group, out_read_stat, min_aln_coverage=cmi.Constants.MIN_ALN_COVERAGE_DEFAULT, min_aln_identity=cmi.Constants.MIN_ALN_IDENTITY_DEFAULT, min_flnc_coverage=cmi.Constants.MIN_FLNC_COVERAGE_DEFAULT, max_fuzzy_junction=cmi.Constants.MAX_FUZZY_JUNCTION_DEFAULT, allow_extra_5exon=cmi.Constants.ALLOW_EXTRA_5EXON_DEFAULT, skip_5_exon_alt=cmi.Constants.SKIP_5_EXON_ALT_DEFAULT, min_count=fci.Constants.MIN_COUNT_DEFAULT, to_filter_out_subsets=True): """ (1) Collapse isoforms and merge fuzzy junctions if needed. (2) Generate read stat file and abundance file (3) Based on abundance file, filter collapsed isoforms by min FL count """ # Check input and output format in_suffix = parse_ds_filename(in_isoforms)[1] out_prefix, out_suffix = parse_ds_filename(out_isoforms) if in_suffix != out_suffix: raise ValueError("Format of input and output isoforms %s, %s must be the same." % (in_isoforms, out_isoforms)) if in_suffix not in ("fasta", "fastq"): raise ValueError("Format of input and output isoforms %s, %s must be FASTA or FASTQ." % (in_isoforms, out_isoforms)) #(1) Collapse isoforms and merge fuzzy junctions if needed. cf = CollapsedFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon) cir = CollapseIsoformsRunner(isoform_filename=in_isoforms, sam_filename=in_sam, output_prefix=out_prefix, min_aln_coverage=min_aln_coverage, min_aln_identity=min_aln_identity, min_flnc_coverage=min_flnc_coverage, max_fuzzy_junction=max_fuzzy_junction, allow_extra_5exon=allow_extra_5exon, skip_5_exon_alt=skip_5_exon_alt) cir.run() # (2) Generate read stat file and abundance file cr = CountRunner(group_filename=cf.group_fn, pickle_filename=in_pickle, output_read_stat_filename=cf.read_stat_fn, output_abundance_filename=cf.abundance_fn) cr.run() # (3) Filter collapsed isoforms by min FL count based on abundance file. fff = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon, min_count=min_count, filter_out_subsets=False) filter_by_count(in_group_filename=cf.group_fn, in_abundance_filename=cf.abundance_fn, in_gff_filename=cf.good_gff_fn, in_rep_filename=cf.rep_fn(out_suffix), out_abundance_filename=fff.filtered_abundance_fn, out_gff_filename=fff.filtered_gff_fn, out_rep_filename=fff.filtered_rep_fn(out_suffix), min_count=min_count) fft = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon, min_count=min_count, filter_out_subsets=True) # (4) Remove collapsed isoforms which are a subset of another isoform if to_filter_out_subsets is True: filter_out_subsets(in_abundance_filename=fff.filtered_abundance_fn, in_gff_filename=fff.filtered_gff_fn, in_rep_filename=fff.filtered_rep_fn(out_suffix), out_abundance_filename=fft.filtered_abundance_fn, out_gff_filename=fft.filtered_gff_fn, out_rep_filename=fft.filtered_rep_fn(out_suffix), max_fuzzy_junction=max_fuzzy_junction) fff = fft # (5) ln outputs files ln_pairs = [(fff.filtered_rep_fn(out_suffix), out_isoforms), # rep isoforms (fff.filtered_gff_fn, out_gff), # gff annotation (fff.filtered_abundance_fn, out_abundance), # abundance info (fff.group_fn, out_group), # groups (fff.read_stat_fn, out_read_stat)] # read stat info for src, dst in ln_pairs: if dst is not None: ln(src, dst) logging.info("Filter arguments: min_count = %s, filter_out_subsets=%s", min_count, filter_out_subsets) logging.info("Collapsed and filtered isoform sequences written to %s", realpath(out_isoforms) if out_isoforms is not None else realpath(fff.filtered_rep_fn(out_suffix))) logging.info("Collapsed and filtered isoform annotations written to %s", realpath(out_gff) if out_gff is not None else realpath(fff.filtered_gff_fn)) logging.info("Collapsed and filtered isoform abundance info written to %s", realpath(out_abundance) if out_abundance is not None else realpath(fff.filtered_abundance_fn)) logging.info("Collapsed isoform groups written to %s", realpath(out_group) if out_group is not None else realpath(fff.group_fn)) logging.info("Read status of FL and nFL reads written to %s", realpath(out_read_stat) if out_read_stat is not None else realpath(fff.read_stat_fn))
def post_mapping_to_genome_runner( in_isoforms, in_sam, in_pickle, out_isoforms, out_gff, out_abundance, out_group, out_read_stat, min_aln_coverage=cmi.Constants.MIN_ALN_COVERAGE_DEFAULT, min_aln_identity=cmi.Constants.MIN_ALN_IDENTITY_DEFAULT, min_flnc_coverage=cmi.Constants.MIN_FLNC_COVERAGE_DEFAULT, max_fuzzy_junction=cmi.Constants.MAX_FUZZY_JUNCTION_DEFAULT, allow_extra_5exon=cmi.Constants.ALLOW_EXTRA_5EXON_DEFAULT, skip_5_exon_alt=cmi.Constants.SKIP_5_EXON_ALT_DEFAULT, min_count=fci.Constants.MIN_COUNT_DEFAULT, to_filter_out_subsets=True): """ (1) Collapse isoforms and merge fuzzy junctions if needed. (2) Generate read stat file and abundance file (3) Based on abundance file, filter collapsed isoforms by min FL count """ log.info('args: {!r}'.format(locals())) # Check input and output format in_suffix = parse_ds_filename(in_isoforms)[1] out_prefix, out_suffix = parse_ds_filename(out_isoforms) if in_suffix != out_suffix: raise ValueError( "Format of input and output isoforms %s, %s must be the same." % (in_isoforms, out_isoforms)) if in_suffix not in ("fasta", "fastq"): raise ValueError( "Format of input and output isoforms %s, %s must be FASTA or FASTQ." % (in_isoforms, out_isoforms)) #(1) Collapse isoforms and merge fuzzy junctions if needed. cf = CollapsedFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon) cir = CollapseIsoformsRunner(isoform_filename=in_isoforms, sam_filename=in_sam, output_prefix=out_prefix, min_aln_coverage=min_aln_coverage, min_aln_identity=min_aln_identity, min_flnc_coverage=min_flnc_coverage, max_fuzzy_junction=max_fuzzy_junction, allow_extra_5exon=allow_extra_5exon, skip_5_exon_alt=skip_5_exon_alt) cir.run() # (2) Generate read stat file and abundance file cr = CountRunner(group_filename=cf.group_fn, pickle_filename=in_pickle, output_read_stat_filename=cf.read_stat_fn, output_abundance_filename=cf.abundance_fn) cr.run() # (3) Filter collapsed isoforms by min FL count based on abundance file. fff = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon, min_count=min_count, filter_out_subsets=False) filter_by_count(in_group_filename=cf.group_fn, in_abundance_filename=cf.abundance_fn, in_gff_filename=cf.good_gff_fn, in_rep_filename=cf.rep_fn(out_suffix), out_abundance_filename=fff.filtered_abundance_fn, out_gff_filename=fff.filtered_gff_fn, out_rep_filename=fff.filtered_rep_fn(out_suffix), min_count=min_count) fft = FilteredFiles(prefix=out_prefix, allow_extra_5exon=allow_extra_5exon, min_count=min_count, filter_out_subsets=True) # (4) Remove collapsed isoforms which are a subset of another isoform if to_filter_out_subsets is True: filter_out_subsets(in_abundance_filename=fff.filtered_abundance_fn, in_gff_filename=fff.filtered_gff_fn, in_rep_filename=fff.filtered_rep_fn(out_suffix), out_abundance_filename=fft.filtered_abundance_fn, out_gff_filename=fft.filtered_gff_fn, out_rep_filename=fft.filtered_rep_fn(out_suffix), max_fuzzy_junction=max_fuzzy_junction) fff = fft # (5) ln outputs files ln_pairs = [ (fff.filtered_rep_fn(out_suffix), out_isoforms), # rep isoforms (fff.filtered_gff_fn, out_gff), # gff annotation (fff.filtered_abundance_fn, out_abundance), # abundance info (fff.group_fn, out_group), # groups (fff.read_stat_fn, out_read_stat) ] # read stat info for src, dst in ln_pairs: if dst is not None: ln(src, dst) logging.info("Filter arguments: min_count = %s, filter_out_subsets=%s", min_count, filter_out_subsets) logging.info( "Collapsed and filtered isoform sequences written to %s", realpath(out_isoforms) if out_isoforms is not None else realpath( fff.filtered_rep_fn(out_suffix))) logging.info( "Collapsed and filtered isoform annotations written to %s", realpath(out_gff) if out_gff is not None else realpath(fff.filtered_gff_fn)) logging.info( "Collapsed and filtered isoform abundance info written to %s", realpath(out_abundance) if out_abundance is not None else realpath(fff.filtered_abundance_fn)) logging.info( "Collapsed isoform groups written to %s", realpath(out_group) if out_group is not None else realpath(fff.group_fn)) logging.info( "Read status of FL and nFL reads written to %s", realpath(out_read_stat) if out_read_stat is not None else realpath(fff.read_stat_fn))
def run(self): """ Check all arrow jobs are running, failed or done. Write high-quality consensus and low-quality consensus to all_arrowed.hq|lq fasta|fastq """ self.validate_inputs() job_stats = self.check_arrow_jobs_completion() self.add_log("Arrow job status: {s}".format(s=job_stats)) if job_stats == 'DONE': pass # continue on below to process data elif job_stats == 'FAILED': self.add_log("Has incomplete jobs. Please re-run them.", level=logging.ERROR) return -1 elif job_stats == 'RUNNING': if self.quit_if_not_done: self.add_log( "Jobs are still running. Please wait before running this script." ) return 1 else: while job_stats != "DONE": self.add_log( "Jobs are still running. Wait. Sleeping for 180 seconds." ) sleep(180) job_stats = self.check_arrow_jobs_completion() if job_stats == "DONE": break elif job_stats == "FAILED": self.add_log( "There are some failed jobs. Please check.", level=logging.ERROR) return 1 elif job_stats == "RUNNING": self.add_log( "Jobs are still running. Wait. Sleeping for 180 seconds.", level=logging.INFO) else: msg = "Unable to recognize job_stats {s}".format(s=job_stats) self.add_log(msg, logging.ERROR) raise ValueError(msg) # at this point, all jobs must be done and all fastq files present. self.pickup_best_clusters() self.add_log("Creating polished high quality consensus isoforms.") if self.hq_isoforms_fa is not None: ln(self.arrowed_good_fa, self.hq_isoforms_fa) if self.hq_isoforms_fq is not None: ln(self.arrowed_good_fq, self.hq_isoforms_fq) self.add_log("Creating polished low quality consensus isoforms.") if self.lq_isoforms_fa is not None: ln(self.arrowed_bad_fa, self.lq_isoforms_fa) if self.lq_isoforms_fq is not None: ln(self.arrowed_bad_fq, self.lq_isoforms_fq) if self.hq_isoforms_dataset is not None: ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset) if self.lq_isoforms_dataset is not None: ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset) if self.summary_fn is not None: self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.final_consensus_fa, hq_fa=self.hq_isoforms_fa, lq_fa=self.lq_isoforms_fa) self.close_log()
def args_runner(args): """args runner""" logging.info("%s arguments are:\n%s\n", __file__, args) # sanity check arguments _sanity_check_args(args) # make option objects ice_opts = IceOptions(quiver=args.quiver, use_finer_qv=args.use_finer_qv, targeted_isoseq=args.targeted_isoseq, ece_penalty=args.ece_penalty, ece_min_len=args.ece_min_len, nfl_reads_per_split=args.nfl_reads_per_split) sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge, max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc, quiver_nproc=args.quiver_nproc, gcon_nproc=args.gcon_nproc, sge_env_name=args.sge_env_name, sge_queue=args.sge_queue) ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3, hq_quiver_min_accuracy=args.hq_quiver_min_accuracy) # (1) separate flnc reads into bins logging.info("Separating FLNC reads into bins.") tofu_f = TofuFiles(tofu_dir=args.tofu_dir) s = SeparateFLNCRunner(flnc_fa=args.flnc_fa, root_dir=args.tofu_dir, out_pickle=tofu_f.separate_flnc_pickle, bin_size_kb=args.bin_size_kb, bin_by_primer=args.bin_by_primer, bin_manual=args.bin_manual, max_base_limit_MB=args.max_base_limit_MB) s.run() flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files(tofu_f.separate_flnc_pickle) logging.info("Separated FLNC reads bins are %s", flnc_files) # (2) apply 'pbtranscript cluster' to each bin # run ICE/Quiver (the whole thing), providing the fasta_fofn logging.info("Running ICE/Polish on separated FLNC reads bins.") split_dirs = [] for flnc_file in flnc_files: split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out") mkdir(split_dir) split_dirs.append(split_dir) cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta") ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts) if op.exists(ipq_f.quivered_good_fq): logging.warning("HQ polished isoforms %s already exist. SKIP!", ipq_f.quivered_good_fq) continue else: logging.info("Running ICE/Quiver on %s", split_dir) rmpath(cur_out_cons) obj = Cluster(root_dir=split_dir, flnc_fa=flnc_file, nfl_fa=args.nfl_fa, bas_fofn=args.bas_fofn, ccs_fofn=args.ccs_fofn, fasta_fofn=args.fasta_fofn, out_fa=cur_out_cons, sge_opts=sge_opts, ice_opts=ice_opts, ipq_opts=ipq_opts) if args.mem_debug: # DEBUG from memory_profiler import memory_usage start_t = time.time() mem_usage = memory_usage(obj.run, interval=60) end_t = time.time() with open('mem_debug.log', 'a') as f: f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(split_dir, end_t-start_t)) f.write("Maximum memory usage: {0}\n".format(max(mem_usage))) f.write("Memory usage: {0}\n".format(mem_usage)) else: obj.run() if not args.keep_tmp_files: # by deafult, delete all tempory files. logging.info("Deleting %s", ipq_f.tmp_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir]) logging.info("Deleting %s", ipq_f.quivered_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir]) # (3) merge polished isoform cluster from all bins logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir) c = CombineRunner(combined_dir=tofu_f.combined_dir, sample_name=get_sample_name(args.sample_name), split_dirs=split_dirs, ipq_opts=ipq_opts) c.run() if args.summary_fn is not None: ln(tofu_f.all_cluster_summary_fn, args.summary_fn) if args.report_fn is not None: ln(tofu_f.all_cluster_report_fn, args.report_fn) # (4) map HQ isoforms to GMAP reference genome map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq, sam_filename=tofu_f.sorted_gmap_sam, gmap_db_dir=args.gmap_db, gmap_db_name=args.gmap_name, gmap_nproc=args.gmap_nproc) # (5) post mapping to genome analysis, including # * collapse polished HQ isoform clusters into groups # * count abundance of collapsed isoform groups # * filter collapsed isoforms based on abundance info logging.info("Post mapping to genome analysis.") out_isoforms = args.collapsed_filtered_fn if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")): in_isoforms = tofu_f.all_hq_fa elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")): in_isoforms = tofu_f.all_hq_fq else: raise ValueError("Output file %s must be FASTA or FASTQ!" % out_isoforms) post_mapping_to_genome_runner( in_isoforms=in_isoforms, in_sam=tofu_f.sorted_gmap_sam, in_pickle=tofu_f.hq_lq_prefix_dict_pickle, out_isoforms=args.collapsed_filtered_fn, out_gff=args.gff_fn, out_abundance=args.abundance_fn, out_group=args.group_fn, out_read_stat=args.read_stat_fn, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, min_flnc_coverage=args.min_flnc_coverage, max_fuzzy_junction=args.max_fuzzy_junction, allow_extra_5exon=args.allow_extra_5exon, min_count=args.min_count) return 0
def run(self): """ First, collapse input isoforms by calling Branch.run(). Then collapse fuzzy junctions by calling collapse_fuzzy_junctions. Finally, pick up representitive gff record for each group of collapsed isoforms. """ self.validate_inputs() logging.info("Collapsing isoforms into transcripts.") b = Branch(isoform_filename=self.isoform_filename, sam_filename=self.sam_filename, cov_threshold=self.min_flnc_coverage, min_aln_coverage=self.min_aln_coverage, min_aln_identity=self.min_aln_identity) b.run(allow_extra_5exon=self.allow_extra_5exon, skip_5_exon_alt=self.skip_5_exon_alt, ignored_ids_fn=self.ignored_ids_txt_fn, good_gff_fn=self.good_unfuzzy_gff_fn, bad_gff_fn=self.bad_unfuzzy_gff_fn, group_fn=self.unfuzzy_group_fn) logging.info("Good unfuzzy isoforms written to: %s", realpath(self.good_unfuzzy_gff_fn)) logging.info("Bad unfuzzy isoforms written to: %s", realpath(self.bad_unfuzzy_gff_fn)) logging.info("Unfuzzy isoform groups written to: %s", realpath(self.unfuzzy_group_fn)) if self.shall_collapse_fuzzy_junctions: logging.info("Further collapsing fuzzy junctions.") # need to further collapse those that have fuzzy junctions! collapse_fuzzy_junctions( gff_filename=self.good_unfuzzy_gff_fn, group_filename=self.unfuzzy_group_fn, fuzzy_gff_filename=self.good_fuzzy_gff_fn, fuzzy_group_filename=self.fuzzy_group_fn, allow_extra_5exon=self.allow_extra_5exon, max_fuzzy_junction=self.max_fuzzy_junction) logging.info("Good fuzzy isoforms written to: %s", realpath(self.good_fuzzy_gff_fn)) logging.info("Bad fuzzy isoforms written to: %s", realpath(self.bad_fuzzy_gff_fn)) logging.info("Fuzzy isoform groups written to: %s", realpath(self.fuzzy_group_fn)) ln(self.good_fuzzy_gff_fn, self.good_gff_fn) ln(self.good_fuzzy_gff_fn, self.gff_fn) ln(self.fuzzy_group_fn, self.group_fn) else: logging.info("No need to further collapse fuzzy junctions.") ln(self.good_unfuzzy_gff_fn, self.good_gff_fn) ln(self.good_unfuzzy_gff_fn, self.gff_fn) ln(self.unfuzzy_group_fn, self.group_fn) # Pick up representative logging.info("Picking up representative record.") pick_least_err_instead = not self.allow_extra_5exon # 5merge, pick longest pick_rep(isoform_filename=self.isoform_filename, gff_filename=self.good_gff_fn, group_filename=self.group_fn, output_filename=self.rep_fn(self.suffix), pick_least_err_instead=pick_least_err_instead, bad_gff_filename=self.bad_gff_fn) logging.info("Ignored IDs written to: %s", realpath(self.ignored_ids_txt_fn)) logging.info("Output GFF written to: %s", realpath(self.gff_fn)) logging.info("Output Group TXT written to: %s", realpath(self.group_fn)) logging.info("Output collapsed isoforms written to: %s", realpath(self.rep_fn(self.suffix))) logging.info("CollapseIsoforms Arguments: %s", self.arg_str())
def args_runner(args): """args runner""" logging.info("%s arguments are:\n%s\n", __file__, args) # sanity check arguments _sanity_check_args(args) # make option objects ice_opts = IceOptions(quiver=args.quiver, use_finer_qv=args.use_finer_qv, targeted_isoseq=args.targeted_isoseq, ece_penalty=args.ece_penalty, ece_min_len=args.ece_min_len, flnc_reads_per_split=args.flnc_reads_per_split, nfl_reads_per_split=args.nfl_reads_per_split) sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge, max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc, quiver_nproc=args.quiver_nproc, gcon_nproc=args.gcon_nproc, sge_env_name=args.sge_env_name, sge_queue=args.sge_queue) ipq_opts = IceQuiverHQLQOptions( qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3, hq_quiver_min_accuracy=args.hq_quiver_min_accuracy) # (1) separate flnc reads into bins logging.info("Separating FLNC reads into bins.") tofu_f = TofuFiles(tofu_dir=args.tofu_dir) s = SeparateFLNCRunner(flnc_fa=args.flnc_fa, root_dir=args.tofu_dir, out_pickle=tofu_f.separate_flnc_pickle, bin_size_kb=args.bin_size_kb, bin_by_primer=args.bin_by_primer, bin_manual=args.bin_manual, max_base_limit_MB=args.max_base_limit_MB) s.run() flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files( tofu_f.separate_flnc_pickle) logging.info("Separated FLNC reads bins are %s", flnc_files) # (2) apply 'pbtranscript cluster' to each bin # run ICE/Quiver (the whole thing), providing the fasta_fofn logging.info("Running ICE/Polish on separated FLNC reads bins.") split_dirs = [] for flnc_file in flnc_files: split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out") mkdir(split_dir) split_dirs.append(split_dir) cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta") ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts) if op.exists(ipq_f.quivered_good_fq): logging.warning("HQ polished isoforms %s already exist. SKIP!", ipq_f.quivered_good_fq) continue else: logging.info("Running ICE/Quiver on %s", split_dir) rmpath(cur_out_cons) obj = Cluster(root_dir=split_dir, flnc_fa=flnc_file, nfl_fa=args.nfl_fa, bas_fofn=args.bas_fofn, ccs_fofn=args.ccs_fofn, fasta_fofn=args.fasta_fofn, out_fa=cur_out_cons, sge_opts=sge_opts, ice_opts=ice_opts, ipq_opts=ipq_opts) if args.mem_debug: # DEBUG from memory_profiler import memory_usage start_t = time.time() mem_usage = memory_usage(obj.run, interval=60) end_t = time.time() with open('mem_debug.log', 'a') as f: f.write("Running ICE/Quiver on {0} took {1} secs.\n".format( split_dir, end_t - start_t)) f.write("Maximum memory usage: {0}\n".format(max(mem_usage))) f.write("Memory usage: {0}\n".format(mem_usage)) else: obj.run() if not args.keep_tmp_files: # by deafult, delete all tempory files. logging.info("Deleting %s", ipq_f.tmp_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir]) logging.info("Deleting %s", ipq_f.quivered_dir) subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir]) # (3) merge polished isoform cluster from all bins logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir) c = CombineRunner(combined_dir=tofu_f.combined_dir, sample_name=get_sample_name(args.sample_name), split_dirs=split_dirs, ipq_opts=ipq_opts) c.run() if args.summary_fn is not None: ln(tofu_f.all_cluster_summary_fn, args.summary_fn) if args.report_fn is not None: ln(tofu_f.all_cluster_report_fn, args.report_fn) # (4) map HQ isoforms to GMAP reference genome map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq, sam_filename=tofu_f.sorted_gmap_sam, gmap_db_dir=args.gmap_db, gmap_db_name=args.gmap_name, gmap_nproc=args.gmap_nproc) # (5) post mapping to genome analysis, including # * collapse polished HQ isoform clusters into groups # * count abundance of collapsed isoform groups # * filter collapsed isoforms based on abundance info logging.info("Post mapping to genome analysis.") out_isoforms = args.collapsed_filtered_fn if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")): in_isoforms = tofu_f.all_hq_fa elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")): in_isoforms = tofu_f.all_hq_fq else: raise ValueError("Output file %s must be FASTA or FASTQ!" % out_isoforms) post_mapping_to_genome_runner(in_isoforms=in_isoforms, in_sam=tofu_f.sorted_gmap_sam, in_pickle=tofu_f.hq_lq_prefix_dict_pickle, out_isoforms=args.collapsed_filtered_fn, out_gff=args.gff_fn, out_abundance=args.abundance_fn, out_group=args.group_fn, out_read_stat=args.read_stat_fn, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, min_flnc_coverage=args.min_flnc_coverage, max_fuzzy_junction=args.max_fuzzy_junction, allow_extra_5exon=args.allow_extra_5exon, min_count=args.min_count) return 0
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) if self.ice_opts.targeted_isoseq: reads_in_first_split = 1000 self.ice_opts.flnc_reads_per_split = 10000 self.add_log("targeted_isoseq: further splitting JUST first " + "split to 1000. Changing flnc_reads_per_split=10000.") else: reads_in_first_split = None # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.flnc_reads_per_split), level=logging.INFO) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split", reads_in_first_split=reads_in_first_split) self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) # This is the first piece of reads to work on first_split_fa = self._flnc_splitted_fas[0] first_split_fq = fafn2fqfn(first_split_fa) # Set up probability and quality value model if self.ice_opts.use_finer_qv: # default off # Use multi-Qvs from ccs.h5, no need to write FASTQ self._probqv, msg = set_probqv_from_ccs( ccs_fofn=self.ccs_fofn, fasta_filename=first_split_fa) else: # use a single Qv from FASTQ if self.ccs_fofn is not None: self.add_log("Converting {fa} + {ccs} into {fq}\n".format( fa=first_split_fa, ccs=self.ccs_fofn, fq=first_split_fq), level=logging.INFO) ice_fa2fq(in_fa=first_split_fa, ccs_fofn=self.ccs_fofn, out_fq=first_split_fq) # Set probqv from the first splitted FASTQ file. self._probqv, msg = set_probqv_from_fq( fastq_filename=first_split_fq) else: # use predefined model self._probqv, msg = set_probqv_from_model() self.add_log(msg, level=logging.INFO) # Initialize cluster by clique self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO) self.iceinit = IceInit(readsFa=first_split_fa, qver_get_func=self._probqv.get_smoothed, qvmean_get_func=self._probqv.get_mean, ice_opts=self.ice_opts, sge_opts=self.sge_opts) uc = self.iceinit.uc # Dump uc to a file self.add_log( "Dumping initial clusters to {f}".format(f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, 'w') as f: if self.initPickleFN.endswith(".json"): f.write(json.dumps(uc)) else: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO) self.icec = IceIterative( fasta_filename=first_split_fa, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv, fastq_filename=first_split_fq, output_pickle_file=self.output_pickle_file, tmp_dir=self.tmp_dir) self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn)) self.icec.run() self.add_log("IceIterative completed.", level=logging.INFO) # IceIterative done, write predicted (unplished) consensus isoforms # to an output fasta self.add_log("Creating a link to unpolished consensus isoforms.") ln(self.icec.final_consensus_fa, self.out_fa) if self.out_fa_dataset is not None: dummy_ds = as_contigset(fasta_file=self.icec.final_consensus_fa, xml_file=self.out_fa_dataset) # Call quiver to polish predicted consensus isoforms. if self.ice_opts.quiver is not True: self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.icec.report_fn, dst=self.report_fn) # Summarize cluster and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa) else: # self.ice_opts.quiver is True self.add_log("Polishing clusters: initializing IcePolish.", level=logging.INFO) self.pol = Polish(root_dir=self.root_dir, nfl_fa=self.nfl_fa, bas_fofn=self.bas_fofn, ccs_fofn=self.ccs_fofn, fasta_fofn=self.fasta_fofn, ice_opts=self.ice_opts, sge_opts=self.sge_opts, ipq_opts=self.ipq_opts, tmp_dir=self.tmp_dir) self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn), level=logging.INFO) self.pol.run() self.add_log("IcePolish completed.", level=logging.INFO) # cluster report self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.pol.iceq.report_fn, dst=self.report_fn) # Summarize cluster & polish and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa, hq_fa=self.pol.icepq.quivered_good_fa, lq_fa=self.pol.icepq.quivered_bad_fa) # Create log file. self.close_log() return 0
def runChimeraDetector(self): """Call chimera detection on full-length reads, and non-full-length reads if required.""" # Create forward/reverse primers for chimera detection. self._processPrimers( primer_fn=self.primer_fn, window_size=self.chimera_detection_opts.primer_search_window, primer_out_fn=self.primer_chimera_fn, revcmp_primers=True) # Detect chimeras among full-length reads, separate flnc reads and # flc reads. logging.info("Detect chimeric reads from trimmed full-length reads.") (self.summary.num_flnc, self.summary.num_flc, self.summary.num_flnc_bases, _x) = \ self._detect_chimera(in_fasta=self._trimmed_fl_reads_fn, out_nc_fasta=self.out_flnc_fn_fasta, out_c_fasta=self.out_flc_fn, primer_report_fn=self._primer_report_fl_fn, out_dom=self.out_trimmed_fl_dom_fn, num_reads=self.summary.num_fl, job_name="fl") assert(self.summary.num_fl == self.summary.num_flnc + self.summary.num_flc) logging.info("Done with chimera detection on trimmed full-length " + "reads.") # Detect chimeras among non-full-length reads if required, separate # nflnc reads and nflc reads, rewrite self.primer_report_nfl_fn. if self.chimera_detection_opts.detect_chimera_nfl is True: logging.info("Detect chimeric reads from trimmed non-full-length " + "reads.") (self.summary.num_nflnc, self.summary.num_nflc, _x, _y) = \ self._detect_chimera(in_fasta=self._trimmed_nfl_reads_fn, out_nc_fasta=self.out_nflnc_fn, out_c_fasta=self.out_nflc_fn, primer_report_fn=self._primer_report_nfl_fn, out_dom=self.out_trimmed_nfl_dom_fn, num_reads=self.summary.num_nfl, job_name="nfl") assert(self.summary.num_nfl == self.summary.num_nflnc + self.summary.num_nflc) logging.info("Done with chimera detection on trimmed " + "non-full-length reads.") # Concatenate out_nflnc_fn and out_nflc_fn as out_nfl_fn cat_files(src=[self.out_nflnc_fn_fasta, self.out_nflc_fn_fasta], dst=self.out_nfl_fn_fasta) # Concatenate out_flnc and out_nflnc to make out_all_reads_fn cat_files(src=[self.out_flnc_fn_fasta, self.out_nflnc_fn_fasta], dst=self.out_all_reads_fn_fasta) else: # Soft link _trimmed_nfl_reads_fn as out_nfl_fn ln(self._trimmed_nfl_reads_fn, self.out_nfl_fn_fasta) # Concatenate out_flnc and out_nfl to make out_all_reads_fn cat_files(src=[self.out_flnc_fn_fasta, self.out_nfl_fn_fasta], dst=self.out_all_reads_fn_fasta) # primer info of fl/nfl reads reported to _primer_report_fl_fn # and _primer_report_nfl_fn, concatenate them in order to make # a full report: primer_report_fn. cat_files(src=[self._primer_report_fl_fn, self._primer_report_nfl_fn], dst=self.primer_report_fn) # Delete intermediate files. self._cleanup([self._primer_report_nfl_fn, self._primer_report_fl_fn])
def runChimeraDetector(self): """Call chimera detection on full-length reads, and non-full-length reads if required.""" # Create forward/reverse primers for chimera detection. self._processPrimers( primer_fn=self.primer_fn, window_size=self.chimera_detection_opts.primer_search_window, primer_out_fn=self.primer_chimera_fn, revcmp_primers=True) # Detect chimeras among full-length reads, separate flnc reads and # flc reads. logging.info("Detect chimeric reads from trimmed full-length reads.") (self.summary.num_flnc, self.summary.num_flc, self.summary.num_flnc_bases, _x) = \ self._detect_chimera(in_fasta=self._trimmed_fl_reads_fn, out_nc_fasta=self.out_flnc_fn_fasta, out_c_fasta=self.out_flc_fn, primer_report_fn=self._primer_report_fl_fn, out_dom=self.out_trimmed_fl_dom_fn, num_reads=self.summary.num_fl, job_name="fl") assert (self.summary.num_fl == self.summary.num_flnc + self.summary.num_flc) logging.info("Done with chimera detection on trimmed full-length " + "reads.") # Detect chimeras among non-full-length reads if required, separate # nflnc reads and nflc reads, rewrite self.primer_report_nfl_fn. if self.chimera_detection_opts.detect_chimera_nfl is True: logging.info( "Detect chimeric reads from trimmed non-full-length " + "reads.") (self.summary.num_nflnc, self.summary.num_nflc, _x, _y) = \ self._detect_chimera(in_fasta=self._trimmed_nfl_reads_fn, out_nc_fasta=self.out_nflnc_fn, out_c_fasta=self.out_nflc_fn, primer_report_fn=self._primer_report_nfl_fn, out_dom=self.out_trimmed_nfl_dom_fn, num_reads=self.summary.num_nfl, job_name="nfl") assert (self.summary.num_nfl == self.summary.num_nflnc + self.summary.num_nflc) logging.info("Done with chimera detection on trimmed " + "non-full-length reads.") # Concatenate out_nflnc_fn and out_nflc_fn as out_nfl_fn cat_files(src=[self.out_nflnc_fn_fasta, self.out_nflc_fn_fasta], dst=self.out_nfl_fn_fasta) # Concatenate out_flnc and out_nflnc to make out_all_reads_fn cat_files(src=[self.out_flnc_fn_fasta, self.out_nflnc_fn_fasta], dst=self.out_all_reads_fn_fasta) else: # Soft link _trimmed_nfl_reads_fn as out_nfl_fn ln(self._trimmed_nfl_reads_fn, self.out_nfl_fn_fasta) # Concatenate out_flnc and out_nfl to make out_all_reads_fn cat_files(src=[self.out_flnc_fn_fasta, self.out_nfl_fn_fasta], dst=self.out_all_reads_fn_fasta) # primer info of fl/nfl reads reported to _primer_report_fl_fn # and _primer_report_nfl_fn, concatenate them in order to make # a full report: primer_report_fn. cat_files(src=[self._primer_report_fl_fn, self._primer_report_nfl_fn], dst=self.primer_report_fn) # Delete intermediate files. self._cleanup([self._primer_report_nfl_fn, self._primer_report_fl_fn])
def run(self): """Check all quiver jobs are running, failed or done. Write high-quality consensus and low-quality consensus to all_quivered.good|bad.fasta|fastq. """ self.validate_inputs() job_stats = self.check_quiver_jobs_completion() self.add_log("quiver job status: {s}".format(s=job_stats)) if self.use_sge is not True and job_stats != "DONE": self.add_log("quiver jobs were not submitted via sge, " + "however are still incomplete. Please check.", level=logging.ERROR) return -1 elif self.use_sge is True: while job_stats != "DONE": self.add_log("Sleeping for 180 seconds.") sleep(180) job_stats = self.check_quiver_jobs_completion() if job_stats == "DONE": break elif job_stats == "FAILED": self.add_log("There are some failed jobs. Please check.", level=logging.ERROR) return 1 elif job_stats == "RUNNING": self.add_log("There are jobs still running, waiting...", level=logging.INFO) if self.quit_if_not_done is True: return 0 else: msg = "Unable to recognize job_stats {s}".format(job_stats) self.add_log(msg, logging.ERROR) raise ValueError(msg) self.pickup_best_clusters(self.fq_filenames) self.add_log("Creating polished high quality consensus isoforms.") if self.hq_isoforms_fa is not None: ln(self.quivered_good_fa, self.hq_isoforms_fa) if self.hq_isoforms_fq is not None: ln(self.quivered_good_fq, self.hq_isoforms_fq) self.add_log("Creating polished low quality consensus isoforms.") if self.lq_isoforms_fa is not None: ln(self.quivered_bad_fa, self.lq_isoforms_fa) if self.lq_isoforms_fq is not None: ln(self.quivered_bad_fq, self.lq_isoforms_fq) hq_fa = self.hq_isoforms_fa lq_fa = self.lq_isoforms_fa if self.hq_isoforms_dataset is not None: ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset) if self.lq_isoforms_dataset is not None: ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset) if self.summary_fn is not None: self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.final_consensus_fa, hq_fa=self.hq_isoforms_fa, lq_fa=self.lq_isoforms_fa) self.close_log()
def run(self): """ For each cluster bin, create summary.json, cluster_report.csv, hq_isoforms.fa|fq, lq_isoforms.fa|fq Finally, merge all cluster bins and save all outputs to 'combined'. """ logging.info("Running {f} v{v}.".format(f=op.basename(__file__), v=self.getVersion())) args = self.args # Get cluster bins directories as input cluster_bin_dirs = self.get_cluster_bin_dirs(separate_flnc_pickle=args.separate_flnc_pickle, cluster_bin_dirs=args.cluster_bin_dirs) cluster_bin_indices = range(0, len(cluster_bin_dirs)) # Create output dir combined_dir = args.combined_dir mkdir(combined_dir) # Get combined output filenames def f(input_fn, default_fn): if input_fn is None: return op.join(combined_dir, default_fn) out_consensus_isoforms_fa = f(args.consensus_isoforms_fa, "all.consensus_isoforms.fasta") out_summary = f(args.summary_fn, "all.cluster_summary.json") out_report = f(args.report_fn, "all.cluster_report.csv") out_hq_fa = f(args.hq_isoforms_fa, "all.polished_hq.fasta") out_lq_fa = f(args.lq_isoforms_fa, "all.polished_lq.fasta") out_hq_fq = f(args.hq_isoforms_fq, "all.polished_hq.fastq") out_lq_fq = f(args.lq_isoforms_fq, "all.polished_lq.fastq") ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3, hq_quiver_min_accuracy=args.hq_quiver_min_accuracy) sample_name = get_sample_name(input_sample_name=args.sample_name) hq_fq_fns, lq_fq_fns = [], [] split_uc_pickles, split_partial_uc_pickles = [], [] split_consensus_isoforms = [] for cluster_bin_dir in cluster_bin_dirs: ice_pq = IceQuiverPostprocess(root_dir=cluster_bin_dir, ipq_opts=ipq_opts) hq_fq_fns.append(ice_pq.quivered_good_fq) lq_fq_fns.append(ice_pq.quivered_bad_fq) split_uc_pickles.append(ice_pq.final_pickle_fn) split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn) split_consensus_isoforms.append(ice_pq.final_consensus_fa) combined_files = CombinedFiles(combined_dir) log.info("Combining results of all cluster bins to %s.", combined_dir) log.info("Merging HQ|LQ isoforms from all cluster bins.") log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns)) log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns)) combine_polished_isoforms(split_indices=cluster_bin_indices, split_hq_fns=hq_fq_fns, split_lq_fns=lq_fq_fns, combined_hq_fa=combined_files.all_hq_fa, combined_hq_fq=combined_files.all_hq_fq, combined_lq_fa=combined_files.all_lq_fa, combined_lq_fq=combined_files.all_lq_fq, hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle, sample_name=sample_name) ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms' ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms' ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms' ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms' log.info("Merging consensus isoforms from all cluster bins.") combine_consensus_isoforms(split_indices=cluster_bin_indices, split_files=split_consensus_isoforms, combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa, sample_name=sample_name) ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa) log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn) write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn, isoforms_fa=out_consensus_isoforms_fa, hq_fa=out_hq_fa, lq_fa=out_lq_fa) ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary" log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn) write_combined_cluster_report(split_indices=cluster_bin_indices, split_uc_pickles=split_uc_pickles, split_partial_uc_pickles=split_partial_uc_pickles, report_fn=combined_files.all_cluster_report_fn, sample_name=sample_name) ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
def resolved_tool_contract_runner(rtc): """ For each cluster bin, create summary.json, cluster_report.csv, hq_isoforms.fa|fq, lq_isoforms.fa|fq Finally, merge all cluster bins and save all outputs to 'combined'. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) p.sorted_by_attr(attr='cluster_bin_index') opts = rtc.task.options ipq_opts = IceQuiverHQLQOptions(qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID], qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID], hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID]) sample_name = get_sample_name(input_sample_name=opts[Constants.SAMPLE_NAME_ID]) out_consensus_isoforms_cs = rtc.task.output_files[0] out_summary = rtc.task.output_files[1] out_report = rtc.task.output_files[2] out_hq_cs = rtc.task.output_files[3] out_hq_fq = rtc.task.output_files[4] out_lq_cs = rtc.task.output_files[5] out_lq_fq = rtc.task.output_files[6] out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7] assert out_consensus_isoforms_cs.endswith(".contigset.xml") assert out_hq_cs.endswith(".contigset.xml") assert out_lq_cs.endswith(".contigset.xml") out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(".contigset.xml", ".fasta") out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta') out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta') hq_fq_fns, lq_fq_fns = [], [] split_uc_pickles, split_partial_uc_pickles = [], [] split_consensus_isoforms = [] cluster_bin_indices = [task.cluster_bin_index for task in p] cluster_out_dirs = [task.cluster_out_dir for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) for task in p: ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir, ipq_opts=ipq_opts) hq_fq_fns.append(ice_pq.quivered_good_fq) lq_fq_fns.append(ice_pq.quivered_bad_fq) split_uc_pickles.append(ice_pq.final_pickle_fn) split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn) split_consensus_isoforms.append(ice_pq.final_consensus_fa) combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])), "combined") mkdir(combined_dir) combined_files = CombinedFiles(combined_dir) log.info("Combining results of all cluster bins to %s.", combined_dir) log.info("Merging HQ|LQ isoforms from all cluster bins.") log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns)) log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns)) combine_polished_isoforms(split_indices=cluster_bin_indices, split_hq_fns=hq_fq_fns, split_lq_fns=lq_fq_fns, combined_hq_fa=combined_files.all_hq_fa, combined_hq_fq=combined_files.all_hq_fq, combined_lq_fa=combined_files.all_lq_fa, combined_lq_fq=combined_files.all_lq_fq, hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle, sample_name=sample_name) ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms' ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms' ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms' ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms' ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle) as_contigset(out_hq_fa, out_hq_cs) as_contigset(out_lq_fa, out_lq_cs) log.info("Merging consensus isoforms from all cluster bins.") combine_consensus_isoforms(split_indices=cluster_bin_indices, split_files=split_consensus_isoforms, combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa, sample_name=sample_name) ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa) #consensus isoforms as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs) log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn) write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn, isoforms_fa=out_consensus_isoforms_cs, hq_fa=out_hq_fa, lq_fa=out_lq_fa) ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary" log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn) write_combined_cluster_report(split_indices=cluster_bin_indices, split_uc_pickles=split_uc_pickles, split_partial_uc_pickles=split_partial_uc_pickles, report_fn=combined_files.all_cluster_report_fn, sample_name=sample_name) ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
def run(self): """ First, collapse input isoforms by calling Branch.run(). Then collapse fuzzy junctions by calling collapse_fuzzy_junctions. Finally, pick up representitive gff record for each group of collapsed isoforms. """ self.validate_inputs() logging.info("Collapsing isoforms into transcripts.") b = Branch(isoform_filename=self.isoform_filename, sam_filename=self.sam_filename, cov_threshold=self.min_flnc_coverage, min_aln_coverage=self.min_aln_coverage, min_aln_identity=self.min_aln_identity) b.run(allow_extra_5exon=self.allow_extra_5exon, skip_5_exon_alt=self.skip_5_exon_alt, ignored_ids_fn=self.ignored_ids_txt_fn, good_gff_fn=self.good_unfuzzy_gff_fn, bad_gff_fn=self.bad_unfuzzy_gff_fn, group_fn=self.unfuzzy_group_fn) logging.info("Good unfuzzy isoforms written to: %s", realpath(self.good_unfuzzy_gff_fn)) logging.info("Bad unfuzzy isoforms written to: %s", realpath(self.bad_unfuzzy_gff_fn)) logging.info("Unfuzzy isoform groups written to: %s", realpath(self.unfuzzy_group_fn)) if self.shall_collapse_fuzzy_junctions: logging.info("Further collapsing fuzzy junctions.") # need to further collapse those that have fuzzy junctions! collapse_fuzzy_junctions(gff_filename=self.good_unfuzzy_gff_fn, group_filename=self.unfuzzy_group_fn, fuzzy_gff_filename=self.good_fuzzy_gff_fn, fuzzy_group_filename=self.fuzzy_group_fn, allow_extra_5exon=self.allow_extra_5exon, max_fuzzy_junction=self.max_fuzzy_junction) logging.info("Good fuzzy isoforms written to: %s", realpath(self.good_fuzzy_gff_fn)) logging.info("Bad fuzzy isoforms written to: %s", realpath(self.bad_fuzzy_gff_fn)) logging.info("Fuzzy isoform groups written to: %s", realpath(self.fuzzy_group_fn)) ln(self.good_fuzzy_gff_fn, self.good_gff_fn) ln(self.good_fuzzy_gff_fn, self.gff_fn) ln(self.fuzzy_group_fn, self.group_fn) else: logging.info("No need to further collapse fuzzy junctions.") ln(self.good_unfuzzy_gff_fn, self.good_gff_fn) ln(self.good_unfuzzy_gff_fn, self.gff_fn) ln(self.unfuzzy_group_fn, self.group_fn) # Pick up representative logging.info("Picking up representative record.") pick_least_err_instead = not self.allow_extra_5exon # 5merge, pick longest pick_rep(isoform_filename=self.isoform_filename, gff_filename=self.good_gff_fn, group_filename=self.group_fn, output_filename=self.rep_fn(self.suffix), pick_least_err_instead=pick_least_err_instead, bad_gff_filename=self.bad_gff_fn) logging.info("Ignored IDs written to: %s", realpath(self.ignored_ids_txt_fn)) logging.info("Output GFF written to: %s", realpath(self.gff_fn)) logging.info("Output Group TXT written to: %s", realpath(self.group_fn)) logging.info("Output collapsed isoforms written to: %s", realpath(self.rep_fn(self.suffix))) logging.info("CollapseIsoforms Arguments: %s", self.arg_str())
def run(self): """ Check all arrow jobs are running, failed or done. Write high-quality consensus and low-quality consensus to all_arrowed.hq|lq fasta|fastq """ self.validate_inputs() job_stats = self.check_arrow_jobs_completion() self.add_log("Arrow job status: {s}".format(s=job_stats)) if job_stats == 'DONE': pass # continue on below to process data elif job_stats == 'FAILED': self.add_log("Has incomplete jobs. Please re-run them.", level=logging.ERROR) return -1 elif job_stats == 'RUNNING': if self.quit_if_not_done: self.add_log("Jobs are still running. Please wait before running this script.") return 1 else: while job_stats != "DONE": self.add_log("Jobs are still running. Wait. Sleeping for 180 seconds.") sleep(180) job_stats = self.check_arrow_jobs_completion() if job_stats == "DONE": break elif job_stats == "FAILED": self.add_log("There are some failed jobs. Please check.", level=logging.ERROR) return 1 elif job_stats == "RUNNING": self.add_log("Jobs are still running. Wait. Sleeping for 180 seconds.", level=logging.INFO) else: msg = "Unable to recognize job_stats {s}".format(s=job_stats) self.add_log(msg, logging.ERROR) raise ValueError(msg) # at this point, all jobs must be done and all fastq files present. self.pickup_best_clusters() self.add_log("Creating polished high quality consensus isoforms.") if self.hq_isoforms_fa is not None: ln(self.arrowed_good_fa, self.hq_isoforms_fa) if self.hq_isoforms_fq is not None: ln(self.arrowed_good_fq, self.hq_isoforms_fq) self.add_log("Creating polished low quality consensus isoforms.") if self.lq_isoforms_fa is not None: ln(self.arrowed_bad_fa, self.lq_isoforms_fa) if self.lq_isoforms_fq is not None: ln(self.arrowed_bad_fq, self.lq_isoforms_fq) if self.hq_isoforms_dataset is not None: ds = as_contigset(self.hq_isoforms_fa, self.hq_isoforms_dataset) if self.lq_isoforms_dataset is not None: ds = as_contigset(self.lq_isoforms_fa, self.lq_isoforms_dataset) if self.summary_fn is not None: self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.final_consensus_fa, hq_fa=self.hq_isoforms_fa, lq_fa=self.lq_isoforms_fa) self.close_log()