def __init__(self, input_fasta, input_fastq, ref_fasta, out_pickle, done_filename, ice_opts, cpus=4, tmp_dir=None): self.input_fasta = input_fasta self.input_fastq = input_fastq # could be None self.ref_fasta = ref_fasta self.out_pickle = out_pickle self.done_filename = done_filename self.cpus = cpus self.ice_opts = ice_opts self.tmp_dir = tmp_dir # read QV from fastq start_t = time.time() if self.input_fastq is not None: self.probqv, msg = set_probqv_from_fq(self.input_fastq) else: self.probqv, msg = set_probqv_from_model() logging.info("Reading probQV from {0} took {1:.1f} sec.".format( self.input_fastq, time.time() - start_t))
def run_IceInit2(readsFa, out_pickle, ice_opts, sge_opts, readsFq=None): if readsFq is None: _probqv, msg = set_probqv_from_model() else: _probqv, msg = set_probqv_from_fq(readsFq) qver_get_func = _probqv.get_smoothed qvmean_get_func = _probqv.get_mean i = IceInit2(readsFa, qver_get_func, qvmean_get_func, ice_opts, sge_opts) with open(out_pickle, 'w') as f: dump(i.uc, f) return i.uc
def pickup_icec_job(pickle_filename, root_dir, flnc_filename, fasta_files_to_add, fastq_files_to_add): icec_obj, icec_pickle_filename = ensure_pickle_goodness(pickle_filename, root_dir, fasta_files_to_add, fastq_files_to_add) probqv, msg = set_probqv_from_model() icec = ice.IceIterative2.from_pickle(icec_pickle_filename, probqv) # first must RE-RUN gcon to get all the proper refs icec.changes = set() icec.refs = {} icec.all_fasta_filename = flnc_filename todo = icec.uc.keys() print >> sys.stderr, "Re-run gcon for proper refs...." icec.run_gcon_parallel(todo) print >> sys.stderr, "Re-calculating cluster prob, just to be safe...." icec.calc_cluster_prob(True) print >> sys.stderr, "Sanity checking now...." icec.sanity_check_uc_refs() icec.ensure_probQV_newid_consistency() print >> sys.stderr, "Sanity check done. Resuming ICE job." icec.run()
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) if self.ice_opts.targeted_isoseq: reads_in_first_split = 1000 self.ice_opts.flnc_reads_per_split = 10000 self.add_log("targeted_isoseq: further splitting JUST first " + "split to 1000. Changing flnc_reads_per_split=10000.") else: reads_in_first_split = None # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.flnc_reads_per_split), level=logging.INFO) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split", reads_in_first_split=reads_in_first_split) self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) # This is the first piece of reads to work on first_split_fa = self._flnc_splitted_fas[0] first_split_fq = fafn2fqfn(first_split_fa) # Set up probability and quality value model if self.ice_opts.use_finer_qv: # default off # Use multi-Qvs from ccs.h5, no need to write FASTQ self._probqv, msg = set_probqv_from_ccs( ccs_fofn=self.ccs_fofn, fasta_filename=first_split_fa) else: # use a single Qv from FASTQ if self.ccs_fofn is not None: self.add_log("Converting {fa} + {ccs} into {fq}\n".format( fa=first_split_fa, ccs=self.ccs_fofn, fq=first_split_fq), level=logging.INFO) ice_fa2fq(in_fa=first_split_fa, ccs_fofn=self.ccs_fofn, out_fq=first_split_fq) # Set probqv from the first splitted FASTQ file. self._probqv, msg = set_probqv_from_fq(fastq_filename=first_split_fq) else: # use predefined model self._probqv, msg = set_probqv_from_model() self.add_log(msg, level=logging.INFO) # Initialize cluster by clique self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO) self.iceinit = IceInit(readsFa=first_split_fa, qver_get_func=self._probqv.get_smoothed, qvmean_get_func=self._probqv.get_mean, ice_opts=self.ice_opts, sge_opts=self.sge_opts) uc = self.iceinit.uc # Dump uc to a file self.add_log("Dumping initial clusters to {f}" .format(f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, 'w') as f: if self.initPickleFN.endswith(".json"): f.write(json.dumps(uc)) else: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO) self.icec = IceIterative( fasta_filename=first_split_fa, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv, fastq_filename=first_split_fq, output_pickle_file=self.output_pickle_file, tmp_dir=self.tmp_dir) self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn)) self.icec.run() self.add_log("IceIterative completed.", level=logging.INFO) # IceIterative done, write predicted (unplished) consensus isoforms # to an output fasta self.add_log("Creating a link to unpolished consensus isoforms.") ln(self.icec.final_consensus_fa, self.out_fa) if self.out_fa_dataset is not None: dummy_ds = as_contigset( fasta_file=self.icec.final_consensus_fa, xml_file=self.out_fa_dataset) # Call quiver to polish predicted consensus isoforms. if self.ice_opts.quiver is not True: self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.icec.report_fn, dst=self.report_fn) # Summarize cluster and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa) else: # self.ice_opts.quiver is True self.add_log("Polishing clusters: initializing IcePolish.", level=logging.INFO) self.pol = Polish(root_dir=self.root_dir, nfl_fa=self.nfl_fa, bas_fofn=self.bas_fofn, ccs_fofn=self.ccs_fofn, fasta_fofn=self.fasta_fofn, ice_opts=self.ice_opts, sge_opts=self.sge_opts, ipq_opts=self.ipq_opts, tmp_dir=self.tmp_dir) self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn), level=logging.INFO) self.pol.run() self.add_log("IcePolish completed.", level=logging.INFO) # cluster report self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.pol.iceq.report_fn, dst=self.report_fn) # Summarize cluster & polish and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa, hq_fa=self.pol.icepq.quivered_good_fa, lq_fa=self.pol.icepq.quivered_bad_fa) # Create log file. self.close_log() return 0
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) if self.ice_opts.targeted_isoseq: reads_in_first_split = 1000 self.ice_opts.flnc_reads_per_split = 10000 self.add_log("targeted_isoseq: further splitting JUST first " + "split to 1000. Changing flnc_reads_per_split=10000.") else: reads_in_first_split = None # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.flnc_reads_per_split), level=logging.INFO) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split", reads_in_first_split=reads_in_first_split) self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) # This is the first piece of reads to work on first_split_fa = self._flnc_splitted_fas[0] first_split_fq = fafn2fqfn(first_split_fa) # Set up probability and quality value model if self.ice_opts.use_finer_qv: # default off # Use multi-Qvs from ccs.h5, no need to write FASTQ self._probqv, msg = set_probqv_from_ccs( ccs_fofn=self.ccs_fofn, fasta_filename=first_split_fa) else: # use a single Qv from FASTQ if self.ccs_fofn is not None: self.add_log("Converting {fa} + {ccs} into {fq}\n".format( fa=first_split_fa, ccs=self.ccs_fofn, fq=first_split_fq), level=logging.INFO) ice_fa2fq(in_fa=first_split_fa, ccs_fofn=self.ccs_fofn, out_fq=first_split_fq) # Set probqv from the first splitted FASTQ file. self._probqv, msg = set_probqv_from_fq( fastq_filename=first_split_fq) else: # use predefined model self._probqv, msg = set_probqv_from_model() self.add_log(msg, level=logging.INFO) # Initialize cluster by clique self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO) self.iceinit = IceInit(readsFa=first_split_fa, qver_get_func=self._probqv.get_smoothed, qvmean_get_func=self._probqv.get_mean, ice_opts=self.ice_opts, sge_opts=self.sge_opts) uc = self.iceinit.uc # Dump uc to a file self.add_log( "Dumping initial clusters to {f}".format(f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, 'w') as f: if self.initPickleFN.endswith(".json"): f.write(json.dumps(uc)) else: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO) self.icec = IceIterative( fasta_filename=first_split_fa, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv, fastq_filename=first_split_fq, output_pickle_file=self.output_pickle_file, tmp_dir=self.tmp_dir) self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn)) self.icec.run() self.add_log("IceIterative completed.", level=logging.INFO) # IceIterative done, write predicted (unplished) consensus isoforms # to an output fasta self.add_log("Creating a link to unpolished consensus isoforms.") ln(self.icec.final_consensus_fa, self.out_fa) if self.out_fa_dataset is not None: dummy_ds = as_contigset(fasta_file=self.icec.final_consensus_fa, xml_file=self.out_fa_dataset) # Call quiver to polish predicted consensus isoforms. if self.ice_opts.quiver is not True: self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.icec.report_fn, dst=self.report_fn) # Summarize cluster and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa) else: # self.ice_opts.quiver is True self.add_log("Polishing clusters: initializing IcePolish.", level=logging.INFO) self.pol = Polish(root_dir=self.root_dir, nfl_fa=self.nfl_fa, bas_fofn=self.bas_fofn, ccs_fofn=self.ccs_fofn, fasta_fofn=self.fasta_fofn, ice_opts=self.ice_opts, sge_opts=self.sge_opts, ipq_opts=self.ipq_opts, tmp_dir=self.tmp_dir) self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn), level=logging.INFO) self.pol.run() self.add_log("IcePolish completed.", level=logging.INFO) # cluster report self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.pol.iceq.report_fn, dst=self.report_fn) # Summarize cluster & polish and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa, hq_fa=self.pol.icepq.quivered_good_fa, lq_fa=self.pol.icepq.quivered_bad_fa) # Create log file. self.close_log() return 0