Example #1
0
def build_uc_from_partial_daligner(input_fastq, ref_fasta, out_pickle,
                          ccs_fofn=None,
                          done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    """
    input_fastq = realpath(input_fastq)
    input_fasta = input_fastq[:input_fastq.rfind('.')] + '.fasta'
    ice_fq2fa(input_fastq, input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = os.path.dirname(out_pickle)

    daligner_sensitive_mode, _low, _high, _ignore5, _ignore3, _ece_min_len = get_daligner_sensitivity_setting(ref_fasta, is_fasta=True)

    # DB should always be already converted
    ref_obj = DazzIDHandler(ref_fasta, True)
    input_obj = DazzIDHandler(input_fasta, False)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(input_fasta, ref_fasta, is_FL=False, same_strand_only=False, \
                            query_converted=True, db_converted=True, query_made=False, \
                            db_made=True, use_sge=False, cpus=cpus, sge_opts=None)
    las_filenames, las_out_filenames = runner.runHPC(min_match_len=_low, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode)

    if no_qv_or_aln_checking:
        # not using QVs or alignment checking!
        # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used
        logging.info("Not using QV for partial_uc. Loading dummy QV.")
        probqv = ProbFromModel(.01, .07, .06)
    else:
#        if ccs_fofn is None:
#            logging.info("Loading probability from model (0.01,0.07,0.06)")
#            probqv = ProbFromModel(.01, .07, .06)
#        else:
        start_t = time.time()
        probqv = ProbFromFastq(input_fastq)
        logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
# --------- comment out below since we are just using FASTQ / BAM
#            if use_finer_qv:
#                probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
#                logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\
#                    s=time.time()-start_t))
#            else:
#                input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
#                logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq))
#                ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
#                probqv = ProbFromFastq(input_fastq)
#                logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
#                print >> sys.stderr, "Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)

    logging.info("Calling dalign_against_ref ...")

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for las_out_filename in las_out_filenames:
        start_t = time.time()
        hitItems = dalign_against_ref(input_obj, ref_obj, las_out_filename,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=_ece_min_len,
                                 same_strand_only=False,
                                 no_qv_or_aln_checking=no_qv_or_aln_checking,
                                 max_missed_start=_ignore5,
                                 max_missed_end=_ignore3)
        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing {0} took {1} sec".format(las_out_filename, time.time()-start_t))
        print >> sys.stderr, "processing {0} took {1} sec".format(las_out_filename, time.time()-start_t)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, 'w') as f:
        dump({'partial_uc': partial_uc, 'nohit': nohit}, f)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)

    # remove all the .las and .las.out filenames
    for file in las_filenames:
        os.remove(file)
    for file in las_out_filenames:
        os.remove(file)
Example #2
0
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        if self.ice_opts.targeted_isoseq:
            first_split = 1000
            self.ice_opts.flnc_reads_per_split = 10000
            self.add_log("targeted_isoseq: further splitting JUST first split to 1000. Changing flnc_reads_per_split=10000.")
        else:
            first_split = None

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)

        self._flnc_splitted_fqs = splitFastq(
            input_fastq=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split",
            first_split=first_split)


        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fqs),
                     level=logging.INFO)

        firstSplit_fq = self._flnc_splitted_fqs[0]
        firstSplit_fa = firstSplit_fq[:firstSplit_fq.rfind('.')] + '.fasta'
        ice_fq2fa(firstSplit_fq, firstSplit_fa)


        # -------- commenting out below because we already have FASTQ!
        #firstSplit = self._flnc_splitted_fas[0]
        #firstSplit_fq = firstSplit[:firstSplit.rfind('.')] + '.fastq'
        #self.add_log("Converting first split file {0} + {1} into fastq\n".format(\
        #        firstSplit, self.ccs_fofn), level=logging.INFO)
        ## Convert this into FASTQ
        #ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq)
        #
        ## Set up probabbility and quality value model
        #if self.ice_opts.use_finer_qv:
        #    self._setProbQV_ccs(self.ccs_fofn, firstSplit)
        #else:
        #    self._setProbQV_fq(firstSplitFq=firstSplit_fq)
        self._setProbQV_fq(firstSplitFq=firstSplit_fq)

        # Initialize cluster by clique
        if os.path.exists(self.initPickleFN):
            self.add_log("Reading existing uc pickle: {0}".format(self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN) as f:
                uc = cPickle.load(f)
        else:
            self.add_log("Finding maximal cliques: initializing IceInit.",
                         level=logging.INFO)
            self.iceinit = IceInit(readsFa=firstSplit_fa,
                                   qver_get_func=self._probqv.get_smoothed,
                                   ice_opts=self.ice_opts,
                                   sge_opts=self.sge_opts,
                                   qvmean_get_func=self._probqv.get_mean)
            uc = self.iceinit.uc

            # Dump uc to a file
            self.add_log("Dumping initial clusters to {f}".format(
                         f=self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN, 'w') as f:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.",
                     level=logging.INFO)
        #self.add_log("In Cluster. DEBUG: Calling Cluster with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn))


        self.icec = IceIterative(
            fasta_filename=firstSplit_fa,
            fastq_filename=firstSplit_fq,
            fastq_filenames_to_add=self._flnc_splitted_fqs[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            use_ccs_qv=self.ice_opts.use_finer_qv)
        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))
        self.icec.run()
        self.add_log("IceIterative completed.", level=logging.INFO)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)

        #self.add_log("In Cluster. DEBUG: End Cluster with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn))


        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.icec.report_fn, dst=self.report_fn)

            # Summarize cluster and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa)
        else:  # self.ice_opts.quiver is True
            self.add_log("Polishing clusters: initializing IcePolish.",
                         level=logging.INFO)
            #self.add_log("In Cluster. DEBUG: Calling Polish with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn))
            self.pol = Polish(root_dir=self.root_dir,
                              nfl_fa=self.nfl_fa,
                              bas_fofn=self.bas_fofn,
                              ccs_fofn=self.ccs_fofn,
                              fasta_fofn=self.fasta_fofn,
                              ice_opts=self.ice_opts,
                              sge_opts=self.sge_opts,
                              ipq_opts=self.ipq_opts,
                              nfl_reads_per_split=self.nfl_reads_per_split)
            self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn),
                         level=logging.INFO)
            self.pol.run()
            self.add_log("IcePolish completed.", level=logging.INFO)

            # cluster report
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Summarize cluster & polish and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa,
                               hq_fa=self.pol.icepq.quivered_good_fa,
                               lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0