Example #1
0
    def run(self):
        """Execute ice_fa2fq.py."""
        logging.info("Running {f} v{v}.".format(f=op.basename(__file__),
                                                v=get_version()))
        cmd_str = ""
        try:
            args = self.args
            in_fa, ccs_fofn, out_fq = args.in_fa, args.ccs_fofn, \
                                      args.out_fq

            self.validate_inputs(in_fa=in_fa, ccs_fofn=ccs_fofn)

            cmd_str = self.cmd_str(in_fa=in_fa,
                                   ccs_fofn=ccs_fofn,
                                   out_fq=out_fq)

            ice_fa2fq(in_fa=in_fa, ccs_fofn=ccs_fofn, out_fq=out_fq)

        except:
            logging.exception(
                "Exiting {cmd} with return code 1.".format(cmd=cmd_str))
            return 1
        return 0
Example #2
0
    def run(self):
        """Execute ice_fa2fq.py."""
        logging.info("Running {f} v{v}.".format(f=op.basename(__file__),
                                                v=get_version()))
        cmd_str = ""
        try:
            args = self.args
            in_fa, ccs_fofn, out_fq = args.in_fa, args.ccs_fofn, \
                                      args.out_fq

            self.validate_inputs(in_fa=in_fa,
                                 ccs_fofn=ccs_fofn)

            cmd_str = self.cmd_str(in_fa=in_fa, ccs_fofn=ccs_fofn,
                                   out_fq=out_fq)

            ice_fa2fq(in_fa=in_fa, ccs_fofn=ccs_fofn, out_fq=out_fq)

        except:
            logging.exception("Exiting {cmd} with return code 1.".
                              format(cmd=cmd_str))
            return 1
        return 0
Example #3
0
def pickup_icec_job(pickle_filename, ccs_fofn, flnc_filename,
                    fasta_files_to_add, root_dir):
    icec_obj, icec_pickle_filename = ensure_pickle_goodness(
        pickle_filename, root_dir, fasta_files_to_add)
    make_current_fasta(icec_obj, flnc_filename, root_dir)
    print >> sys.stderr, "Reading QV information...."
    # first need to convert to fastq
    ice_fa2fq('current.fasta', ccs_fofn, 'current.fastq')
    probqv = pm.ProbFromFastq(os.path.join(root_dir, 'current.fastq'))
    icec = ice.IceIterative.from_pickle(icec_pickle_filename, probqv)
    # first must RE-RUN gcon to get all the proper refs
    icec.changes = set()
    icec.refs = {}
    todo = icec.uc.keys()
    print >> sys.stderr, "Re-run gcon for proper refs...."
    icec.run_gcon_parallel(todo)
    print >> sys.stderr, "Re-calculating cluster prob, just to be safe...."
    icec.calc_cluster_prob(True)
    print >> sys.stderr, "Sanity checking now...."
    icec.sanity_check_uc_refs()
    icec.ensure_probQV_newid_consistency()
    print >> sys.stderr, "Sanity check done. Resuming ICE job."
    icec.run()
Example #4
0
def pickup_icec_job(pickle_filename, ccs_fofn, flnc_filename, fasta_files_to_add, root_dir):
    icec_obj, icec_pickle_filename = ensure_pickle_goodness(pickle_filename, root_dir, fasta_files_to_add)
    make_current_fasta(icec_obj, flnc_filename, root_dir)
    print >> sys.stderr, "Reading QV information...."
    # first need to convert to fastq
    ice_fa2fq('current.fasta', ccs_fofn, 'current.fastq')
    probqv = pm.ProbFromFastq(os.path.join(root_dir,'current.fastq'))
    icec = ice.IceIterative.from_pickle(icec_pickle_filename, probqv)
    # first must RE-RUN gcon to get all the proper refs
    icec.changes = set()
    icec.refs = {}
    icec.ccs_fofn = ccs_fofn
    icec.all_fasta_filename = flnc_filename
    todo = icec.uc.keys()
    print >> sys.stderr, "Re-run gcon for proper refs...."
    icec.run_gcon_parallel(todo)
    print >> sys.stderr, "Re-calculating cluster prob, just to be safe...."
    icec.calc_cluster_prob(True)
    print >> sys.stderr, "Sanity checking now...."
    icec.sanity_check_uc_refs()
    icec.ensure_probQV_newid_consistency()
    print >> sys.stderr, "Sanity check done. Resuming ICE job."
    icec.run()
Example #5
0
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle,
                          sa_file=None, ccs_fofn=None,
                          done_filename=None, blasr_nproc=12, use_finer_qv=False):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = realpath(input_fasta)
    m5_file = input_fasta + ".blasr"
    out_pickle = realpath(out_pickle)
    if sa_file is None:
        if op.exists(input_fasta + ".sa"):
            sa_file = input_fasta + ".sa"

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \
          "-nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "-maxScore -1000 -minPctIdentity 85 " + \
          "-out {o} ".format(o=real_upath(m5_file))
    if sa_file is not None and op.exists(sa_file):
        cmd += "-sa {sa}".format(sa=real_upath(sa_file))

    logging.info("CMD: {cmd}".format(cmd=cmd))
    _out, _code, _msg = backticks(cmd)
    if _code != 0:
        errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg)
        logging.error(errMsg)
        raise RuntimeError(errMsg)
    
    if ccs_fofn is None:
        logging.info("Loading probability from model (0.01,0.07,0.06)")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        start_t = time.time()
        if use_finer_qv:
            logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\
                    s=time.time()-start_t))
            probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
        else:
            input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
            logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq))
            ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
            logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
            probqv = ProbFromFastq(input_fastq)


    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qvmean_get_func=probqv.get_mean,
                                 qver_get_func=probqv.get_smoothed,
                                 ece_penalty=1,
                                 ece_min_len=20,
                                 same_strand_only=False,
                                 max_missed_start=200,
                                 max_missed_end=50)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, 'w') as f:
        dump({'partial_uc': partial_uc, 'nohit': nohit}, f)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)
Example #6
0
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        if self.ice_opts.targeted_isoseq:
            first_split = 1000
            self.ice_opts.flnc_reads_per_split = 10000
            self.add_log(
                "targeted_isoseq: further splitting JUST first split to 1000. Changing flnc_reads_per_split=10000."
            )
        else:
            first_split = None

        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log(
            "Splitting {flnc} into ".format(flnc=self.flnc_fa)
            + "smaller files each containing {n} reads.".format(n=self.ice_opts.flnc_reads_per_split),
            level=logging.INFO,
        )

        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split",
            first_split=first_split,
        )

        self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO)

        firstSplit = self._flnc_splitted_fas[0]
        firstSplit_fq = firstSplit[: firstSplit.rfind(".")] + ".fastq"
        self.add_log(
            "Converting first split file {0} + {1} into fastq\n".format(firstSplit, self.ccs_fofn), level=logging.INFO
        )
        # Convert this into FASTQ
        ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq)

        # Set up probabbility and quality value model
        if self.ice_opts.use_finer_qv:
            self._setProbQV_ccs(self.ccs_fofn, firstSplit)
        else:
            self._setProbQV_fq(firstSplitFq=firstSplit_fq)

        # Initialize cluster by clique
        if os.path.exists(self.initPickleFN):
            self.add_log("Reading existing uc pickle: {0}".format(self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN) as f:
                uc = cPickle.load(f)
        else:
            self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO)
            self.iceinit = IceInit(
                readsFa=firstSplit,
                qver_get_func=self._probqv.get_smoothed,
                ice_opts=self.ice_opts,
                sge_opts=self.sge_opts,
                qvmean_get_func=self._probqv.get_mean,
            )
            uc = self.iceinit.uc

            # Dump uc to a file
            self.add_log("Dumping initial clusters to {f}".format(f=self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN, "w") as f:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO)
        # self.add_log("In Cluster. DEBUG: Calling Cluster with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn))

        self.icec = IceIterative(
            fasta_filename=firstSplit,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=firstSplit_fq,
            use_ccs_qv=self.ice_opts.use_finer_qv,
        )
        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))
        self.icec.run()
        self.add_log("IceIterative completed.", level=logging.INFO)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)

        # self.add_log("In Cluster. DEBUG: End Cluster with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn))

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.", level=logging.INFO)
            ln(src=self.icec.report_fn, dst=self.report_fn)

            # Summarize cluster and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa)
        else:  # self.ice_opts.quiver is True
            self.add_log("Polishing clusters: initializing IcePolish.", level=logging.INFO)
            # self.add_log("In Cluster. DEBUG: Calling Polish with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn))
            self.pol = Polish(
                root_dir=self.root_dir,
                nfl_fa=self.nfl_fa,
                bas_fofn=self.bas_fofn,
                ccs_fofn=self.ccs_fofn,
                fasta_fofn=self.fasta_fofn,
                ice_opts=self.ice_opts,
                sge_opts=self.sge_opts,
                ipq_opts=self.ipq_opts,
                nfl_reads_per_split=self.nfl_reads_per_split,
            )
            self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn), level=logging.INFO)
            self.pol.run()
            self.add_log("IcePolish completed.", level=logging.INFO)

            # cluster report
            self.add_log("Creating a link to cluster report.", level=logging.INFO)
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Summarize cluster & polish and write to summary_fn.
            self.write_summary(
                summary_fn=self.summary_fn,
                isoforms_fa=self.out_fa,
                hq_fa=self.pol.icepq.quivered_good_fa,
                lq_fa=self.pol.icepq.quivered_bad_fa,
            )

        # Create log file.
        self.close_log()
        return 0
Example #7
0
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        #self.ice_opts.flnc_reads_per_split=1000 #FOR DEBUGGING, REMOVE LATER
        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split")
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        firstSplit = self._flnc_splitted_fas[0]
        firstSplit_fq = firstSplit[:firstSplit.rfind('.')] + '.fastq'
        self.add_log("Converting first split file {0} + {1} into fastq\n".format(\
                firstSplit, self.ccs_fofn), level=logging.INFO)
        # Convert this into FASTQ
        ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq)

        # Set up probabbility and quality value model
        if self.ice_opts.use_finer_qv:
            self._setProbQV_ccs(self.ccs_fofn, firstSplit)
        else:
            self._setProbQV_fq(firstSplitFq=firstSplit_fq)

        # Initialize cluster by clique
        self.add_log("Finding maximal cliques: initializing IceInit.",
                     level=logging.INFO)
        self.iceinit = IceInit(readsFa=firstSplit,
                               qver_get_func=self._probqv.get_smoothed,
                               ice_opts=self.ice_opts,
                               sge_opts=self.sge_opts)
        uc = self.iceinit.uc

        # Dump uc to a file
        self.add_log(
            "Dumping initial clusters to {f}".format(f=self.initPickleFN),
            level=logging.INFO)
        with open(self.initPickleFN, 'w') as f:
            cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.",
                     level=logging.INFO)
        self.icec = IceIterative(
            fasta_filename=firstSplit,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=firstSplit_fq,
            use_ccs_qv=self.ice_opts.use_finer_qv)
        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))
        self.icec.run()
        self.add_log("IceIterative completed.", level=logging.INFO)

        # IceIterative done, write predicted (unplished) consensus isoforms
        # to an output fasta
        self.add_log("Creating a link to unpolished consensus isoforms.")
        ln(self.icec.final_consensus_fa, self.out_fa)

        # Call quiver to polish predicted consensus isoforms.
        if self.ice_opts.quiver is not True:
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.icec.report_fn, dst=self.report_fn)

            # Summarize cluster and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa)
        else:  # self.ice_opts.quiver is True
            self.add_log("Polishing clusters: initializing IcePolish.",
                         level=logging.INFO)
            self.pol = Polish(root_dir=self.root_dir,
                              nfl_fa=self.nfl_fa,
                              bas_fofn=self.bas_fofn,
                              ccs_fofn=self.ccs_fofn,
                              fasta_fofn=self.fasta_fofn,
                              ice_opts=self.ice_opts,
                              sge_opts=self.sge_opts,
                              ipq_opts=self.ipq_opts,
                              nfl_reads_per_split=self.nfl_reads_per_split)
            self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn),
                         level=logging.INFO)
            self.pol.run()
            self.add_log("IcePolish completed.", level=logging.INFO)

            # cluster report
            self.add_log("Creating a link to cluster report.",
                         level=logging.INFO)
            ln(src=self.pol.iceq.report_fn, dst=self.report_fn)

            # Summarize cluster & polish and write to summary_fn.
            self.write_summary(summary_fn=self.summary_fn,
                               isoforms_fa=self.out_fa,
                               hq_fa=self.pol.icepq.quivered_good_fa,
                               lq_fa=self.pol.icepq.quivered_bad_fa)

        # Create log file.
        self.close_log()
        return 0
Example #8
0
    def run(self):
        """Call ICE to cluster consensus isoforms."""
        self.add_log("Start to run cluster.", level=logging.INFO)

        #self.ice_opts.flnc_reads_per_split=1000 #FOR DEBUGGING, REMOVE LATER
        # Split flnc_fa into smaller files and save files to _flnc_splitted_fas.
        self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.ice_opts.flnc_reads_per_split),
                     level=logging.INFO)
        self._flnc_splitted_fas = splitFasta(
            input_fasta=self.flnc_fa,
            reads_per_split=self.ice_opts.flnc_reads_per_split,
            out_dir=self.root_dir,
            out_prefix="input.split")
        self.add_log("Splitted files are: " +
                     "\n".join(self._flnc_splitted_fas),
                     level=logging.INFO)

        firstSplit = self._flnc_splitted_fas[0]
        firstSplit_fq = firstSplit[:firstSplit.rfind('.')] + '.fastq'
        self.add_log("Converting first split file {0} + {1} into fastq\n".format(\
                firstSplit, self.ccs_fofn), level=logging.INFO)
        # Convert this into FASTQ
        ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq)

        # Set up probabbility and quality value model
        if self.ice_opts.use_finer_qv:
            self._setProbQV_ccs(self.ccs_fofn, firstSplit)
        else:
            self._setProbQV_fq(firstSplitFq=firstSplit_fq)

        # Initialize cluster by clique
        if os.path.exists(self.initPickleFN):
            self.add_log("Reading existing uc pickle: {0}".format(self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN) as f:
                uc = cPickle.load(f)
        else:
            self.add_log("Finding maximal cliques: initializing IceInit.",
                         level=logging.INFO)
            self.iceinit = IceInit(readsFa=firstSplit,
                                   qver_get_func=self._probqv.get_smoothed,
                                   ice_opts=self.ice_opts,
                                   sge_opts=self.sge_opts)
            uc = self.iceinit.uc

            # Dump uc to a file
            self.add_log("Dumping initial clusters to {f}".format(
                         f=self.initPickleFN), level=logging.INFO)
            with open(self.initPickleFN, 'w') as f:
                cPickle.dump(uc, f)

        # Run IceIterative.
        self.add_log("Iterative clustering: initializing IceIterative.",
                     level=logging.INFO)
        self.icec = IceIterative(
            fasta_filename=firstSplit,
            fasta_filenames_to_add=self._flnc_splitted_fas[1:],
            all_fasta_filename=self.flnc_fa,
            ccs_fofn=self.ccs_fofn,
            root_dir=self.root_dir,
            ice_opts=self.ice_opts,
            sge_opts=self.sge_opts,
            uc=uc,
            probQV=self._probqv,
            fastq_filename=firstSplit_fq,
            use_ccs_qv=self.ice_opts.use_finer_qv)
        self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn))


        return self.icec
Example #9
0
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle,
                          ccs_fofn=None,
                          done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    """
    input_fasta = realpath(input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = os.path.dirname(out_pickle)

    daligner_sensitive_mode, _low, _high = get_daligner_sensitivity_setting(ref_fasta)

    # DB should always be already converted
    ref_obj = DazzIDHandler(ref_fasta, True)
    input_obj = DazzIDHandler(input_fasta, False)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(input_fasta, ref_fasta, is_FL=False, same_strand_only=False, \
                            query_converted=True, db_converted=True, query_made=False, \
                            db_made=True, use_sge=False, cpus=cpus, sge_opts=None)
    las_filenames, las_out_filenames = runner.runHPC(min_match_len=300, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode)

    if no_qv_or_aln_checking:
        # not using QVs or alignment checking!
        # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used
        logging.info("Not using QV for partial_uc. Loading dummy QV.")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        if ccs_fofn is None:
            logging.info("Loading probability from model (0.01,0.07,0.06)")
            probqv = ProbFromModel(.01, .07, .06)
        else:
            start_t = time.time()
            if use_finer_qv:
                probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
                logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\
                    s=time.time()-start_t))
            else:
                input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
                logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq))
                ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
                probqv = ProbFromFastq(input_fastq)
                logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
                print >> sys.stderr, "Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)

    logging.info("Calling dalign_against_ref ...")

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for las_out_filename in las_out_filenames:
        start_t = time.time()
        hitItems = dalign_against_ref(input_obj, ref_obj, las_out_filename,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=20,
                                 same_strand_only=False,
                                 no_qv_or_aln_checking=no_qv_or_aln_checking)
        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing {0} took {1} sec".format(las_out_filename, time.time()-start_t))
        print >> sys.stderr, "processing {0} took {1} sec".format(las_out_filename, time.time()-start_t)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, 'w') as f:
        dump({'partial_uc': partial_uc, 'nohit': nohit}, f)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)

    # remove all the .las and .las.out filenames
    for file in las_filenames:
        os.remove(file)
    for file in las_out_filenames:
        os.remove(file)
Example #10
0
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle,
                          sa_file=None, ccs_fofn=None,
                          done_filename=None, blasr_nproc=12, use_finer_qv=False):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = realpath(input_fasta)
    m5_file = input_fasta + ".blasr"
    out_pickle = realpath(out_pickle)
    if sa_file is None:
        if op.exists(input_fasta + ".sa"):
            sa_file = input_fasta + ".sa"

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \
          "-nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "-maxScore -1000 -minPctIdentity 85 " + \
          "-out {o} ".format(o=real_upath(m5_file))
    if sa_file is not None and op.exists(sa_file):
        cmd += "-sa {sa}".format(sa=real_upath(sa_file))

    logging.info("CMD: {cmd}".format(cmd=cmd))
    _out, _code, _msg = backticks(cmd)
    if _code != 0:
        errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg)
        logging.error(errMsg)
        raise RuntimeError(errMsg)
    
    if ccs_fofn is None:
        logging.info("Loading probability from model (0.01,0.07,0.06)")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        start_t = time.time()
        if use_finer_qv:
            logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\
                    s=time.time()-start_t))
            probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
        else:
            input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
            logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq))
            ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
            logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
            probqv = ProbFromFastq(input_fastq)


    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qvmean_get_func=probqv.get_mean,
                                 qver_get_func=probqv.get_smoothed,
                                 ece_penalty=1,
                                 ece_min_len=10,
                                 same_strand_only=False)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, 'w') as f:
        dump({'partial_uc': partial_uc, 'nohit': nohit}, f)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)