Esempio n. 1
0
def set_probqv_from_model():
    """Set probablitiy values from a fixed model,
    return probqv, log_info.
    """
    msg = "Loading predefined probabilities model."
    probqv = ProbFromModel(0.01, 0.07, 0.06)
    return probqv, msg
Esempio n. 2
0
def pickup_icec_job(pickle_filename, ccs_fofn, flnc_filename,
                    fasta_files_to_add, root_dir):
    """
    Reconstruct an ICE object from a pickle file and restart this ICE job.
    """
    log.info("Reading ICE pickle %s ....", pickle_filename)
    icec_obj, icec_pickle_filename = ensure_pickle_goodness(
        pickle_filename=pickle_filename,
        root_dir=root_dir,
        fasta_files_to_add=fasta_files_to_add)

    c_fa = current_fasta(root_dir)
    c_fq = current_fastq(root_dir)
    log.info("Making current.fasta %s for ICE ....", c_fa)
    make_current_fasta(icec_obj=icec_obj,
                       flnc_filename=flnc_filename,
                       root_dir=root_dir)

    log.info("Loading prob QV information....")
    probqv = None
    if ccs_fofn is None:
        logging.info("Loading probability from model (0.01,0.07,0.06)")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        #if use_finer_qv:
        #    probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
        #    logging.info("Loading prob QVs from %s + %s took %s secs",
        #                 ccs_fofn, input_fasta, time.time()-start_t)
        logging.info("Converting %s to %s", c_fa, c_fq)
        ice_fa2fq(c_fa, ccs_fofn, c_fq)

        logging.info("Loading prob QVs from %s", c_fq)
        probqv = ProbFromFastq(c_fq)

    log.info("Starting ICE from pickle %s....", icec_pickle_filename)
    icec = ice.IceIterative.from_pickle(icec_pickle_filename, probqv)

    # first must RE-RUN gcon to get all the proper refs
    icec.changes = set()
    icec.refs = {}
    icec.ccs_fofn = ccs_fofn
    icec.all_fasta_filename = flnc_filename
    todo = icec.uc.keys()
    log.info("Re-run gcon for proper refs....")
    icec.run_gcon_parallel(todo)

    log.info("Re-calculating cluster prob, just to be safe....")
    icec.calc_cluster_prob(True)

    log.info("Sanity checking uc_refs now....")
    icec.sanity_check_uc_refs()

    log.info("Ensuring prob QV of new ids are consistent....")
    icec.ensure_probQV_newid_consistency()

    log.info("Sanity check done. Resuming ICE job.")
    icec.run()
Esempio n. 3
0
def build_uc_from_partial_daligner(input_fasta,
                                   ref_fasta,
                                   out_pickle,
                                   ccs_fofn=None,
                                   done_filename=None,
                                   use_finer_qv=False,
                                   cpus=24,
                                   no_qv_or_aln_checking=True,
                                   tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.

    tmp_dir - where to save intermediate files such as dazz files.
              if None, writer dazz files to the same directory as query/target.
    """
    input_fasta = realpath(input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = op.dirname(out_pickle)

    ice_opts = IceOptions()
    ice_opts.detect_cDNA_size(ref_fasta)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(query_filename=input_fasta,
                            target_filename=ref_fasta,
                            is_FL=False,
                            same_strand_only=False,
                            query_converted=False,
                            target_converted=True,
                            dazz_dir=tmp_dir,
                            script_dir=op.join(output_dir, "script"),
                            use_sge=False,
                            sge_opts=None,
                            cpus=cpus)
    runner.run(min_match_len=300,
               output_dir=output_dir,
               sensitive_mode=ice_opts.sensitive_mode)

    if no_qv_or_aln_checking:
        # not using QVs or alignment checking!
        # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used
        logging.info("Not using QV for partial_uc. Loading dummy QV.")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        if ccs_fofn is None:
            logging.info("Loading probability from model (0.01,0.07,0.06)")
            probqv = ProbFromModel(.01, .07, .06)
        else:
            start_t = time.time()
            if use_finer_qv:
                probqv = ProbFromQV(input_fofn=ccs_fofn,
                                    fasta_filename=input_fasta)
                logging.info("Loading QVs from %s + %s took %s secs", ccs_fofn,
                             input_fasta,
                             time.time() - start_t)
            else:
                input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
                logging.info("Converting %s + %s --> %s", input_fasta,
                             ccs_fofn, input_fastq)
                ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
                probqv = ProbFromFastq(input_fastq)
                logging.info("Loading QVs from %s took %s secs", input_fastq,
                             time.time() - start_t)

    logging.info("Calling dalign_against_ref ...")

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for la4ice_filename in runner.la4ice_filenames:
        start_t = time.time()
        hitItems = daligner_against_ref(
            query_dazz_handler=runner.query_dazz_handler,
            target_dazz_handler=runner.target_dazz_handler,
            la4ice_filename=la4ice_filename,
            is_FL=False,
            sID_starts_with_c=True,
            qver_get_func=probqv.get_smoothed,
            qvmean_get_func=probqv.get_mean,
            ece_penalty=1,
            ece_min_len=20,
            same_strand_only=False,
            no_qv_or_aln_checking=no_qv_or_aln_checking)
        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing %s took %s sec", la4ice_filename,
                     str(time.time() - start_t))

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0]
                  for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)

    # remove all the .las and .las.out filenames
    runner.clean_run()
Esempio n. 4
0
def build_uc_from_partial(input_fasta,
                          ref_fasta,
                          out_pickle,
                          ccs_fofn=None,
                          done_filename=None,
                          blasr_nproc=12,
                          tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = _get_fasta_path(realpath(input_fasta))
    m5_file = os.path.basename(input_fasta) + ".blasr"
    if tmp_dir is not None:
        m5_file = op.join(tmp_dir, m5_file)

    out_pickle = realpath(out_pickle)

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \
          "--nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "--maxScore -1000 --minPctIdentity 85 " + \
          "--out {o} ".format(o=real_upath(m5_file)) + \
          "1>/dev/null 2>/dev/null"

    execute(cmd)

    if ccs_fofn is None:
        logging.info("Loading probability from model")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        # FIXME this will not work with current CCS bam output, which lacks
        # QV pulse features required - this is handled via a workaround in
        # pbtranscript.tasks.ice_partial
        logging.info("Loading probability from QV in %s", ccs_fofn)
        probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)

    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=10,
                                 same_strand_only=False)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0]
                  for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)
Esempio n. 5
0
    def _test_daligner_against_ref(self,
                                   test_name,
                                   use_sge,
                                   sge_opts,
                                   prob_model_from="fake"):
        """Test daligner_against_ref with and without using sge."""
        copy_dir = op.join(self.dataDir, "test_daligner_against_ref")
        output_dir = op.join(self.outDir, test_name)
        mknewdir(output_dir)

        qname, tname = "test_daligner_query.fasta", "test_daligner_target.fasta"
        query_filename = op.join(output_dir, qname)
        target_filename = op.join(output_dir, tname)

        prob_model = None
        if prob_model_from == "fake":
            prob_model = ProbFromModel(0.01, 0.07, 0.06)
        elif prob_model_from == "fastq":
            fastq_fn = op.join(copy_dir, "test_daligner_reads.fastq")
            prob_model = ProbFromFastq(fastq_fn)
        else:
            self.assertTrue(False)

        qver_get_func = prob_model.get_smoothed
        qvmean_get_func = prob_model.get_mean

        dummy_o, c, dummy_m = backticks(
            "cp %s %s" % (op.join(copy_dir, qname), query_filename))
        self.assertTrue(c == 0)

        dummy_o, c, dummy_m = backticks(
            "cp %s %s" % (op.join(copy_dir, tname), target_filename))
        self.assertTrue(c == 0)

        old_dir = os.getcwd()
        os.chdir(output_dir)

        runner = DalignerRunner(query_filename=query_filename,
                                target_filename=target_filename,
                                is_FL=True,
                                same_strand_only=True,
                                use_sge=use_sge,
                                sge_opts=sge_opts)
        runner.run(output_dir=op.join(self.outDir, test_name))

        hits = []

        for la4ice_filename in runner.la4ice_filenames:
            hits.extend(
                daligner_against_ref(
                    query_dazz_handler=runner.query_dazz_handler,
                    target_dazz_handler=runner.target_dazz_handler,
                    la4ice_filename=la4ice_filename,
                    is_FL=True,
                    sID_starts_with_c=False,
                    qver_get_func=qver_get_func,
                    qvmean_get_func=qvmean_get_func))
        # Num of hits may change when daligner or parameters change.
        self.assertTrue(len(hits), 706)
        self.assertEqual(
            str(hits[0]),
            "m54007_160109_025449/27984844/29_646_CCS/0_617 aligns to m54007_160109_025449/28836279/631_54_CCS"
        )
        os.chdir(output_dir)