Exemple #1
0
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"):
    """
    Run daligner on gcon_in.fa, but don't care about results.
    Just make sure it runs.
    """
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    if not op.exists(scriptDir):
        os.makedirs(scriptDir)
    if not op.exists(testDir):
        os.makedirs(testDir)

    testInFa = op.join(testDir, "gcon_in.fa")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert(op.exists(testInFa))

    obj = DazzIDHandler(testInFa)
    DalignerRunner.make_db(obj.dazz_filename)
    runner = DalignerRunner(testInFa, testInFa, is_FL=True, same_strand_only=True, \
                            query_converted=True, db_converted=True, query_made=True, \
                            db_made=True, use_sge=False, cpus=4, sge_opts=None)
    runner.runHPC(min_match_len=300, output_dir=testDir, sensitive_mode=False)

    shutil.rmtree(testDir)
    logging.info("daligner check passed.")
    return True
Exemple #2
0
    def _align(self, queryFa, output_dir, ice_opts, sge_opts):

        daligner_sensitive_mode, _low, _high = get_daligner_sensitivity_setting(queryFa)

        input_obj = DazzIDHandler(queryFa, False)
        DalignerRunner.make_db(input_obj.dazz_filename)

        # run this locally
        runner = DalignerRunner(queryFa, queryFa, is_FL=True, same_strand_only=True, \
                            query_converted=True, db_converted=True, query_made=True, \
                            db_made=True, use_sge=False, cpus=4, sge_opts=None)
        las_filenames, las_out_filenames = runner.runHPC(min_match_len=_low, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode)
        return input_obj, las_out_filenames
Exemple #3
0
    def _align(self, queryFa, output_dir, ice_opts, sge_opts):

        daligner_sensitive_mode, _low, _high = get_daligner_sensitivity_setting(
            queryFa)

        input_obj = DazzIDHandler(queryFa, False)
        DalignerRunner.make_db(input_obj.dazz_filename)

        # run this locally
        runner = DalignerRunner(queryFa, queryFa, is_FL=True, same_strand_only=True, \
                            query_converted=True, db_converted=True, query_made=True, \
                            db_made=True, use_sge=False, cpus=4, sge_opts=None)
        las_filenames, las_out_filenames = runner.runHPC(
            min_match_len=_low,
            output_dir=output_dir,
            sensitive_mode=daligner_sensitive_mode)
        return input_obj, las_out_filenames
Exemple #4
0
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"):
    """
    Run daligner on gcon_in.fa, but don't care about results.
    Just make sure it runs.
    """
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    if not op.exists(scriptDir):
        os.makedirs(scriptDir)
    if not op.exists(testDir):
        os.makedirs(testDir)

    testInFa = op.join(testDir, "gcon_in.fa")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert(op.exists(testInFa))

    obj = DazzIDHandler(testInFa)
    DalignerRunner.make_db(obj.dazz_filename)
    runner = DalignerRunner(testInFa, testInFa, is_FL=True, same_strand_only=True, \
                            query_converted=True, db_converted=True, query_made=True, \
                            db_made=True, use_sge=False, cpus=4, sge_opts=None)
    runner.runHPC(min_match_len=300, output_dir=testDir, sensitive_mode=False)

    shutil.rmtree(testDir)
    logging.info("daligner check passed.")
    return True
Exemple #5
0
    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        # Create final.consensus.fa.sa
        #self.add_log("Generating suffix array for {f}".format(
        #             f=self.final_consensus_sa), level=logging.INFO)
        #sa_file = self.get_sa_file()

        # Create input.fasta.fofn from bas_fofn
        self.add_log("Creating fasta fofn from bas/bax.h5 fofn",
                     level=logging.INFO)
        if self.fasta_fofn is None:
            self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
        self.add_log("bas fofn={f}".format(f=self.bas_fofn))
        self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
        if op.exists(self.fasta_fofn):
            self.add_log("No need to run convert_fofn_to_fasta.")
        else:
            convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                                out_filename=self.fasta_fofn,
                                fasta_out_dir=self.nfl_dir,
                                cpus=self.sge_opts.blasr_nproc)

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fa, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(input_fasta=self.nfl_fa,
                                            reads_per_split=self.nfl_reads_per_split,
                                            out_dir=self.nfl_dir,
                                            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Generating dazz DB for final.consensus.fasta
        ref_obj = DazzIDHandler(self.final_consensus_fa, False)
        DalignerRunner.make_db(ref_obj.dazz_filename)
        msg = "Dazz DB made for: " + ref_obj.dazz_filename
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("Initializing IceAllPartials.", level=logging.INFO)
        #sa_file = self.final_consensus_sa \
        #    if op.exists(self.final_consensus_fa) else None

        self.icep = IceAllPartials(
            root_dir=self.root_dir,
            fasta_filenames=self._nfl_splitted_fas,
            ref_fasta=self.final_consensus_fa,
            out_pickle=self.nfl_all_pickle_fn,
            sge_opts=self.sge_opts,
            sa_file=None,  # since we are switching to daligner, just give it as None now; remove sa_file completely later when daligner is mature (ToDo)
            ccs_fofn=self.ccs_fofn)
        self.add_log("IceAllPartials log: {f}.".format(f=self.icep.log_fn),
                     level=logging.INFO)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("Initializing IceQuiver.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts)
        self.add_log("IceQuiver log: {f}.".format(f=self.iceq.log_fn),
                     level=logging.INFO)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("Initializing IceQuiverPostprocess.", level=logging.INFO)
        self.icepq = IceQuiverPostprocess(root_dir=self.root_dir,
                                          use_sge=self.sge_opts.use_sge,
                                          quit_if_not_done=False,
                                          ipq_opts=self.ipq_opts)
        self.add_log("IceQuiverPostprocess log: {f}.".
                     format(f=self.icepq.log_fn), level=logging.INFO)
        self.icepq.run()
        self.add_log("IceQuiverPostprocess finished.", level=logging.INFO)
Exemple #6
0
def build_uc_from_partial_daligner(input_fastq, ref_fasta, out_pickle,
                          ccs_fofn=None,
                          done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    """
    input_fastq = realpath(input_fastq)
    input_fasta = input_fastq[:input_fastq.rfind('.')] + '.fasta'
    ice_fq2fa(input_fastq, input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = os.path.dirname(out_pickle)

    daligner_sensitive_mode, _low, _high, _ignore5, _ignore3, _ece_min_len = get_daligner_sensitivity_setting(ref_fasta, is_fasta=True)

    # DB should always be already converted
    ref_obj = DazzIDHandler(ref_fasta, True)
    input_obj = DazzIDHandler(input_fasta, False)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(input_fasta, ref_fasta, is_FL=False, same_strand_only=False, \
                            query_converted=True, db_converted=True, query_made=False, \
                            db_made=True, use_sge=False, cpus=cpus, sge_opts=None)
    las_filenames, las_out_filenames = runner.runHPC(min_match_len=_low, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode)

    if no_qv_or_aln_checking:
        # not using QVs or alignment checking!
        # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used
        logging.info("Not using QV for partial_uc. Loading dummy QV.")
        probqv = ProbFromModel(.01, .07, .06)
    else:
#        if ccs_fofn is None:
#            logging.info("Loading probability from model (0.01,0.07,0.06)")
#            probqv = ProbFromModel(.01, .07, .06)
#        else:
        start_t = time.time()
        probqv = ProbFromFastq(input_fastq)
        logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
# --------- comment out below since we are just using FASTQ / BAM
#            if use_finer_qv:
#                probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
#                logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\
#                    s=time.time()-start_t))
#            else:
#                input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
#                logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq))
#                ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
#                probqv = ProbFromFastq(input_fastq)
#                logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
#                print >> sys.stderr, "Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)

    logging.info("Calling dalign_against_ref ...")

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for las_out_filename in las_out_filenames:
        start_t = time.time()
        hitItems = dalign_against_ref(input_obj, ref_obj, las_out_filename,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=_ece_min_len,
                                 same_strand_only=False,
                                 no_qv_or_aln_checking=no_qv_or_aln_checking,
                                 max_missed_start=_ignore5,
                                 max_missed_end=_ignore3)
        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing {0} took {1} sec".format(las_out_filename, time.time()-start_t))
        print >> sys.stderr, "processing {0} took {1} sec".format(las_out_filename, time.time()-start_t)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, 'w') as f:
        dump({'partial_uc': partial_uc, 'nohit': nohit}, f)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)

    # remove all the .las and .las.out filenames
    for file in las_filenames:
        os.remove(file)
    for file in las_out_filenames:
        os.remove(file)
Exemple #7
0
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle,
                          ccs_fofn=None,
                          done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    """
    input_fasta = realpath(input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = os.path.dirname(out_pickle)

    daligner_sensitive_mode, _low, _high = get_daligner_sensitivity_setting(ref_fasta)

    # DB should always be already converted
    ref_obj = DazzIDHandler(ref_fasta, True)
    input_obj = DazzIDHandler(input_fasta, False)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(input_fasta, ref_fasta, is_FL=False, same_strand_only=False, \
                            query_converted=True, db_converted=True, query_made=False, \
                            db_made=True, use_sge=False, cpus=cpus, sge_opts=None)
    las_filenames, las_out_filenames = runner.runHPC(min_match_len=300, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode)

    if no_qv_or_aln_checking:
        # not using QVs or alignment checking!
        # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used
        logging.info("Not using QV for partial_uc. Loading dummy QV.")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        if ccs_fofn is None:
            logging.info("Loading probability from model (0.01,0.07,0.06)")
            probqv = ProbFromModel(.01, .07, .06)
        else:
            start_t = time.time()
            if use_finer_qv:
                probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
                logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\
                    s=time.time()-start_t))
            else:
                input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
                logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq))
                ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
                probqv = ProbFromFastq(input_fastq)
                logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
                print >> sys.stderr, "Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)

    logging.info("Calling dalign_against_ref ...")

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for las_out_filename in las_out_filenames:
        start_t = time.time()
        hitItems = dalign_against_ref(input_obj, ref_obj, las_out_filename,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=20,
                                 same_strand_only=False,
                                 no_qv_or_aln_checking=no_qv_or_aln_checking)
        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing {0} took {1} sec".format(las_out_filename, time.time()-start_t))
        print >> sys.stderr, "processing {0} took {1} sec".format(las_out_filename, time.time()-start_t)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, 'w') as f:
        dump({'partial_uc': partial_uc, 'nohit': nohit}, f)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)

    # remove all the .las and .las.out filenames
    for file in las_filenames:
        os.remove(file)
    for file in las_out_filenames:
        os.remove(file)