Example #1
0
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"):
    """
    Run daligner on gcon_in.fa, but don't care about results.
    Just make sure it runs.
    """
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    mkdir(scriptDir)
    mkdir(testDir)

    testInFa = op.join(testDir, "daligner.fasta")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert op.exists(testInFa)

    runner = DalignerRunner(query_filename=testInFa,
                            target_filename=testInFa,
                            is_FL=True, same_strand_only=True,
                            query_converted=False, target_converted=False,
                            use_sge=False, cpus=4, sge_opts=None)
    runner.run(output_dir=testDir, min_match_len=300, sensitive_mode=False)
    runner.clean_run()

    shutil.rmtree(testDir)
    logging.info("daligner check passed.")
    return True
Example #2
0
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"):
    """
    Run daligner on gcon_in.fa, but don't care about results.
    Just make sure it runs.
    """
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    mkdir(scriptDir)
    mkdir(testDir)

    testInFa = op.join(testDir, "daligner.fasta")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert op.exists(testInFa)

    runner = DalignerRunner(query_filename=testInFa,
                            target_filename=testInFa,
                            is_FL=True,
                            same_strand_only=True,
                            query_converted=False,
                            target_converted=False,
                            use_sge=False,
                            cpus=4,
                            sge_opts=None)
    runner.run(output_dir=testDir, min_match_len=300, sensitive_mode=False)
    runner.clean_run()

    shutil.rmtree(testDir)
    logging.info("daligner check passed.")
    return True
Example #3
0
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle,
                                   done_filename,
                                   ice_opts,
                                   probqv,
                                   qv_prob_threshold=0.3,
                                   cpus=4,
                                   no_qv_or_aln_checking=False,
                                   tmp_dir=None,
                                   sID_starts_with_c=False):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using DALIGNER, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    tmp_dir - where to save intermediate files such as dazz files.
              if None, writer dazz files to the same directory as query/target.
    """
    input_fasta = realpath(input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = op.dirname(out_pickle)

    ice_opts.detect_cDNA_size(ref_fasta)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(query_filename=input_fasta,
                            target_filename=ref_fasta,
                            is_FL=False, same_strand_only=False,
                            query_converted=False, target_converted=True,
                            dazz_dir=tmp_dir, script_dir=op.join(output_dir, "script"),
                            use_sge=False, sge_opts=None, cpus=cpus)
    runner.run(min_match_len=ice_opts.min_match_len, output_dir=output_dir, sensitive_mode=ice_opts.sensitive_mode)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for la4ice_filename in runner.la4ice_filenames:
        start_t = time.time()


        # not providing full_missed_start/end since aligning nFLs, ok to partially align only
        hitItems = daligner_against_ref2(query_dazz_handler=runner.query_dazz_handler,
                                        target_dazz_handler=runner.target_dazz_handler,
                                        la4ice_filename=la4ice_filename,
                                        is_FL=False, sID_starts_with_c=sID_starts_with_c,
                                        qver_get_func=probqv.get_smoothed,
                                        qvmean_get_func=probqv.get_mean,
                                        qv_prob_threshold=qv_prob_threshold,
                                        ece_penalty=ice_opts.ece_penalty,
                                        ece_min_len=ice_opts.ece_min_len,
                                        same_strand_only=True,
                                        no_qv_or_aln_checking=no_qv_or_aln_checking,
                                        max_missed_start=ice_opts.max_missed_start,
                                        max_missed_end=ice_opts.max_missed_end,
                                        full_missed_start=ice_opts.full_missed_start,
                                        full_missed_end=ice_opts.full_missed_end)


        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing %s took %s sec",
                     la4ice_filename, str(time.time()-start_t))

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)

    # remove all the .las and .las.out filenames
    runner.clean_run()
Example #4
0
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle,
                                   ccs_fofn=None,
                                   done_filename=None,
                                   use_finer_qv=False,
                                   cpus=24,
                                   no_qv_or_aln_checking=True,
                                   tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.

    tmp_dir - where to save intermediate files such as dazz files.
              if None, writer dazz files to the same directory as query/target.
    """
    input_fasta = realpath(input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = op.dirname(out_pickle)

    ice_opts = IceOptions()
    ice_opts.detect_cDNA_size(ref_fasta)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(query_filename=input_fasta,
                            target_filename=ref_fasta,
                            is_FL=False, same_strand_only=False,
                            query_converted=False, target_converted=True,
                            dazz_dir=tmp_dir, script_dir=op.join(output_dir, "script"),
                            use_sge=False, sge_opts=None, cpus=cpus)
    runner.run(min_match_len=300, output_dir=output_dir, sensitive_mode=ice_opts.sensitive_mode)

    if no_qv_or_aln_checking:
        # not using QVs or alignment checking!
        # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used
        logging.info("Not using QV for partial_uc. Loading dummy QV.")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        if ccs_fofn is None:
            logging.info("Loading probability from model (0.01,0.07,0.06)")
            probqv = ProbFromModel(.01, .07, .06)
        else:
            start_t = time.time()
            if use_finer_qv:
                probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
                logging.info("Loading QVs from %s + %s took %s secs",
                             ccs_fofn, input_fasta, time.time()-start_t)
            else:
                input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
                logging.info("Converting %s + %s --> %s",
                             input_fasta, ccs_fofn, input_fastq)
                ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
                probqv = ProbFromFastq(input_fastq)
                logging.info("Loading QVs from %s took %s secs",
                             input_fastq, time.time()-start_t)

    logging.info("Calling dalign_against_ref ...")

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for la4ice_filename in runner.la4ice_filenames:
        start_t = time.time()
        hitItems = daligner_against_ref(query_dazz_handler=runner.query_dazz_handler,
                                        target_dazz_handler=runner.target_dazz_handler,
                                        la4ice_filename=la4ice_filename,
                                        is_FL=False,
                                        sID_starts_with_c=True,
                                        qver_get_func=probqv.get_smoothed,
                                        qvmean_get_func=probqv.get_mean,
                                        ece_penalty=1,
                                        ece_min_len=20,
                                        same_strand_only=False,
                                        no_qv_or_aln_checking=no_qv_or_aln_checking)
        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing %s took %s sec",
                     la4ice_filename, str(time.time()-start_t))

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)

    # remove all the .las and .las.out filenames
    runner.clean_run()
Example #5
0
class TestDalignerRunner(unittest.TestCase):
    """Test pbtranscript.ice_daligner.DalignerRunner"""
    def setUp(self):
        """Initialize."""
        self.data_dir = op.join(DATA_DIR, "test_daligner_against_ref")
        self.script_dir = op.join(OUT_DIR, "test_ice_daligner_script")
        self.dazz_dir = op.join(OUT_DIR, "test_ice_daligner_dazz")
        self.out_dir = op.join(OUT_DIR, "test_ice_daligner_out")
        mkdir(self.dazz_dir)
        mkdir(self.out_dir)
        self.stdout_dir = STD_DIR
        self.sivDataDir = SIV_DATA_DIR
        self.query_filename = "test_daligner_query.fasta"
        self.target_filename = "test_daligner_target.fasta"
        self.runner = DalignerRunner(
            query_filename=op.join(self.data_dir, self.query_filename),
            target_filename=op.join(self.data_dir, self.target_filename),
            is_FL=False,
            same_strand_only=True,
            dazz_dir=self.dazz_dir,
            script_dir=self.script_dir)
        self.runner.output_dir = self.out_dir

    def test_query_prefix(self):
        """Test query_prefix."""
        self.assertEqual(
            self.runner.query_prefix(1),
            op.join(self.dazz_dir, self.query_filename[0:-6] + ".dazz.fasta"))

    def test_target_prefix(self):
        """Test target_prefix."""
        self.assertEqual(
            self.runner.target_prefix(1),
            op.join(self.dazz_dir, self.target_filename[0:-6] + ".dazz.fasta"))

    def test_thread_prefix(self):
        """Test local_job_runner."""
        self.assertEqual(self.runner.thread_prefix(2, is_forward=True), 'N2')
        self.assertEqual(self.runner.thread_prefix(2, is_forward=False), 'C2')

    def test_las_filenames(self):
        """Test las_filenames."""
        expected = [
            op.join(
                self.out_dir, "{q}.{t}.{k}.las".format(
                    q=self.query_filename[0:-6] + ".dazz.fasta",
                    t=self.target_filename[0:-6] + ".dazz.fasta",
                    k=k)) for k in ('N0', 'N1', 'N2', 'N3')
        ]
        print 'las_filenames\n'
        print expected
        self.assertEqual(self.runner.las_filenames, expected)

    def test_la4ice_filenames(self):
        """Test la4ice_filenames."""
        expected = [
            op.join(
                self.out_dir, "{q}.{t}.{k}.las.out".format(
                    q=self.query_filename[0:-6] + ".dazz.fasta",
                    t=self.target_filename[0:-6] + ".dazz.fasta",
                    k=k)) for k in ('N0', 'N1', 'N2', 'N3')
        ]
        self.assertEqual(self.runner.la4ice_filenames, expected)

    def test_run(self):
        """Test run(output_dir, min_match_len, sensitive_mode).
        running on sge and locally.
        """
        run_on_sge = (backticks('qstat')[1] == 0)

        if run_on_sge:
            self.runner.use_sge = True
            self.runner.sge_opts = SgeOptions(100)
            mknewdir(self.out_dir)
            self.runner.run(output_dir=self.out_dir)

            for las_filename in self.runner.las_filenames:
                print "Checking existance of " + las_filename
                self.assertTrue(op.exists(las_filename))

            for la4ice_filename in self.runner.la4ice_filenames:
                print "Checking existance of " + la4ice_filename
                self.assertTrue(op.exists(la4ice_filename))

        # Run locally
        self.runner.use_sge = False
        mknewdir(self.out_dir)
        self.runner.run(output_dir=self.out_dir)

        for las_filename in self.runner.las_filenames:
            print "Checking existance of " + las_filename
            self.assertTrue(op.exists(las_filename))

        for la4ice_filename in self.runner.la4ice_filenames:
            print "Checking existance of " + la4ice_filename
            self.assertTrue(op.exists(la4ice_filename))

        # clean all output
        self.runner.clean_run()

        for las_filename in self.runner.las_filenames:
            print "Checking %s has been removed.\n" % las_filename
            self.assertTrue(not op.exists(las_filename))

        for la4ice_filename in self.runner.la4ice_filenames:
            print "Checking %s has been removed.\n" % la4ice_filename
            self.assertTrue(not op.exists(la4ice_filename))
class TestDalignerRunner(unittest.TestCase):
    """Test pbtranscript.ice_daligner.DalignerRunner"""
    def setUp(self):
        """Initialize."""
        self.data_dir = op.join(DATA_DIR, "test_daligner_against_ref")
        self.script_dir = op.join(OUT_DIR, "test_ice_daligner_script")
        self.dazz_dir = op.join(OUT_DIR, "test_ice_daligner_dazz")
        self.out_dir = op.join(OUT_DIR, "test_ice_daligner_out")
        mkdir (self.dazz_dir)
        mkdir (self.out_dir)
        self.stdout_dir = STD_DIR
        self.sivDataDir = SIV_DATA_DIR
        self.query_filename  = "test_daligner_query.fasta"
        self.target_filename = "test_daligner_target.fasta"
        self.runner = DalignerRunner(query_filename=op.join(self.data_dir, self.query_filename),
                                     target_filename=op.join(self.data_dir, self.target_filename),
                                     is_FL=False, same_strand_only=True,
                                     dazz_dir=self.dazz_dir, script_dir=self.script_dir)
        self.runner.output_dir = self.out_dir

    def test_query_prefix(self):
        """Test query_prefix."""
        self.assertEqual(self.runner.query_prefix(1),
                op.join(self.dazz_dir, self.query_filename[0:-6] + ".dazz.fasta"))

    def test_target_prefix(self):
        """Test target_prefix."""
        self.assertEqual(self.runner.target_prefix(1),
                op.join(self.dazz_dir, self.target_filename[0:-6] + ".dazz.fasta"))

    def test_thread_prefix(self):
        """Test local_job_runner."""
        self.assertEqual(self.runner.thread_prefix(2, is_forward=True), 'N2')
        self.assertEqual(self.runner.thread_prefix(2, is_forward=False), 'C2')

    def test_las_filenames(self):
        """Test las_filenames."""
        expected = [op.join(self.out_dir,
                            "{q}.{t}.{k}.las".format(
                                q=self.query_filename[0:-6] + ".dazz.fasta",
                                t=self.target_filename[0:-6] + ".dazz.fasta",
                                k=k))
                    for k in ('N0', 'N1', 'N2', 'N3')]
        print 'las_filenames\n'
        print expected
        self.assertEqual(self.runner.las_filenames, expected)

    def test_la4ice_filenames(self):
        """Test la4ice_filenames."""
        expected = [op.join(self.out_dir,
                            "{q}.{t}.{k}.las.out".format(
                                q=self.query_filename[0:-6] + ".dazz.fasta",
                                t=self.target_filename[0:-6] + ".dazz.fasta",
                                k=k))
                    for k in ('N0', 'N1', 'N2', 'N3')]
        self.assertEqual(self.runner.la4ice_filenames, expected)

    def test_run(self):
        """Test run(output_dir, min_match_len, sensitive_mode).
        running on sge and locally.
        """
        run_on_sge = (backticks('qstat')[1] == 0)

        if run_on_sge:
            self.runner.use_sge = True
            self.runner.sge_opts = SgeOptions(100)
            mknewdir(self.out_dir)
            self.runner.run(output_dir=self.out_dir)

            for las_filename in self.runner.las_filenames:
                print "Checking existance of " + las_filename
                self.assertTrue(op.exists(las_filename))

            for la4ice_filename in self.runner.la4ice_filenames:
                print "Checking existance of " + la4ice_filename
                self.assertTrue(op.exists(la4ice_filename))

        # Run locally
        self.runner.use_sge = False
        mknewdir(self.out_dir)
        self.runner.run(output_dir=self.out_dir)

        for las_filename in self.runner.las_filenames:
            print "Checking existance of " + las_filename
            self.assertTrue(op.exists(las_filename))

        for la4ice_filename in self.runner.la4ice_filenames:
            print "Checking existance of " + la4ice_filename
            self.assertTrue(op.exists(la4ice_filename))

        # clean all output
        self.runner.clean_run()

        for las_filename in self.runner.las_filenames:
            print "Checking %s has been removed.\n" % las_filename
            self.assertTrue(not op.exists(las_filename))

        for la4ice_filename in self.runner.la4ice_filenames:
            print "Checking %s has been removed.\n" % la4ice_filename
            self.assertTrue(not op.exists(la4ice_filename))
Example #7
0
def build_uc_from_partial_daligner(input_fasta,
                                   ref_fasta,
                                   out_pickle,
                                   ccs_fofn=None,
                                   done_filename=None,
                                   use_finer_qv=False,
                                   cpus=24,
                                   no_qv_or_aln_checking=True,
                                   tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.

    tmp_dir - where to save intermediate files such as dazz files.
              if None, writer dazz files to the same directory as query/target.
    """
    input_fasta = realpath(input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = op.dirname(out_pickle)

    ice_opts = IceOptions()
    ice_opts.detect_cDNA_size(ref_fasta)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(query_filename=input_fasta,
                            target_filename=ref_fasta,
                            is_FL=False,
                            same_strand_only=False,
                            query_converted=False,
                            target_converted=True,
                            dazz_dir=tmp_dir,
                            script_dir=op.join(output_dir, "script"),
                            use_sge=False,
                            sge_opts=None,
                            cpus=cpus)
    runner.run(min_match_len=300,
               output_dir=output_dir,
               sensitive_mode=ice_opts.sensitive_mode)

    if no_qv_or_aln_checking:
        # not using QVs or alignment checking!
        # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used
        logging.info("Not using QV for partial_uc. Loading dummy QV.")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        if ccs_fofn is None:
            logging.info("Loading probability from model (0.01,0.07,0.06)")
            probqv = ProbFromModel(.01, .07, .06)
        else:
            start_t = time.time()
            if use_finer_qv:
                probqv = ProbFromQV(input_fofn=ccs_fofn,
                                    fasta_filename=input_fasta)
                logging.info("Loading QVs from %s + %s took %s secs", ccs_fofn,
                             input_fasta,
                             time.time() - start_t)
            else:
                input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
                logging.info("Converting %s + %s --> %s", input_fasta,
                             ccs_fofn, input_fastq)
                ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
                probqv = ProbFromFastq(input_fastq)
                logging.info("Loading QVs from %s took %s secs", input_fastq,
                             time.time() - start_t)

    logging.info("Calling dalign_against_ref ...")

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for la4ice_filename in runner.la4ice_filenames:
        start_t = time.time()
        hitItems = daligner_against_ref(
            query_dazz_handler=runner.query_dazz_handler,
            target_dazz_handler=runner.target_dazz_handler,
            la4ice_filename=la4ice_filename,
            is_FL=False,
            sID_starts_with_c=True,
            qver_get_func=probqv.get_smoothed,
            qvmean_get_func=probqv.get_mean,
            ece_penalty=1,
            ece_min_len=20,
            same_strand_only=False,
            no_qv_or_aln_checking=no_qv_or_aln_checking)
        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing %s took %s sec", la4ice_filename,
                     str(time.time() - start_t))

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0]
                  for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)

    # remove all the .las and .las.out filenames
    runner.clean_run()