コード例 #1
0
    def test_runner(self):
        """Test CombineRunner."""
        ipq_opts = IceQuiverHQLQOptions(qv_trim_5=100, qv_trim_3=30)
        d = op.join(SIV_DATA_DIR, "test_tool_contract_chunks")
        split_dirs = [op.join(d, b, "cluster_out") for b in
                      ("0to1kb_part0", "1to2kb_part0", "2to3kb_part0", "3to4kb_part0", "4to5kb_part0")]
        print split_dirs
        out_combined_dir = op.join(OUT_DIR, "test_CombineUtils", "combined_dir")
        rmpath(out_combined_dir)
        mkdir(out_combined_dir)
        obj = CombineRunner(combined_dir=out_combined_dir,
                            sample_name="mysample",
                            split_dirs=split_dirs,
                            ipq_opts=ipq_opts)
        obj.run()

        expected_out_fns = (obj.all_hq_fa, obj.all_hq_fq, obj.all_lq_fa, obj.all_lq_fq,
                            obj.all_consensus_isoforms_fa,
                            obj.all_cluster_report_fn, obj.all_cluster_summary_fn)
        self.assertTrue(all([op.exists(f) for f in expected_out_fns]))

        expected_hq_isoforms = ['i1_HQ_mysample|c0/f2p16/1826', 'i2_HQ_mysample|c2/f9p14/2470',
                                'i2_HQ_mysample|c5/f7p19/2472', 'i2_HQ_mysample|c10/f8p16/2457',
                                'i2_HQ_mysample|c98/f2p10/2081', 'i2_HQ_mysample|c108/f23p28/2471']
        self.assertEqual([r.name.split(' ')[0] for r in FastaReader(obj.all_hq_fa)], expected_hq_isoforms)
        self.assertEqual([r.name.split(' ')[0] for r in FastqReader(obj.all_hq_fq)], expected_hq_isoforms)

        expected_lq_isoforms_num = 73
        self.assertEqual(len([r for r in FastaReader(obj.all_lq_fa)]), expected_lq_isoforms_num)

        expected_consensus_isoforms_num = 79
        self.assertEqual(len([r for r in FastaReader(obj.all_consensus_isoforms_fa)]), expected_consensus_isoforms_num)
コード例 #2
0
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir,
                          gmap_db_name, gmap_nproc):
    """
    Map isoforms to references by gmap, generate a sam output and sort sam.
    Parameters:
        input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml
        sam_filename -- output sam file, produced by gmap and sorted.
        gmap_db_dir -- gmap database directory
        gmap_db_name -- gmap database name
        gmap_nproc -- gmap nproc
    """
    unsorted_sam_filename = sam_filename + ".tmp"
    log_filename = sam_filename + ".log"

    gmap_input_filename = input_filename
    if input_filename.endswith('.xml'):
        # must consolidate dataset xml to FASTA/FASTQ
        w = ContigSetReaderWrapper(input_filename)
        gmap_input_filename = w.consolidate(out_prefix=sam_filename + '.input')
    if not op.exists(gmap_input_filename):
        raise IOError("Gmap input file %s does not exists" %
                      gmap_input_filename)

    # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files
    cwd = realpath(os.getcwd())
    cmd_args = [
        'cd %s' % real_upath(op.join(gmap_db_dir, gmap_db_name)),
        'ls *.iit *meta', 'sleep 3',
        'cd %s' % real_upath(cwd)
    ]
    execute(' && '.join(cmd_args))

    cmd_args = [
        'gmap',
        '-D {d}'.format(d=real_upath(gmap_db_dir)),
        '-d {name}'.format(name=gmap_db_name),
        '-t {nproc}'.format(nproc=gmap_nproc),
        '-n 0',
        '-z sense_force',
        '--cross-species',
        '-f samse',
        '--max-intronlength-ends 200000',  # for long genes
        real_upath(gmap_input_filename),
        '>',
        real_upath(unsorted_sam_filename),
        '2>{log}'.format(log=real_upath(log_filename))
    ]
    # Call gmap to map isoforms to reference and output sam.
    try:
        execute(' '.join(cmd_args))
    except Exception:
        logging.debug("gmap failed, try again.")
        execute('sleep 3')
        execute(' '.join(cmd_args))

    # sort sam file
    sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename)

    # remove intermediate unsorted sam file.
    rmpath(unsorted_sam_filename)
コード例 #3
0
ファイル: CollapsingUtils.py プロジェクト: lpp1985/lpp_Script
def map_isoforms_and_sort(input_filename, sam_filename,
                          gmap_db_dir, gmap_db_name, gmap_nproc):
    """
    Map isoforms to references by gmap, generate a sam output and sort sam.
    Parameters:
        input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml
        sam_filename -- output sam file, produced by gmap and sorted.
        gmap_db_dir -- gmap database directory
        gmap_db_name -- gmap database name
        gmap_nproc -- gmap nproc
    """
    unsorted_sam_filename = sam_filename + ".tmp"
    log_filename = sam_filename + ".log"

    gmap_input_filename = input_filename
    if input_filename.endswith('.xml'):
        # must consolidate dataset xml to FASTA/FASTQ
        w = ContigSetReaderWrapper(input_filename)
        gmap_input_filename = w.consolidate(out_prefix=sam_filename+'.input')
    if not op.exists(gmap_input_filename):
        raise IOError("Gmap input file %s does not exists" % gmap_input_filename)

    # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files
    cwd = realpath(os.getcwd())
    cmd_args = ['cd %s' % op.join(gmap_db_dir, gmap_db_name),
                'ls *.iit *meta', 'sleep 3', 'cd %s' % cwd]
    execute(' && '.join(cmd_args))

    cmd_args = ['gmap', '-D {d}'.format(d=gmap_db_dir),
                '-d {name}'.format(name=gmap_db_name),
                '-t {nproc}'.format(nproc=gmap_nproc),
                '-n 0',
                '-z sense_force',
                '--cross-species',
                '-f samse',
                gmap_input_filename,
                '>', unsorted_sam_filename,
                '2>{log}'.format(log=log_filename)]
    # Call gmap to map isoforms to reference and output sam.
    try:
        execute(' '.join(cmd_args))
    except Exception:
        logging.debug("gmap failed, try again.")
        execute('sleep 3')
        execute(' '.join(cmd_args))

    # sort sam file
    sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename)

    # remove intermediate unsorted sam file.
    rmpath(unsorted_sam_filename)
コード例 #4
0
def map_isoforms_and_sort(input_filename, sam_filename,
                          gmap_db_dir, gmap_db_name, gmap_nproc):
    """
    Map isoforms to references by gmap, generate a sam output and sort sam.
    Parameters:
        input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml
        sam_filename -- output sam file, produced by gmap and sorted.
        gmap_db_dir -- gmap database directory
        gmap_db_name -- gmap database name
        gmap_nproc -- gmap nproc
    """
    unsorted_sam_filename = sam_filename + ".tmp"
    log_filename = sam_filename + ".log"

    gmap_input_filename = input_filename
    if input_filename.endswith('.xml'):
        # must consolidate dataset xml to FASTA/FASTQ
        w = ContigSetReaderWrapper(input_filename)
        gmap_input_filename = w.consolidate(out_prefix=sam_filename+'.input')
    if not op.exists(gmap_input_filename):
        raise IOError("Gmap input file %s does not exists" % gmap_input_filename)

    cmd_args = ['gmap', '-D {d}'.format(d=gmap_db_dir),
                '-d {name}'.format(name=gmap_db_name),
                '-t {nproc}'.format(nproc=gmap_nproc),
                '-n 0',
                '-z sense_force',
                '--cross-species',
                '-f samse',
                gmap_input_filename,
                '>', unsorted_sam_filename,
                '2>{log}'.format(log=log_filename)]
    # Call gmap to map isoforms to reference and output sam.
    execute(' '.join(cmd_args))

    # Copy SAM headers
    copy_sam_header(in_sam=unsorted_sam_filename,
                    out_sam=sam_filename)

    # Call sort to sort gmap output sam file
    cmd_args = ['sort', '-k 3,3', '-k 4,4n', unsorted_sam_filename,
                '| grep -v \'^@\'', '>>', sam_filename]

    execute(' '.join(cmd_args))

    # remove intermediate unsorted sam file.
    rmpath(unsorted_sam_filename)
コード例 #5
0
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir,
                          gmap_db_name, gmap_nproc):
    """
    Map isoforms to references by gmap, generate a sam output and sort sam.
    Parameters:
        input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml
        sam_filename -- output sam file, produced by gmap and sorted.
        gmap_db_dir -- gmap database directory
        gmap_db_name -- gmap database name
        gmap_nproc -- gmap nproc
    """
    unsorted_sam_filename = sam_filename + ".tmp"
    log_filename = sam_filename + ".log"

    gmap_input_filename = input_filename
    if input_filename.endswith('.xml'):
        # must consolidate dataset xml to FASTA/FASTQ
        w = ContigSetReaderWrapper(input_filename)
        gmap_input_filename = w.consolidate(out_prefix=sam_filename + '.input')
    if not op.exists(gmap_input_filename):
        raise IOError("Gmap input file %s does not exists" %
                      gmap_input_filename)

    cmd_args = [
        'gmap', '-D {d}'.format(d=gmap_db_dir),
        '-d {name}'.format(name=gmap_db_name),
        '-t {nproc}'.format(nproc=gmap_nproc), '-n 0', '-z sense_force',
        '--cross-species', '-f samse', gmap_input_filename, '>',
        unsorted_sam_filename, '2>{log}'.format(log=log_filename)
    ]
    # Call gmap to map isoforms to reference and output sam.
    execute(' '.join(cmd_args))

    # Copy SAM headers
    copy_sam_header(in_sam=unsorted_sam_filename, out_sam=sam_filename)

    # Call sort to sort gmap output sam file
    cmd_args = [
        'sort', '-k 3,3', '-k 4,4n', unsorted_sam_filename, '| grep -v \'^@\'',
        '>>', sam_filename
    ]

    execute(' '.join(cmd_args))

    # remove intermediate unsorted sam file.
    rmpath(unsorted_sam_filename)
コード例 #6
0
ファイル: test_SAMReaders.py プロジェクト: wenmm/pbtranscript
    def test_iter_gmap_sam(self):
        """
        test iter_gmap_sam, which takes a sorted gmap sam file as input, and
        iterates over a group of overlapping sam records (which supposed to belong
        to the same isoform family.)
        """
        ignored_ids_txt = op.join(OUT_DIR, 'iter_gmap_sam.ignored.txt')
        rmpath(ignored_ids_txt)
        ignored_ids_writer = open(ignored_ids_txt, 'w')
        groups = _get_sam_groups(ignored_ids_writer)
        ignored_ids_writer.close()

        self.assertTrue(op.exists(ignored_ids_txt))
        ignored_ids = [line.split(' ')[0] for line in open(ignored_ids_txt, 'r')]
        self.assertEqual(len(ignored_ids), 108)

        self.assertEqual(len(groups), 9)
        expected_plus_lens = [10, 2, 129, 31, 0, 0, 348, 141, 0]
        self.assertEqual([len(g["+"]) for g in groups], expected_plus_lens)

        expected_minus_lens = [77, 36, 11, 0, 6, 9, 2, 2, 72]
        self.assertEqual([len(g["-"]) for g in groups], expected_minus_lens)

        self.assertTrue(all([r.sID == 'SIRV1' for r in groups[0]["+"]]))
        self.assertTrue(all([r.sID == 'SIRV2' for r in groups[1]["+"]]))
        self.assertTrue(all([r.sID == 'SIRV3' for r in groups[2]["+"]]))
        self.assertTrue(all([r.sID == 'SIRV4' for r in groups[3]["+"]]))
        self.assertTrue(all([r.sID == 'SIRV4' for r in groups[4]["-"]]))
        self.assertTrue(all([r.sID == 'SIRV4' for r in groups[5]["-"]]))
        self.assertTrue(all([r.sID == 'SIRV5' for r in groups[6]["+"]]))
        self.assertTrue(all([r.sID == 'SIRV6' for r in groups[7]["+"]]))
        self.assertTrue(all([r.sID == 'SIRV7' for r in groups[8]["-"]]))

        expected_g0_plus_sStart = [10710, 10712, 10712, 10712, 10712, 10712, 10712, 10713, 10713, 10715]
        expected_g0_plus_sEnd = [11641, 11641, 11638, 11640, 11641, 11641, 11638, 11641, 11640, 11641]
        self.assertTrue(expected_g0_plus_sStart, [r.sStart for r in groups[0]["+"]])
        self.assertTrue(expected_g0_plus_sEnd, [r.sEnd for r in groups[0]["+"]])

        expected_g4_minus_sStart = [3640, 3640, 3642, 3642, 3642, 3644]
        expected_g4_minus_sEnd = [5157, 5157, 5157, 5157, 3829, 5157]
        self.assertTrue(expected_g0_plus_sStart, [r.sStart for r in groups[4]["-"]])
        self.assertTrue(expected_g0_plus_sEnd, [r.sEnd for r in groups[4]["-"]])
コード例 #7
0
ファイル: test_Branch.py プロジェクト: natechols/pbtranscript
    def test_Branch(self):
        """
        Test Branch and Branch.run.
        Note that fuzzy junctions are not merged.
        """
        test_name = "test_branch"
        good_gff_fn = op.join(_OUT_DIR_, test_name + ".good.gff.unfuzzy")
        bad_gff_fn = op.join(_OUT_DIR_, test_name + ".bad.gff.unfuzzy")
        group_fn = op.join(_OUT_DIR_, test_name + ".group.txt.unfuzzy")

        rmpath(good_gff_fn)
        rmpath(bad_gff_fn)
        rmpath(group_fn)

        b = Branch(isoform_filename=READS_DS, sam_filename=SORTED_GMAP_SAM,
                   cov_threshold=2, min_aln_coverage=0.99, min_aln_identity=0.95)

        b.run(allow_extra_5exon=True, skip_5_exon_alt=False,
              ignored_ids_fn=None,
              good_gff_fn=good_gff_fn,
              bad_gff_fn=bad_gff_fn,
              group_fn=group_fn)

        self.assertTrue(op.exists(good_gff_fn))
        self.assertTrue(op.exists(bad_gff_fn))
        self.assertTrue(op.exists(group_fn))

        std_good_gff_fn = op.join(SIV_STD_DIR, "test_branch", test_name + ".good.gff.unfuzzy")
        std_bad_gff_fn = op.join(SIV_STD_DIR, "test_branch", test_name + ".bad.gff.unfuzzy")
        std_group_fn = op.join(SIV_STD_DIR, "test_branch", test_name + ".group.txt.unfuzzy")

        print "Comparing %s and %s"  %  (good_gff_fn, std_good_gff_fn)
        self.assertTrue(filecmp.cmp(good_gff_fn, std_good_gff_fn))
        self.assertTrue(filecmp.cmp(bad_gff_fn, std_bad_gff_fn))
        self.assertTrue(filecmp.cmp(group_fn, std_group_fn))
コード例 #8
0
ファイル: gather_gmap_sam.py プロジェクト: lpp1985/lpp_Script
def run_main(chunk_json, sam_output, chunk_key):
    """run main"""
    chunks = load_pipeline_chunks_from_json(chunk_json)

    # Allow looseness
    if not chunk_key.startswith('$chunk.'):
        chunk_key = '$chunk.' + chunk_key
        log.warn("Prepending chunk key with '$chunk.' to '%s'", str(chunk_key))

    sam_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key)
    log.debug("Chunked SAM files are %s.", (', '.join(sam_files)))

    log.info("Concatenate chunked SAM files to %s.", sam_output)

    # concatenate sam files
    unsorted_sam_output = sam_output + ".unsorted.sam"
    concatenate_sam(sam_files, unsorted_sam_output)

    # then sort
    sort_sam(unsorted_sam_output, sam_output)

    # remove intermediate file
    rmpath(unsorted_sam_output)
    return 0
コード例 #9
0
ファイル: test_Branch.py プロジェクト: ylipacbio/pbtranscript
    def test_collapse_sam_records(self):
        """Test collapse_sam_records, which takes in a list of grouped sam records. and
        write collapsed gff records to good_gff_writer|bad_gff_writer. A collapsed
        gff record is 'good' if there are >= cov_threshold supportive sam records
        belonging to its group; otherwise, 'bad'.
        """
        test_name = "test_collapse_sam_records"
        good_gff_fn = op.join(_OUT_DIR_, test_name + ".good.gff.unfuzzy")
        bad_gff_fn = op.join(_OUT_DIR_, test_name + ".bad.gff.unfuzzy")
        group_fn = op.join(_OUT_DIR_, test_name + ".group.txt.unfuzzy")

        rmpath(good_gff_fn)
        rmpath(bad_gff_fn)
        rmpath(group_fn)

        records = _get_sam_groups()[0]["+"] # contains 10 sam records
        with CollapseGffWriter(good_gff_fn) as good_gff_writer, \
             CollapseGffWriter(bad_gff_fn) as  bad_gff_writer, \
             GroupWriter(group_fn) as group_writer:
            collapse_sam_records(records=records, cuff_index=0, cov_threshold=2,
                                 allow_extra_5exon=False, skip_5_exon_alt=True,
                                 good_gff_writer=good_gff_writer,
                                 bad_gff_writer=bad_gff_writer,
                                 group_writer=group_writer)

        def str_to_gffrecord(line):
            fields = line.strip().split('\t')
            print fields
            attributes = []
            for attr_tuple in fields[8].split(';'):
                if len(attr_tuple.strip()) == 0:
                    continue
                else:
                    fs = attr_tuple.strip().split(' ')
                    if len(fs) == 2:
                        attributes.append((fs[0], fs[1].replace('"', '')))
            return Gff3Record(seqid=fields[0], start=fields[3], end=fields[4],
                              type=fields[2], attributes=attributes)

        bad_gff_records = [str_to_gffrecord(line) for line in open(bad_gff_fn, 'r') if not line.startswith('##')]
        self.assertEqual(len(bad_gff_records), 0)

        good_gff_records = [str_to_gffrecord(line) for line in open(good_gff_fn, 'r') if not line.startswith('##')]

        self.assertEqual(len(good_gff_records), 4)
        self.assertEqual([(int(r.start), int(r.end), r.type, r.attributes['gene_id'], r.attributes['transcript_id']) for r in good_gff_records],
                         [(10711, 11641, 'transcript', "PB.0", "PB.0.1"),
                          (10711, 10791, 'exon', "PB.0", "PB.0.1"),
                          (10883, 11057, 'exon', "PB.0", "PB.0.1"),
                          (11435, 11641, 'exon', "PB.0", "PB.0.1"),
                         ])
コード例 #10
0
    def test_map_isoforms_and_sort(self):
        """Test map_isoforms_and_sort"""
        out_fn = op.join(_OUT_DIR_, 'test map_isoforms_and_sort_fasta.sam')
        rmpath(out_fn)
        map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTA,
                              sam_filename=out_fn,
                              gmap_db_dir=self.gmap_db_dir,
                              gmap_db_name=GMAP_NAME,
                              gmap_nproc=10)
        self.assertTrue(op.exists(out_fn))

        out_fn = op.join(_OUT_DIR_, 'test map_isoforms_and_sort_fastq.sam')
        rmpath(out_fn)
        map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTQ,
                              sam_filename=out_fn,
                              gmap_db_dir=self.gmap_db_dir,
                              gmap_db_name=GMAP_NAME,
                              gmap_nproc=10)
        self.assertTrue(op.exists(out_fn))

        out_fn = op.join(_OUT_DIR_, 'test map_isoforms_and_sort_fasta_ds.sam')
        rmpath(out_fn)
        map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTA_DS,
                              sam_filename=out_fn,
                              gmap_db_dir=self.gmap_db_dir,
                              gmap_db_name=GMAP_NAME,
                              gmap_nproc=10)
        self.assertTrue(op.exists(out_fn))

        out_fn = op.join(_OUT_DIR_, 'test map_isoforms_and_sort_fastq_ds.sam')
        rmpath(out_fn)
        map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTQ_DS,
                              sam_filename=out_fn,
                              gmap_db_dir=self.gmap_db_dir,
                              gmap_db_name=GMAP_NAME,
                              gmap_nproc=10)
        self.assertTrue(op.exists(out_fn))
コード例 #11
0
    def test_map_isoforms_and_sort(self):
        """Test map_isoforms_and_sort"""
        out_fn = op.join(_OUT_DIR_, 'test_map_isoforms_and_sort_fasta.sam')
        rmpath(out_fn)
        map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTA,
                              sam_filename=out_fn,
                              gmap_db_dir=GMAP_DB,
                              gmap_db_name=GMAP_NAME,
                              gmap_nproc=10)
        self.assertTrue(op.exists(out_fn))

        out_fn = op.join(_OUT_DIR_, 'test_map_isoforms_and_sort_fastq.sam')
        rmpath(out_fn)
        map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTQ,
                              sam_filename=out_fn,
                              gmap_db_dir=GMAP_DB,
                              gmap_db_name=GMAP_NAME,
                              gmap_nproc=10)
        self.assertTrue(op.exists(out_fn))

        out_fn = op.join(_OUT_DIR_, 'test_map_isoforms_and_sort_fasta_ds.sam')
        rmpath(out_fn)
        map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTA_DS,
                              sam_filename=out_fn,
                              gmap_db_dir=GMAP_DB,
                              gmap_db_name=GMAP_NAME,
                              gmap_nproc=10)
        self.assertTrue(op.exists(out_fn))

        out_fn = op.join(_OUT_DIR_, 'test_map_isoforms_and_sort_fastq_ds.sam')
        rmpath(out_fn)
        map_isoforms_and_sort(input_filename=GMAP_INPUT_FASTQ_DS,
                              sam_filename=out_fn,
                              gmap_db_dir=GMAP_DB,
                              gmap_db_name=GMAP_NAME,
                              gmap_nproc=10)
        self.assertTrue(op.exists(out_fn))
コード例 #12
0
    def test_Branch(self):
        """
        Test Branch and Branch.run.
        Note that fuzzy junctions are not merged.
        """
        test_name = "test_branch"
        good_gff_fn = op.join(_OUT_DIR_, test_name + ".good.gff.unfuzzy")
        bad_gff_fn = op.join(_OUT_DIR_, test_name + ".bad.gff.unfuzzy")
        group_fn = op.join(_OUT_DIR_, test_name + ".group.txt.unfuzzy")

        rmpath(good_gff_fn)
        rmpath(bad_gff_fn)
        rmpath(group_fn)

        b = Branch(isoform_filename=READS_DS,
                   sam_filename=SORTED_GMAP_SAM,
                   cov_threshold=2,
                   min_aln_coverage=0.99,
                   min_aln_identity=0.95)

        b.run(allow_extra_5exon=True,
              skip_5_exon_alt=False,
              ignored_ids_fn=None,
              good_gff_fn=good_gff_fn,
              bad_gff_fn=bad_gff_fn,
              group_fn=group_fn)

        self.assertTrue(op.exists(good_gff_fn))
        self.assertTrue(op.exists(bad_gff_fn))
        self.assertTrue(op.exists(group_fn))

        std_good_gff_fn = op.join(SIV_STD_DIR, "test_branch",
                                  test_name + ".good.gff.unfuzzy")
        std_bad_gff_fn = op.join(SIV_STD_DIR, "test_branch",
                                 test_name + ".bad.gff.unfuzzy")
        std_group_fn = op.join(SIV_STD_DIR, "test_branch",
                               test_name + ".group.txt.unfuzzy")

        print "Comparing %s and %s" % (good_gff_fn, std_good_gff_fn)
        self.assertTrue(filecmp.cmp(good_gff_fn, std_good_gff_fn))
        self.assertTrue(filecmp.cmp(bad_gff_fn, std_bad_gff_fn))
        self.assertTrue(filecmp.cmp(group_fn, std_group_fn))
コード例 #13
0
ファイル: test_Branch.py プロジェクト: natechols/pbtranscript
 def setUp(self):
     """Define input and output file."""
     rmpath(_OUT_DIR_)
     mkdir(_OUT_DIR_)
コード例 #14
0
ファイル: tofu_wrap.py プロジェクト: natechols/pbtranscript
def args_runner(args):
    """args runner"""
    logging.info("%s arguments are:\n%s\n", __file__, args)

    # sanity check arguments
    _sanity_check_args(args)

    # make option objects
    ice_opts = IceOptions(quiver=args.quiver, use_finer_qv=args.use_finer_qv,
                          targeted_isoseq=args.targeted_isoseq,
                          ece_penalty=args.ece_penalty, ece_min_len=args.ece_min_len,
                          nfl_reads_per_split=args.nfl_reads_per_split)
    sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge,
                          max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc,
                          quiver_nproc=args.quiver_nproc, gcon_nproc=args.gcon_nproc,
                          sge_env_name=args.sge_env_name, sge_queue=args.sge_queue)
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3,
                                    hq_quiver_min_accuracy=args.hq_quiver_min_accuracy)

    # (1) separate flnc reads into bins
    logging.info("Separating FLNC reads into bins.")
    tofu_f = TofuFiles(tofu_dir=args.tofu_dir)
    s = SeparateFLNCRunner(flnc_fa=args.flnc_fa, root_dir=args.tofu_dir,
                           out_pickle=tofu_f.separate_flnc_pickle,
                           bin_size_kb=args.bin_size_kb, bin_by_primer=args.bin_by_primer,
                           bin_manual=args.bin_manual, max_base_limit_MB=args.max_base_limit_MB)
    s.run()

    flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files(tofu_f.separate_flnc_pickle)
    logging.info("Separated FLNC reads bins are %s", flnc_files)

    # (2) apply 'pbtranscript cluster' to each bin
    # run ICE/Quiver (the whole thing), providing the fasta_fofn
    logging.info("Running ICE/Polish on separated FLNC reads bins.")
    split_dirs = []
    for flnc_file in flnc_files:
        split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out")
        mkdir(split_dir)
        split_dirs.append(split_dir)
        cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta")

        ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts)
        if op.exists(ipq_f.quivered_good_fq):
            logging.warning("HQ polished isoforms %s already exist. SKIP!", ipq_f.quivered_good_fq)
            continue
        else:
            logging.info("Running ICE/Quiver on %s", split_dir)
            rmpath(cur_out_cons)

        obj = Cluster(root_dir=split_dir, flnc_fa=flnc_file,
                      nfl_fa=args.nfl_fa,
                      bas_fofn=args.bas_fofn,
                      ccs_fofn=args.ccs_fofn,
                      fasta_fofn=args.fasta_fofn,
                      out_fa=cur_out_cons, sge_opts=sge_opts,
                      ice_opts=ice_opts, ipq_opts=ipq_opts)

        if args.mem_debug: # DEBUG
            from memory_profiler import memory_usage
            start_t = time.time()
            mem_usage = memory_usage(obj.run, interval=60)
            end_t = time.time()
            with open('mem_debug.log', 'a') as f:
                f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(split_dir,
                                                                            end_t-start_t))
                f.write("Maximum memory usage: {0}\n".format(max(mem_usage)))
                f.write("Memory usage: {0}\n".format(mem_usage))
        else:
            obj.run()

        if not args.keep_tmp_files: # by deafult, delete all tempory files.
            logging.info("Deleting %s", ipq_f.tmp_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir])
            logging.info("Deleting %s", ipq_f.quivered_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir])

    # (3) merge polished isoform cluster from all bins
    logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir)
    c = CombineRunner(combined_dir=tofu_f.combined_dir,
                      sample_name=get_sample_name(args.sample_name),
                      split_dirs=split_dirs, ipq_opts=ipq_opts)
    c.run()
    if args.summary_fn is not None:
        ln(tofu_f.all_cluster_summary_fn, args.summary_fn)
    if args.report_fn is not None:
        ln(tofu_f.all_cluster_report_fn, args.report_fn)

    # (4) map HQ isoforms to GMAP reference genome
    map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq, sam_filename=tofu_f.sorted_gmap_sam,
                          gmap_db_dir=args.gmap_db, gmap_db_name=args.gmap_name,
                          gmap_nproc=args.gmap_nproc)

    # (5) post mapping to genome analysis, including
    #     * collapse polished HQ isoform clusters into groups
    #     * count abundance of collapsed isoform groups
    #     * filter collapsed isoforms based on abundance info
    logging.info("Post mapping to genome analysis.")
    out_isoforms = args.collapsed_filtered_fn
    if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")):
        in_isoforms = tofu_f.all_hq_fa
    elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")):
        in_isoforms = tofu_f.all_hq_fq
    else:
        raise ValueError("Output file %s must be FASTA or FASTQ!" % out_isoforms)

    post_mapping_to_genome_runner(
        in_isoforms=in_isoforms, in_sam=tofu_f.sorted_gmap_sam,
        in_pickle=tofu_f.hq_lq_prefix_dict_pickle, out_isoforms=args.collapsed_filtered_fn,
        out_gff=args.gff_fn, out_abundance=args.abundance_fn,
        out_group=args.group_fn, out_read_stat=args.read_stat_fn,
        min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity,
        min_flnc_coverage=args.min_flnc_coverage, max_fuzzy_junction=args.max_fuzzy_junction,
        allow_extra_5exon=args.allow_extra_5exon, min_count=args.min_count)

    return 0
コード例 #15
0
 def setUp(self):
     """Define input and output file."""
     rmpath(_OUT_DIR_)
     mkdir(_OUT_DIR_)
     self.gmap_db_dir = op.join(_OUT_DIR_, 'gmap db dir')
     os.symlink(GMAP_DB, self.gmap_db_dir)
コード例 #16
0
ファイル: test_ReadStatIO.py プロジェクト: wenmm/pbtranscript
 def setUp(self):
     """Define input and output file."""
     rmpath(_OUT_DIR_)
     mkdir(_OUT_DIR_)
コード例 #17
0
def args_runner(args):
    """Run given input args, e.g.,
    filter_collapsed_isoforms.py in_rep_fastq out_rep_fastq --min_count 2
    filter_collapsed_isoforms.py in_rep_fastq out_rep_fastq --min_count 2 --no_filter_subsets
    """
    in_fq, out_fq = args.in_rep_fastq, args.out_rep_fastq

    def _get_prefix_of_rep_fq(fn):
        """Return prefix of *.rep.fq"""
        if fn.endswith(".rep.fastq") or fn.endswith(".rep.fq"):
            return '.'.join(fn.split(".")[0:-2])
        elif fn.endswith(".fastq") or fn.endswith(".fq"):
            return '.'.join(fn.split(".")[0:-1])
        raise ValueError("Invalid collapsed isoforms .rep.fastq file %s" % fn)

    input_prefix = _get_prefix_of_rep_fq(in_fq)
    output_prefix = _get_prefix_of_rep_fq(out_fq)

    # infer group.txt, abundance.txt and gff
    in_group_filename = input_prefix + ".group.txt"
    in_abundance_filename = input_prefix + ".abundance.txt"
    in_gff_filename = input_prefix + ".gff"

    tmp_out_abundance_filename = output_prefix + ".has_subsets.abundance.txt"
    tmp_out_gff_filename = output_prefix + ".has_subsets.gff"
    tmp_out_fq = output_prefix + ".has_subsets.rep.fastq"

    out_abundance_filename = output_prefix + ".abundance.txt"
    out_gff_filename = output_prefix + ".gff"

    # Filter collapsed isoforms by min FL count.
    logging.info("Filtering collapsed isoforms by count %s", args.min_count)
    filter_by_count(in_group_filename=in_group_filename,
                    in_abundance_filename=in_abundance_filename,
                    in_gff_filename=in_gff_filename, in_rep_filename=in_fq,
                    out_abundance_filename=tmp_out_abundance_filename,
                    out_gff_filename=tmp_out_gff_filename, out_rep_filename=tmp_out_fq,
                    min_count=args.min_count)

    # Remove collapsed isoforms which are a subset of another isoform
    logging.info("Filtering out subsets collapsed isoforms = %s", args.filter_out_subsets)
    if args.filter_out_subsets is True:
        filter_out_subsets(in_abundance_filename=tmp_out_abundance_filename,
                           in_gff_filename=tmp_out_gff_filename,
                           in_rep_filename=tmp_out_fq,
                           out_abundance_filename=out_abundance_filename,
                           out_gff_filename=out_gff_filename,
                           out_rep_filename=out_fq,
                           max_fuzzy_junction=args.max_fuzzy_junction)
        rmpath(tmp_out_abundance_filename)
        rmpath(tmp_out_gff_filename)
        rmpath(tmp_out_fq)
    else:
        mv(tmp_out_abundance_filename, out_abundance_filename)
        mv(tmp_out_gff_filename, out_gff_filename)
        mv(tmp_out_fq, out_fq)

    logging.info("Filtered collapsed isoforms sequences written to %s", realpath(out_fq))
    logging.info("Filtered collapsed isoforms abundance written to %s", realpath(out_abundance_filename))
    logging.info("Filtered collapsed isoforms gff written to %s", realpath(out_gff_filename))
    return 0
コード例 #18
0
from pbcore.io import FastqReader
from pbtranscript.io import CollapseGffReader, AbundanceReader, GroupReader
from pbtranscript.Utils import rmpath, mkdir
from pbtranscript.filtering.FilteringUtils import good_isoform_ids_by_count, \
    good_isoform_ids_by_removing_subsets, filter_by_count, filter_out_subsets

from test_setpath import DATA_DIR, OUT_DIR, SIV_DATA_DIR, SIV_STD_DIR

GROUP_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.group.txt")
ABUNDANCE_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.abundance.txt")
GFF_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.gff")
REP_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.rep.fastq")

_OUT_DIR_ = op.join(OUT_DIR, "test_filtering")
rmpath(_OUT_DIR_)
mkdir(_OUT_DIR_)


class TEST_FilteringUtils(unittest.TestCase):
    """Test functions of pbtranscript.filtering.FilteringUtils."""
    def setUp(self):
        """Define input and output file."""
        self.expected_good = ['PB.2.5', 'PB.5.1', 'PB.7.1', 'PB.10.2', 'PB.10.42', 'PB.12.1']
        self.expected_diff = ['PB.10.42', 'PB.10.36', 'PB.10.35']

    def test_good_isoform_ids_by_count(self):
        """Test good_isoform_ids_by_count"""
        good = good_isoform_ids_by_count(in_group_filename=GROUP_FN,
                                         in_abundance_filename=ABUNDANCE_FN,
                                         min_count=20)
コード例 #19
0
"""Test pbtranscript.collapsing.Branch."""
import unittest
import os.path as op
import cPickle
import filecmp
import numpy as np
from pbtranscript.Utils import rmpath, mkdir
from pbtranscript.tasks.map_isoforms_to_genome import gmap_db_and_name_from_ds
from test_setpath import DATA_DIR, OUT_DIR, SIV_DATA_DIR, SIV_STD_DIR


READS_DS = op.join(SIV_DATA_DIR, 'test_collapsing', 'gmap-input.fastq.contigset.xml')
GMAP_DS = op.join(SIV_DATA_DIR, "gmap-referenceset-root-dir/SIRV/gmapreferenceset.xml")
_OUT_DIR_ = op.join(OUT_DIR, "test_map_isoforms_to_genome")
rmpath(_OUT_DIR_)
mkdir(_OUT_DIR_)


class TEST_map_isoforms_to_genome(unittest.TestCase):
    """Test functions of pbtranscript.tasks.map_isoforms_to_genome."""
    def setUp(self):
        """Define input and output file."""

    def test_gmap_db_and_name_from_ds(self):
        """Test map_isoforms_to_genome.gmap_db_and_name_from_ds"""
        gmap_db, gmap_name = gmap_db_and_name_from_ds(GMAP_DS)
        self.assertEqual(gmap_db, op.join(SIV_DATA_DIR, "gmap-referenceset-root-dir", "SIRV"))
        self.assertEqual(gmap_name, "gmap_db")

コード例 #20
0
def args_runner(args):
    """args runner"""
    logging.info("%s arguments are:\n%s\n", __file__, args)

    # sanity check arguments
    _sanity_check_args(args)

    # make option objects
    ice_opts = IceOptions(quiver=args.quiver,
                          use_finer_qv=args.use_finer_qv,
                          targeted_isoseq=args.targeted_isoseq,
                          ece_penalty=args.ece_penalty,
                          ece_min_len=args.ece_min_len,
                          flnc_reads_per_split=args.flnc_reads_per_split,
                          nfl_reads_per_split=args.nfl_reads_per_split)
    sge_opts = SgeOptions(unique_id=args.unique_id,
                          use_sge=args.use_sge,
                          max_sge_jobs=args.max_sge_jobs,
                          blasr_nproc=args.blasr_nproc,
                          quiver_nproc=args.quiver_nproc,
                          gcon_nproc=args.gcon_nproc,
                          sge_env_name=args.sge_env_name,
                          sge_queue=args.sge_queue)
    ipq_opts = IceQuiverHQLQOptions(
        qv_trim_5=args.qv_trim_5,
        qv_trim_3=args.qv_trim_3,
        hq_quiver_min_accuracy=args.hq_quiver_min_accuracy)

    # (1) separate flnc reads into bins
    logging.info("Separating FLNC reads into bins.")
    tofu_f = TofuFiles(tofu_dir=args.tofu_dir)
    s = SeparateFLNCRunner(flnc_fa=args.flnc_fa,
                           root_dir=args.tofu_dir,
                           out_pickle=tofu_f.separate_flnc_pickle,
                           bin_size_kb=args.bin_size_kb,
                           bin_by_primer=args.bin_by_primer,
                           bin_manual=args.bin_manual,
                           max_base_limit_MB=args.max_base_limit_MB)
    s.run()

    flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files(
        tofu_f.separate_flnc_pickle)
    logging.info("Separated FLNC reads bins are %s", flnc_files)

    # (2) apply 'pbtranscript cluster' to each bin
    # run ICE/Quiver (the whole thing), providing the fasta_fofn
    logging.info("Running ICE/Polish on separated FLNC reads bins.")
    split_dirs = []
    for flnc_file in flnc_files:
        split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out")
        mkdir(split_dir)
        split_dirs.append(split_dir)
        cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta")

        ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts)
        if op.exists(ipq_f.quivered_good_fq):
            logging.warning("HQ polished isoforms %s already exist. SKIP!",
                            ipq_f.quivered_good_fq)
            continue
        else:
            logging.info("Running ICE/Quiver on %s", split_dir)
            rmpath(cur_out_cons)

        obj = Cluster(root_dir=split_dir,
                      flnc_fa=flnc_file,
                      nfl_fa=args.nfl_fa,
                      bas_fofn=args.bas_fofn,
                      ccs_fofn=args.ccs_fofn,
                      fasta_fofn=args.fasta_fofn,
                      out_fa=cur_out_cons,
                      sge_opts=sge_opts,
                      ice_opts=ice_opts,
                      ipq_opts=ipq_opts)

        if args.mem_debug:  # DEBUG
            from memory_profiler import memory_usage
            start_t = time.time()
            mem_usage = memory_usage(obj.run, interval=60)
            end_t = time.time()
            with open('mem_debug.log', 'a') as f:
                f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(
                    split_dir, end_t - start_t))
                f.write("Maximum memory usage: {0}\n".format(max(mem_usage)))
                f.write("Memory usage: {0}\n".format(mem_usage))
        else:
            obj.run()

        if not args.keep_tmp_files:  # by deafult, delete all tempory files.
            logging.info("Deleting %s", ipq_f.tmp_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir])
            logging.info("Deleting %s", ipq_f.quivered_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir])

    # (3) merge polished isoform cluster from all bins
    logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir)
    c = CombineRunner(combined_dir=tofu_f.combined_dir,
                      sample_name=get_sample_name(args.sample_name),
                      split_dirs=split_dirs,
                      ipq_opts=ipq_opts)
    c.run()
    if args.summary_fn is not None:
        ln(tofu_f.all_cluster_summary_fn, args.summary_fn)
    if args.report_fn is not None:
        ln(tofu_f.all_cluster_report_fn, args.report_fn)

    # (4) map HQ isoforms to GMAP reference genome
    map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq,
                          sam_filename=tofu_f.sorted_gmap_sam,
                          gmap_db_dir=args.gmap_db,
                          gmap_db_name=args.gmap_name,
                          gmap_nproc=args.gmap_nproc)

    # (5) post mapping to genome analysis, including
    #     * collapse polished HQ isoform clusters into groups
    #     * count abundance of collapsed isoform groups
    #     * filter collapsed isoforms based on abundance info
    logging.info("Post mapping to genome analysis.")
    out_isoforms = args.collapsed_filtered_fn
    if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")):
        in_isoforms = tofu_f.all_hq_fa
    elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")):
        in_isoforms = tofu_f.all_hq_fq
    else:
        raise ValueError("Output file %s must be FASTA or FASTQ!" %
                         out_isoforms)

    post_mapping_to_genome_runner(in_isoforms=in_isoforms,
                                  in_sam=tofu_f.sorted_gmap_sam,
                                  in_pickle=tofu_f.hq_lq_prefix_dict_pickle,
                                  out_isoforms=args.collapsed_filtered_fn,
                                  out_gff=args.gff_fn,
                                  out_abundance=args.abundance_fn,
                                  out_group=args.group_fn,
                                  out_read_stat=args.read_stat_fn,
                                  min_aln_coverage=args.min_aln_coverage,
                                  min_aln_identity=args.min_aln_identity,
                                  min_flnc_coverage=args.min_flnc_coverage,
                                  max_fuzzy_junction=args.max_fuzzy_junction,
                                  allow_extra_5exon=args.allow_extra_5exon,
                                  min_count=args.min_count)

    return 0
コード例 #21
0
    def test_collapse_sam_records(self):
        """Test collapse_sam_records, which takes in a list of grouped sam records. and
        write collapsed gff records to good_gff_writer|bad_gff_writer. A collapsed
        gff record is 'good' if there are >= cov_threshold supportive sam records
        belonging to its group; otherwise, 'bad'.
        """
        test_name = "test_collapse_sam_records"
        good_gff_fn = op.join(_OUT_DIR_, test_name + ".good.gff.unfuzzy")
        bad_gff_fn = op.join(_OUT_DIR_, test_name + ".bad.gff.unfuzzy")
        group_fn = op.join(_OUT_DIR_, test_name + ".group.txt.unfuzzy")

        rmpath(good_gff_fn)
        rmpath(bad_gff_fn)
        rmpath(group_fn)

        records = _get_sam_groups()[0]["+"]  # contains 10 sam records
        with CollapseGffWriter(good_gff_fn) as good_gff_writer, \
             CollapseGffWriter(bad_gff_fn) as  bad_gff_writer, \
             GroupWriter(group_fn) as group_writer:
            collapse_sam_records(records=records,
                                 cuff_index=0,
                                 cov_threshold=2,
                                 allow_extra_5exon=False,
                                 skip_5_exon_alt=True,
                                 good_gff_writer=good_gff_writer,
                                 bad_gff_writer=bad_gff_writer,
                                 group_writer=group_writer)

        def str_to_gffrecord(line):
            fields = line.strip().split('\t')
            print fields
            attributes = []
            for attr_tuple in fields[8].split(';'):
                if len(attr_tuple.strip()) == 0:
                    continue
                else:
                    fs = attr_tuple.strip().split(' ')
                    if len(fs) == 2:
                        attributes.append((fs[0], fs[1].replace('"', '')))
            return Gff3Record(seqid=fields[0],
                              start=fields[3],
                              end=fields[4],
                              type=fields[2],
                              attributes=attributes)

        bad_gff_records = [
            str_to_gffrecord(line) for line in open(bad_gff_fn, 'r')
        ]
        self.assertEqual(len(bad_gff_records), 0)

        good_gff_records = [
            str_to_gffrecord(line) for line in open(good_gff_fn, 'r')
        ]
        self.assertEqual(len(good_gff_records), 4)
        self.assertEqual(
            [(int(r.start), int(r.end), r.type, r.attributes['gene_id'],
              r.attributes['transcript_id']) for r in good_gff_records], [
                  (10711, 11641, 'transcript', "PB.0", "PB.0.1"),
                  (10711, 10791, 'exon', "PB.0", "PB.0.1"),
                  (10883, 11057, 'exon', "PB.0", "PB.0.1"),
                  (11435, 11641, 'exon', "PB.0", "PB.0.1"),
              ])