Exemple #1
0
 def test_ilmn_bcl2fastq_miseq(self):
     args = martian.Record({
         'input_mode':
         "ILMN_BCL2FASTQ",
         'sample_def': [{
             "gem_group": None,
             "lanes": None,
             "read_path": "/mnt/projects/pat/bcl_direct/t1_miseq/p1",
             "samples": ["9968"]
         }],
         'barcode_whitelist':
         "737K-april-2014",
     })
     outs = martian.Record({})
     if os.path.exists("/mnt/projects/pat/bcl_direct/t1_miseq/p1"):
         main(args, outs)
         self.assertTrue(outs.barcodes == [
             "/mnt/projects/pat/bcl_direct/t1_miseq/p1/9968_S3_L001_R2_001.fastq.gz"
         ])
         self.assertTrue(outs.barcodes_reverse_complement[0] == False)
         self.assertTrue(outs.reads_interleaved == False)
         self.assertTrue(outs.sample_indices == [None])
         self.assertTrue(outs.is_read1[0] == True
                         and outs.is_read1[1] == False)
         self.assertTrue(outs.reads == [
             "/mnt/projects/pat/bcl_direct/t1_miseq/p1/9968_S3_L001_R1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t1_miseq/p1/9968_S3_L001_R3_001.fastq.gz"
         ])
Exemple #2
0
 def test_ilmn_bcl2fastq_v184(self):
     args = martian.Record({
         'input_mode':
         "ILMN_BCL2FASTQ",
         'sample_def': [{
             "gem_group": None,
             "lanes": None,
             "read_path":
             "/mnt/projects/pat/bcl_direct/v184_t1/Project_proj1/Sample_a",
             "samples": ["a"]
         }],
         'barcode_whitelist':
         "737K-april-2014",
     })
     outs = martian.Record({})
     if os.path.exists(
             "/mnt/projects/pat/bcl_direct/v184_t1/Project_proj1/Sample_a"):
         main(args, outs)
         self.assertTrue(
             outs.barcodes[0] ==
             "/mnt/projects/pat/bcl_direct/v184_t1/Project_proj1/Sample_a/a_CGCTTCAA_L001_R2_001.fastq.gz"
         )
         self.assertTrue(outs.barcodes_reverse_complement[0] == False)
         self.assertTrue(outs.reads_interleaved == False)
         self.assertTrue(outs.sample_indices[10] == None)
         self.assertTrue(outs.is_read1[0] == True
                         and outs.is_read1[1] == False)
Exemple #3
0
    def test_align(self):
        args = {
            'chunk_input': IN_FASTQ,
            'aligner': 'bwa',
            'aligner_method': 'MEM',
            'reference_path': 'hg19',
            '__threads': 1,
            'reads_interleaved': True
        }
        outs = {'default': OUT_BAM}

        args = martian.Record(args)
        outs = martian.Record(outs)

        main(args, outs)

        # Ensure each read has a barcode
        out_bam = pysam.Samfile(OUT_BAM)
        bam_reads = list(out_bam)

        fq_file = open(IN_FASTQ)
        fq_reads = list(
            tk_fasta.read_generator_fastq(fq_file, paired_end=False))

        self.assertEqual(len(bam_reads), len(fq_reads))
Exemple #4
0
    def main(self):
        """Parses command line arguments and runs the stage main."""
        # Load args and retvals from metadata.
        args = martian.Record(self.metadata.read('args'))

        if self._run_type == 'split':
            self._run(
                lambda: self._record_result(lambda: self._module.split(args)))
            self.metadata.write('stage_defs', self._result)
            return

        outs = martian.Record(self.metadata.read('outs'))

        if self._run_type == 'main':
            self._run(lambda: self._module.main(args, outs))
        elif self._run_type == 'join':
            chunk_defs = [
                martian.Record(chunk_def)
                for chunk_def in self.metadata.read('chunk_defs')
            ]
            chunk_outs = [
                martian.Record(chunk_out)
                for chunk_out in self.metadata.read('chunk_outs')
            ]
            self._run(
                lambda: self._module.join(args, outs, chunk_defs, chunk_outs))
        else:
            martian.throw('Invalid run type %s' % self._run_type)

        # Write the output as JSON.
        self.metadata.write('outs', outs.items())
Exemple #5
0
    def test_dedup(self):
        tenkit.constants.DUPLICATE_SUBSAMPLE_COVERAGES = [0.00001, 0.0001]
        args = martian.Record({ 'input': IN_BAM, 'estimated_coverage': 100.0, 'perfect_read_count': 1000, 'chunk_start':None, 'chunk_end':None })
        outs = martian.Record({ 'output': OUT_BAM, 'duplicate_summary': OUT_JSON })
        main_mark_duplicates(args, outs)

        out_bam = pysam.Samfile(OUT_BAM)
        dups = [ x.is_duplicate for x in out_bam ]

        self.assertEqual(dups, [ False, True, False, False, True, False ])
Exemple #6
0
    def test_attach_bcs(self):
        #  --align_input alignment_output.bam --barcode_input phix_I2.fastq --output test2.out --complete ~/c --stats ~/s
        args = {
            'barcode_whitelist' : IN_WHITELIST,
            'align_chunk' : IN_BAM,
            'barcode_chunk' : IN_I2,
            'sample_index_chunk' : IN_I1,
            'gem_group' : None,
            'paired_end' : True,
            'exclude_non_bc_reads' : False,
            'max_expected_bc_error': 0.75,
            'subsample_rate' : 1.0,
        }
        outs = { 'output': OUT_BAM }

        args = martian.Record(args)
        outs = martian.Record(outs)

        main(args, outs)

        # Get the barcodes
        barcode_whitelist = tk_seq.load_barcode_whitelist(IN_WHITELIST)

        # Ensure each read has a barcode
        out_bam = pysam.Samfile(OUT_BAM)
        for r in out_bam:
            tag_dict = { k:v for (k,v) in r.tags }
            tag_names = [ k for (k,v) in r.tags ]
            self.assertTrue(RAW_BARCODE_TAG in tag_names)

            if tag_dict[RAW_BARCODE_TAG] in barcode_whitelist:
                self.assertTrue(PROCESSED_BARCODE_TAG in tag_names)

            self.assertTrue(SAMPLE_INDEX_TAG in tag_names)


        # Make sure we put out the full BAM file
        out_len = len([ x for x in pysam.Samfile(OUT_BAM)])
        in_len  = len([ x for x in pysam.Samfile(IN_BAM)])
        self.assertEqual(out_len, in_len)


        def get_bc(r):
            tags = { k:v for (k,v) in r.tags }
            return tags[RAW_BARCODE_TAG]

        # Ensure each read pair has the same barcode
        out_bam = pysam.Samfile(OUT_BAM)
        reads = [ x for x in out_bam ]

        for (grp, reads) in groupby(reads, lambda x: x.qname):
            bcs = set(tk_io.get_read_barcode(r) for r in reads)
            self.assertEqual(len(bcs), 1)
    def test_make_unaligned(self):
        args = martian.Record({
            'sample_id': 1234,
            'output_format': "bam",
            'read_group': "RG",
            'read_chunk': IN_FASTQ
        })
        outs = martian.Record({'barcoded_unaligned': OUT_BAM})
        main(args, outs)

        out_bam = pysam.Samfile(OUT_BAM, check_sq=False)
        reads = list(out_bam)

        assert (len(reads) == 2000)
Exemple #8
0
    def test_setup_chunks(self):

        args = martian.Record({
            'input_mode':
            'BCL_PROCESSOR',
            'sample_def': [{
                'read_path': IN_PREFIX,
                'sample_indices': ["AAAA", "CCCC"],
                'lanes': None,
                'gem_group': None,
            }],
            'barcode_whitelist':
            "737K-april-2014",
        })
        outs = martian.Record({})

        main(args, outs)
        print outs
        self.assertTrue(len(outs.chunks) == 3)
    def test_attach_phasing(self):

        args = martian.Record({
            'input': IN_BAM,
            'fragment_phasing': IN_FRAGS,
            'chunk_start': 0,
            'chunk_end': 1 << 32
        })
        outs = martian.Record({
            'phased_possorted_bam': OUT_BAM,
            'phased_possorted_bam_index': OUT_BAM + ".bai"
        })

        main(args, outs)

        # Ensure each read has a barcode
        out_bam = pysam.Samfile(OUT_BAM)
        bam_reads = list(out_bam)
        '''
        chr1    628490  701466  10565419        565419  711255  GTACACAGAGTGTT-1        0.9996837673    0.000316232700235       5.00029071137e-61
        chr1    628789  678258  10565419        565419  711255  CGAACTCACTCCAA-1        0.999800468729  0.000199531270515       5.00029083957e-61
        chr1    628958  726129  10565419        565419  711255  AGGCTTCATCAGAA-1        3.01287901923e-08       0.999999969871  2.50113486814e-61
        chr1    630911  726153  10565419        565419  711255  CTAAGCAGGTTTAG-1        0.998004731897  0.00199526810283        5.00029096742e-61
        '''
        def check_reads(bc, start, end, haplotype):
            for r in bam_reads:
                if tk_io.get_read_barcode(r) != bc:
                    continue

                tags = {t: v for (t, v) in r.tags}
                if r.pos >= start and r.pos < end:
                    self.assertTrue(tags.has_key('HP'))
                    self.assertEqual(tags['HP'] == haplotype)

        check_reads("AGGCTTCATCAGAA-1", 565419, 711255, 1)
        check_reads("CTAAGCAGGTTTAG-1", 565419, 711255, 0)
        check_reads("CTAAGCAGGTTTAG-1", 565419, 711255, 1)

        self.assertEqual(len(bam_reads) > 0, True)
Exemple #10
0
 def test_ilmn_bclfastq_mode(self):
     args = martian.Record({
         'input_mode':
         "ILMN_BCL2FASTQ",
         'sample_def': [{
             "gem_group": None,
             "lanes": None,
             "read_path": "/mnt/projects/pat/bcl_direct/t2/p1/a",
             "samples": ["aa"]
         }, {
             "gem_group": None,
             "lanes": None,
             "read_path": "/mnt/projects/pat/bcl_direct/t2/p2/b",
             "samples": ["bb"]
         }],
         'barcode_whitelist':
         "737K-april-2014",
     })
     outs = martian.Record({})
     if os.path.exists("/mnt/projects/pat/bcl_direct/t2/p1/a"):
         main(args, outs)
         self.assertTrue(outs.barcodes == [
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L001_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L002_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L003_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L004_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L005_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L006_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L007_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L008_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L001_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L002_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L003_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L004_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L005_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L006_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L007_R2_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L008_R2_001.fastq.gz"
         ])
         self.assertTrue(outs.barcodes_reverse_complement[0] == True)
         self.assertTrue(outs.reads_interleaved == False)
         self.assertTrue(outs.sample_indices == [
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L001_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L002_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L003_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L004_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L005_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L006_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L007_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L008_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L001_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L002_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L003_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L004_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L005_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L006_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L007_I1_001.fastq.gz",
             "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L008_I1_001.fastq.gz"
         ])
         self.assertTrue(outs.is_read1[0] == True
                         and outs.is_read1[1] == False)
Exemple #11
0
    def test_big_dedup(self):
        tenkit.constants.DUPLICATE_SUBSAMPLE_COVERAGES = [0.000003, 0.000015]
        args = martian.Record({ 'input': IN_BAM_BIG, 'estimated_coverage':100.0, 'perfect_read_count': 100000, 'chunk_start': None, 'chunk_end': None })
        outs = martian.Record({ 'output': OUT_BAM, 'duplicate_summary': OUT_JSON })
        main_mark_duplicates(args, outs)

        out_bam = pysam.Samfile(OUT_BAM)
        out_reads = list(out_bam)

        in_bam = pysam.Samfile(IN_BAM_BIG)
        in_reads = list(in_bam)

        # Check we haven't lost any reads
        self.assertEqual(len(out_reads), len(in_reads))

        def read_tuple(r):
            bc = crdna_io.get_read_barcode(r)
            return (bc, r.tid, r.pos, r.mrnm, r.mpos, r.is_reverse, r.is_read1)
            #return (bc, r.is_read1, r.is_reverse, r.tid, r.pos, r.mrnm, r.mpos)

        def mark_duplicates(read_set):

            # Re-run the dup analysis manually
            read_tups = [(read_tuple(r), r) for r in read_set]
            read_tups.sort(key = lambda x: x[0])

            groups = itertools.groupby(read_tups, lambda x: x[0])

            for (k, reads) in groups:
                rl = list(reads)
                rl[0][1].is_duplicate = False

                for i in range(1, len(rl)):
                    rl[i][1].is_duplicate = True


        mark_duplicates(in_reads)

        # Make sure our 'all-reads' analysis matches the code
        out_dup_marks = np.array([ r.is_duplicate for r in out_reads if (not r.is_unmapped) and (not r.mate_is_unmapped)])
        test_dup_marks = np.array([ r.is_duplicate for r in in_reads if (not r.is_unmapped) and (not r.mate_is_unmapped)])

        print "len(start_bam): %d  -- len(out_bam): %d" % (len(out_dup_marks), len(test_dup_marks))
        eq = (out_dup_marks == test_dup_marks).all()

        print "mean dups code: %f" % out_dup_marks.mean()
        print "mean dups test: %f" % test_dup_marks.mean()

        self.assertTrue(eq)

        # Read the molecule count histogram and verify
        count_hist = json.load(file(OUT_JSON))['no_filter_full_use_bcs']

        dups = sum([ (int(times_observed) - 1) * n for (times_observed, n) in count_hist.items() ])
        total_reads = sum([ int(times_observed) * n for (times_observed, n) in count_hist.items() ])
        summary_dup_rate = float(dups) / total_reads

        mapped_in_reads = np.array([r.is_duplicate for r in in_reads if not(r.is_unmapped or r.mate_is_unmapped) and crdna_io.get_read_barcode(r) is not None ])
        self.assertEqual(summary_dup_rate, mapped_in_reads.mean())


        # Get the perfect reads, mark dups and compare stats
        perfect_reads = [x for x in in_reads if crdna.read_filter.stringent_read_filter(x, True)]
        mark_duplicates(perfect_reads)

        # Read the molecule count histogram and verify -- perfect reads
        count_hist = json.load(file(OUT_JSON))['full_use_bcs']

        dups = sum([ (int(times_observed) - 1) * n for (times_observed, n) in count_hist.items() ])
        total_reads = sum([ int(times_observed) * n for (times_observed, n) in count_hist.items() ])
        summary_dup_rate = float(dups) / total_reads

        mapped_in_reads = np.array([r.is_duplicate for r in perfect_reads])
        self.assertEqual(summary_dup_rate, mapped_in_reads.mean())