Esempio n. 1
0
 def test_integration_0(self):
     """check for any & all changes to output"""
     out1 = path_join(dirname(__file__), "test_integration_0.1.fq")
     out2 = path_join(dirname(__file__), "test_integration_0.2.fq")
     read_length = 150
     contamination = 0.3
     args = [
         "--seed", "12345678", "--sample-name",
         "c9a6be94-bdb7-4c0d-a89d-4addbf76e486", "--vcf-input", self.vcfgrm,
         "--somatic-mode", "--sample-name2",
         "d44d739c-0143-4350-bba5-72dd068e05fd", "--contamination",
         str(contamination), "--vcf-input2", self.vcfsom, "--num-pairs",
         "320", "--quals-from", self.qpxml_R1, self.qpxml_R2, "--length1",
         str(read_length), "--length2",
         str(read_length), self.fa1, out1, out2
     ]
     qasim.workflow(qasim.get_args(args))
     # There's no complicated logic here, just run a deterministic workflow
     # and compare the output to what it was when we wrote the tests.
     with open(out1) as test, open(self.fq1) as original:
         test_content = test.readlines()
         original_content = original.readlines()
         self.assertEqual(test_content, original_content)
     with open(out2) as test, open(self.fq2) as original:
         test_content = test.readlines()
         original_content = original.readlines()
         self.assertEqual(test_content, original_content)
Esempio n. 2
0
 def test_integration_2(self):
     """germline mode with mutations specified by input VCF"""
     # We take advantage of the fact that we know the true location
     # of the generated reads on the reference (from the coord1_coord2
     # embedded in read ids) to check SNP genotypes at positions without
     # having to align the reads first. This wouldn't be straightforward
     # for indels because the insertion/deletion shifts the coordinates.
     out1 = path_join(dirname(__file__), "test_integration_2.1.fq")
     out2 = path_join(dirname(__file__), "test_integration_2.2.fq")
     read_length = 150
     args = [
         "--seed", "12345678", "--sample-name",
         "c9a6be94-bdb7-4c0d-a89d-4addbf76e486", "--vcf-input", self.vcfgrm,
         "--num-pairs", "640", "--quals-from", self.qpxml_R1, self.qpxml_R2,
         "--length1",
         str(read_length), "--length2",
         str(read_length), self.fa1, out1, out2
     ]
     qasim.workflow(qasim.get_args(args))
     # work with "forwardized" reads: it's more convenient to only deal
     # with variants relative to the reference strand.
     fq1 = Fastq.forwardize(Fastq(out1))
     fq2 = Fastq.forwardize(Fastq(out2))
     # In the assertions below we're quite lenient to account for both
     # sequencing errors (introducing non-REF/ALT bases) and in the case
     # of the first two het positions, imbalanced read coverage of the
     # A & B alleles. The variants here are specified in `self.vcfgrm`
     delta = 0.1
     # A>C 0|1 SNP at position 81
     pos = 81
     covering_reads = fq1.coverage(pos) + fq2.coverage(pos)
     pos_bases = [r['seq'][pos - r['read_start']] for r in covering_reads]
     frac_A = pos_bases.count('A') / float(len(pos_bases))
     frac_C = pos_bases.count('C') / float(len(pos_bases))
     self.assertAlmostEqual(frac_A, 0.5, delta=delta)
     self.assertAlmostEqual(frac_C, 0.5, delta=delta)
     # A>C 1|0 SNP at position 161
     pos = 161
     covering_reads = fq1.coverage(pos) + fq2.coverage(pos)
     pos_bases = [r['seq'][pos - r['read_start']] for r in covering_reads]
     frac_A = pos_bases.count('A') / float(len(pos_bases))
     frac_C = pos_bases.count('C') / float(len(pos_bases))
     self.assertAlmostEqual(frac_A, 0.5, delta=delta)
     self.assertAlmostEqual(frac_C, 0.5, delta=delta)
     # A>C 1|1 SNP at position 241
     pos = 241
     covering_reads = fq1.coverage(pos) + fq2.coverage(pos)
     pos_bases = [r['seq'][pos - r['read_start']] for r in covering_reads]
     frac_A = pos_bases.count('A') / float(len(pos_bases))
     frac_C = pos_bases.count('C') / float(len(pos_bases))
     self.assertAlmostEqual(frac_A, 0.0, delta=delta)
     self.assertAlmostEqual(frac_C, 1.0, delta=delta)
Esempio n. 3
0
 def test_integration_4(self):
     """check sample degradation conversion options"""
     out1 = path_join(dirname(__file__), "test_integration_4.1.fq")
     out2 = path_join(dirname(__file__), "test_integration_4.2.fq")
     bases = ['A', 'C', 'T', 'G']
     # these input fastas consist of a single base, repeated.
     fastas = [self.faA, self.faC, self.faT, self.f*G]
     # conversion rate (probability)
     rate = 0.1
     for idx, (from_base, fasta) in enumerate(zip(bases, fastas)):
         from_comp = bases[idx - 2]
         for to_base in bases:
             if to_base != from_base:
                 conv = "--{}{}".format(from_base, to_base)
                 with self.subTest(msg="{} {}".format(conv, rate)):
                     # generate reads with no mutations or sequencing errors
                     args = [
                         "--seed", "12345", "--vcf-input", self.vcfempty,
                         "--num-pairs", "5000", "--error-rate", "0", conv,
                         str(rate), fasta, out1, out2
                     ]
                     qasim.workflow(qasim.get_args(args))
                     counts1 = Fastq(out1).basecounts()
                     counts2 = Fastq(out2).basecounts()
                     counts = {
                         b: counts1.get(b, 0) + counts2.get(b, 0)
                         for b in bases
                     }
                     rate_calc = counts[to_base] / (counts[to_base] +
                                                    counts[from_base])
                     # we expect half the reads to be the reverse complement
                     # base: if that is the same as 'to_base' then
                     # conversion rate calculation is slightly different:
                     if to_base == from_comp:
                         rate_calc = 2 * rate_calc - 1
                     self.assertAlmostEqual(rate / rate_calc,
                                            1.0,
                                            delta=0.01)
Esempio n. 4
0
#!/usr/bin/env python3
"""Command-line interface for qasim"""
import sys
from qasim import qasim

qasim.workflow(qasim.get_args(sys.argv[1:]))
Esempio n. 5
0
 def test_integration_3(self):
     """somatic mode with mutations specified by input VCFs"""
     out1 = path_join(dirname(__file__), "test_integration_3.1.fq")
     out2 = path_join(dirname(__file__), "test_integration_3.2.fq")
     read_length = 150
     contamination = 0.3
     args = [
         "--seed", "12345678", "--sample-name",
         "c9a6be94-bdb7-4c0d-a89d-4addbf76e486", "--vcf-input", self.vcfgrm,
         "--somatic-mode", "--sample-name2",
         "d44d739c-0143-4350-bba5-72dd068e05fd", "--contamination",
         str(contamination), "--vcf-input2", self.vcfsom, "--num-pairs",
         "640", "--quals-from", self.qpxml_R1, self.qpxml_R2, "--length1",
         str(read_length), "--length2",
         str(read_length), self.fa1, out1, out2
     ]
     qasim.workflow(qasim.get_args(args))
     # see comments in test_integration_2
     fq1 = Fastq.forwardize(Fastq(out1))
     fq2 = Fastq.forwardize(Fastq(out2))
     delta = 0.1
     # Verify that a germline variant is still present in the somatic reads
     # A>C 1|0 SNP at position 161
     pos = 161
     covering_reads = fq1.coverage(pos) + fq2.coverage(pos)
     pos_bases = [r['seq'][pos - r['read_start']] for r in covering_reads]
     frac_A = pos_bases.count('A') / float(len(pos_bases))
     frac_C = pos_bases.count('C') / float(len(pos_bases))
     self.assertAlmostEqual(frac_A, 0.5, delta=delta)
     self.assertAlmostEqual(frac_C, 0.5, delta=delta)
     # A>ACG 0|1 insertion at position 881
     pos = 881
     # We look at only "original" forward reads because in the case of the
     # first insertion specified by the somatic vcf their start coordinate
     # is unshifted from the reference, and we can perform naive position
     # arithmetic to obtain the values of the bases at pos, pos+1 & pos+2.
     #
     # Contrast this to reverse reads where the end coordinate we get
     # from the read id is /after/ the insertion and all read positions
     # relative to it are shifted by len(insert_size).
     #
     # Considering only fwd reads makes this an imperfect test of reads
     # generated over indels but a better one will require proper
     # alignment to the reference.
     fwd_covering_reads = [
         r for r in fq1.coverage(pos) + fq2.coverage(pos)
         if r['read'] == 1 and r['frag_start'] < r['frag_end']
         or r['read'] == 2 and r['frag_start'] > r['frag_end']
     ]
     pos1_bases = [
         r['seq'][pos - r['read_start']] for r in fwd_covering_reads
     ]
     pos2_bases = [
         r['seq'][pos + 1 - r['read_start']] for r in fwd_covering_reads
         if pos + 1 - r['read_start'] < read_length
     ]
     pos3_bases = [
         r['seq'][pos + 2 - r['read_start']] for r in fwd_covering_reads
         if pos + 2 - r['read_start'] < read_length
     ]
     frac_A1 = pos1_bases.count('A') / float(len(pos1_bases))
     self.assertAlmostEqual(frac_A1, 1.0, delta=delta)
     frac_A2 = pos2_bases.count('A') / float(len(pos2_bases))
     self.assertAlmostEqual(frac_A2, 0.5 * (1 + contamination), delta=delta)
     frac_C2 = pos2_bases.count('C') / float(len(pos2_bases))
     self.assertAlmostEqual(frac_C2, 0.5 * (1 - contamination), delta=delta)
     frac_A3 = pos3_bases.count('A') / float(len(pos3_bases))
     self.assertAlmostEqual(frac_A3, 0.5 * (1 + contamination), delta=delta)
     frac_G3 = pos3_bases.count('G') / float(len(pos3_bases))
     self.assertAlmostEqual(frac_G3, 0.5 * (1 - contamination), delta=delta)
Esempio n. 6
0
    def test_integration_1(self):
        """check reads are generated correctly over indels"""
        out1 = path_join(dirname(__file__), "test_integration_1.1.fq")
        out2 = path_join(dirname(__file__), "test_integration_1.2.fq")
        read_length = 150
        # generate reads with no sequencing errors to make comparison back
        # to reference easy:
        args = [
            "--seed", "12345678", "--sample-name",
            "c9a6be94-bdb7-4c0d-a89d-4addbf76e486", "--vcf-input",
            self.vcfindel, "--num-pairs", "320", "--error-rate", "0",
            "--length1",
            str(read_length), "--length2",
            str(read_length), self.fa2, out1, out2
        ]
        qasim.workflow(qasim.get_args(args))
        ref = next(qasim.read_fasta(self.fa2))
        # pad with ' ' to make 1-based reference sequence as a string
        ref_seq = ' ' + ''.join(base(b) for b in ref.seqA)
        self.assertEqual(ref_seq[1:33], "AAAAAAAACCCCCCCCGGGGGGGGTTTTTTTT")
        fq1 = Fastq(out1)
        fq2 = Fastq(out2)
        # The VCF specifies a homozygous insertion A>ACG at POS=401:
        pos = 401
        ins_reads = fq1.coverage(pos) + fq2.coverage(pos)

        # Check the forward reads over the insertion
        fwd_reads = [
            r for r in ins_reads
            if r['read'] == 1 and r['frag_start'] < r['frag_end']
            or r['read'] == 2 and r['frag_start'] > r['frag_end']
        ]
        for r in fwd_reads:
            read_start = min(r['frag_start'], r['frag_end'])
            for i, b in enumerate(r['seq']):
                b_pos = read_start + i
                if b_pos <= pos:
                    self.assertEqual(b, ref_seq[b_pos])
                elif b_pos == pos + 1:
                    self.assertEqual(b, 'C')
                elif b_pos == pos + 2:
                    self.assertEqual(b, 'G')
                else:
                    self.assertEqual(b, ref_seq[b_pos - 2])

        # Check the reverse reads over the insertion
        rev_reads = [
            r for r in ins_reads
            if r['read'] == 1 and r['frag_start'] > r['frag_end']
            or r['read'] == 2 and r['frag_start'] < r['frag_end']
        ]
        for r in rev_reads:
            read_start = max(r['frag_start'], r['frag_end'])
            if read_start == pos:
                # skip reverse reads whose start coord (far end) is /exactly/
                # pos since the indel isn't actually contained in these reads
                continue
            for i, b in enumerate(r['seq']):
                # b_pos is decreasing as we read backwards
                b_pos = read_start - i
                b = Fastq.complement[b]
                if b_pos > pos:
                    self.assertEqual(b, ref_seq[b_pos])
                elif b_pos == pos:
                    self.assertEqual(b, 'G')
                elif b_pos == pos - 1:
                    self.assertEqual(b, 'C')
                else:
                    self.assertEqual(b, ref_seq[b_pos + 2])

        # The VCF specifies a homozygous deletion AAA>A at POS=1201.
        pos = 1201
        del_reads = fq1.coverage(pos) + fq2.coverage(pos)

        # Check the forward reads over the deletion
        fwd_reads = [
            r for r in del_reads
            if r['read'] == 1 and r['frag_start'] < r['frag_end']
            or r['read'] == 2 and r['frag_start'] > r['frag_end']
        ]
        for r in fwd_reads:
            read_start = min(r['frag_start'], r['frag_end'])
            for i, b in enumerate(r['seq']):
                b_pos = read_start + i
                if b_pos <= pos:
                    self.assertEqual(b, ref_seq[b_pos])
                else:
                    self.assertEqual(b, ref_seq[b_pos + 2])

        # Check the reverse reads over the deletion
        rev_reads = [
            r for r in del_reads
            if r['read'] == 1 and r['frag_start'] > r['frag_end']
            or r['read'] == 2 and r['frag_start'] < r['frag_end']
        ]
        for r in rev_reads:
            read_start = max(r['frag_start'], r['frag_end'])
            if read_start == pos:
                # skip reverse reads whose start coord (far end) is /exactly/
                # pos since the indel isn't actually contained in these reads
                continue
            for i, b in enumerate(r['seq']):
                # b_pos is decreasing as we read backwards
                b_pos = read_start - i
                b = Fastq.complement[b]
                if b_pos >= pos + 2:
                    self.assertEqual(b, ref_seq[b_pos])
                else:
                    self.assertEqual(b, ref_seq[b_pos - 2])