def test_integration_0(self): """check for any & all changes to output""" out1 = path_join(dirname(__file__), "test_integration_0.1.fq") out2 = path_join(dirname(__file__), "test_integration_0.2.fq") read_length = 150 contamination = 0.3 args = [ "--seed", "12345678", "--sample-name", "c9a6be94-bdb7-4c0d-a89d-4addbf76e486", "--vcf-input", self.vcfgrm, "--somatic-mode", "--sample-name2", "d44d739c-0143-4350-bba5-72dd068e05fd", "--contamination", str(contamination), "--vcf-input2", self.vcfsom, "--num-pairs", "320", "--quals-from", self.qpxml_R1, self.qpxml_R2, "--length1", str(read_length), "--length2", str(read_length), self.fa1, out1, out2 ] qasim.workflow(qasim.get_args(args)) # There's no complicated logic here, just run a deterministic workflow # and compare the output to what it was when we wrote the tests. with open(out1) as test, open(self.fq1) as original: test_content = test.readlines() original_content = original.readlines() self.assertEqual(test_content, original_content) with open(out2) as test, open(self.fq2) as original: test_content = test.readlines() original_content = original.readlines() self.assertEqual(test_content, original_content)
def test_integration_2(self): """germline mode with mutations specified by input VCF""" # We take advantage of the fact that we know the true location # of the generated reads on the reference (from the coord1_coord2 # embedded in read ids) to check SNP genotypes at positions without # having to align the reads first. This wouldn't be straightforward # for indels because the insertion/deletion shifts the coordinates. out1 = path_join(dirname(__file__), "test_integration_2.1.fq") out2 = path_join(dirname(__file__), "test_integration_2.2.fq") read_length = 150 args = [ "--seed", "12345678", "--sample-name", "c9a6be94-bdb7-4c0d-a89d-4addbf76e486", "--vcf-input", self.vcfgrm, "--num-pairs", "640", "--quals-from", self.qpxml_R1, self.qpxml_R2, "--length1", str(read_length), "--length2", str(read_length), self.fa1, out1, out2 ] qasim.workflow(qasim.get_args(args)) # work with "forwardized" reads: it's more convenient to only deal # with variants relative to the reference strand. fq1 = Fastq.forwardize(Fastq(out1)) fq2 = Fastq.forwardize(Fastq(out2)) # In the assertions below we're quite lenient to account for both # sequencing errors (introducing non-REF/ALT bases) and in the case # of the first two het positions, imbalanced read coverage of the # A & B alleles. The variants here are specified in `self.vcfgrm` delta = 0.1 # A>C 0|1 SNP at position 81 pos = 81 covering_reads = fq1.coverage(pos) + fq2.coverage(pos) pos_bases = [r['seq'][pos - r['read_start']] for r in covering_reads] frac_A = pos_bases.count('A') / float(len(pos_bases)) frac_C = pos_bases.count('C') / float(len(pos_bases)) self.assertAlmostEqual(frac_A, 0.5, delta=delta) self.assertAlmostEqual(frac_C, 0.5, delta=delta) # A>C 1|0 SNP at position 161 pos = 161 covering_reads = fq1.coverage(pos) + fq2.coverage(pos) pos_bases = [r['seq'][pos - r['read_start']] for r in covering_reads] frac_A = pos_bases.count('A') / float(len(pos_bases)) frac_C = pos_bases.count('C') / float(len(pos_bases)) self.assertAlmostEqual(frac_A, 0.5, delta=delta) self.assertAlmostEqual(frac_C, 0.5, delta=delta) # A>C 1|1 SNP at position 241 pos = 241 covering_reads = fq1.coverage(pos) + fq2.coverage(pos) pos_bases = [r['seq'][pos - r['read_start']] for r in covering_reads] frac_A = pos_bases.count('A') / float(len(pos_bases)) frac_C = pos_bases.count('C') / float(len(pos_bases)) self.assertAlmostEqual(frac_A, 0.0, delta=delta) self.assertAlmostEqual(frac_C, 1.0, delta=delta)
def test_integration_4(self): """check sample degradation conversion options""" out1 = path_join(dirname(__file__), "test_integration_4.1.fq") out2 = path_join(dirname(__file__), "test_integration_4.2.fq") bases = ['A', 'C', 'T', 'G'] # these input fastas consist of a single base, repeated. fastas = [self.faA, self.faC, self.faT, self.f*G] # conversion rate (probability) rate = 0.1 for idx, (from_base, fasta) in enumerate(zip(bases, fastas)): from_comp = bases[idx - 2] for to_base in bases: if to_base != from_base: conv = "--{}{}".format(from_base, to_base) with self.subTest(msg="{} {}".format(conv, rate)): # generate reads with no mutations or sequencing errors args = [ "--seed", "12345", "--vcf-input", self.vcfempty, "--num-pairs", "5000", "--error-rate", "0", conv, str(rate), fasta, out1, out2 ] qasim.workflow(qasim.get_args(args)) counts1 = Fastq(out1).basecounts() counts2 = Fastq(out2).basecounts() counts = { b: counts1.get(b, 0) + counts2.get(b, 0) for b in bases } rate_calc = counts[to_base] / (counts[to_base] + counts[from_base]) # we expect half the reads to be the reverse complement # base: if that is the same as 'to_base' then # conversion rate calculation is slightly different: if to_base == from_comp: rate_calc = 2 * rate_calc - 1 self.assertAlmostEqual(rate / rate_calc, 1.0, delta=0.01)
#!/usr/bin/env python3 """Command-line interface for qasim""" import sys from qasim import qasim qasim.workflow(qasim.get_args(sys.argv[1:]))
def test_integration_3(self): """somatic mode with mutations specified by input VCFs""" out1 = path_join(dirname(__file__), "test_integration_3.1.fq") out2 = path_join(dirname(__file__), "test_integration_3.2.fq") read_length = 150 contamination = 0.3 args = [ "--seed", "12345678", "--sample-name", "c9a6be94-bdb7-4c0d-a89d-4addbf76e486", "--vcf-input", self.vcfgrm, "--somatic-mode", "--sample-name2", "d44d739c-0143-4350-bba5-72dd068e05fd", "--contamination", str(contamination), "--vcf-input2", self.vcfsom, "--num-pairs", "640", "--quals-from", self.qpxml_R1, self.qpxml_R2, "--length1", str(read_length), "--length2", str(read_length), self.fa1, out1, out2 ] qasim.workflow(qasim.get_args(args)) # see comments in test_integration_2 fq1 = Fastq.forwardize(Fastq(out1)) fq2 = Fastq.forwardize(Fastq(out2)) delta = 0.1 # Verify that a germline variant is still present in the somatic reads # A>C 1|0 SNP at position 161 pos = 161 covering_reads = fq1.coverage(pos) + fq2.coverage(pos) pos_bases = [r['seq'][pos - r['read_start']] for r in covering_reads] frac_A = pos_bases.count('A') / float(len(pos_bases)) frac_C = pos_bases.count('C') / float(len(pos_bases)) self.assertAlmostEqual(frac_A, 0.5, delta=delta) self.assertAlmostEqual(frac_C, 0.5, delta=delta) # A>ACG 0|1 insertion at position 881 pos = 881 # We look at only "original" forward reads because in the case of the # first insertion specified by the somatic vcf their start coordinate # is unshifted from the reference, and we can perform naive position # arithmetic to obtain the values of the bases at pos, pos+1 & pos+2. # # Contrast this to reverse reads where the end coordinate we get # from the read id is /after/ the insertion and all read positions # relative to it are shifted by len(insert_size). # # Considering only fwd reads makes this an imperfect test of reads # generated over indels but a better one will require proper # alignment to the reference. fwd_covering_reads = [ r for r in fq1.coverage(pos) + fq2.coverage(pos) if r['read'] == 1 and r['frag_start'] < r['frag_end'] or r['read'] == 2 and r['frag_start'] > r['frag_end'] ] pos1_bases = [ r['seq'][pos - r['read_start']] for r in fwd_covering_reads ] pos2_bases = [ r['seq'][pos + 1 - r['read_start']] for r in fwd_covering_reads if pos + 1 - r['read_start'] < read_length ] pos3_bases = [ r['seq'][pos + 2 - r['read_start']] for r in fwd_covering_reads if pos + 2 - r['read_start'] < read_length ] frac_A1 = pos1_bases.count('A') / float(len(pos1_bases)) self.assertAlmostEqual(frac_A1, 1.0, delta=delta) frac_A2 = pos2_bases.count('A') / float(len(pos2_bases)) self.assertAlmostEqual(frac_A2, 0.5 * (1 + contamination), delta=delta) frac_C2 = pos2_bases.count('C') / float(len(pos2_bases)) self.assertAlmostEqual(frac_C2, 0.5 * (1 - contamination), delta=delta) frac_A3 = pos3_bases.count('A') / float(len(pos3_bases)) self.assertAlmostEqual(frac_A3, 0.5 * (1 + contamination), delta=delta) frac_G3 = pos3_bases.count('G') / float(len(pos3_bases)) self.assertAlmostEqual(frac_G3, 0.5 * (1 - contamination), delta=delta)
def test_integration_1(self): """check reads are generated correctly over indels""" out1 = path_join(dirname(__file__), "test_integration_1.1.fq") out2 = path_join(dirname(__file__), "test_integration_1.2.fq") read_length = 150 # generate reads with no sequencing errors to make comparison back # to reference easy: args = [ "--seed", "12345678", "--sample-name", "c9a6be94-bdb7-4c0d-a89d-4addbf76e486", "--vcf-input", self.vcfindel, "--num-pairs", "320", "--error-rate", "0", "--length1", str(read_length), "--length2", str(read_length), self.fa2, out1, out2 ] qasim.workflow(qasim.get_args(args)) ref = next(qasim.read_fasta(self.fa2)) # pad with ' ' to make 1-based reference sequence as a string ref_seq = ' ' + ''.join(base(b) for b in ref.seqA) self.assertEqual(ref_seq[1:33], "AAAAAAAACCCCCCCCGGGGGGGGTTTTTTTT") fq1 = Fastq(out1) fq2 = Fastq(out2) # The VCF specifies a homozygous insertion A>ACG at POS=401: pos = 401 ins_reads = fq1.coverage(pos) + fq2.coverage(pos) # Check the forward reads over the insertion fwd_reads = [ r for r in ins_reads if r['read'] == 1 and r['frag_start'] < r['frag_end'] or r['read'] == 2 and r['frag_start'] > r['frag_end'] ] for r in fwd_reads: read_start = min(r['frag_start'], r['frag_end']) for i, b in enumerate(r['seq']): b_pos = read_start + i if b_pos <= pos: self.assertEqual(b, ref_seq[b_pos]) elif b_pos == pos + 1: self.assertEqual(b, 'C') elif b_pos == pos + 2: self.assertEqual(b, 'G') else: self.assertEqual(b, ref_seq[b_pos - 2]) # Check the reverse reads over the insertion rev_reads = [ r for r in ins_reads if r['read'] == 1 and r['frag_start'] > r['frag_end'] or r['read'] == 2 and r['frag_start'] < r['frag_end'] ] for r in rev_reads: read_start = max(r['frag_start'], r['frag_end']) if read_start == pos: # skip reverse reads whose start coord (far end) is /exactly/ # pos since the indel isn't actually contained in these reads continue for i, b in enumerate(r['seq']): # b_pos is decreasing as we read backwards b_pos = read_start - i b = Fastq.complement[b] if b_pos > pos: self.assertEqual(b, ref_seq[b_pos]) elif b_pos == pos: self.assertEqual(b, 'G') elif b_pos == pos - 1: self.assertEqual(b, 'C') else: self.assertEqual(b, ref_seq[b_pos + 2]) # The VCF specifies a homozygous deletion AAA>A at POS=1201. pos = 1201 del_reads = fq1.coverage(pos) + fq2.coverage(pos) # Check the forward reads over the deletion fwd_reads = [ r for r in del_reads if r['read'] == 1 and r['frag_start'] < r['frag_end'] or r['read'] == 2 and r['frag_start'] > r['frag_end'] ] for r in fwd_reads: read_start = min(r['frag_start'], r['frag_end']) for i, b in enumerate(r['seq']): b_pos = read_start + i if b_pos <= pos: self.assertEqual(b, ref_seq[b_pos]) else: self.assertEqual(b, ref_seq[b_pos + 2]) # Check the reverse reads over the deletion rev_reads = [ r for r in del_reads if r['read'] == 1 and r['frag_start'] > r['frag_end'] or r['read'] == 2 and r['frag_start'] < r['frag_end'] ] for r in rev_reads: read_start = max(r['frag_start'], r['frag_end']) if read_start == pos: # skip reverse reads whose start coord (far end) is /exactly/ # pos since the indel isn't actually contained in these reads continue for i, b in enumerate(r['seq']): # b_pos is decreasing as we read backwards b_pos = read_start - i b = Fastq.complement[b] if b_pos >= pos + 2: self.assertEqual(b, ref_seq[b_pos]) else: self.assertEqual(b, ref_seq[b_pos - 2])