def test_should_fail_at_seq_with_different_length_to_reference(self): # Given ref_seq = "AAAA" seq = "CC" sequence_bank = SequenceBank(ReferenceChromosome(ref_seq)) # Then self.assertRaises(weCallException, sequence_bank.add_sequence, seq)
def test_find_adjacent_insertion_and_snp(self): ref = ReferenceChromosome("T*ATAAAAAAAT") seq = Sequence(ref, ".CG.........") self.assertEqual(seq.variants, { Variant(ref.chrom, 0, "T", "TC"), Variant(ref.chrom, 1, "A", "G") })
def test_should_find_multiple_snps(self): ref = ReferenceChromosome("AAAAAAAAAAAAA") seq = Sequence(ref, ".C.........T.") self.assertEqual(seq.variants, { Variant(ref.chrom, 1, "A", "C"), Variant(ref.chrom, 11, "A", "T") })
def test_should_interpret_trailing_whitespace_to_override_pos_to_when_seq_has_insertion( self): ref = ReferenceChromosome("C*GA") annotated_seqs = sequence_builder(ref, ".C ") read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs] reads = [read for read_list in read_lists for read in read_list] self.assertEqual(reads[0].rlen, 2)
def test_find_multiple_single_base_deletion(self): ref = ReferenceChromosome("TTAAAAAGAAAAT") seq = Sequence(ref, "..*.....*....") self.assertEqual(seq.variants, { Variant(ref.chrom, 1, "TA", "T"), Variant(ref.chrom, 7, "GA", "G") })
def test_find_adjacent_snp_and_deletion(self): ref = ReferenceChromosome("TTAAAAAAAAAT") seq = Sequence(ref, ".G*.........") self.assertEqual(seq.variants, { Variant(ref.chrom, 1, "T", "G"), Variant(ref.chrom, 1, "TA", "T") })
def test_header_for_multisample_multicontig(self): ref = ReferenceChromosome("") sequence_bank = SequenceBank(ref) builder = BAMBuilder( os.path.join(self.work_dir, self.filestub + ".bam")) builder.with_bam_contig_data("1", 10, "SAMPLE_ONE", sequence_bank) builder.with_bam_contig_data("2", 20, "SAMPLE_TWO", sequence_bank) expected_header = { 'HD': { 'VN': '1.0' }, 'SQ': [{ 'LN': 10, 'SN': "1" }, { 'LN': 20, 'SN': "2" }], 'RG': [{ "ID": RG_ID + "_SAMPLE_ONE", "SM": "SAMPLE_ONE" }, { "ID": RG_ID + "_SAMPLE_TWO", "SM": "SAMPLE_TWO" }] } self.assertDictEqual(expected_header, builder.header)
def test_can_build_with_one_seq(self): ref = ReferenceChromosome("TCATAAAAAAAT") sequence_bank = SequenceBank(ref) sequence_bank.add_sequence(".*G.........", " ", n_fwd=2, n_rev=1) builder = BAMBuilder( os.path.join(self.work_dir, self.filestub + ".bam")).with_bam_contig_data( self.chrom, self.chrom_length, self.sample_name, sequence_bank) builder.build() bam_file = pysam.Samfile(builder.filename, "rb") reads = list(bam_file.fetch()) self.assertEqual(len(reads), 3) for read in reads: self.assertEqual(read.pos, 0) self.assertEqual(read.seq, "TGTAAAAAAAT") self.assertEqual(read.cigarstring, "1M1D10M") self.assertTrue(os.path.isfile(bam_file.filename)) self.assertTrue(os.path.isfile(bam_file.filename.decode() + ".bai"))
def test_should_be_able_to_access_ref_char(self): # Given seq_ref = ReferenceChromosome("AC*T*G") # Then self.assertEqual(seq_ref[0], "A") self.assertEqual(seq_ref[1], "C") self.assertEqual(seq_ref[2], "T") self.assertEqual(seq_ref[3], "G")
def test_should_interpret_trailing_whitespace_to_override_pos_to(self): ref = ReferenceChromosome("CATG") annotated_seqs = sequence_builder(ref, ".C ") read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs] reads = [read for read_list in read_lists for read in read_list] self.assertEqual(len(reads), 1) self.assertEqual(reads[0].pos, 0) self.assertEqual(reads[0].seq, "CC")
def test_should_build_correct_sequence_with_insertion_at_the_end(self): ref = ReferenceChromosome("CCC**") builders = sequence_builder(ref, "...TT") read_lists = [builder.build_reads(0, {}) for builder in builders] reads = [read for read_list in read_lists for read in read_list] self.assertEqual(reads[0].pos, 0) self.assertEqual(reads[0].rlen, 5) self.assertEqual(reads[0].seq, "CCCTT")
def test_should_build_with_custom_quality_with_ins(self): ref = ReferenceChromosome("AA**A") annotated_seqs = sequence_builder(ref, "..CC.", quality_string="31220") read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs] reads = [read for read_list in read_lists for read in read_list] self.assertEqual( reads[0].qual, self.ascii_codes["3"] + self.ascii_codes["1"] + self.ascii_codes["2"] * 2 + self.ascii_codes["0"])
def test_should_allocate_pos_from_and_pos_to_based_on_reference_size(self): # Given input_ref_seq = "ACCCT" # When seq_ref = ReferenceChromosome(input_ref_seq) # Then self.assertEqual(seq_ref.pos_from, 0) self.assertEqual(seq_ref.pos_to, len(input_ref_seq))
def test_should_interpret_trailing_whitespace_to_override_positions_for_complex_ref_and_seq( self): ref = ReferenceChromosome("ACCC*G*A") annotated_seqs = sequence_builder(ref, ".**.C ") read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs] reads = [read for read_list in read_lists for read in read_list] self.assertEqual(reads[0].pos, 0) self.assertEqual(reads[0].rlen, 3)
def test_should_build_correct_sequence_without_any_whitespace(self): ref = ReferenceChromosome("C*CC") annotated_seqs = sequence_builder(ref, ".*.T") read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs] reads = [read for read_list in read_lists for read in read_list] self.assertEqual(len(reads), 1) self.assertEqual(reads[0].pos, 0) self.assertEqual(reads[0].rlen, 3) self.assertEqual(reads[0].seq, "CCT")
def test_should_interpret_leading_whitespace_to_override_pos_from_when_ref_has_deletion( self): ref = ReferenceChromosome("C*TG") annotated_seqs = sequence_builder(ref, " C.") read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs] reads = [read for read_list in read_lists for read in read_list] self.assertEqual(len(reads), 1) self.assertEqual(reads[0].pos, 1) self.assertEqual(reads[0].seq, "CG")
def test_should_build_with_custom_quality_with_del(self): ref = ReferenceChromosome("AAAAA") annotated_seqs = sequence_builder(ref, "..*..", quality_string="31 00") read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs] reads = [read for read_list in read_lists for read in read_list] reads.sort(key=lambda x: (x.pos, x.seq, x.qual, x.cigarstring, x.mapq)) self.assertEqual( reads[0].qual, self.ascii_codes["3"] + self.ascii_codes["1"] + self.ascii_codes["0"] * 2)
def test_find_multiple_variants(self): ref = ReferenceChromosome("TA*AAAGCTAACT") seq = Sequence(ref, ".GC...T...**.") self.assertEqual( seq.variants, { Variant(ref.chrom, 1, "A", "G"), Variant(ref.chrom, 1, "A", "AC"), Variant(ref.chrom, 5, "G", "T"), Variant(ref.chrom, 8, "AAC", "A") })
def test_should_build_with_custom_quality_and_sequence_shorter_than_reference( self): ref = ReferenceChromosome("AAAAAAAAAAAA") builders = sequence_builder(ref, " ..*.. ", quality_string=" 31 0 ") read_lists = [builder.build_reads(0, {}) for builder in builders] reads = [read for read_list in read_lists for read in read_list] self.assertEqual( reads[0].qual, self.ascii_codes["3"] + self.ascii_codes["1"] + self.ascii_codes["0"] + self.default_qual)
def test_should_build_two_complex_seqs_defined_on_single_line(self): ref = ReferenceChromosome("AA*CC*TGTAAGG") annotated_seqs = sequence_builder(ref, " .G. ,c,*, ") read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs] reads = [read for read_list in read_lists for read in read_list] reads.sort(key=lambda x: (x.pos, x.seq, x.qual, x.cigarstring, x.mapq)) self.assertEqual(len(reads), 2) self.assertEqual(reads[0].pos, 1) self.assertEqual(reads[0].seq, "AGC") self.assertEqual(reads[1].pos, 4) self.assertEqual(reads[1].seq, "TCTA")
def test_should_be_able_to_add_snp_using_whitespace_dsl_syntax(self): # Given input_ref = "CC*AAGG" snp_input = " .T. " # When sequence_bank = SequenceBank(ReferenceChromosome(input_ref)) sequence_bank.add_sequence(snp_input) read_lists = [builder.build_reads(0, {}) for builder in sequence_bank] reads = [read for read_list in read_lists for read in read_list] # Then self.assertEqual(reads[0].pos, 2) self.assertEqual(reads[0].seq, 'ATG')
def build_annotated_pair(fwd, rev, n_fwd, n_rev, mapping_quality, insert_size, read_id, read_flags, cigar_string, read_start, read_mate_start): fwd_reference = ReferenceChromosome(fwd.reference_string, fwd.pos_from) rev_reference = ReferenceChromosome(rev.reference_string, rev.pos_from) fwd_sequence = Sequence(fwd_reference, fwd.sequence_string.replace(",", ".").upper(), cigar_string) rev_sequence = Sequence(rev_reference, rev.sequence_string.replace(",", ".").upper(), cigar_string) fwd_quality = SequenceQuality(fwd.quality_string) rev_quality = SequenceQuality(rev.quality_string) fwd_read_sequence = ReadSequence(fwd_sequence, fwd_quality, mapping_quality, insert_size, read_id, read_flags, read_start, read_mate_start) rev_read_sequence = ReadSequence(rev_sequence, rev_quality, mapping_quality, insert_size, read_id, read_flags, read_start, read_mate_start) return [ ReadPairWithCoverage(fwd_read_sequence, rev_read_sequence, n_fwd, n_rev) ]
def test_can_build_two_chroms(self): ref1 = ReferenceChromosome("TCATAAAAAAAT") sequence_bank1 = SequenceBank(ref1) sequence_bank1.add_sequence(".*G.........") ref2 = ReferenceChromosome("GGGG") sequence_bank2 = SequenceBank(ref2) sequence_bank2.add_sequence("..*.") builder = BAMBuilder( os.path.join(self.work_dir, self.filestub + ".bam")).with_bam_contig_data( "1", 100, "SAMPLE", sequence_bank1).with_bam_contig_data( "X", 50, "SAMPLE", sequence_bank2) builder.build() bam_file = pysam.Samfile(builder.filename, "rb") reads_chrom1 = list(bam_file.fetch(region="1:1-20")) self.assertEqual(len(reads_chrom1), 1) self.assertEqual(reads_chrom1[0].seq, "TGTAAAAAAAT") bam_file = pysam.Samfile(builder.filename, "rb") reads_chrom2 = list(bam_file.fetch(region="X:1-5")) self.assertEqual(len(reads_chrom2), 1) self.assertEqual(reads_chrom2[0].seq, "GGG") reads = list(bam_file.fetch()) self.assertEqual(len(reads), 2) self.assertEqual(reads[0].seq, "TGTAAAAAAAT") self.assertEqual(reads[1].seq, "GGG") self.assertRaises(ValueError, bam_file.fetch, region="2:1-20") self.assertTrue(os.path.isfile(bam_file.filename)) self.assertTrue(os.path.isfile(bam_file.filename.decode() + ".bai"))
def test_should_generate_variant_from_ascii_text(self): ref = "ATAAAAAAAAAT" alt_1 = ".A........*." alt_2 = ".C.........." variant_generator = AsciiVariantGenerator(ReferenceChromosome(ref)) gen_vars = variant_generator.get_variants([alt_1, alt_2]) self.assertEqual( gen_vars, { Variant(variant_generator.reference.chrom, 1, "T", "A"), Variant(variant_generator.reference.chrom, 1, "T", "C"), Variant(variant_generator.reference.chrom, 9, "AA", "A") } )
def test_should_use_sample_name_if_available(self): chrom = '14' sequence_bank = SequenceBank( ReferenceChromosome('CGGCGGTCGAACGGAGCCCCAAGCGAAGCTCAAAACATGG', 0, chrom)) sequence_bank.add_sequence(' ...........A............. ', n_fwd=10, n_rev=10) driver = SVCDriver(self).with_ref_sequence( 'CGGCGGTCGAACGGAGCCCCAAGCGAAGCTCAAAACATGG', chrom=chrom)\ .with_bam_data('pi.bam', {'sample': sequence_bank}, True) expect = driver.call() expect.with_output_vcf().record_count(1).with_samples(['sample'])
def test_can_build_with_defined_quality(self): ref = ReferenceChromosome("TCATAAAT") sequence_bank = SequenceBank(ref) sequence_bank.add_sequence(".*G.....", "9 87 00", n_fwd=1, n_rev=0) builder = BAMBuilder( os.path.join(self.work_dir, self.filestub + ".bam")).with_bam_contig_data( self.chrom, self.chrom_length, self.sample_name, sequence_bank) builder.build() bam_file = pysam.Samfile(builder.filename, "rb") reads = list(bam_file.fetch()) self.assertEqual(len(reads), 1) self.assertEqual(reads[0].seq, "TGTAAAT") # ascii: "0": "!", "1": "+", "2": "5", "3": "?", "4": "H", "5": "S", # "6": "]", "7": "g", "8": "q", "9": "{" expected_qual = "{qgHH!!" self.assertEqual(reads[0].qual, expected_qual)
def build_annotated_seq(self, n_fwd, n_rev, mapping_quality, insert_size, read_id, read_flags, cigar_string, read_start, read_mate_start): reference = ReferenceChromosome(self.reference_string, self.pos_from) sequence = Sequence(reference, self.sequence_string.replace(",", ".").upper(), cigar_string) quality = SequenceQuality(self.quality_string) read_sequence = ReadSequence(sequence, quality, mapping_quality, insert_size, read_id, read_flags, read_start, read_mate_start) if n_fwd is not None: return [ReadSequenceWithCoverage(read_sequence, n_fwd, n_rev)] elif self.is_reverse_seq(): return [ReadSequenceWithCoverage(read_sequence, 0, 1)] elif self.is_forward_seq(): return [ReadSequenceWithCoverage(read_sequence, 1, 0)] else: raise weCallException( "Raw sequence: {} is neither forward or reverse".format(self))
def test_should_get_correct_fasta_string_for_offset_reference(self): # Given seq_ref = ReferenceChromosome("AC*T*G", 5) # Then self.assertEqual(seq_ref.fasta_string(), "NNNNNACTG")
def test_should_correctly_getitem_for_offset_reference(self): # Given seq_ref = ReferenceChromosome("A*T", 10) # Then self.assertEqual(seq_ref[10], "A") self.assertEqual(seq_ref[11], "T")
def test_should_ignore_asterixes_in_reference_sequence_in_computing_pos_to( self): # Given ref_seq = ReferenceChromosome("C*C*C") # Then self.assertEqual(ref_seq.pos_to, 3)