def test_can_build_with_one_seq(self): ref = ReferenceChromosome("TCATAAAAAAAT") sequence_bank = SequenceBank(ref) sequence_bank.add_sequence(".*G.........", " ", n_fwd=2, n_rev=1) builder = BAMBuilder( os.path.join(self.work_dir, self.filestub + ".bam")).with_bam_contig_data( self.chrom, self.chrom_length, self.sample_name, sequence_bank) builder.build() bam_file = pysam.Samfile(builder.filename, "rb") reads = list(bam_file.fetch()) self.assertEqual(len(reads), 3) for read in reads: self.assertEqual(read.pos, 0) self.assertEqual(read.seq, "TGTAAAAAAAT") self.assertEqual(read.cigarstring, "1M1D10M") self.assertTrue(os.path.isfile(bam_file.filename)) self.assertTrue(os.path.isfile(bam_file.filename.decode() + ".bai"))
def test_should_be_able_to_add_snp_using_whitespace_dsl_syntax(self): # Given input_ref = "CC*AAGG" snp_input = " .T. " # When sequence_bank = SequenceBank(ReferenceChromosome(input_ref)) sequence_bank.add_sequence(snp_input) read_lists = [builder.build_reads(0, {}) for builder in sequence_bank] reads = [read for read_list in read_lists for read in read_list] # Then self.assertEqual(reads[0].pos, 2) self.assertEqual(reads[0].seq, 'ATG')
def test_header_for_multisample_multicontig(self): ref = ReferenceChromosome("") sequence_bank = SequenceBank(ref) builder = BAMBuilder( os.path.join(self.work_dir, self.filestub + ".bam")) builder.with_bam_contig_data("1", 10, "SAMPLE_ONE", sequence_bank) builder.with_bam_contig_data("2", 20, "SAMPLE_TWO", sequence_bank) expected_header = { 'HD': { 'VN': '1.0' }, 'SQ': [{ 'LN': 10, 'SN': "1" }, { 'LN': 20, 'SN': "2" }], 'RG': [{ "ID": RG_ID + "_SAMPLE_ONE", "SM": "SAMPLE_ONE" }, { "ID": RG_ID + "_SAMPLE_TWO", "SM": "SAMPLE_TWO" }] } self.assertDictEqual(expected_header, builder.header)
def test_should_fail_at_seq_with_different_length_to_reference(self): # Given ref_seq = "AAAA" seq = "CC" sequence_bank = SequenceBank(ReferenceChromosome(ref_seq)) # Then self.assertRaises(weCallException, sequence_bank.add_sequence, seq)
def add_sample_name(self, sample_name): if sample_name in self.__samples: raise weCallException( "Sample {} already exists in the SampleBank.".format( sample_name)) sequence_bank = SequenceBank(self.reference) self.__samples[sample_name] = sequence_bank return sequence_bank
def test_should_use_sample_name_if_available(self): chrom = '14' sequence_bank = SequenceBank( ReferenceChromosome('CGGCGGTCGAACGGAGCCCCAAGCGAAGCTCAAAACATGG', 0, chrom)) sequence_bank.add_sequence(' ...........A............. ', n_fwd=10, n_rev=10) driver = SVCDriver(self).with_ref_sequence( 'CGGCGGTCGAACGGAGCCCCAAGCGAAGCTCAAAACATGG', chrom=chrom)\ .with_bam_data('pi.bam', {'sample': sequence_bank}, True) expect = driver.call() expect.with_output_vcf().record_count(1).with_samples(['sample'])
def test_can_build_with_defined_quality(self): ref = ReferenceChromosome("TCATAAAT") sequence_bank = SequenceBank(ref) sequence_bank.add_sequence(".*G.....", "9 87 00", n_fwd=1, n_rev=0) builder = BAMBuilder( os.path.join(self.work_dir, self.filestub + ".bam")).with_bam_contig_data( self.chrom, self.chrom_length, self.sample_name, sequence_bank) builder.build() bam_file = pysam.Samfile(builder.filename, "rb") reads = list(bam_file.fetch()) self.assertEqual(len(reads), 1) self.assertEqual(reads[0].seq, "TGTAAAT") # ascii: "0": "!", "1": "+", "2": "5", "3": "?", "4": "H", "5": "S", # "6": "]", "7": "g", "8": "q", "9": "{" expected_qual = "{qgHH!!" self.assertEqual(reads[0].qual, expected_qual)
def setParallelAndSerialVariantCallers(self, copies1, copies2): '''Prepare the variant caller data for the test to run''' filestem = "vc_input" ref_file_builder = FastaFileBuilder(os.path.join(self.work_dir, filestem + ".fa")) ref1 = ref_file_builder.with_chrom(self.chrom1, self.ref_string1 * copies1) ref2 = ref_file_builder.with_chrom(self.chrom2, self.ref_string2 * copies2) self.repeat_length1 = ref1.length_minus_deletions() / copies1 self.repeat_length2 = ref2.length_minus_deletions() / copies2 ref_file_builder.build() ref_file_builder.index() seq_bank1 = SequenceBank(ref1) seq_bank1.add_sequence(self.seq_string1 * copies1, n_fwd=10, n_rev=10) seq_bank2 = SequenceBank(ref2) seq_bank2.add_sequence(self.seq_string2 * copies2, n_fwd=10, n_rev=10) seq_bank2.add_sequence(self.seq_string3 * copies2, n_fwd=10, n_rev=10) bam_builder = BAMBuilder(os.path.join(self.work_dir, filestem + ".bam")) bam_builder.with_bam_contig_data(ref1.chrom, ref1.length_minus_deletions(), self.sample_name1, seq_bank1) bam_builder.with_bam_contig_data(ref2.chrom, ref2.length_minus_deletions(), self.sample_name2, seq_bank2) bam_builder.build() wecall_input_data = WecallInputData([bam_builder.filename], ref_file_builder.filename) wecall_config_builder = WecallConfigBuilder(wecall_input_data, os.path.join(self.work_dir, filestem)) wecall_config_builder.with_configuration("maxBlockSize", self.block_size) wecall_config_builder.with_configuration("noSimilarReadsFilter", False) wecall_config_builder.with_configuration("maxClusterDist", 20) wecall_config = wecall_config_builder.build() parallel_output_file_stem = os.path.join(self.work_dir, filestem + "_parallel") serial_output_file_stem = os.path.join(self.work_dir, filestem + "_serial") self.vc_wrapper_parallel = VariantCallerWrapper(parallel_output_file_stem, wecall_config) self.vc_wrapper_serial = VariantCallerWrapper(serial_output_file_stem, wecall_config)
def test_can_build_two_chroms(self): ref1 = ReferenceChromosome("TCATAAAAAAAT") sequence_bank1 = SequenceBank(ref1) sequence_bank1.add_sequence(".*G.........") ref2 = ReferenceChromosome("GGGG") sequence_bank2 = SequenceBank(ref2) sequence_bank2.add_sequence("..*.") builder = BAMBuilder( os.path.join(self.work_dir, self.filestub + ".bam")).with_bam_contig_data( "1", 100, "SAMPLE", sequence_bank1).with_bam_contig_data( "X", 50, "SAMPLE", sequence_bank2) builder.build() bam_file = pysam.Samfile(builder.filename, "rb") reads_chrom1 = list(bam_file.fetch(region="1:1-20")) self.assertEqual(len(reads_chrom1), 1) self.assertEqual(reads_chrom1[0].seq, "TGTAAAAAAAT") bam_file = pysam.Samfile(builder.filename, "rb") reads_chrom2 = list(bam_file.fetch(region="X:1-5")) self.assertEqual(len(reads_chrom2), 1) self.assertEqual(reads_chrom2[0].seq, "GGG") reads = list(bam_file.fetch()) self.assertEqual(len(reads), 2) self.assertEqual(reads[0].seq, "TGTAAAAAAAT") self.assertEqual(reads[1].seq, "GGG") self.assertRaises(ValueError, bam_file.fetch, region="2:1-20") self.assertTrue(os.path.isfile(bam_file.filename)) self.assertTrue(os.path.isfile(bam_file.filename.decode() + ".bai"))