def __run_small_variant_caller(self, refcalls, format): sample_bank = SampleBank("T") sample_bank.add_sample_name("TEST").add_sequence(".") variant_caller_builder = VariantCallerBuilderFromSampleBank( sample_bank, self.work_dir) variant_caller_builder.configuration = {} # clear config. variant_caller = variant_caller_builder.build() variant_caller.add_additional_command('outputRefCalls', refcalls) variant_caller.add_additional_command('outputFormat', "VCF{}".format(format)) variant_caller.run() with VCFReaderContextManager(variant_caller.output_vcf) as vcf_file: actual_schema = vcf_file.read_header() reference = os.path.splitext( os.path.basename(variant_caller_builder.wecall_input_data. reference_filename))[0] expected_schema = wecall_schema( file_date=datetime.datetime.today().strftime('%F'), reference=reference, contigs={ sample_bank.reference.chrom: { "length": sample_bank.reference.length_minus_deletions() } }, add_ref_calls=refcalls, format=format) return expected_schema, actual_schema
def test_should_add_sequences_with_same_reference(self): sample_bank = SampleBank("AAATTTTGGGGG") sample_bank.add_sample_name("SAMPLE1") sample_bank.add_sample_name("SAMPLE2") self.assertEqual(sample_bank["SAMPLE1"].reference.ref_seq, sample_bank["SAMPLE2"].reference.ref_seq)
def test_should_have_near_zero_RR_genotype_likelihood_for_hom_alt_call( self): chr1 = 'chr1' sample_bank = SampleBank("TTTTTAAAAAAAAAAAAAAAAAAAA", chrom=chr1) sequence_bank_1 = sample_bank.add_sample_name('sample_1') sequence_bank_1.add_sequence(".........................", n_fwd=20, n_rev=20) sequence_bank_2 = sample_bank.add_sample_name('sample_2') sequence_bank_2.add_sequence("............C............", n_fwd=20, n_rev=20) vc_wrapper_builder = VariantCallerBuilderFromSampleBank( sample_bank, self.work_dir) variant_output = vc_wrapper_builder.build().run().output_vcf vcf_expectation = VCFExpectation(self, variant_output) record_expectation = vcf_expectation.has_record_for_variant( Variant(chr1, 12, "A", "C")) sample_expectation = record_expectation.with_sample("sample_1") sample_expectation.has_genotype("0|0").has_RR_genotype_likelihood(0.0)
def test_should_raise_when_adding_existing_sample(self): sample_bank = SampleBank("AAA") sample_name = "SAMPLE1" sample_bank.add_sample_name(sample_name) self.assertRaisesRegex( weCallException, "Sample SAMPLE1 already exists in the SampleBank.", sample_bank.add_sample_with_seqs_and_quals, sample_name, [])
def test_should_add_seq_and_quals_list_with_fwd_and_rev_reads(self): sample_bank = SampleBank("AAA") sample_bank.add_sample_with_seqs_and_quals("SAMPLE1", ["...", "007"], n_fwd=1, n_rev=2) self.assertEqual(len(sample_bank["SAMPLE1"]), 1) self.assertEqual(sample_bank["SAMPLE1"][0].n_fwd, 1) self.assertEqual(sample_bank["SAMPLE1"][0].n_rev, 2)
def test_should_add_short_sequence_and_quality_list(self): sample_bank = SampleBank("AAA") sample_bank.add_sample_with_seqs_and_quals("SAMPLE1", ["...", "007"]) read_lists = [ builder.build_reads(0, {}) for builder in sample_bank["SAMPLE1"] ] reads = [read for read_list in read_lists for read in read_list] self.assertEqual(reads[0].qual, "!!g")
def test_should_add_seq_and_quals_list_with_deletion(self): sample_bank = SampleBank("AAA") sample_bank.add_sample_with_seqs_and_quals("SAMPLE1", [".*C", "1 3"]) read_lists = [ builder.build_reads(0, {}) for builder in sample_bank["SAMPLE1"] ] reads = [read for read_list in read_lists for read in read_list] self.assertEqual(reads[0].qual, "+?")
def __run_wecall_variant_caller(self, chrom, reference_string, sequence_list, vcf_stem=None): if vcf_stem is None: vcf_stem = chrom sample_bank = SampleBank(reference_string, chrom=chrom) sample_bank.add_sample_with_seqs_and_quals(DEFAULT_SAMPLE_NAME, sequence_list, n_fwd=10, n_rev=10) vc_builder = VariantCallerBuilderFromSampleBank(sample_bank, self.work_dir) vc_wrapper = vc_builder.build() vc_wrapper.add_additional_command("allowMNPCalls", False) vc_wrapper.output_vcf = path.join(self.intermediate_vcfs_dir, "{}.vcf".format(vcf_stem)) vc_wrapper.run() return vc_wrapper.output_vcf
def __build_default_sample_bank( self, ref, sequence_list, n_fwd=None, n_rev=None): sample_bank = SampleBank(ref) sample_bank.add_sample_with_seqs_and_quals( DEFAULT_SAMPLE_NAME, sequence_list, n_fwd, n_rev) return sample_bank
def test_should_add_two_sequence_list(self): sample_bank = SampleBank("AAA") sample_bank.add_sample_with_seqs_and_quals("SAMPLE1", ["...", " .."]) read_lists = [ builder.build_reads(0, {}) for builder in sample_bank["SAMPLE1"] ] reads = [read for read_list in read_lists for read in read_list] reads.sort(key=lambda x: (x.pos, x.seq, x.qual, x.cigarstring, x.mapq)) self.assertEqual(len(reads), 2) self.assertEqual(reads[0].qual, "HHH") self.assertEqual(reads[1].qual, "HH")
def test_should_place_variants_at_custom_position(self): sample_bank = SampleBank("AAATTTTGGGAG", 100) sample_bank.add_sample_name("SAMPLE1") sample_bank.add_sample_name("SAMPLE2") sample_bank["SAMPLE1"].add_sequence(".....G......") sample_bank["SAMPLE2"].add_sequence("..........*.") exp_variant1 = Variant(sample_bank.reference.chrom, 105, "T", "G") exp_variant2 = Variant(sample_bank.reference.chrom, 109, "GA", "G") self.assertEqual(sample_bank["SAMPLE1"].variants, {exp_variant1}) self.assertEqual(sample_bank["SAMPLE2"].variants, {exp_variant2}) self.assertEqual(sample_bank.variants, {exp_variant1, exp_variant2})
def test_should_return_all_variants(self): sample_bank = SampleBank("AAATTTTGGGAG") sample_bank.add_sample_name("SAMPLE1") sample_bank.add_sample_name("SAMPLE2") sample_bank["SAMPLE1"].add_sequence(".....G......") sample_bank["SAMPLE2"].add_sequence("..........*.") exp_variant1 = Variant(sample_bank.reference.chrom, 5, "T", "G") exp_variant2 = Variant(sample_bank.reference.chrom, 9, "GA", "G") self.assertEqual(sample_bank["SAMPLE1"].variants, {exp_variant1}) self.assertEqual(sample_bank["SAMPLE2"].variants, {exp_variant2}) self.assertEqual(sample_bank.variants, {exp_variant1, exp_variant2})
def test_should_add_sequence_with_quality(self): sample_bank = SampleBank("AAA") sample_name = "SAMPLE1" sample_bank.add_sample_name(sample_name) sample_bank[sample_name].add_sequence("...", quality_string="007") read_lists = [ builder.build_reads(0, {}) for builder in sample_bank["SAMPLE1"] ] reads = [read for read_list in read_lists for read in read_list] # ascii: "0": "!", "1": "+", "2": "5", "3": "?", "4": "I", "5": "S", # "6": "]", "7": "g", "8": "q", "9": "{" self.assertEqual(reads[0].qual, "!!g")
def test_should_add_complex_seq_and_quals_list(self): sample_bank = SampleBank("AAA") sample_bank.add_sample_with_seqs_and_quals( "SAMPLE1", ["...", "007", " ..", ".*C", "1 3"]) read_lists = [ builder.build_reads(0, {}) for builder in sample_bank["SAMPLE1"] ] reads = [read for read_list in read_lists for read in read_list] reads.sort(key=lambda x: (x.pos, x.seq, x.qual, x.cigarstring, x.mapq)) self.assertEqual(len(reads), 3) self.assertEqual(reads[0].qual, "!!g") self.assertEqual(reads[1].qual, "+?") self.assertEqual(reads[2].qual, self.default_char * 2)
def test_should_raise_when_multiple_quality_strings_specified_per_sequence( self): sample_bank = SampleBank("AAA") self.assertRaisesRegex(weCallException, "Illegal character in sequence \'008\'", sample_bank.add_sample_with_seqs_and_quals, "SAMPLE1", ["...", "007", "008"])
def calls_variants(self, ref, sequence_list, candidate_ascii_haplotypes, prior, expected_ascii_haplotypes): sample_bank = SampleBank(ref) sample_bank.add_sample_with_seqs_and_quals("TEST", sequence_list, 1, 0) variant_generator = AsciiVariantGenerator(sample_bank.reference) candidate_variants = variant_generator.get_variants(candidate_ascii_haplotypes) expected_variants = variant_generator.get_variants(expected_ascii_haplotypes) candidate_variant_list = VCFBuilder(path.join(self.work_dir, "candiate_variants.vcf")) candidate_variant_list.schema.set_info_data('AF', 'A', 'Float', 'Allele Frequency') for var in candidate_variants: candidate_variant_list.with_record_from_variant( var, info=InfoData(candidate_variant_list.schema, {"AF": prior}) ) candidate_variant_list.build().index() vc_wrapper_builder = VariantCallerBuilderFromSampleBank(sample_bank, self.work_dir) vc_wrapper_builder.configuration[CANDIDATE_VARIANTS_FILE_KEY] = candidate_variant_list.compressed_filename callset = vc_wrapper_builder.build().run().get_variant_callset(self) self.assertEqual(callset.get_variants(), set(expected_variants))
def test_can_build_correct_ref_and_bam_file(self): bank = SampleBank("ATCCT*ATAATAAATAAATAAT") sample_name = "TEST_SAMPLE" bank.add_sample_name(sample_name) bank[sample_name].add_sequence("....CT.........T......") builder = WecallInputDataBuilder(self.work_dir).with_sample_bank(bank) input_files = builder.build() bam_file = pysam.Samfile(input_files.bam_filenames[0], "rb") for read in bam_file.fetch(): self.assertEqual(read.pos, 0) self.assertEqual(read.seq, "ATCCCTATAATAAATTAATAAT") self.assertEqual(read.cigarstring, "5M1I16M") fasta_file = pysam.Fastafile(input_files.reference_filename) self.assertEqual(fasta_file.get_reference_length(bank.reference.chrom), 21) self.assertEqual(fasta_file.fetch(bank.reference.chrom, 0, 21), "ATCCTATAATAAATAAATAAT")
def calls_variants_from_samples(self, ref, sample_seqs, expected_haplotypes=None, expected_call_stubs=None, config_dict=None): """ :param expected_haplotypes: dictionary: {sample_name : list of two ascii sequences expressing the genotype} :param expected_call_stubs: dictionary: {variant_stub: dictionary {sample_name: str(genotype)} } """ self.__validate_expected_calls( expected_haplotypes, expected_call_stubs) sample_bank = SampleBank(ref) for sample_name, sequence_list in sample_seqs.items(): sample_bank.add_sample_with_seqs_and_quals( sample_name, sequence_list) variant_callset = self.__run_wecall(sample_bank, config_dict) wecall_calls = variant_callset.get_variants_with_genotypes() if expected_call_stubs is None: self.__filter_none_genotypes(wecall_calls) expected_calls = self.__get_expected_calls_from_sample_ascii_haplotypes( expected_haplotypes, sample_bank.reference) else: expected_calls = {} for variant_stub, genotypes in expected_call_stubs.items(): variant = self._variant_from_stub( sample_bank.reference.chrom, variant_stub) expected_calls[variant] = OrderedDict() for sample_name, genotype in genotypes.items(): expected_calls[variant][sample_name] = GenotypeCall( genotype) self.maxDiff = None # print the whole message if the following assertion fails self.assertDictEqual(expected_calls, wecall_calls)
def assert_quality_recalibrated_in_output_bam(self, ref_string, input_bam_seqs, output_bam_seqs): input_sample_bank = SampleBank(ref_string) input_sample_bank.add_sample_with_seqs_and_quals( self.sample_name, input_bam_seqs) output_sample_bank = SampleBank(ref_string) output_sample_bank.add_sample_with_seqs_and_quals( self.sample_name, output_bam_seqs) vc_builder = VariantCallerBuilderFromSampleBank( input_sample_bank, self.work_dir) vc_builder.configuration["recalibrateBaseQs"] = "true" vc_builder.configuration[ "intermediateRecalibFileStem"] = self.output_stem vc_builder.build().run() self.assertTrue(os.path.exists(self.output_sam)) sam_file = pysam.Samfile(self.output_sam, "r") reads = list(sam_file.fetch()) self.assertEqual(len(reads), len(output_sample_bank[self.sample_name])) # Sort the sam as in sequence bank. # output_sample_bank.sort_sequence_banks() output_reads = sorted(output_sample_bank[self.sample_name].build_reads( 0, {}), key=lambda x: (x.pos, x.seq, x.qual, x.cigarstring, x.mapq)) reads.sort(key=lambda x: (x.pos, x.seq, x.qual, x.cigarstring, x.mapq)) for read, expected_sequence in zip(reads, output_reads): self.assertEqual(read.pos, expected_sequence.pos) self.assertEqual(read.seq, expected_sequence.seq) self.assert_matching_ascii_qualities(read.qual, expected_sequence.qual) self.assertEqual(read.cigarstring, expected_sequence.cigarstring) self.assertEqual(read.mapq, expected_sequence.mapq) sam_file.close()
def test_should_be_able_to_build_bam_and_ref_data_with_multiple_chromosomes( self): bank_1 = SampleBank("A" * 10, 0, chrom='10') bank_1.add_sample_name("sample").add_sequence("." * 10) bank_2 = SampleBank("T" * 9, 0, chrom='20') bank_2.add_sample_name("sample").add_sequence("." * 9) builder = WecallInputDataBuilder( self.work_dir).with_sample_bank(bank_1).with_sample_bank(bank_2) input_files = builder.build() bam_file = pysam.Samfile(input_files.bam_filenames[0], "rb") for read in bam_file.fetch(reference='20'): self.assertEqual(read.pos, 0) self.assertEqual(read.seq, "T" * 9) self.assertEqual(read.cigarstring, "9M") for read in bam_file.fetch(reference='10'): self.assertEqual(read.pos, 0) self.assertEqual(read.seq, "A" * 10) self.assertEqual(read.cigarstring, "10M") print((dir(read)))
def test_can_build_multiple_bam_files(self): bank = SampleBank("ATCCT*ATAATAAATAAATAAT") sample_name1 = "TEST_SAMPLE1" bank.add_sample_name(sample_name1) bank[sample_name1].add_sequence("....CT.........T......") sample_name2 = "TEST_SAMPLE2" bank.add_sample_name(sample_name2) bank[sample_name2].add_sequence(".....*.G..........*...") builder = WecallInputDataBuilder(self.work_dir).with_sample_bank(bank) input_bams = builder.build().bam_filenames bam_file1 = pysam.Samfile(input_bams[0], "rb") for read in bam_file1.fetch(): self.assertEqual(read.pos, 0) self.assertEqual(read.seq, "ATCCCTATAATAAATTAATAAT") self.assertEqual(read.cigarstring, "5M1I16M") bam_file2 = pysam.Samfile(input_bams[1], "rb") for read in bam_file2.fetch(): self.assertEqual(read.pos, 0) self.assertEqual(read.seq, "ATCCTAGAATAAATAAAAAT") self.assertEqual(read.cigarstring, "17M1D3M")
def with_ref_sequence(self, ref_sequence, pos_from=0, chrom=DEFAULT_CHROM): self._sample_bank[chrom] = SampleBank(ref_sequence, pos_from, chrom) return self