Beispiel #1
0
    def __run_small_variant_caller(self, refcalls, format):
        sample_bank = SampleBank("T")
        sample_bank.add_sample_name("TEST").add_sequence(".")

        variant_caller_builder = VariantCallerBuilderFromSampleBank(
            sample_bank, self.work_dir)
        variant_caller_builder.configuration = {}  # clear config.
        variant_caller = variant_caller_builder.build()
        variant_caller.add_additional_command('outputRefCalls', refcalls)
        variant_caller.add_additional_command('outputFormat',
                                              "VCF{}".format(format))
        variant_caller.run()

        with VCFReaderContextManager(variant_caller.output_vcf) as vcf_file:
            actual_schema = vcf_file.read_header()

        reference = os.path.splitext(
            os.path.basename(variant_caller_builder.wecall_input_data.
                             reference_filename))[0]
        expected_schema = wecall_schema(
            file_date=datetime.datetime.today().strftime('%F'),
            reference=reference,
            contigs={
                sample_bank.reference.chrom: {
                    "length": sample_bank.reference.length_minus_deletions()
                }
            },
            add_ref_calls=refcalls,
            format=format)

        return expected_schema, actual_schema
Beispiel #2
0
    def test_should_add_sequences_with_same_reference(self):
        sample_bank = SampleBank("AAATTTTGGGGG")
        sample_bank.add_sample_name("SAMPLE1")
        sample_bank.add_sample_name("SAMPLE2")

        self.assertEqual(sample_bank["SAMPLE1"].reference.ref_seq,
                         sample_bank["SAMPLE2"].reference.ref_seq)
Beispiel #3
0
    def test_should_have_near_zero_RR_genotype_likelihood_for_hom_alt_call(
            self):
        chr1 = 'chr1'
        sample_bank = SampleBank("TTTTTAAAAAAAAAAAAAAAAAAAA", chrom=chr1)

        sequence_bank_1 = sample_bank.add_sample_name('sample_1')
        sequence_bank_1.add_sequence(".........................",
                                     n_fwd=20,
                                     n_rev=20)

        sequence_bank_2 = sample_bank.add_sample_name('sample_2')
        sequence_bank_2.add_sequence("............C............",
                                     n_fwd=20,
                                     n_rev=20)

        vc_wrapper_builder = VariantCallerBuilderFromSampleBank(
            sample_bank, self.work_dir)
        variant_output = vc_wrapper_builder.build().run().output_vcf

        vcf_expectation = VCFExpectation(self, variant_output)
        record_expectation = vcf_expectation.has_record_for_variant(
            Variant(chr1, 12, "A", "C"))
        sample_expectation = record_expectation.with_sample("sample_1")

        sample_expectation.has_genotype("0|0").has_RR_genotype_likelihood(0.0)
Beispiel #4
0
    def test_should_raise_when_adding_existing_sample(self):
        sample_bank = SampleBank("AAA")
        sample_name = "SAMPLE1"
        sample_bank.add_sample_name(sample_name)

        self.assertRaisesRegex(
            weCallException,
            "Sample SAMPLE1 already exists in the SampleBank.",
            sample_bank.add_sample_with_seqs_and_quals, sample_name, [])
Beispiel #5
0
    def test_should_add_seq_and_quals_list_with_fwd_and_rev_reads(self):
        sample_bank = SampleBank("AAA")
        sample_bank.add_sample_with_seqs_and_quals("SAMPLE1", ["...", "007"],
                                                   n_fwd=1,
                                                   n_rev=2)

        self.assertEqual(len(sample_bank["SAMPLE1"]), 1)
        self.assertEqual(sample_bank["SAMPLE1"][0].n_fwd, 1)
        self.assertEqual(sample_bank["SAMPLE1"][0].n_rev, 2)
Beispiel #6
0
    def test_should_add_short_sequence_and_quality_list(self):
        sample_bank = SampleBank("AAA")
        sample_bank.add_sample_with_seqs_and_quals("SAMPLE1", ["...", "007"])
        read_lists = [
            builder.build_reads(0, {}) for builder in sample_bank["SAMPLE1"]
        ]
        reads = [read for read_list in read_lists for read in read_list]

        self.assertEqual(reads[0].qual, "!!g")
Beispiel #7
0
    def test_should_add_seq_and_quals_list_with_deletion(self):
        sample_bank = SampleBank("AAA")
        sample_bank.add_sample_with_seqs_and_quals("SAMPLE1", [".*C", "1 3"])
        read_lists = [
            builder.build_reads(0, {}) for builder in sample_bank["SAMPLE1"]
        ]
        reads = [read for read_list in read_lists for read in read_list]

        self.assertEqual(reads[0].qual, "+?")
Beispiel #8
0
 def __run_wecall_variant_caller(self, chrom, reference_string, sequence_list, vcf_stem=None):
     if vcf_stem is None:
         vcf_stem = chrom
     sample_bank = SampleBank(reference_string, chrom=chrom)
     sample_bank.add_sample_with_seqs_and_quals(DEFAULT_SAMPLE_NAME, sequence_list, n_fwd=10, n_rev=10)
     vc_builder = VariantCallerBuilderFromSampleBank(sample_bank, self.work_dir)
     vc_wrapper = vc_builder.build()
     vc_wrapper.add_additional_command("allowMNPCalls", False)
     vc_wrapper.output_vcf = path.join(self.intermediate_vcfs_dir, "{}.vcf".format(vcf_stem))
     vc_wrapper.run()
     return vc_wrapper.output_vcf
Beispiel #9
0
    def __build_default_sample_bank(
            self,
            ref,
            sequence_list,
            n_fwd=None,
            n_rev=None):
        sample_bank = SampleBank(ref)
        sample_bank.add_sample_with_seqs_and_quals(
            DEFAULT_SAMPLE_NAME, sequence_list, n_fwd, n_rev)

        return sample_bank
Beispiel #10
0
    def test_should_add_two_sequence_list(self):
        sample_bank = SampleBank("AAA")
        sample_bank.add_sample_with_seqs_and_quals("SAMPLE1", ["...", " .."])
        read_lists = [
            builder.build_reads(0, {}) for builder in sample_bank["SAMPLE1"]
        ]
        reads = [read for read_list in read_lists for read in read_list]
        reads.sort(key=lambda x: (x.pos, x.seq, x.qual, x.cigarstring, x.mapq))

        self.assertEqual(len(reads), 2)
        self.assertEqual(reads[0].qual, "HHH")
        self.assertEqual(reads[1].qual, "HH")
Beispiel #11
0
    def test_should_place_variants_at_custom_position(self):
        sample_bank = SampleBank("AAATTTTGGGAG", 100)
        sample_bank.add_sample_name("SAMPLE1")
        sample_bank.add_sample_name("SAMPLE2")

        sample_bank["SAMPLE1"].add_sequence(".....G......")
        sample_bank["SAMPLE2"].add_sequence("..........*.")

        exp_variant1 = Variant(sample_bank.reference.chrom, 105, "T", "G")
        exp_variant2 = Variant(sample_bank.reference.chrom, 109, "GA", "G")
        self.assertEqual(sample_bank["SAMPLE1"].variants, {exp_variant1})
        self.assertEqual(sample_bank["SAMPLE2"].variants, {exp_variant2})
        self.assertEqual(sample_bank.variants, {exp_variant1, exp_variant2})
Beispiel #12
0
    def test_should_return_all_variants(self):
        sample_bank = SampleBank("AAATTTTGGGAG")
        sample_bank.add_sample_name("SAMPLE1")
        sample_bank.add_sample_name("SAMPLE2")

        sample_bank["SAMPLE1"].add_sequence(".....G......")
        sample_bank["SAMPLE2"].add_sequence("..........*.")

        exp_variant1 = Variant(sample_bank.reference.chrom, 5, "T", "G")
        exp_variant2 = Variant(sample_bank.reference.chrom, 9, "GA", "G")
        self.assertEqual(sample_bank["SAMPLE1"].variants, {exp_variant1})
        self.assertEqual(sample_bank["SAMPLE2"].variants, {exp_variant2})
        self.assertEqual(sample_bank.variants, {exp_variant1, exp_variant2})
Beispiel #13
0
    def test_should_add_sequence_with_quality(self):
        sample_bank = SampleBank("AAA")
        sample_name = "SAMPLE1"
        sample_bank.add_sample_name(sample_name)
        sample_bank[sample_name].add_sequence("...", quality_string="007")
        read_lists = [
            builder.build_reads(0, {}) for builder in sample_bank["SAMPLE1"]
        ]
        reads = [read for read_list in read_lists for read in read_list]

        # ascii: "0": "!", "1": "+", "2": "5", "3": "?", "4": "I", "5": "S",
        # "6": "]", "7": "g", "8": "q", "9": "{"
        self.assertEqual(reads[0].qual, "!!g")
Beispiel #14
0
    def test_should_add_complex_seq_and_quals_list(self):
        sample_bank = SampleBank("AAA")
        sample_bank.add_sample_with_seqs_and_quals(
            "SAMPLE1", ["...", "007", " ..", ".*C", "1 3"])
        read_lists = [
            builder.build_reads(0, {}) for builder in sample_bank["SAMPLE1"]
        ]
        reads = [read for read_list in read_lists for read in read_list]
        reads.sort(key=lambda x: (x.pos, x.seq, x.qual, x.cigarstring, x.mapq))

        self.assertEqual(len(reads), 3)
        self.assertEqual(reads[0].qual, "!!g")
        self.assertEqual(reads[1].qual, "+?")
        self.assertEqual(reads[2].qual, self.default_char * 2)
Beispiel #15
0
    def test_should_raise_when_multiple_quality_strings_specified_per_sequence(
            self):
        sample_bank = SampleBank("AAA")

        self.assertRaisesRegex(weCallException,
                               "Illegal character in sequence \'008\'",
                               sample_bank.add_sample_with_seqs_and_quals,
                               "SAMPLE1", ["...", "007", "008"])
Beispiel #16
0
    def calls_variants(self, ref, sequence_list, candidate_ascii_haplotypes, prior, expected_ascii_haplotypes):
        sample_bank = SampleBank(ref)
        sample_bank.add_sample_with_seqs_and_quals("TEST", sequence_list, 1, 0)

        variant_generator = AsciiVariantGenerator(sample_bank.reference)
        candidate_variants = variant_generator.get_variants(candidate_ascii_haplotypes)
        expected_variants = variant_generator.get_variants(expected_ascii_haplotypes)

        candidate_variant_list = VCFBuilder(path.join(self.work_dir, "candiate_variants.vcf"))
        candidate_variant_list.schema.set_info_data('AF', 'A', 'Float', 'Allele Frequency')
        for var in candidate_variants:
            candidate_variant_list.with_record_from_variant(
                var, info=InfoData(candidate_variant_list.schema, {"AF": prior})
            )
        candidate_variant_list.build().index()

        vc_wrapper_builder = VariantCallerBuilderFromSampleBank(sample_bank, self.work_dir)
        vc_wrapper_builder.configuration[CANDIDATE_VARIANTS_FILE_KEY] = candidate_variant_list.compressed_filename
        callset = vc_wrapper_builder.build().run().get_variant_callset(self)

        self.assertEqual(callset.get_variants(), set(expected_variants))
Beispiel #17
0
    def test_can_build_correct_ref_and_bam_file(self):
        bank = SampleBank("ATCCT*ATAATAAATAAATAAT")
        sample_name = "TEST_SAMPLE"
        bank.add_sample_name(sample_name)
        bank[sample_name].add_sequence("....CT.........T......")

        builder = WecallInputDataBuilder(self.work_dir).with_sample_bank(bank)

        input_files = builder.build()

        bam_file = pysam.Samfile(input_files.bam_filenames[0], "rb")
        for read in bam_file.fetch():
            self.assertEqual(read.pos, 0)
            self.assertEqual(read.seq, "ATCCCTATAATAAATTAATAAT")
            self.assertEqual(read.cigarstring, "5M1I16M")

        fasta_file = pysam.Fastafile(input_files.reference_filename)
        self.assertEqual(fasta_file.get_reference_length(bank.reference.chrom),
                         21)
        self.assertEqual(fasta_file.fetch(bank.reference.chrom, 0, 21),
                         "ATCCTATAATAAATAAATAAT")
Beispiel #18
0
    def calls_variants_from_samples(self,
                                    ref,
                                    sample_seqs,
                                    expected_haplotypes=None,
                                    expected_call_stubs=None,
                                    config_dict=None):
        """
        :param expected_haplotypes: dictionary: {sample_name : list of two ascii sequences expressing the genotype}
        :param expected_call_stubs: dictionary: {variant_stub: dictionary {sample_name: str(genotype)} }
        """
        self.__validate_expected_calls(
            expected_haplotypes, expected_call_stubs)
        sample_bank = SampleBank(ref)

        for sample_name, sequence_list in sample_seqs.items():
            sample_bank.add_sample_with_seqs_and_quals(
                sample_name, sequence_list)

        variant_callset = self.__run_wecall(sample_bank, config_dict)
        wecall_calls = variant_callset.get_variants_with_genotypes()

        if expected_call_stubs is None:
            self.__filter_none_genotypes(wecall_calls)
            expected_calls = self.__get_expected_calls_from_sample_ascii_haplotypes(
                expected_haplotypes, sample_bank.reference)
        else:
            expected_calls = {}
            for variant_stub, genotypes in expected_call_stubs.items():
                variant = self._variant_from_stub(
                    sample_bank.reference.chrom, variant_stub)
                expected_calls[variant] = OrderedDict()
                for sample_name, genotype in genotypes.items():
                    expected_calls[variant][sample_name] = GenotypeCall(
                        genotype)

        self.maxDiff = None  # print the whole message if the following assertion fails
        self.assertDictEqual(expected_calls, wecall_calls)
    def assert_quality_recalibrated_in_output_bam(self, ref_string,
                                                  input_bam_seqs,
                                                  output_bam_seqs):
        input_sample_bank = SampleBank(ref_string)
        input_sample_bank.add_sample_with_seqs_and_quals(
            self.sample_name, input_bam_seqs)

        output_sample_bank = SampleBank(ref_string)
        output_sample_bank.add_sample_with_seqs_and_quals(
            self.sample_name, output_bam_seqs)

        vc_builder = VariantCallerBuilderFromSampleBank(
            input_sample_bank, self.work_dir)
        vc_builder.configuration["recalibrateBaseQs"] = "true"
        vc_builder.configuration[
            "intermediateRecalibFileStem"] = self.output_stem
        vc_builder.build().run()

        self.assertTrue(os.path.exists(self.output_sam))

        sam_file = pysam.Samfile(self.output_sam, "r")
        reads = list(sam_file.fetch())
        self.assertEqual(len(reads), len(output_sample_bank[self.sample_name]))

        # Sort the sam as in sequence bank.
        # output_sample_bank.sort_sequence_banks()
        output_reads = sorted(output_sample_bank[self.sample_name].build_reads(
            0, {}),
                              key=lambda x:
                              (x.pos, x.seq, x.qual, x.cigarstring, x.mapq))
        reads.sort(key=lambda x: (x.pos, x.seq, x.qual, x.cigarstring, x.mapq))

        for read, expected_sequence in zip(reads, output_reads):
            self.assertEqual(read.pos, expected_sequence.pos)
            self.assertEqual(read.seq, expected_sequence.seq)
            self.assert_matching_ascii_qualities(read.qual,
                                                 expected_sequence.qual)
            self.assertEqual(read.cigarstring, expected_sequence.cigarstring)
            self.assertEqual(read.mapq, expected_sequence.mapq)

        sam_file.close()
Beispiel #20
0
    def test_should_be_able_to_build_bam_and_ref_data_with_multiple_chromosomes(
            self):
        bank_1 = SampleBank("A" * 10, 0, chrom='10')
        bank_1.add_sample_name("sample").add_sequence("." * 10)

        bank_2 = SampleBank("T" * 9, 0, chrom='20')
        bank_2.add_sample_name("sample").add_sequence("." * 9)

        builder = WecallInputDataBuilder(
            self.work_dir).with_sample_bank(bank_1).with_sample_bank(bank_2)

        input_files = builder.build()
        bam_file = pysam.Samfile(input_files.bam_filenames[0], "rb")

        for read in bam_file.fetch(reference='20'):
            self.assertEqual(read.pos, 0)
            self.assertEqual(read.seq, "T" * 9)
            self.assertEqual(read.cigarstring, "9M")

        for read in bam_file.fetch(reference='10'):
            self.assertEqual(read.pos, 0)
            self.assertEqual(read.seq, "A" * 10)
            self.assertEqual(read.cigarstring, "10M")
            print((dir(read)))
Beispiel #21
0
    def test_can_build_multiple_bam_files(self):
        bank = SampleBank("ATCCT*ATAATAAATAAATAAT")
        sample_name1 = "TEST_SAMPLE1"
        bank.add_sample_name(sample_name1)
        bank[sample_name1].add_sequence("....CT.........T......")

        sample_name2 = "TEST_SAMPLE2"
        bank.add_sample_name(sample_name2)
        bank[sample_name2].add_sequence(".....*.G..........*...")

        builder = WecallInputDataBuilder(self.work_dir).with_sample_bank(bank)
        input_bams = builder.build().bam_filenames

        bam_file1 = pysam.Samfile(input_bams[0], "rb")
        for read in bam_file1.fetch():
            self.assertEqual(read.pos, 0)
            self.assertEqual(read.seq, "ATCCCTATAATAAATTAATAAT")
            self.assertEqual(read.cigarstring, "5M1I16M")

        bam_file2 = pysam.Samfile(input_bams[1], "rb")
        for read in bam_file2.fetch():
            self.assertEqual(read.pos, 0)
            self.assertEqual(read.seq, "ATCCTAGAATAAATAAAAAT")
            self.assertEqual(read.cigarstring, "17M1D3M")
Beispiel #22
0
 def with_ref_sequence(self, ref_sequence, pos_from=0, chrom=DEFAULT_CHROM):
     self._sample_bank[chrom] = SampleBank(ref_sequence, pos_from, chrom)
     return self