Example #1
0
 def test_should_fail_at_seq_with_different_length_to_reference(self):
     # Given
     ref_seq = "AAAA"
     seq = "CC"
     sequence_bank = SequenceBank(ReferenceChromosome(ref_seq))
     # Then
     self.assertRaises(weCallException, sequence_bank.add_sequence, seq)
Example #2
0
 def test_find_adjacent_insertion_and_snp(self):
     ref = ReferenceChromosome("T*ATAAAAAAAT")
     seq = Sequence(ref, ".CG.........")
     self.assertEqual(seq.variants, {
         Variant(ref.chrom, 0, "T", "TC"),
         Variant(ref.chrom, 1, "A", "G")
     })
Example #3
0
 def test_should_find_multiple_snps(self):
     ref = ReferenceChromosome("AAAAAAAAAAAAA")
     seq = Sequence(ref, ".C.........T.")
     self.assertEqual(seq.variants, {
         Variant(ref.chrom, 1, "A", "C"),
         Variant(ref.chrom, 11, "A", "T")
     })
Example #4
0
 def test_should_interpret_trailing_whitespace_to_override_pos_to_when_seq_has_insertion(
         self):
     ref = ReferenceChromosome("C*GA")
     annotated_seqs = sequence_builder(ref, ".C  ")
     read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs]
     reads = [read for read_list in read_lists for read in read_list]
     self.assertEqual(reads[0].rlen, 2)
Example #5
0
 def test_find_multiple_single_base_deletion(self):
     ref = ReferenceChromosome("TTAAAAAGAAAAT")
     seq = Sequence(ref, "..*.....*....")
     self.assertEqual(seq.variants, {
         Variant(ref.chrom, 1, "TA", "T"),
         Variant(ref.chrom, 7, "GA", "G")
     })
Example #6
0
 def test_find_adjacent_snp_and_deletion(self):
     ref = ReferenceChromosome("TTAAAAAAAAAT")
     seq = Sequence(ref, ".G*.........")
     self.assertEqual(seq.variants, {
         Variant(ref.chrom, 1, "T", "G"),
         Variant(ref.chrom, 1, "TA", "T")
     })
Example #7
0
    def test_header_for_multisample_multicontig(self):
        ref = ReferenceChromosome("")
        sequence_bank = SequenceBank(ref)
        builder = BAMBuilder(
            os.path.join(self.work_dir, self.filestub + ".bam"))
        builder.with_bam_contig_data("1", 10, "SAMPLE_ONE", sequence_bank)
        builder.with_bam_contig_data("2", 20, "SAMPLE_TWO", sequence_bank)

        expected_header = {
            'HD': {
                'VN': '1.0'
            },
            'SQ': [{
                'LN': 10,
                'SN': "1"
            }, {
                'LN': 20,
                'SN': "2"
            }],
            'RG': [{
                "ID": RG_ID + "_SAMPLE_ONE",
                "SM": "SAMPLE_ONE"
            }, {
                "ID": RG_ID + "_SAMPLE_TWO",
                "SM": "SAMPLE_TWO"
            }]
        }

        self.assertDictEqual(expected_header, builder.header)
Example #8
0
    def test_can_build_with_one_seq(self):
        ref = ReferenceChromosome("TCATAAAAAAAT")
        sequence_bank = SequenceBank(ref)
        sequence_bank.add_sequence(".*G.........",
                                   "            ",
                                   n_fwd=2,
                                   n_rev=1)

        builder = BAMBuilder(
            os.path.join(self.work_dir,
                         self.filestub + ".bam")).with_bam_contig_data(
                             self.chrom, self.chrom_length, self.sample_name,
                             sequence_bank)
        builder.build()

        bam_file = pysam.Samfile(builder.filename, "rb")
        reads = list(bam_file.fetch())
        self.assertEqual(len(reads), 3)

        for read in reads:
            self.assertEqual(read.pos, 0)
            self.assertEqual(read.seq, "TGTAAAAAAAT")
            self.assertEqual(read.cigarstring, "1M1D10M")

        self.assertTrue(os.path.isfile(bam_file.filename))
        self.assertTrue(os.path.isfile(bam_file.filename.decode() + ".bai"))
 def test_should_be_able_to_access_ref_char(self):
     # Given
     seq_ref = ReferenceChromosome("AC*T*G")
     # Then
     self.assertEqual(seq_ref[0], "A")
     self.assertEqual(seq_ref[1], "C")
     self.assertEqual(seq_ref[2], "T")
     self.assertEqual(seq_ref[3], "G")
Example #10
0
 def test_should_interpret_trailing_whitespace_to_override_pos_to(self):
     ref = ReferenceChromosome("CATG")
     annotated_seqs = sequence_builder(ref, ".C  ")
     read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs]
     reads = [read for read_list in read_lists for read in read_list]
     self.assertEqual(len(reads), 1)
     self.assertEqual(reads[0].pos, 0)
     self.assertEqual(reads[0].seq, "CC")
Example #11
0
 def test_should_build_correct_sequence_with_insertion_at_the_end(self):
     ref = ReferenceChromosome("CCC**")
     builders = sequence_builder(ref, "...TT")
     read_lists = [builder.build_reads(0, {}) for builder in builders]
     reads = [read for read_list in read_lists for read in read_list]
     self.assertEqual(reads[0].pos, 0)
     self.assertEqual(reads[0].rlen, 5)
     self.assertEqual(reads[0].seq, "CCCTT")
Example #12
0
 def test_should_build_with_custom_quality_with_ins(self):
     ref = ReferenceChromosome("AA**A")
     annotated_seqs = sequence_builder(ref, "..CC.", quality_string="31220")
     read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs]
     reads = [read for read_list in read_lists for read in read_list]
     self.assertEqual(
         reads[0].qual, self.ascii_codes["3"] + self.ascii_codes["1"] +
         self.ascii_codes["2"] * 2 + self.ascii_codes["0"])
Example #13
0
 def test_should_allocate_pos_from_and_pos_to_based_on_reference_size(self):
     # Given
     input_ref_seq = "ACCCT"
     # When
     seq_ref = ReferenceChromosome(input_ref_seq)
     # Then
     self.assertEqual(seq_ref.pos_from, 0)
     self.assertEqual(seq_ref.pos_to, len(input_ref_seq))
Example #14
0
 def test_should_interpret_trailing_whitespace_to_override_positions_for_complex_ref_and_seq(
         self):
     ref = ReferenceChromosome("ACCC*G*A")
     annotated_seqs = sequence_builder(ref, ".**.C   ")
     read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs]
     reads = [read for read_list in read_lists for read in read_list]
     self.assertEqual(reads[0].pos, 0)
     self.assertEqual(reads[0].rlen, 3)
Example #15
0
 def test_should_build_correct_sequence_without_any_whitespace(self):
     ref = ReferenceChromosome("C*CC")
     annotated_seqs = sequence_builder(ref, ".*.T")
     read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs]
     reads = [read for read_list in read_lists for read in read_list]
     self.assertEqual(len(reads), 1)
     self.assertEqual(reads[0].pos, 0)
     self.assertEqual(reads[0].rlen, 3)
     self.assertEqual(reads[0].seq, "CCT")
Example #16
0
 def test_should_interpret_leading_whitespace_to_override_pos_from_when_ref_has_deletion(
         self):
     ref = ReferenceChromosome("C*TG")
     annotated_seqs = sequence_builder(ref, "  C.")
     read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs]
     reads = [read for read_list in read_lists for read in read_list]
     self.assertEqual(len(reads), 1)
     self.assertEqual(reads[0].pos, 1)
     self.assertEqual(reads[0].seq, "CG")
Example #17
0
 def test_should_build_with_custom_quality_with_del(self):
     ref = ReferenceChromosome("AAAAA")
     annotated_seqs = sequence_builder(ref, "..*..", quality_string="31 00")
     read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs]
     reads = [read for read_list in read_lists for read in read_list]
     reads.sort(key=lambda x: (x.pos, x.seq, x.qual, x.cigarstring, x.mapq))
     self.assertEqual(
         reads[0].qual, self.ascii_codes["3"] + self.ascii_codes["1"] +
         self.ascii_codes["0"] * 2)
Example #18
0
 def test_find_multiple_variants(self):
     ref = ReferenceChromosome("TA*AAAGCTAACT")
     seq = Sequence(ref, ".GC...T...**.")
     self.assertEqual(
         seq.variants, {
             Variant(ref.chrom, 1, "A", "G"),
             Variant(ref.chrom, 1, "A", "AC"),
             Variant(ref.chrom, 5, "G", "T"),
             Variant(ref.chrom, 8, "AAC", "A")
         })
Example #19
0
 def test_should_build_with_custom_quality_and_sequence_shorter_than_reference(
         self):
     ref = ReferenceChromosome("AAAAAAAAAAAA")
     builders = sequence_builder(ref,
                                 "  ..*..     ",
                                 quality_string="  31 0      ")
     read_lists = [builder.build_reads(0, {}) for builder in builders]
     reads = [read for read_list in read_lists for read in read_list]
     self.assertEqual(
         reads[0].qual, self.ascii_codes["3"] + self.ascii_codes["1"] +
         self.ascii_codes["0"] + self.default_qual)
Example #20
0
 def test_should_build_two_complex_seqs_defined_on_single_line(self):
     ref = ReferenceChromosome("AA*CC*TGTAAGG")
     annotated_seqs = sequence_builder(ref, " .G.  ,c,*,  ")
     read_lists = [builder.build_reads(0, {}) for builder in annotated_seqs]
     reads = [read for read_list in read_lists for read in read_list]
     reads.sort(key=lambda x: (x.pos, x.seq, x.qual, x.cigarstring, x.mapq))
     self.assertEqual(len(reads), 2)
     self.assertEqual(reads[0].pos, 1)
     self.assertEqual(reads[0].seq, "AGC")
     self.assertEqual(reads[1].pos, 4)
     self.assertEqual(reads[1].seq, "TCTA")
Example #21
0
 def test_should_be_able_to_add_snp_using_whitespace_dsl_syntax(self):
     # Given
     input_ref = "CC*AAGG"
     snp_input = "   .T. "
     # When
     sequence_bank = SequenceBank(ReferenceChromosome(input_ref))
     sequence_bank.add_sequence(snp_input)
     read_lists = [builder.build_reads(0, {}) for builder in sequence_bank]
     reads = [read for read_list in read_lists for read in read_list]
     # Then
     self.assertEqual(reads[0].pos, 2)
     self.assertEqual(reads[0].seq, 'ATG')
Example #22
0
def build_annotated_pair(fwd, rev, n_fwd, n_rev, mapping_quality, insert_size,
                         read_id, read_flags, cigar_string, read_start,
                         read_mate_start):
    fwd_reference = ReferenceChromosome(fwd.reference_string, fwd.pos_from)
    rev_reference = ReferenceChromosome(rev.reference_string, rev.pos_from)
    fwd_sequence = Sequence(fwd_reference,
                            fwd.sequence_string.replace(",", ".").upper(),
                            cigar_string)
    rev_sequence = Sequence(rev_reference,
                            rev.sequence_string.replace(",", ".").upper(),
                            cigar_string)
    fwd_quality = SequenceQuality(fwd.quality_string)
    rev_quality = SequenceQuality(rev.quality_string)

    fwd_read_sequence = ReadSequence(fwd_sequence, fwd_quality,
                                     mapping_quality, insert_size, read_id,
                                     read_flags, read_start, read_mate_start)
    rev_read_sequence = ReadSequence(rev_sequence, rev_quality,
                                     mapping_quality, insert_size, read_id,
                                     read_flags, read_start, read_mate_start)
    return [
        ReadPairWithCoverage(fwd_read_sequence, rev_read_sequence, n_fwd,
                             n_rev)
    ]
Example #23
0
    def test_can_build_two_chroms(self):
        ref1 = ReferenceChromosome("TCATAAAAAAAT")
        sequence_bank1 = SequenceBank(ref1)
        sequence_bank1.add_sequence(".*G.........")

        ref2 = ReferenceChromosome("GGGG")
        sequence_bank2 = SequenceBank(ref2)
        sequence_bank2.add_sequence("..*.")

        builder = BAMBuilder(
            os.path.join(self.work_dir,
                         self.filestub + ".bam")).with_bam_contig_data(
                             "1", 100, "SAMPLE",
                             sequence_bank1).with_bam_contig_data(
                                 "X", 50, "SAMPLE", sequence_bank2)
        builder.build()

        bam_file = pysam.Samfile(builder.filename, "rb")
        reads_chrom1 = list(bam_file.fetch(region="1:1-20"))
        self.assertEqual(len(reads_chrom1), 1)
        self.assertEqual(reads_chrom1[0].seq, "TGTAAAAAAAT")

        bam_file = pysam.Samfile(builder.filename, "rb")
        reads_chrom2 = list(bam_file.fetch(region="X:1-5"))
        self.assertEqual(len(reads_chrom2), 1)
        self.assertEqual(reads_chrom2[0].seq, "GGG")

        reads = list(bam_file.fetch())
        self.assertEqual(len(reads), 2)
        self.assertEqual(reads[0].seq, "TGTAAAAAAAT")
        self.assertEqual(reads[1].seq, "GGG")

        self.assertRaises(ValueError, bam_file.fetch, region="2:1-20")

        self.assertTrue(os.path.isfile(bam_file.filename))
        self.assertTrue(os.path.isfile(bam_file.filename.decode() + ".bai"))
Example #24
0
    def test_should_generate_variant_from_ascii_text(self):
        ref = "ATAAAAAAAAAT"
        alt_1 = ".A........*."
        alt_2 = ".C.........."
        variant_generator = AsciiVariantGenerator(ReferenceChromosome(ref))

        gen_vars = variant_generator.get_variants([alt_1, alt_2])

        self.assertEqual(
            gen_vars,
            {
                Variant(variant_generator.reference.chrom, 1, "T", "A"),
                Variant(variant_generator.reference.chrom, 1, "T", "C"),
                Variant(variant_generator.reference.chrom, 9, "AA", "A")
            }
        )
Example #25
0
    def test_should_use_sample_name_if_available(self):
        chrom = '14'

        sequence_bank = SequenceBank(
            ReferenceChromosome('CGGCGGTCGAACGGAGCCCCAAGCGAAGCTCAAAACATGG', 0,
                                chrom))
        sequence_bank.add_sequence('      ...........A.............         ',
                                   n_fwd=10,
                                   n_rev=10)

        driver = SVCDriver(self).with_ref_sequence(
            'CGGCGGTCGAACGGAGCCCCAAGCGAAGCTCAAAACATGG', chrom=chrom)\
            .with_bam_data('pi.bam', {'sample': sequence_bank}, True)

        expect = driver.call()

        expect.with_output_vcf().record_count(1).with_samples(['sample'])
Example #26
0
    def test_can_build_with_defined_quality(self):
        ref = ReferenceChromosome("TCATAAAT")
        sequence_bank = SequenceBank(ref)
        sequence_bank.add_sequence(".*G.....", "9 87  00", n_fwd=1, n_rev=0)

        builder = BAMBuilder(
            os.path.join(self.work_dir,
                         self.filestub + ".bam")).with_bam_contig_data(
                             self.chrom, self.chrom_length, self.sample_name,
                             sequence_bank)
        builder.build()

        bam_file = pysam.Samfile(builder.filename, "rb")
        reads = list(bam_file.fetch())
        self.assertEqual(len(reads), 1)
        self.assertEqual(reads[0].seq, "TGTAAAT")

        # ascii: "0": "!", "1": "+", "2": "5", "3": "?", "4": "H", "5": "S",
        # "6": "]", "7": "g", "8": "q", "9": "{"
        expected_qual = "{qgHH!!"
        self.assertEqual(reads[0].qual, expected_qual)
Example #27
0
    def build_annotated_seq(self, n_fwd, n_rev, mapping_quality, insert_size,
                            read_id, read_flags, cigar_string, read_start,
                            read_mate_start):
        reference = ReferenceChromosome(self.reference_string, self.pos_from)
        sequence = Sequence(reference,
                            self.sequence_string.replace(",", ".").upper(),
                            cigar_string)
        quality = SequenceQuality(self.quality_string)

        read_sequence = ReadSequence(sequence, quality, mapping_quality,
                                     insert_size, read_id, read_flags,
                                     read_start, read_mate_start)
        if n_fwd is not None:
            return [ReadSequenceWithCoverage(read_sequence, n_fwd, n_rev)]
        elif self.is_reverse_seq():
            return [ReadSequenceWithCoverage(read_sequence, 0, 1)]
        elif self.is_forward_seq():
            return [ReadSequenceWithCoverage(read_sequence, 1, 0)]
        else:
            raise weCallException(
                "Raw sequence: {} is neither forward or reverse".format(self))
Example #28
0
 def test_should_get_correct_fasta_string_for_offset_reference(self):
     # Given
     seq_ref = ReferenceChromosome("AC*T*G", 5)
     # Then
     self.assertEqual(seq_ref.fasta_string(), "NNNNNACTG")
Example #29
0
 def test_should_correctly_getitem_for_offset_reference(self):
     # Given
     seq_ref = ReferenceChromosome("A*T", 10)
     # Then
     self.assertEqual(seq_ref[10], "A")
     self.assertEqual(seq_ref[11], "T")
Example #30
0
 def test_should_ignore_asterixes_in_reference_sequence_in_computing_pos_to(
         self):
     # Given
     ref_seq = ReferenceChromosome("C*C*C")
     # Then
     self.assertEqual(ref_seq.pos_to, 3)