def test_variant_sequence_contains(): # AA|C|T vs_longer_prefix = VariantSequence(prefix="AA", alt="C", suffix="T", reads=[ AlleleRead(prefix="AA", allele="C", suffix="T", name="longer_prefix") ]) # A|C|TT vs_longer_suffix = VariantSequence(prefix="A", alt="C", suffix="TT", reads=[ AlleleRead(prefix="A", allele="C", suffix="TT", name="longer_suffix") ]) # A|C|T vs_short = VariantSequence( prefix="A", alt="C", suffix="T", reads=[AlleleRead(prefix="A", allele="C", suffix="T", name="short")]) # two longer sequences contain the shorter subsequence assert vs_longer_prefix.contains(vs_short), \ "Expected %s to contain %s" % (vs_longer_prefix, vs_short) assert vs_longer_suffix.contains(vs_short), \ "Expected %s to contain %s" % (vs_longer_suffix, vs_short) # other pairs do not contain each other assert not vs_longer_prefix.contains(vs_longer_suffix), \ "Expected %s to not contain %s" % (vs_longer_prefix, vs_longer_suffix) assert not vs_longer_suffix.contains(vs_longer_prefix), \ "Expected %s to not contain %s" % (vs_longer_suffix, vs_longer_prefix) assert not vs_short.contains(vs_longer_prefix), \ "Expected %s to not contain %s" % (vs_short, vs_longer_prefix) assert not vs_short.contains(vs_longer_suffix), \ "Expected %s to not contain %s" % (vs_short, vs_longer_suffix) # Sequences above has 'C' allele whereas this one has 'G' # A|G|T vs_different_allele = VariantSequence( prefix="A", alt="G", suffix="T", reads=[AlleleRead(prefix="A", allele="G", suffix="T", name="short")]) for vs in [vs_longer_suffix, vs_longer_prefix, vs_short]: assert not vs.contains(vs_different_allele), \ "Expected %s to not contain %s" % (vs, vs_different_allele) assert not vs_different_allele.contains(vs), \ "Expected %s to not contain %s" % (vs_different_allele, vs)
def test_variant_sequence_min_coverage(): # 1: AA|C|TT # 2: AA|C|T # 3: A|C|TT reads = [ AlleleRead(prefix="AA", allele="C", suffix="TT", name="1"), AlleleRead(prefix="AA", allele="C", suffix="T", name="2"), AlleleRead(prefix="A", allele="C", suffix="TT", name="3") ] vs = VariantSequence(prefix="AA", alt="C", suffix="TT", reads=reads) eq_(vs.min_coverage(), 2)
def test_allele_count_dataframe(): variant = Variant("test_contig", 50, "C", "G") reads = [ AlleleRead(prefix="AAA", allele="C", suffix="TTT", name="C1"), AlleleRead(prefix="AAC", allele="C", suffix="TTA", name="C2"), AlleleRead(prefix="AAA", allele="G", suffix="TTT", name="G1"), ] df = allele_counts_dataframe([(variant, reads)]) assert len(df) == 1, "Wrong number of rows in DataFrame: %s" % (df, ) row = df.iloc[0] eq_(row.n_ref, 2) eq_(row.n_alt, 1) eq_(row.n_other, 0)
def test_assembly_of_simple_sequence_from_mock_reads(): # Read sequences: # AAAAA|CC|TTTTT # AAAAA|CC|TTTTT # GAAAAA|CC|TTTTTG # AAAA|CC|TTTT reads = [ # two identical reads with sequence AAAAA|CC|TTTTT AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup1"), AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup2"), # longer sequence GAAAAA|CC|TTTTTG AlleleRead(prefix="G" + "A" * 5, allele="CC", suffix="T" * 5 + "G", name="longer"), # shorter sequence AAAA|CC|TTTT AlleleRead(prefix="A" * 4, allele="CC", suffix="T" * 4, name="shorter"), ] expected_variant_sequence = VariantSequence(prefix="G" + "A" * 5, alt="CC", suffix="T" * 5 + "G", reads=reads) initial_variant_sequences = initial_variant_sequences_from_reads(reads) # expecting one fewer sequence than reads since two of the reads are # duplicates eq_(len(initial_variant_sequences), len(reads) - 1) # calling into either iterative_overlap_assembly or greedy_merge should # give same results for fn in [greedy_merge, iterative_overlap_assembly]: assembled_variant_sequences = fn(initial_variant_sequences, min_overlap_size=1) # since no reads contradict each other then we should get back a single # assembled sequence eq_( len(assembled_variant_sequences), 1, "Unexpected number of variant sequences: %s" % (assembled_variant_sequences, )) assembled_variant_sequence = assembled_variant_sequences[0] eq_(assembled_variant_sequence, expected_variant_sequence) eq_(len(assembled_variant_sequence.reads), len(reads)) eq_(assembled_variant_sequence.min_coverage(), 1) # 2 bases with 1/4 reads, 2 bases with 3/4 reads, remaining 10 bases with # all 4/4 reads expected_mean_coverage = (2 * 1 + 2 * 3 + 10 * 4) / 14 eq_(assembled_variant_sequence.mean_coverage(), expected_mean_coverage)
def test_variant_sequence_mean_coverage(): # 1: AA|C|TT # 2: AA|C|T # 3: A|C|TT reads = [ AlleleRead(prefix="AA", allele="C", suffix="TT", name="1"), AlleleRead(prefix="AA", allele="C", suffix="T", name="2"), AlleleRead(prefix="A", allele="C", suffix="TT", name="3") ] vs = VariantSequence(prefix="AA", alt="C", suffix="TT", reads=reads) # count the number of times a nucleotide in the sequences above # is contained in a read expected_mean_coverage = (2 + 3 + 3 + 3 + 2) / 5 eq_(vs.mean_coverage(), expected_mean_coverage)
def test_variant_sequence_trim_by_coverage(): reads = [ AlleleRead(prefix="AA", allele="C", suffix="T", name="1"), AlleleRead(prefix="A", allele="C", suffix="T", name="2") ] vs = VariantSequence(prefix="AA", alt="C", suffix="T", reads=reads) # every nucleotide is spanned by one read eq_(vs.trim_by_coverage(1), vs) vs_expected_trim_by_2 = VariantSequence(prefix="A", alt="C", suffix="T", reads=reads) eq_(vs.trim_by_coverage(2), vs_expected_trim_by_2)
def test_variant_sequence_read_names(): vs = VariantSequence(prefix="A", alt="C", suffix="T", reads=[ AlleleRead(prefix="A", allele="C", suffix="T", name="1"), AlleleRead(prefix="A", allele="C", suffix="T", name="2") ]) eq_(vs.read_names, {"1", "2"})
def test_partitioned_read_sequences_deletion(): """ test_partitioned_read_sequences_deletion : Test that read gets correctly partitioned for chr1:4 TT>T where the sequence for chr1 is assumed to be "ACCTTG" """ # chr1_seq = "ACCTTG" chromosome = "chromosome" location = 4 ref = "TT" alt = "T" variant = Variant(chromosome, location, ref, alt, normalize_contig_name=False) read = make_read(seq="ACCTG", cigar="4M1D1M", mdtag="4^T1") samfile = DummySamFile(reads=[read]) variant_reads = reads_supporting_variant(samfile=samfile, chromosome=chromosome, variant=variant) print(variant_reads) assert len(variant_reads) == 1 variant_read = variant_reads[0] expected = AlleleRead(name=read.qname, prefix="ACCT", allele="", suffix="G") eq_(variant_read, expected)
def test_variant_sequence_overlaps(): # AAA|GG|TT vs_3A = VariantSequence( prefix="AAA", alt="GG", suffix="TT", reads=[AlleleRead(prefix="AAA", allele="GG", suffix="TT", name="1")]) # AA|GG|TT vs_2A = VariantSequence( prefix="AA", alt="GG", suffix="TT", reads=[AlleleRead(prefix="AA", allele="GG", suffix="TT", name="1")]) for min_overlap_size in [1, 2, 3, 4, 5, 6]: assert vs_3A.left_overlaps(vs_2A, min_overlap_size=min_overlap_size), \ "Expected %s to overlap %s from left (min overlap size=%d)" % ( vs_3A, vs_2A, min_overlap_size) assert not vs_2A.left_overlaps(vs_3A, min_overlap_size=min_overlap_size), \ "Expected %s to not overlap %s from left (min overlap size=%d)" % ( vs_2A, vs_3A, min_overlap_size) assert not vs_3A.left_overlaps(vs_2A, min_overlap_size=7), \ "Unexpected overlap between %s and %s for min_overlap_size=7" % ( vs_3A, vs_2A)
def make_inputs_for_tp53_201_variant( cdna_prefix="ATG", cdna_suffix="AGGAGCCGCAGTCAGAT", n_bad_nucleotides_at_start=0, mismatches_before_variant=0, mismatches_after_variant=14, # the read is that much longer than the reference (17 vs 3) reference_context_size=3): """ Parameters ---------- cdna_prefix : str Transcript nucleotides before the variant that we're pretending got detected from RNA-seq reads. cdna_suffix : str Transcript nucleotides after the variant that we're pretending got detected from RNA-seq reads. n_bad_nucleotides_at_start : int Number of nucleotides we expect to get trimmed from the beginning of the variant sequence while matching to a reference context. mismatches_before_variant : int Expected number of nucleotide mismatches in the result before the variant locus. reference_context_size : int Number of nucleotides before the variant locus to try matching against a reference transcript. """ # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") cdna_alt = "A" # genomic DNA is the reverse complement of the cDNA # for TP53-001 since it's on the negative strand gdna_prefix = reverse_complement_dna(cdna_suffix) gdna_alt = reverse_complement_dna(cdna_alt) gdna_suffix = reverse_complement_dna(cdna_prefix) # variant sequence supported by two reads # one fully spanning the variant sequence # and another missing the last nucleotide fully_overlapping_read = AlleleRead(prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix, name="full-overlap") # testing the prefix and allele to make sure they have the expected # TP53-201 sequence but the suffix might change depending on what's # passed in as cdna_prefix if cdna_suffix == "AGGAGCCGCAGTCAGAT": eq_(fully_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(fully_overlapping_read.allele, "T") partially_overlapping_read = AlleleRead(prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix[:-1], name="partial-overlap") if cdna_suffix == "AGGAGCCGCAGTCAGAT": eq_(partially_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(partially_overlapping_read.allele, "T") variant_sequence = VariantSequence( prefix=gdna_prefix, alt=gdna_alt, suffix=gdna_suffix, reads=[fully_overlapping_read, partially_overlapping_read]) assert isinstance(variant_sequence, VariantSequence) prefix_length = len(cdna_prefix) - n_bad_nucleotides_at_start reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=reference_context_size) assert isinstance(reference_coding_sequence_key, ReferenceCodingSequenceKey) reference_context = ReferenceContext.from_reference_coding_sequence_key( key=reference_coding_sequence_key, variant=variant, transcripts=[transcript]) assert isinstance(reference_context, ReferenceContext) expected = VariantSequenceInReadingFrame( cdna_sequence=cdna_prefix[-prefix_length:] + cdna_alt + cdna_suffix, offset_to_first_complete_codon=prefix_length % 3, variant_cdna_interval_start=prefix_length, variant_cdna_interval_end=prefix_length + 1, reference_cdna_sequence_before_variant="ATG"[-prefix_length:], reference_cdna_sequence_after_variant= "AGGAGCCGCAGTCAGAT"[:reference_context_size], number_mismatches_before_variant=mismatches_before_variant, number_mismatches_after_variant=mismatches_after_variant) assert isinstance(expected, VariantSequenceInReadingFrame) return variant_sequence, reference_context, expected
def test_allele_read_from_single_read_at_locus_trim_N_nucleotides(): read_at_locus = make_read_at_locus(prefix="NCCN", alt="A", suffix="TNNA") allele_read = AlleleRead.from_locus_read(read_at_locus, n_ref=1) print(allele_read) expected = AlleleRead(prefix="", allele="A", suffix="T", name="dummy") eq_(allele_read, expected)