def test_somatic_variant_with_2_supporting_rna_reads(): variant = Variant("14", 105849746, "G", "A", grch38) base_dir = "data/somatic-variant-with-2-supporting-rna-reads/" normal_reads = load_bam(base_dir + "normal.14.105849746.G.A.no-alt.sorted.bam") tumor_reads = load_bam(base_dir + "tumor.14.105849746.G.A.many-alt.sorted.bam") rna_reads = load_bam(base_dir + "rna.14.105849746.G.A.2-alt.sorted.bam") read_creator = ReadCollector() normal_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, alignment_file=normal_reads) eq_(len(normal_sample_variant_reads), 0) print(normal_sample_variant_reads) tumor_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, alignment_file=tumor_reads) print(tumor_sample_variant_reads) eq_(len(tumor_sample_variant_reads), 8) rna_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, alignment_file=rna_reads) print(rna_sample_variant_reads) eq_(len(rna_sample_variant_reads), 2) # Arun went through the hassle of pulling out the exact read names # in IGV expected_variant_rna_read_names = { "K00193:50:H5NKVBBXX:5:2202:6421:24964", "K00193:50:H5NKVBBXX:5:2119:30908:1138", } for variant_read in rna_sample_variant_reads: assert variant_read.name in expected_variant_rna_read_names
def test_somatic_variant_with_2_supporting_rna_reads(): variant = Variant("14", 105849746, "G", "A") base_dir = "data/somatic-variant-with-2-supporting-rna-reads/" normal_reads = load_bam(base_dir + "normal.14.105849746.G.A.no-alt.sorted.bam") tumor_reads = load_bam(base_dir + "tumor.14.105849746.G.A.many-alt.sorted.bam") rna_reads = load_bam(base_dir + "rna.14.105849746.G.A.2-alt.sorted.bam") normal_sample_variant_reads = reads_supporting_variant( variant=variant, samfile=normal_reads) eq_(len(normal_sample_variant_reads), 0) print(normal_sample_variant_reads) tumor_sample_variant_reads = reads_supporting_variant( variant=variant, samfile=tumor_reads) print(tumor_sample_variant_reads) eq_(len(tumor_sample_variant_reads), 8) rna_sample_variant_reads = reads_supporting_variant( variant=variant, samfile=rna_reads) print(rna_sample_variant_reads) eq_(len(rna_sample_variant_reads), 2) # Arun went through the hassle of pulling out the exact read names # in IGV expected_variant_rna_read_names = { "K00193:50:H5NKVBBXX:5:2202:6421:24964", "K00193:50:H5NKVBBXX:5:2119:30908:1138", } for variant_read in rna_sample_variant_reads: assert variant_read.name in expected_variant_rna_read_names
def test_most_common_nucleotides_for_chr12_deletion(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 70091490 ref = "TTGTAGATGCTGCCTCTCC" alt = "" variant = Variant( chromosome, base1_location, ref, alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant( samfile=samfile, chromosome=chromosome, variant=variant) consensus_sequence, chosen_counts, other_counts = most_common_nucleotides( variant_reads) print(chosen_counts) print(other_counts) eq_(len(chosen_counts), len(consensus_sequence)) eq_(len(other_counts), len(consensus_sequence)) assert other_counts.sum() < chosen_counts.sum(), \ "Counts for alternate nucleotides should not exceed the chosen sequence" number_matching_reads = 0 for variant_read in variant_reads: full_seq = variant_read.prefix + variant_read.allele + variant_read.suffix number_matching_reads += (full_seq in consensus_sequence) fraction_matching_reads = number_matching_reads / float(len(variant_reads)) print("Fraction matching reads is %d/%d = %f" % ( number_matching_reads, len(variant_reads), fraction_matching_reads)) assert fraction_matching_reads > 0.5, \ "Expected majority of reads to match consensus sequence"
def test_sequence_counts_snv(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant(chromosome, base1_location, ref, alt) variant_reads = reads_supporting_variant( samfile=samfile, chromosome=chromosome, variant=variant) variant_sequences = reads_to_variant_sequences( variant=variant, reads=variant_reads, preferred_sequence_length=61) assert len(variant_sequences) == 1 for variant_sequence in variant_sequences: print(variant_sequence) eq_(variant_sequence.alt, alt) eq_(len(variant_sequence.prefix), 30) eq_(len(variant_sequence.suffix), 30) eq_( variant_sequence.prefix + variant_sequence.alt + variant_sequence.suffix, variant_sequence.sequence)
def test_assemble_transcript_fragments_snv(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant( contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant( variant=variant, samfile=samfile, chromosome=chromosome,) sequences = iterative_overlap_assembly( initial_variant_sequences_from_reads(variant_reads), min_overlap_size=30) assert len(sequences) > 0 max_read_length = max(len(r) for r in variant_reads) for s in sequences: print("%s%s%s weight=%d length=%d" % ( s.prefix, s.alt, s.suffix, len(s.reads), len(s.sequence))) eq_(s.alt, alt) assert len(s) > max_read_length, \ "Expected assembled sequences to be longer than read length (%d)" % ( max_read_length,)
def test_group_unique_sequences(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant( contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant( samfile=samfile, chromosome=chromosome, variant=variant) print("%d variant reads: %s" % ( len(variant_reads), variant_reads)) groups = group_unique_sequences( variant_reads, max_prefix_size=30, max_suffix_size=30) print("%d unique sequences: %s" % ( len(groups), groups)) # there are some redundant reads, so we expect that the number of # unique entries should be less than the total read partitions assert len(variant_reads) > len(groups)
def variants_to_protein_sequences_dataframe( expressed_vcf="data/b16.f10/b16.expressed.vcf", not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf", tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam", min_mapping_quality=0, max_protein_sequences_per_variant=1, variant_sequence_assembly=False): """ Helper function to load pair of VCFs and tumor RNA BAM and use them to generate a DataFrame of expressed variant protein sequences. """ expressed_variants = load_vcf(expressed_vcf) not_expressed_variants = load_vcf(not_expressed_vcf) combined_variants = VariantCollection( list(expressed_variants) + list(not_expressed_variants)) alignment_file = load_bam(tumor_rna_bam) read_collector = ReadCollector(min_mapping_quality=min_mapping_quality) read_evidence_gen = read_collector.read_evidence_generator( variants=combined_variants, alignment_file=alignment_file) creator = ProteinSequenceCreator( max_protein_sequences_per_variant=max_protein_sequences_per_variant, variant_sequence_assembly=variant_sequence_assembly) protein_sequences_generator = \ creator.protein_sequences_from_read_evidence_generator(read_evidence_gen) df = protein_sequences_generator_to_dataframe(protein_sequences_generator) return df, expressed_variants, combined_variants
def variants_to_protein_sequences_dataframe( expressed_vcf="data/b16.f10/b16.expressed.vcf", not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf", tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam", min_mapping_quality=0, max_protein_sequences_per_variant=1, variant_sequence_assembly=False): """ Helper function to load pair of VCFs and tumor RNA BAM and use them to generate a DataFrame of expressed variant protein sequences. """ expressed_variants = load_vcf(expressed_vcf) not_expressed_variants = load_vcf(not_expressed_vcf) combined_variants = VariantCollection( list(expressed_variants) + list(not_expressed_variants)) samfile = load_bam(tumor_rna_bam) allele_reads_generator = reads_overlapping_variants( variants=combined_variants, samfile=samfile, min_mapping_quality=min_mapping_quality) protein_sequences_generator = reads_generator_to_protein_sequences_generator( allele_reads_generator, max_protein_sequences_per_variant=max_protein_sequences_per_variant, variant_sequence_assembly=variant_sequence_assembly) df = protein_sequences_generator_to_dataframe(protein_sequences_generator) return df, expressed_variants, combined_variants
def test_assemble_transcript_fragments_snv(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant(contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant( variant=variant, samfile=samfile, chromosome=chromosome, ) sequences = iterative_overlap_assembly( initial_variant_sequences_from_reads(variant_reads), min_overlap_size=30) assert len(sequences) > 0 max_read_length = max(len(r) for r in variant_reads) for s in sequences: print("%s%s%s weight=%d length=%d" % (s.prefix, s.alt, s.suffix, len(s.reads), len(s.sequence))) eq_(s.alt, alt) if len(s.read_names) > 1: # expect sequences supported by more than one read to be greater # than the read length assert len(s) > max_read_length, \ "Expected assembled sequences to be longer than read length (%d)" % ( max_read_length,)
def test_locus_reads_dataframe(): sam_all_variants = load_bam("data/b16.f10/b16.combined.bam") n_reads_expected = 0 sam_path_single_variant = data_path( "data/b16.f10/b16.f10.127a.aldh1b1.chr4.45802539.refG.altC.sam") with open(sam_path_single_variant) as f: for line in f: if line.startswith("HWI"): n_reads_expected += 1 # we know from inspecting the file that *one* of the reads overlapping this # variant has a CIGAR string of N at the location before and thus we'll # be missing that read. # # TODO: figure out what to do when the variant nucleotide is at the start or # end of an exon, since that won't have mapping positions on both its left # and right n_reads_expected -= 1 print("Found %d sequences in %s" % (n_reads_expected, sam_path_single_variant)) df = locus_reads_dataframe(samfile=sam_all_variants, chromosome="chr4", base1_position_before_variant=45802538, base1_position_after_variant=45802540) print(df) eq_(len(df), n_reads_expected)
def test_locus_reads_dataframe(): sam_all_variants = load_bam("data/b16.f10/b16.combined.bam") n_reads_expected = 0 sam_path_single_variant = data_path( "data/b16.f10/b16.f10.127a.aldh1b1.chr4.45802539.refG.altC.sam") with open(sam_path_single_variant) as f: for line in f: if line.startswith("HWI"): n_reads_expected += 1 # we know from inspecting the file that *one* of the reads overlapping this # variant has a CIGAR string of N at the location before and thus we'll # be missing that read. # # TODO: figure out what to do when the variant nucleotide is at the start or # end of an exon, since that won't have mapping positions on both its left # and right n_reads_expected -= 1 print("Found %d sequences in %s" % (n_reads_expected, sam_path_single_variant)) df = locus_reads_dataframe( samfile=sam_all_variants, chromosome="chr4", base1_position_before_variant=45802538, base1_position_after_variant=45802540) print(df) eq_(len(df), n_reads_expected)
def test_partition_variant_reads_deletion(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 70091490 ref = "TTGTAGATGCTGCCTCTCC" alt = "" variant = Variant(contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant(samfile=samfile, chromosome=chromosome, variant=variant) assert len(variant_reads) > 1 for variant_read in variant_reads: eq_(variant_read.allele, alt)
def test_somatic_variant_with_0_supporting_rna_reads(): variant = Variant("6", 90411765, "G", "A") base_dir = "data/somatic-variant-with-0-supporting-rna-reads/" normal_reads = load_bam(base_dir + "normal.6.90411765.G.A.sorted.bam") tumor_reads = load_bam(base_dir + "tumor.6.90411765.G.A.sorted.bam") rna_reads = load_bam(base_dir + "rna.6.90411765.G.A.sorted.bam") normal_sample_variant_reads = reads_supporting_variant( variant=variant, samfile=normal_reads) eq_(len(normal_sample_variant_reads), 0) print(normal_sample_variant_reads) tumor_sample_variant_reads = reads_supporting_variant(variant=variant, samfile=tumor_reads) print(tumor_sample_variant_reads) eq_(len(tumor_sample_variant_reads), 5) rna_sample_variant_reads = reads_supporting_variant(variant=variant, samfile=rna_reads) print(rna_sample_variant_reads) eq_(len(rna_sample_variant_reads), 0)
def test_translate_variant_collection(): variants = load_vcf("data/b16.f10/b16.vcf") samfile = load_bam("data/b16.f10/b16.combined.sorted.bam") result = list(translate_variants(reads_supporting_variants(variants, samfile))) eq_( len(result), 4, "Expected %d translated variants but got %d: %s" % ( len(variants), len(result), result))
def test_translate_variant_collection(): variants = load_vcf("data/b16.f10/b16.vcf") samfile = load_bam("data/b16.f10/b16.combined.sorted.bam") read_evidence_gen = ReadCollector().read_evidence_generator( variants, samfile) translation_gen = ProteinSequenceCreator().translate_variants(read_evidence_gen) translations = list(translation_gen) eq_( len(translations), 4, "Expected %d translated variants but got %d: %s" % ( len(variants), len(translations), translations))
def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality( ): # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped # if we set the minimum quality to 256 variants = load_vcf("data/b16.f10/b16.vcf") alignment_file = load_bam("data/b16.f10/b16.combined.sorted.bam") read_collector = ReadCollector(min_mapping_quality=256) read_evidence_gen = read_collector.read_evidence_generator( variants=variants, alignment_file=alignment_file) creator = ProteinSequenceCreator(max_protein_sequences_per_variant=1, ) protein_sequences_generator = creator.protein_sequences_from_read_evidence_generator( read_evidence_gen) df = protein_sequences_generator_to_dataframe(protein_sequences_generator) print(df) eq_(len(df), 0, "Expected 0 entries, got %d: %s" % (len(df), df))
def test_partition_variant_reads_deletion(): alignment_file = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 70091490 ref = "TTGTAGATGCTGCCTCTCC" alt = "" variant = Variant(contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) read_collector = ReadCollector() read_evidence = read_collector.read_evidence_for_variant( alignment_file=alignment_file, variant=variant) assert len(read_evidence.alt_reads) > 1 for variant_read in read_evidence.alt_reads: eq_(variant_read.allele, alt)
def test_partition_variant_reads_snv(): alignment_file = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant(contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) read_collector = ReadCollector() read_evidence = read_collector.read_evidence_for_variant( alignment_file=alignment_file, variant=variant) alt_reads = read_evidence.alt_reads assert len(alt_reads) > 1 for variant_read in alt_reads: eq_(variant_read.allele, alt)
def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality(): # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped # if we set the minimum quality to 256 variants = load_vcf("data/b16.f10/b16.vcf") samfile = load_bam("data/b16.f10/b16.combined.sorted.bam") allele_reads_generator = reads_overlapping_variants( variants=variants, samfile=samfile, min_mapping_quality=256) protein_sequences_generator = reads_generator_to_protein_sequences_generator( allele_reads_generator, max_protein_sequences_per_variant=1) df = protein_sequences_generator_to_dataframe(protein_sequences_generator) print(df) eq_( len(df), 0, "Expected 0 entries, got %d: %s" % (len(df), df))
def test_partition_variant_reads_deletion(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 70091490 ref = "TTGTAGATGCTGCCTCTCC" alt = "" variant = Variant( contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant( samfile=samfile, chromosome=chromosome, variant=variant) assert len(variant_reads) > 1 for variant_read in variant_reads: eq_(variant_read.allele, alt)
def test_protein_sequence_creator_protein_length(): variants = load_vcf("data/b16.f10/b16.vcf") alignment_file = load_bam("data/b16.f10/b16.combined.sorted.bam") read_collector = ReadCollector() for desired_length in [21, 15, 10]: creator = ProteinSequenceCreator( max_protein_sequences_per_variant=1, protein_sequence_length=desired_length) read_evidence_gen = read_collector.read_evidence_generator( variants=variants, alignment_file=alignment_file) protein_sequences_generator = creator.protein_sequences_from_read_evidence_generator( read_evidence_gen) df = protein_sequences_generator_to_dataframe( protein_sequences_generator) print(df) protein_sequences = df["amino_acids"] print(protein_sequences) protein_sequence_lengths = protein_sequences.str.len() assert (protein_sequence_lengths == desired_length).all(), ( protein_sequence_lengths, )
def test_sequence_counts_snv(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant(chromosome, base1_location, ref, alt, grch38) read_creator = ReadCollector() variant_reads = read_creator.allele_reads_supporting_variant( alignment_file=samfile, variant=variant) variant_sequence_creator = VariantSequenceCreator( preferred_sequence_length=61) variant_sequences = variant_sequence_creator.reads_to_variant_sequences( variant=variant, reads=variant_reads) assert len(variant_sequences) == 1 for variant_sequence in variant_sequences: print(variant_sequence) eq_(variant_sequence.alt, alt) eq_(len(variant_sequence.prefix), 30) eq_(len(variant_sequence.suffix), 30) eq_( variant_sequence.prefix + variant_sequence.alt + variant_sequence.suffix, variant_sequence.sequence)
def test_sequence_counts_snv(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant(chromosome, base1_location, ref, alt) variant_reads = reads_supporting_variant(samfile=samfile, chromosome=chromosome, variant=variant) variant_sequences = reads_to_variant_sequences( variant=variant, reads=variant_reads, preferred_sequence_length=61) assert len(variant_sequences) == 1 for variant_sequence in variant_sequences: print(variant_sequence) eq_(variant_sequence.alt, alt) eq_(len(variant_sequence.prefix), 30) eq_(len(variant_sequence.suffix), 30) eq_( variant_sequence.prefix + variant_sequence.alt + variant_sequence.suffix, variant_sequence.sequence)