def test_partitioned_read_sequences_deletion(): """ test_partitioned_read_sequences_deletion : Test that read gets correctly partitioned for chr1:4 TT>T where the sequence for chr1 is assumed to be "ACCTTG" """ # chr1_seq = "ACCTTG" chromosome = "chromosome" location = 4 ref = "TT" alt = "T" variant = Variant( chromosome, location, ref, alt, grch38, normalize_contig_name=False) read = make_pysam_read( seq="ACCTG", cigar="4M1D1M", mdtag="4^T1") samfile = MockAlignmentFile( references=(chromosome,), reads=[read]) read_creator = ReadCollector() variant_reads = read_creator.allele_reads_supporting_variant( alignment_file=samfile, variant=variant) print(variant_reads) assert len(variant_reads) == 1 variant_read = variant_reads[0] expected = AlleleRead( name=read.qname, prefix="ACCT", allele="", suffix="G") eq_(variant_read, expected)
def test_locus_reads_insertion(): """ test_partitioned_read_sequences_insertion : Test that read gets correctly partitioned for chr1:4 T>TG where the sequence for chr1 is assumed to be "ACCTTG" and the variant sequence is "ACCTGTG" """ variant = Variant("1", 4, ref="T", alt="TG") pysam_read = make_pysam_read(seq="ACCTGTG", cigar="4M1I2M", mdtag="6") samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read]) read_creator = ReadCollector() reads = read_creator.get_locus_reads(samfile, "chromosome", variant.start, variant.start) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) read = reads[0] expected = LocusRead( name=pysam_read.qname, sequence=pysam_read.query_sequence, # expect the inserted nucleotide to be missing a corresponding # ref position reference_positions=[0, 1, 2, 3, None, 4, 5], quality_scores=pysam_read.query_qualities, read_base0_start_inclusive=4, read_base0_end_exclusive=5, reference_base0_start_inclusive=4, reference_base0_end_exclusive=4) print("Actual: %s" % (read, )) print("Expected: %s" % (expected, )) assert_equal_fields(read, expected)
def test_locus_reads_deletion(): """ test_partitioned_read_sequences_deletion : Test that read gets correctly partitioned for chr1:4 TT>T where the sequence for chr1 is assumed to be "ACCTTG" """ # normalization of this variant will turn it into the deletion of # "T" at base-1 position 5 variant = Variant("1", 4, ref="TT", alt="T") pysam_read = make_pysam_read(seq="ACCTG", cigar="4M1D1M", mdtag="4^T1") samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read]) read_creator = ReadCollector() reads = read_creator.get_locus_reads(samfile, "chromosome", variant.start - 1, variant.start) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) read = reads[0] expected = LocusRead( name=pysam_read.qname, sequence=pysam_read.query_sequence, reference_positions=[0, 1, 2, 3, 5], quality_scores=pysam_read.query_qualities, # missing would have gone after 4th nucleotide in the read read_base0_start_inclusive=4, read_base0_end_exclusive=4, reference_base0_start_inclusive=4, reference_base0_end_exclusive=5) assert_equal_fields(read, expected)
def test_locus_reads_snv(): """ test_partitioned_read_sequences_snv : Test that read gets correctly partitioned for chr1:4 T>G where the sequence for chr1 is assumed to be "ACCTTG" """ # chr1_seq = "ACCTTG" variant = Variant("1", 4, ref="T", alt="G") pysam_read = make_pysam_read(seq="ACCGTG", cigar="6M", mdtag="3G2") samfile = MockAlignmentFile(references=("chromosome", ), reads=[pysam_read]) read_creator = ReadCollector() reads = read_creator.get_locus_reads(samfile, "chromosome", variant.start - 1, variant.start) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) read = reads[0] expected = LocusRead(name=pysam_read.qname, sequence=pysam_read.query_sequence, reference_positions=[0, 1, 2, 3, 4, 5], quality_scores=pysam_read.query_qualities, reference_base0_start_inclusive=3, reference_base0_end_exclusive=4, read_base0_start_inclusive=3, read_base0_end_exclusive=4) assert_equal_fields(read, expected)
def test_locus_reads_substitution_shorter(): # test CC>G subsitution at 2nd and 3rd nucleotides of reference sequence # "ACCTTG", for which the alignment is interpreted as a C>G variant # followed by the deletion of a C variant = Variant("1", 2, ref="CC", alt="G") print(variant) pysam_read = make_pysam_read(seq="AGTTG", cigar="2M1D3M", mdtag="1C^C4") samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read]) read_creator = ReadCollector() reads = read_creator.get_locus_reads(samfile, "chromosome", 1, 3) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) print(reads) read = reads[0] expected = LocusRead(name=pysam_read.qname, sequence=pysam_read.query_sequence, reference_positions=[0, 1, 3, 4, 5], quality_scores=pysam_read.query_qualities, read_base0_start_inclusive=1, read_base0_end_exclusive=2, reference_base0_start_inclusive=1, reference_base0_end_exclusive=3) assert_equal_fields(read, expected)
def variants_to_protein_sequences_dataframe( expressed_vcf="data/b16.f10/b16.expressed.vcf", not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf", tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam", min_mapping_quality=0, max_protein_sequences_per_variant=1, variant_sequence_assembly=False): """ Helper function to load pair of VCFs and tumor RNA BAM and use them to generate a DataFrame of expressed variant protein sequences. """ expressed_variants = load_vcf(expressed_vcf) not_expressed_variants = load_vcf(not_expressed_vcf) combined_variants = VariantCollection( list(expressed_variants) + list(not_expressed_variants)) alignment_file = load_bam(tumor_rna_bam) read_collector = ReadCollector(min_mapping_quality=min_mapping_quality) read_evidence_gen = read_collector.read_evidence_generator( variants=combined_variants, alignment_file=alignment_file) creator = ProteinSequenceCreator( max_protein_sequences_per_variant=max_protein_sequences_per_variant, variant_sequence_assembly=variant_sequence_assembly) protein_sequences_generator = \ creator.protein_sequences_from_read_evidence_generator(read_evidence_gen) df = protein_sequences_generator_to_dataframe(protein_sequences_generator) return df, expressed_variants, combined_variants
def test_locus_reads_substitution_longer(): # test C>GG subsitution at second nucleotide of reference sequence "ACCTTG", # the alignment is interpreted as a C>G variant followed by an insertion of # another G variant = Variant("1", 2, ref="C", alt="GG") print(variant) pysam_read = make_pysam_read(seq="AGGCTTG", cigar="2M1I4M", mdtag="1C4") samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read]) read_creator = ReadCollector() reads = read_creator.get_locus_reads(samfile, "chromosome", 1, 2) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) read = reads[0] expected = LocusRead(name=pysam_read.qname, sequence=pysam_read.query_sequence, reference_positions=[0, 1, None, 2, 3, 4, 5], quality_scores=pysam_read.query_qualities, read_base0_start_inclusive=1, read_base0_end_exclusive=3, reference_base0_start_inclusive=1, reference_base0_end_exclusive=2) assert_equal_fields(read, expected)
def test_assemble_transcript_fragments_snv(): alignment_file = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant(contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) read_creator = ReadCollector() variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, alignment_file=alignment_file) sequences = iterative_overlap_assembly( initial_variant_sequences_from_reads(variant_reads), min_overlap_size=30) assert len(sequences) > 0 max_read_length = max(len(r) for r in variant_reads) for s in sequences: print("%s%s%s weight=%d length=%d" % (s.prefix, s.alt, s.suffix, len(s.reads), len(s.sequence))) eq_(s.alt, alt) if len(s.read_names) > 1: # expect sequences supported by more than one read to be greater # than the read length assert len(s) > max_read_length, \ "Expected assembled sequences to be longer than read length (%d)" % ( max_read_length,)
def test_somatic_variant_with_2_supporting_rna_reads(): variant = Variant("14", 105849746, "G", "A", grch38) base_dir = "data/somatic-variant-with-2-supporting-rna-reads/" normal_reads = load_bam(base_dir + "normal.14.105849746.G.A.no-alt.sorted.bam") tumor_reads = load_bam(base_dir + "tumor.14.105849746.G.A.many-alt.sorted.bam") rna_reads = load_bam(base_dir + "rna.14.105849746.G.A.2-alt.sorted.bam") read_creator = ReadCollector() normal_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, alignment_file=normal_reads) eq_(len(normal_sample_variant_reads), 0) print(normal_sample_variant_reads) tumor_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, alignment_file=tumor_reads) print(tumor_sample_variant_reads) eq_(len(tumor_sample_variant_reads), 8) rna_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, alignment_file=rna_reads) print(rna_sample_variant_reads) eq_(len(rna_sample_variant_reads), 2) # Arun went through the hassle of pulling out the exact read names # in IGV expected_variant_rna_read_names = { "K00193:50:H5NKVBBXX:5:2202:6421:24964", "K00193:50:H5NKVBBXX:5:2119:30908:1138", } for variant_read in rna_sample_variant_reads: assert variant_read.name in expected_variant_rna_read_names
def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality( ): # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped # if we set the minimum quality to 256 variants = load_vcf("data/b16.f10/b16.vcf") alignment_file = load_bam("data/b16.f10/b16.combined.sorted.bam") read_collector = ReadCollector(min_mapping_quality=256) read_evidence_gen = read_collector.read_evidence_generator( variants=variants, alignment_file=alignment_file) creator = ProteinSequenceCreator(max_protein_sequences_per_variant=1, ) protein_sequences_generator = creator.protein_sequences_from_read_evidence_generator( read_evidence_gen) df = protein_sequences_generator_to_dataframe(protein_sequences_generator) print(df) eq_(len(df), 0, "Expected 0 entries, got %d: %s" % (len(df), df))
def test_protein_sequence_creator_protein_length(): variants = load_vcf("data/b16.f10/b16.vcf") alignment_file = load_bam("data/b16.f10/b16.combined.sorted.bam") read_collector = ReadCollector() for desired_length in [21, 15, 10]: creator = ProteinSequenceCreator( max_protein_sequences_per_variant=1, protein_sequence_length=desired_length) read_evidence_gen = read_collector.read_evidence_generator( variants=variants, alignment_file=alignment_file) protein_sequences_generator = creator.protein_sequences_from_read_evidence_generator( read_evidence_gen) df = protein_sequences_generator_to_dataframe( protein_sequences_generator) print(df) protein_sequences = df["amino_acids"] print(protein_sequences) protein_sequence_lengths = protein_sequences.str.len() assert (protein_sequence_lengths == desired_length).all(), ( protein_sequence_lengths, )
def test_sequence_counts_snv(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant(chromosome, base1_location, ref, alt, grch38) read_creator = ReadCollector() variant_reads = read_creator.allele_reads_supporting_variant( alignment_file=samfile, variant=variant) variant_sequence_creator = VariantSequenceCreator( preferred_sequence_length=61) variant_sequences = variant_sequence_creator.reads_to_variant_sequences( variant=variant, reads=variant_reads) assert len(variant_sequences) == 1 for variant_sequence in variant_sequences: print(variant_sequence) eq_(variant_sequence.alt, alt) eq_(len(variant_sequence.prefix), 30) eq_(len(variant_sequence.suffix), 30) eq_( variant_sequence.prefix + variant_sequence.alt + variant_sequence.suffix, variant_sequence.sequence)
def test_somatic_variant_with_0_supporting_rna_reads(): variant = Variant("6", 90411765, "G", "A", grch38) base_dir = "data/somatic-variant-with-0-supporting-rna-reads/" normal_reads = load_bam(base_dir + "normal.6.90411765.G.A.sorted.bam") tumor_reads = load_bam(base_dir + "tumor.6.90411765.G.A.sorted.bam") rna_reads = load_bam(base_dir + "rna.6.90411765.G.A.sorted.bam") read_creator = ReadCollector() normal_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, alignment_file=normal_reads) eq_(len(normal_sample_variant_reads), 0) print(normal_sample_variant_reads) tumor_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, alignment_file=tumor_reads) print(tumor_sample_variant_reads) eq_(len(tumor_sample_variant_reads), 5) rna_sample_variant_reads = read_creator.allele_reads_supporting_variant( variant=variant, alignment_file=rna_reads) print(rna_sample_variant_reads) eq_(len(rna_sample_variant_reads), 0)