def test_variants_to_protein_sequences_dataframe_protein_sequence_length(): expressed_variants = load_vcf("data/b16.f10/b16.expressed.vcf") parser = make_protein_sequences_arg_parser() parser.print_help() for desired_length in range(9, 20, 3): args = parser.parse_args([ "--vcf", data_path("data/b16.f10/b16.vcf"), "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"), "--max-protein-sequences-per-variant", "1", "--protein-sequence-length", str(desired_length), ]) df = protein_sequences_dataframe_from_args(args) eq_( len(df), len(expressed_variants), "Expected %d entries for protein_sequence_length=%d, got %d results: %s" % ( len(expressed_variants), desired_length, len(df), df)) protein_sequences = df["amino_acids"] print(protein_sequences) protein_sequence_lengths = protein_sequences.str.len() assert (protein_sequence_lengths == desired_length).all(), ( protein_sequence_lengths,)
def test_isovar_result_str(): for result in run_isovar( variants=data_path("data/b16.f10/b16.vcf"), alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam")): s = str(result) assert len(s) > 0 assert s.startswith("IsovarResult(") assert s.endswith(")")
def test_isovar_main_to_dataframe(): results = run_isovar( variants=data_path("data/b16.f10/b16.vcf"), alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam")) df = isovar_results_to_dataframe(results) print(df) eq_(len(df), 4) # B16 test data has 2/4 variants with enough coverage # to translate protein sequences eq_(df["passes_all_filters"].sum(), 2)
def test_isovar_result_property_types(): for result in run_isovar( variants=data_path("data/b16.f10/b16.vcf"), alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam")): # variant assert type(result.variant) is Variant # counts of genes and transcripts from variant assert type(result.num_overlapping_genes) is int assert type(result.num_overlapping_coding_genes) is int assert type(result.num_overlapping_transcripts) is int assert type(result.num_overlapping_coding_transcripts) is int # protein sequence assert type(result.top_protein_sequence) in (type(None), ProteinSequence) # counts of genes and transcripts from protein sequences assert type(result.num_genes_from_protein_sequences) is int assert type(result.num_genes_from_top_protein_sequence) is int assert type(result.num_transcripts_from_protein_sequences) is int assert type(result.num_transcripts_from_top_protein_sequence) is int # read and fragment counts assert type(result.num_ref_reads) is int assert type(result.num_alt_reads) is int assert type(result.num_other_reads) is int assert type(result.num_ref_fragments) is int assert type(result.num_alt_fragments) is int assert type(result.num_other_fragments) is int # read and fragment fractions assert type(result.fraction_ref_reads) is float assert type(result.fraction_alt_reads) is float assert type(result.fraction_other_reads) is float assert type(result.fraction_ref_fragments) is float assert type(result.fraction_alt_fragments) is float assert type(result.fraction_other_fragments) is float # read and fragment count ratios assert type(result.ratio_alt_to_other_reads) is float assert type(result.ratio_alt_to_other_fragments) is float assert type(result.ratio_other_to_alt_fragments) is float assert type(result.ratio_other_to_alt_reads) is float assert type(result.ratio_ref_to_other_fragments) is float assert type(result.ratio_other_to_ref_fragments) is float assert type(result.ratio_other_to_ref_reads) is float # this property aggregates all filters assert result.passes_all_filters in {True, False} assert type(result.protein_sequence_mutation_start) in (int, type(None)) assert type(result.protein_sequence_mutation_end) in (int, type(None))
def test_locus_reads_dataframe(): sam_all_variants = load_bam("data/b16.f10/b16.combined.bam") n_reads_expected = 0 sam_path_single_variant = data_path( "data/b16.f10/b16.f10.127a.aldh1b1.chr4.45802539.refG.altC.sam") with open(sam_path_single_variant) as f: for line in f: if line.startswith("HWI"): n_reads_expected += 1 # we know from inspecting the file that *one* of the reads overlapping this # variant has a CIGAR string of N at the location before and thus we'll # be missing that read. # # TODO: figure out what to do when the variant nucleotide is at the start or # end of an exon, since that won't have mapping positions on both its left # and right n_reads_expected -= 1 print("Found %d sequences in %s" % (n_reads_expected, sam_path_single_variant)) df = locus_reads_dataframe( samfile=sam_all_variants, chromosome="chr4", base1_position_before_variant=45802538, base1_position_after_variant=45802540) print(df) eq_(len(df), n_reads_expected)
def test_locus_reads_dataframe(): sam_all_variants = load_bam("data/b16.f10/b16.combined.bam") n_reads_expected = 0 sam_path_single_variant = data_path( "data/b16.f10/b16.f10.127a.aldh1b1.chr4.45802539.refG.altC.sam") with open(sam_path_single_variant) as f: for line in f: if line.startswith("HWI"): n_reads_expected += 1 # we know from inspecting the file that *one* of the reads overlapping this # variant has a CIGAR string of N at the location before and thus we'll # be missing that read. # # TODO: figure out what to do when the variant nucleotide is at the start or # end of an exon, since that won't have mapping positions on both its left # and right n_reads_expected -= 1 print("Found %d sequences in %s" % (n_reads_expected, sam_path_single_variant)) df = locus_reads_dataframe(samfile=sam_all_variants, chromosome="chr4", base1_position_before_variant=45802538, base1_position_after_variant=45802540) print(df) eq_(len(df), n_reads_expected)
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I(): # there are two co-occurring variants in the RNAseq data but since # they don't happen in the same codon then we're considering the Varcode # annotation to be correct # TODO: deal with phasing of variants explicitly so that both # variant positions are considered mutated parser = make_protein_sequences_arg_parser() args = parser.parse_args([ "--vcf", data_path("data/b16.f10/b16.f10.Wdr13.vcf"), "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"), "--max-protein-sequences-per-variant", "1", "--protein-sequence-length", "15" ]) for variant, protein_sequences in protein_sequences_generator_from_args(args): protein_sequence = protein_sequences[0] check_mutant_amino_acids(variant, protein_sequence)
def test_mutant_amino_acids_in_mm10_chr9_82927102_refG_altT_pT441H(): # the variant chr9:82927102 G>T occurs right next to T>G so the varcode # prediction for the protein sequence (Asparagine) will be wrong since # the correct translation is Histidine parser = make_protein_sequences_arg_parser() args = parser.parse_args([ "--vcf", data_path("data/b16.f10/b16.f10.Phip.vcf"), "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"), "--max-protein-sequences-per-variant", "1", "--protein-sequence-length", "15" ]) for variant, protein_sequences in protein_sequences_generator_from_args(args): protein_sequence = protein_sequences[0] check_mutant_amino_acids( variant, protein_sequence, expected_amino_acids="H")
def test_isovar_result_nonsyn_variants(): for result in run_isovar( variants=data_path("data/b16.f10/b16.vcf"), alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam")): print(result.variant) print(result.predicted_effect) if result.has_mutant_protein_sequence_from_rna: assert result.num_amino_acid_mismatches_from_predicted_effect is not None assert result.num_amino_acid_mismatches_from_reference is not None assert result.num_amino_acid_mismatches_from_reference > 0 eq_(result.protein_sequence_matches_reference, False) eq_(result.protein_sequence_contains_mutation, True) else: assert result.num_amino_acid_mismatches_from_predicted_effect is None assert result.num_amino_acid_mismatches_from_reference is None assert result.protein_sequence_matches_predicted_mutation_effect is None assert result.protein_sequence_matches_reference is None assert result.protein_sequence_contains_mutation is None
import tempfile from os import remove from os.path import getsize, exists from testing_helpers import data_path from isovar.cli.isovar_translations import run as isovar_translations from isovar.cli.isovar_allele_counts import run as isovar_allele_counts from isovar.cli.isovar_allele_reads import run as isovar_allele_reads from isovar.cli.isovar_protein_sequences import run as isovar_protein_sequences from isovar.cli.isovar_reference_contexts import run as isovar_reference_contexts from isovar.cli.isovar_variant_reads import run as isovar_variant_reads from isovar.cli.isovar_variant_sequences import run as isovar_variant_sequences from isovar.cli.isovar_main import run as isovar_main vcf_args = ["--vcf", data_path("data/b16.f10/b16.vcf")] args_with_bam = vcf_args + [ "--bam", data_path("data/b16.f10/b16.combined.sorted.bam") ] def run_cli_fn(fn, include_bam_in_args=True): with tempfile.NamedTemporaryFile(delete=False) as f: output_path = f.name assert not exists(output_path) == 0 output_args = ["--output", output_path] if include_bam_in_args: args = args_with_bam + output_args else: args = vcf_args + output_args
def test_isovar_result_clone_with_updates(): for result in run_isovar( variants=data_path("data/b16.f10/b16.vcf"), alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam")): result2 = result.clone_with_updates(variant=None) assert result != result2
def test_isovar_result_clone(): for result in run_isovar( variants=data_path("data/b16.f10/b16.vcf"), alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam")): result2 = result.clone() eq_(result, result2)