def test_variants_to_protein_sequences_dataframe_protein_sequence_length(): expressed_variants = load_vcf("data/b16.f10/b16.expressed.vcf") parser = make_protein_sequences_arg_parser() parser.print_help() for desired_length in range(9, 20, 3): args = parser.parse_args([ "--vcf", data_path("data/b16.f10/b16.vcf"), "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"), "--max-protein-sequences-per-variant", "1", "--protein-sequence-length", str(desired_length), ]) df = protein_sequences_dataframe_from_args(args) eq_( len(df), len(expressed_variants), "Expected %d entries for protein_sequence_length=%d, got %d results: %s" % ( len(expressed_variants), desired_length, len(df), df)) protein_sequences = df["amino_acids"] print(protein_sequences) protein_sequence_lengths = protein_sequences.str.len() assert (protein_sequence_lengths == desired_length).all(), ( protein_sequence_lengths,)
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I(): # there are two co-occurring variants in the RNAseq data but since # they don't happen in the same codon then we're considering the Varcode # annotation to be correct # TODO: deal with phasing of variants explicitly so that both # variant positions are considered mutated parser = make_protein_sequences_arg_parser() args = parser.parse_args([ "--vcf", data_path("data/b16.f10/b16.f10.Wdr13.vcf"), "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"), "--max-protein-sequences-per-variant", "1", "--protein-sequence-length", "15" ]) for variant, protein_sequences in protein_sequences_generator_from_args(args): protein_sequence = protein_sequences[0] check_mutant_amino_acids(variant, protein_sequence)
def test_mutant_amino_acids_in_mm10_chr9_82927102_refG_altT_pT441H(): # the variant chr9:82927102 G>T occurs right next to T>G so the varcode # prediction for the protein sequence (Asparagine) will be wrong since # the correct translation is Histidine parser = make_protein_sequences_arg_parser() args = parser.parse_args([ "--vcf", data_path("data/b16.f10/b16.f10.Phip.vcf"), "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"), "--max-protein-sequences-per-variant", "1", "--protein-sequence-length", "15" ]) for variant, protein_sequences in protein_sequences_generator_from_args(args): protein_sequence = protein_sequences[0] check_mutant_amino_acids( variant, protein_sequence, expected_amino_acids="H")
def test_mutant_amino_acids_in_mm10_chr9_82927102_refG_altT_pT441H(): # the variant chr9:82927102 G>T occurs right next to T>G so the varcode # prediction for the protein sequence (Asparagine) will be wrong since # the correct translation is Histidine parser = make_protein_sequences_arg_parser() args = parser.parse_args([ "--vcf", data_path("data/b16.f10/b16.f10.Phip.vcf"), "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"), "--max-protein-sequences-per-variant", "1", "--protein-sequence-length", "15" ]) for variant, protein_sequences in protein_sequences_generator_from_args( args): protein_sequence = protein_sequences[0] check_mutant_amino_acids(variant, protein_sequence, expected_amino_acids="H")
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I(): # there are two co-occurring variants in the RNAseq data but since # they don't happen in the same codon then we're considering the Varcode # annotation to be correct # TODO: deal with phasing of variants explicitly so that both # variant positions are considered mutated parser = make_protein_sequences_arg_parser() args = parser.parse_args([ "--vcf", data_path("data/b16.f10/b16.f10.Wdr13.vcf"), "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"), "--max-protein-sequences-per-variant", "1", "--protein-sequence-length", "15" ]) for variant, protein_sequences in protein_sequences_generator_from_args( args): protein_sequence = protein_sequences[0] check_mutant_amino_acids(variant, protein_sequence)
RNAseq BAM from the same sample. Combine synonymous translations and assign a read count to each protein sequence. """ from __future__ import print_function, division, absolute_import import logging import logging.config import pkg_resources from isovar.cli.protein_sequences import ( make_protein_sequences_arg_parser, protein_sequences_dataframe_from_args ) logging.config.fileConfig(pkg_resources.resource_filename('isovar.cli', 'logging.conf')) logger = logging.getLogger(__name__) parser = make_protein_sequences_arg_parser() parser.add_argument( "--output", default="isovar-translate-variants-results.csv", help="Name of CSV file which contains predicted sequences") if __name__ == "__main__": args = parser.parse_args() logger.info(args) df = protein_sequences_dataframe_from_args(args) logger.info(df) df.to_csv(args.output)
""" Translate non-synonymous coding variants into mutant protein sequences using an RNAseq BAM from the same sample. Combine synonymous translations and assign a read count to each protein sequence. """ from __future__ import print_function, division, absolute_import import logging import logging.config import pkg_resources from isovar.cli.protein_sequences import (make_protein_sequences_arg_parser, protein_sequences_dataframe_from_args ) logging.config.fileConfig( pkg_resources.resource_filename('isovar.cli', 'logging.conf')) logger = logging.getLogger(__name__) parser = make_protein_sequences_arg_parser() parser.add_argument("--output", default="isovar-translate-variants-results.csv", help="Name of CSV file which contains predicted sequences") if __name__ == "__main__": args = parser.parse_args() logger.info(args) df = protein_sequences_dataframe_from_args(args) logger.info(df) df.to_csv(args.output)