Example #1
0
def test_variants_to_protein_sequences_dataframe_protein_sequence_length():
    expressed_variants = load_vcf("data/b16.f10/b16.expressed.vcf")
    parser = make_protein_sequences_arg_parser()
    parser.print_help()
    for desired_length in range(9, 20, 3):
        args = parser.parse_args([
            "--vcf", data_path("data/b16.f10/b16.vcf"),
            "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"),
            "--max-protein-sequences-per-variant", "1",
            "--protein-sequence-length", str(desired_length),
        ])
        df = protein_sequences_dataframe_from_args(args)
        eq_(
            len(df),
            len(expressed_variants),
            "Expected %d entries for protein_sequence_length=%d, got %d results: %s" % (
                len(expressed_variants),
                desired_length,
                len(df),
                df))
        protein_sequences = df["amino_acids"]
        print(protein_sequences)
        protein_sequence_lengths = protein_sequences.str.len()
        assert (protein_sequence_lengths == desired_length).all(), (
            protein_sequence_lengths,)
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I():
    # there are two co-occurring variants in the RNAseq data but since
    # they don't happen in the same codon then we're considering the Varcode
    # annotation to be correct
    # TODO: deal with phasing of variants explicitly so that both
    # variant positions are considered mutated
    parser = make_protein_sequences_arg_parser()
    args = parser.parse_args([
        "--vcf", data_path("data/b16.f10/b16.f10.Wdr13.vcf"),
        "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"),
        "--max-protein-sequences-per-variant", "1",
        "--protein-sequence-length", "15"
    ])
    for variant, protein_sequences in protein_sequences_generator_from_args(args):
        protein_sequence = protein_sequences[0]
        check_mutant_amino_acids(variant, protein_sequence)
def test_mutant_amino_acids_in_mm10_chr9_82927102_refG_altT_pT441H():
    # the variant chr9:82927102 G>T occurs right next to T>G so the varcode
    # prediction for the protein sequence (Asparagine) will be wrong since
    # the correct translation is Histidine
    parser = make_protein_sequences_arg_parser()
    args = parser.parse_args([
        "--vcf", data_path("data/b16.f10/b16.f10.Phip.vcf"),
        "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"),
        "--max-protein-sequences-per-variant", "1",
        "--protein-sequence-length", "15"
    ])
    for variant, protein_sequences in protein_sequences_generator_from_args(args):
        protein_sequence = protein_sequences[0]
        check_mutant_amino_acids(
            variant,
            protein_sequence,
            expected_amino_acids="H")
Example #4
0
def test_mutant_amino_acids_in_mm10_chr9_82927102_refG_altT_pT441H():
    # the variant chr9:82927102 G>T occurs right next to T>G so the varcode
    # prediction for the protein sequence (Asparagine) will be wrong since
    # the correct translation is Histidine
    parser = make_protein_sequences_arg_parser()
    args = parser.parse_args([
        "--vcf",
        data_path("data/b16.f10/b16.f10.Phip.vcf"), "--bam",
        data_path("data/b16.f10/b16.combined.sorted.bam"),
        "--max-protein-sequences-per-variant", "1",
        "--protein-sequence-length", "15"
    ])
    for variant, protein_sequences in protein_sequences_generator_from_args(
            args):
        protein_sequence = protein_sequences[0]
        check_mutant_amino_acids(variant,
                                 protein_sequence,
                                 expected_amino_acids="H")
Example #5
0
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I():
    # there are two co-occurring variants in the RNAseq data but since
    # they don't happen in the same codon then we're considering the Varcode
    # annotation to be correct
    # TODO: deal with phasing of variants explicitly so that both
    # variant positions are considered mutated
    parser = make_protein_sequences_arg_parser()
    args = parser.parse_args([
        "--vcf",
        data_path("data/b16.f10/b16.f10.Wdr13.vcf"), "--bam",
        data_path("data/b16.f10/b16.combined.sorted.bam"),
        "--max-protein-sequences-per-variant", "1",
        "--protein-sequence-length", "15"
    ])
    for variant, protein_sequences in protein_sequences_generator_from_args(
            args):
        protein_sequence = protein_sequences[0]
        check_mutant_amino_acids(variant, protein_sequence)
RNAseq BAM from the same sample. Combine synonymous translations and assign
a read count to each protein sequence.
"""

from __future__ import print_function, division, absolute_import
import logging
import logging.config
import pkg_resources

from isovar.cli.protein_sequences import (
    make_protein_sequences_arg_parser,
    protein_sequences_dataframe_from_args
)


logging.config.fileConfig(pkg_resources.resource_filename('isovar.cli', 'logging.conf'))
logger = logging.getLogger(__name__)

parser = make_protein_sequences_arg_parser()
parser.add_argument(
    "--output",
    default="isovar-translate-variants-results.csv",
    help="Name of CSV file which contains predicted sequences")

if __name__ == "__main__":
    args = parser.parse_args()
    logger.info(args)
    df = protein_sequences_dataframe_from_args(args)
    logger.info(df)
    df.to_csv(args.output)
Example #7
0
"""
Translate non-synonymous coding variants into mutant protein sequences using an
RNAseq BAM from the same sample. Combine synonymous translations and assign
a read count to each protein sequence.
"""

from __future__ import print_function, division, absolute_import
import logging
import logging.config
import pkg_resources

from isovar.cli.protein_sequences import (make_protein_sequences_arg_parser,
                                          protein_sequences_dataframe_from_args
                                          )

logging.config.fileConfig(
    pkg_resources.resource_filename('isovar.cli', 'logging.conf'))
logger = logging.getLogger(__name__)

parser = make_protein_sequences_arg_parser()
parser.add_argument("--output",
                    default="isovar-translate-variants-results.csv",
                    help="Name of CSV file which contains predicted sequences")

if __name__ == "__main__":
    args = parser.parse_args()
    logger.info(args)
    df = protein_sequences_dataframe_from_args(args)
    logger.info(df)
    df.to_csv(args.output)