コード例 #1
0
def test_variants_to_protein_sequences_dataframe_protein_sequence_length():
    expressed_variants = load_vcf("data/b16.f10/b16.expressed.vcf")
    parser = make_protein_sequences_arg_parser()
    parser.print_help()
    for desired_length in range(9, 20, 3):
        args = parser.parse_args([
            "--vcf", data_path("data/b16.f10/b16.vcf"),
            "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"),
            "--max-protein-sequences-per-variant", "1",
            "--protein-sequence-length", str(desired_length),
        ])
        df = protein_sequences_dataframe_from_args(args)
        eq_(
            len(df),
            len(expressed_variants),
            "Expected %d entries for protein_sequence_length=%d, got %d results: %s" % (
                len(expressed_variants),
                desired_length,
                len(df),
                df))
        protein_sequences = df["amino_acids"]
        print(protein_sequences)
        protein_sequence_lengths = protein_sequences.str.len()
        assert (protein_sequence_lengths == desired_length).all(), (
            protein_sequence_lengths,)
コード例 #2
0
def test_isovar_result_str():
    for result in run_isovar(
            variants=data_path("data/b16.f10/b16.vcf"),
            alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam")):
        s = str(result)
        assert len(s) > 0
        assert s.startswith("IsovarResult(")
        assert s.endswith(")")
コード例 #3
0
ファイル: test_main.py プロジェクト: tavinathanson/isovar
def test_isovar_main_to_dataframe():
    results = run_isovar(
        variants=data_path("data/b16.f10/b16.vcf"),
        alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam"))
    df = isovar_results_to_dataframe(results)
    print(df)
    eq_(len(df), 4)
    # B16 test data has 2/4 variants with enough coverage
    # to translate protein sequences
    eq_(df["passes_all_filters"].sum(), 2)
コード例 #4
0
def test_isovar_result_property_types():
    for result in run_isovar(
            variants=data_path("data/b16.f10/b16.vcf"),
            alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam")):
        # variant
        assert type(result.variant) is Variant

        # counts of genes and transcripts from variant
        assert type(result.num_overlapping_genes) is int
        assert type(result.num_overlapping_coding_genes) is int
        assert type(result.num_overlapping_transcripts) is int
        assert type(result.num_overlapping_coding_transcripts) is int

        # protein sequence
        assert type(result.top_protein_sequence) in (type(None),
                                                     ProteinSequence)

        # counts of genes and transcripts from protein sequences
        assert type(result.num_genes_from_protein_sequences) is int
        assert type(result.num_genes_from_top_protein_sequence) is int
        assert type(result.num_transcripts_from_protein_sequences) is int
        assert type(result.num_transcripts_from_top_protein_sequence) is int

        # read and fragment counts
        assert type(result.num_ref_reads) is int
        assert type(result.num_alt_reads) is int
        assert type(result.num_other_reads) is int
        assert type(result.num_ref_fragments) is int
        assert type(result.num_alt_fragments) is int
        assert type(result.num_other_fragments) is int

        # read and fragment fractions
        assert type(result.fraction_ref_reads) is float
        assert type(result.fraction_alt_reads) is float
        assert type(result.fraction_other_reads) is float
        assert type(result.fraction_ref_fragments) is float
        assert type(result.fraction_alt_fragments) is float
        assert type(result.fraction_other_fragments) is float

        # read and fragment count ratios
        assert type(result.ratio_alt_to_other_reads) is float
        assert type(result.ratio_alt_to_other_fragments) is float
        assert type(result.ratio_other_to_alt_fragments) is float
        assert type(result.ratio_other_to_alt_reads) is float
        assert type(result.ratio_ref_to_other_fragments) is float
        assert type(result.ratio_other_to_ref_fragments) is float
        assert type(result.ratio_other_to_ref_reads) is float

        # this property aggregates all filters
        assert result.passes_all_filters in {True, False}

        assert type(result.protein_sequence_mutation_start) in (int,
                                                                type(None))
        assert type(result.protein_sequence_mutation_end) in (int, type(None))
コード例 #5
0
ファイル: test_locus_reads.py プロジェクト: Al3n70rn/isovar
def test_locus_reads_dataframe():
    sam_all_variants = load_bam("data/b16.f10/b16.combined.bam")

    n_reads_expected = 0

    sam_path_single_variant = data_path(
        "data/b16.f10/b16.f10.127a.aldh1b1.chr4.45802539.refG.altC.sam")
    with open(sam_path_single_variant) as f:
        for line in f:
            if line.startswith("HWI"):
                n_reads_expected += 1
    # we know from inspecting the file that *one* of the reads overlapping this
    # variant has a CIGAR string of N at the location before and thus we'll
    # be missing that read.
    #
    # TODO: figure out what to do when the variant nucleotide is at the start or
    # end of an exon, since that won't have mapping positions on both its left
    # and right
    n_reads_expected -= 1

    print("Found %d sequences in %s" % (n_reads_expected, sam_path_single_variant))
    df = locus_reads_dataframe(
        samfile=sam_all_variants,
        chromosome="chr4",
        base1_position_before_variant=45802538,
        base1_position_after_variant=45802540)
    print(df)
    eq_(len(df), n_reads_expected)
コード例 #6
0
ファイル: test_locus_reads.py プロジェクト: gnetsanet/isovar
def test_locus_reads_dataframe():
    sam_all_variants = load_bam("data/b16.f10/b16.combined.bam")

    n_reads_expected = 0

    sam_path_single_variant = data_path(
        "data/b16.f10/b16.f10.127a.aldh1b1.chr4.45802539.refG.altC.sam")
    with open(sam_path_single_variant) as f:
        for line in f:
            if line.startswith("HWI"):
                n_reads_expected += 1
    # we know from inspecting the file that *one* of the reads overlapping this
    # variant has a CIGAR string of N at the location before and thus we'll
    # be missing that read.
    #
    # TODO: figure out what to do when the variant nucleotide is at the start or
    # end of an exon, since that won't have mapping positions on both its left
    # and right
    n_reads_expected -= 1

    print("Found %d sequences in %s" %
          (n_reads_expected, sam_path_single_variant))
    df = locus_reads_dataframe(samfile=sam_all_variants,
                               chromosome="chr4",
                               base1_position_before_variant=45802538,
                               base1_position_after_variant=45802540)
    print(df)
    eq_(len(df), n_reads_expected)
コード例 #7
0
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I():
    # there are two co-occurring variants in the RNAseq data but since
    # they don't happen in the same codon then we're considering the Varcode
    # annotation to be correct
    # TODO: deal with phasing of variants explicitly so that both
    # variant positions are considered mutated
    parser = make_protein_sequences_arg_parser()
    args = parser.parse_args([
        "--vcf", data_path("data/b16.f10/b16.f10.Wdr13.vcf"),
        "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"),
        "--max-protein-sequences-per-variant", "1",
        "--protein-sequence-length", "15"
    ])
    for variant, protein_sequences in protein_sequences_generator_from_args(args):
        protein_sequence = protein_sequences[0]
        check_mutant_amino_acids(variant, protein_sequence)
コード例 #8
0
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I():
    # there are two co-occurring variants in the RNAseq data but since
    # they don't happen in the same codon then we're considering the Varcode
    # annotation to be correct
    # TODO: deal with phasing of variants explicitly so that both
    # variant positions are considered mutated
    parser = make_protein_sequences_arg_parser()
    args = parser.parse_args([
        "--vcf", data_path("data/b16.f10/b16.f10.Wdr13.vcf"),
        "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"),
        "--max-protein-sequences-per-variant", "1",
        "--protein-sequence-length", "15"
    ])
    for variant, protein_sequences in protein_sequences_generator_from_args(args):
        protein_sequence = protein_sequences[0]
        check_mutant_amino_acids(variant, protein_sequence)
コード例 #9
0
def test_mutant_amino_acids_in_mm10_chr9_82927102_refG_altT_pT441H():
    # the variant chr9:82927102 G>T occurs right next to T>G so the varcode
    # prediction for the protein sequence (Asparagine) will be wrong since
    # the correct translation is Histidine
    parser = make_protein_sequences_arg_parser()
    args = parser.parse_args([
        "--vcf", data_path("data/b16.f10/b16.f10.Phip.vcf"),
        "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"),
        "--max-protein-sequences-per-variant", "1",
        "--protein-sequence-length", "15"
    ])
    for variant, protein_sequences in protein_sequences_generator_from_args(args):
        protein_sequence = protein_sequences[0]
        check_mutant_amino_acids(
            variant,
            protein_sequence,
            expected_amino_acids="H")
コード例 #10
0
def test_mutant_amino_acids_in_mm10_chr9_82927102_refG_altT_pT441H():
    # the variant chr9:82927102 G>T occurs right next to T>G so the varcode
    # prediction for the protein sequence (Asparagine) will be wrong since
    # the correct translation is Histidine
    parser = make_protein_sequences_arg_parser()
    args = parser.parse_args([
        "--vcf", data_path("data/b16.f10/b16.f10.Phip.vcf"),
        "--bam", data_path("data/b16.f10/b16.combined.sorted.bam"),
        "--max-protein-sequences-per-variant", "1",
        "--protein-sequence-length", "15"
    ])
    for variant, protein_sequences in protein_sequences_generator_from_args(args):
        protein_sequence = protein_sequences[0]
        check_mutant_amino_acids(
            variant,
            protein_sequence,
            expected_amino_acids="H")
コード例 #11
0
def test_isovar_result_nonsyn_variants():
    for result in run_isovar(
            variants=data_path("data/b16.f10/b16.vcf"),
            alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam")):
        print(result.variant)
        print(result.predicted_effect)
        if result.has_mutant_protein_sequence_from_rna:
            assert result.num_amino_acid_mismatches_from_predicted_effect is not None
            assert result.num_amino_acid_mismatches_from_reference is not None
            assert result.num_amino_acid_mismatches_from_reference > 0
            eq_(result.protein_sequence_matches_reference, False)
            eq_(result.protein_sequence_contains_mutation, True)
        else:
            assert result.num_amino_acid_mismatches_from_predicted_effect is None
            assert result.num_amino_acid_mismatches_from_reference is None
            assert result.protein_sequence_matches_predicted_mutation_effect is None
            assert result.protein_sequence_matches_reference is None
            assert result.protein_sequence_contains_mutation is None
コード例 #12
0
ファイル: test_cli.py プロジェクト: tavinathanson/isovar
import tempfile
from os import remove
from os.path import getsize, exists

from testing_helpers import data_path

from isovar.cli.isovar_translations import run as isovar_translations
from isovar.cli.isovar_allele_counts import run as isovar_allele_counts
from isovar.cli.isovar_allele_reads import run as isovar_allele_reads
from isovar.cli.isovar_protein_sequences import run as isovar_protein_sequences
from isovar.cli.isovar_reference_contexts import run as isovar_reference_contexts
from isovar.cli.isovar_variant_reads import run as isovar_variant_reads
from isovar.cli.isovar_variant_sequences import run as isovar_variant_sequences
from isovar.cli.isovar_main import run as isovar_main

vcf_args = ["--vcf", data_path("data/b16.f10/b16.vcf")]

args_with_bam = vcf_args + [
    "--bam", data_path("data/b16.f10/b16.combined.sorted.bam")
]


def run_cli_fn(fn, include_bam_in_args=True):
    with tempfile.NamedTemporaryFile(delete=False) as f:
        output_path = f.name
    assert not exists(output_path) == 0
    output_args = ["--output", output_path]
    if include_bam_in_args:
        args = args_with_bam + output_args
    else:
        args = vcf_args + output_args
コード例 #13
0
def test_isovar_result_clone_with_updates():
    for result in run_isovar(
            variants=data_path("data/b16.f10/b16.vcf"),
            alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam")):
        result2 = result.clone_with_updates(variant=None)
        assert result != result2
コード例 #14
0
def test_isovar_result_clone():
    for result in run_isovar(
            variants=data_path("data/b16.f10/b16.vcf"),
            alignment_file=data_path("data/b16.f10/b16.combined.sorted.bam")):
        result2 = result.clone()
        eq_(result, result2)