Esempio n. 1
0
def variant_collection_from_args(args):
    variant_collections = []

    if args.reference_name:
        genome = genome_for_reference_name(args.reference_name)
    else:
        # no genome specified, assume it can be inferred from the file(s)
        # we're loading
        genome = None

    for vcf_path in args.vcf:
        vcf_variants = load_vcf(vcf_path, genome=genome)
        variant_collections.append(vcf_variants)
    for maf_path in args.maf:
        maf_variants = load_maf(maf_path)
        variant_collections.append(maf_variants)

    if args.variant:
        if not genome:
            raise ValueError(
                "--reference-name must be specified when using --variant")

        variants = [
            Variant(chromosome,
                    start=position,
                    ref=ref,
                    alt=alt,
                    ensembl=genome)
            for (chromosome, position, ref, alt) in args.variant
        ]
        variant_collection = VariantCollection(variants)
        variant_collections.append(variant_collection)

    if len(variant_collections) == 0:
        raise ValueError(
            "No variants loaded (use --maf, --vcf, or --variant options)")

    for json_path in args.json_variant_files:
        with open(json_path, 'r') as f:
            json_string = f.read()
            variant_collections.append(
                VariantCollection.from_json(json_string))
    if len(variant_collections) == 0:
        raise ValueError(
            "No variants loaded (use --maf, --vcf, --json-variants options)")
    elif len(variant_collections) == 1:
        return variant_collections[0]
    else:
        combined_variants = []
        for variant_collection in variant_collections:
            combined_variants.extend(list(variant_collection))
        return VariantCollection(combined_variants)
Esempio n. 2
0
def test_serialization():
    original = VariantCollection([
            Variant(
                1, start=10, ref="AA", alt="AAT", ensembl=77),
            Variant(10, start=15, ref="A", alt="G"),
            Variant(20, start=150, ref="", alt="G"),
    ])
    original.metadata[original[0]] = {"a": "b"}
    original.metadata[original[2]] = {"bar": 2}

    # This causes the variants' ensembl objects to make a SQL connection,
    # which makes the ensembl object non-serializable. By calling this
    # method, we are checking that we don't attempt to directly serialize
    # the ensembl object.
    original.effects()

    # Test pickling.
    serialized = pickle.dumps(original)
    reconstituted = pickle.loads(serialized)
    eq_(original, reconstituted)
    eq_(reconstituted[0], original[0])
    eq_(reconstituted.metadata[original[0]], original.metadata[original[0]])

    # Test json.
    serialized = original.to_json()
    reconstituted = VariantCollection.from_json(serialized)
    eq_(original, reconstituted)
    eq_(reconstituted[0], original[0])
    eq_(reconstituted.metadata[original[0]], original.metadata[original[0]])
def variants_to_protein_sequences_dataframe(
        expressed_vcf="data/b16.f10/b16.expressed.vcf",
        not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf",
        tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam",
        min_mapping_quality=0,
        max_protein_sequences_per_variant=1,
        variant_sequence_assembly=False):
    """
    Helper function to load pair of VCFs and tumor RNA BAM
    and use them to generate a DataFrame of expressed variant protein
    sequences.
    """
    expressed_variants = load_vcf(expressed_vcf)
    not_expressed_variants = load_vcf(not_expressed_vcf)

    combined_variants = VariantCollection(
        list(expressed_variants) + list(not_expressed_variants))
    alignment_file = load_bam(tumor_rna_bam)
    read_collector = ReadCollector(min_mapping_quality=min_mapping_quality)
    read_evidence_gen = read_collector.read_evidence_generator(
        variants=combined_variants, alignment_file=alignment_file)

    creator = ProteinSequenceCreator(
        max_protein_sequences_per_variant=max_protein_sequences_per_variant,
        variant_sequence_assembly=variant_sequence_assembly)
    protein_sequences_generator = \
        creator.protein_sequences_from_read_evidence_generator(read_evidence_gen)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    return df, expressed_variants, combined_variants
Esempio n. 4
0
def test_multiple_variant_forms():
    """
    Load VCF, MAF and VariantCollection together.
    """
    vcf_dir, cohort = None, None
    try:
        vcf_dir, cohort = make_cohort([FILE_FORMAT_1])
        patient = cohort[0]
        patient.variants.append(data_path(MAF_FILE))
        # Make sure listing the file twice has no effect.
        patient.variants.append(data_path(MAF_FILE))
        variant = Variant(start=1000000, ref="A", alt="T", contig=1, ensembl=75)
        patient.variants.append(VariantCollection([variant]))

        cohort_variants = cohort.load_variants(patients=[patient])

        # Make sure the VariantCollection was included.
        eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1000000)), 1)

        # Make sure the VCF was included.
        eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 53513530)), 1)

        # Make sure the MAF was included.
        eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1650797)), 1)

        # Make sure a non-existant variant is not included.
        eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1650798)), 0)
    finally:
        if vcf_dir is not None and path.exists(vcf_dir):
            rmtree(vcf_dir)
        if cohort is not None:
            cohort.clear_caches()
Esempio n. 5
0
def generate_random_missense_variants(num_variants=10,
                                      max_search=100000,
                                      reference="GRCh37"):
    """
    Generate a random collection of missense variants by trying random variants repeatedly.
    """
    variants = []
    for i in range(max_search):
        bases = ["A", "C", "T", "G"]
        random_ref = choice(bases)
        bases.remove(random_ref)
        random_alt = choice(bases)
        random_contig = choice(["1", "2", "3", "4", "5"])
        random_variant = Variant(contig=random_contig,
                                 start=randint(1, 1000000),
                                 ref=random_ref,
                                 alt=random_alt,
                                 ensembl=reference)
        try:
            effects = random_variant.effects()
            for effect in effects:
                if isinstance(effect, Substitution):
                    variants.append(random_variant)
                    break
        except:
            continue
        if len(variants) == num_variants:
            break
    return VariantCollection(variants)
Esempio n. 6
0
def variants_to_protein_sequences_dataframe(
        expressed_vcf="data/b16.f10/b16.expressed.vcf",
        not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf",
        tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam",
        min_mapping_quality=0,
        max_protein_sequences_per_variant=1,
        variant_sequence_assembly=False):
    """
    Helper function to load pair of VCFs and tumor RNA BAM
    and use them to generate a DataFrame of expressed variant protein
    sequences.
    """
    expressed_variants = load_vcf(expressed_vcf)
    not_expressed_variants = load_vcf(not_expressed_vcf)

    combined_variants = VariantCollection(
        list(expressed_variants) + list(not_expressed_variants))
    samfile = load_bam(tumor_rna_bam)

    allele_reads_generator = reads_overlapping_variants(
        variants=combined_variants,
        samfile=samfile,
        min_mapping_quality=min_mapping_quality)

    protein_sequences_generator = reads_generator_to_protein_sequences_generator(
        allele_reads_generator,
        max_protein_sequences_per_variant=max_protein_sequences_per_variant,
        variant_sequence_assembly=variant_sequence_assembly)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    return df, expressed_variants, combined_variants
Esempio n. 7
0
def test_drop_duplicates():
    ensembl = EnsemblRelease(78)
    v1 = Variant("1", 3000, "A", "G", ensembl=ensembl)
    v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl)
    v2 = Variant("2", 10, "G", "T", ensembl=ensembl)
    collection_without_duplicates = VariantCollection(
        variants=[v1, v1, v1_copy, v2])
    assert len(collection_without_duplicates) == 2
Esempio n. 8
0
def variant_collection_from_args(args):
    variant_collections = []

    if args.reference_name:
        genome = genome_for_reference_name(args.reference_name)
    else:
        # no genome specified, assume it can be inferred from the file(s)
        # we're loading
        genome = None

    for vcf_path in args.vcf:
        vcf_variants = load_vcf(vcf_path, genome=genome)
        variant_collections.append(vcf_variants)
    for maf_path in args.maf:
        maf_variants = load_maf(maf_path)
        variant_collections.append(maf_variants)

    if args.variant:
        if not genome:
            raise ValueError(
                "--reference-name must be specified when using --variant")

        variants = [
            Variant(
                chromosome,
                start=position,
                ref=ref,
                alt=alt,
                ensembl=genome)
            for (chromosome, position, ref, alt)
            in args.variant
        ]
        variant_collection = VariantCollection(variants)
        variant_collections.append(variant_collection)

    if len(variant_collections) == 0:
        raise ValueError(
            "No variants loaded (use --maf, --vcf, or --variant options)")

    for json_path in args.json_variant_files:
        with open(json_path, 'r') as f:
            json_string = f.read()
            variant_collections.append(
                VariantCollection.from_json(json_string))
    if len(variant_collections) == 0:
        raise ValueError(
            "No variants loaded (use --maf, --vcf, --json-variants options)")
    elif len(variant_collections) == 1:
        return variant_collections[0]
    else:
        combined_variants = []
        for variant_collection in variant_collections:
            combined_variants.extend(list(variant_collection))
        return VariantCollection(combined_variants)
Esempio n. 9
0
    def _load_single_sample_variants(self, sample_idx, file_format_funcs,
                                     variant_type, merge_type):
        sample_id = self.sample_ids[sample_idx]
        normal_bam_id = None if self.normal_bam_ids is None else self.normal_bam_ids[
            sample_idx]
        tumor_bam_id = None if self.tumor_bam_ids is None else self.tumor_bam_ids[
            sample_idx]
        cached_file_name = "%s-%s-variants.pkl" % (variant_type, merge_type)
        cached = self.load_from_cache(self.cache_names["variant"], sample_id,
                                      cached_file_name)
        if cached is not None:
            return cached

        combined_variants = []
        for file_format_func in file_format_funcs:
            file_name = file_format_func(sample_id, normal_bam_id,
                                         tumor_bam_id)
            variants = varcode.load_vcf_fast(
                path.join(self.data_dir, file_name))
            combined_variants.append(set(variants.elements))

        if len(combined_variants) == 1:
            # There is nothing to merge
            merged_variants = VariantCollection(combined_variants[0])
        else:
            assert merge_type in ["union", "intersection"
                                  ], "Unknown merge type: %s" % merge_type
            if merge_type == "union":
                merged_variants = VariantCollection(
                    set.union(*combined_variants))
            elif merge_type == "intersection":
                merged_variants = VariantCollection(
                    set.intersection(*combined_variants))

        self.save_to_cache(merged_variants, self.cache_names["variant"],
                           sample_id, cached_file_name)

        return merged_variants
Esempio n. 10
0
def test_variant_collection_serialization():
    variant_list = [
        Variant(1, start=10, ref="AA", alt="AAT"),
        Variant(10, start=15, ref="A", alt="G"),
        Variant(20, start=150, ref="", alt="G"),
    ]
    original = VariantCollection(
        variant_list,
        source_to_metadata_dict={
            "test_data":
            {variant: {
                "a": "b",
                "bar": 2
            }
             for variant in variant_list}
        })

    # This causes the variants' ensembl objects to make a SQL connection,
    # which makes the ensembl object non-serializable. By calling this
    # method, we are checking that we don't attempt to directly serialize
    # the ensembl object.
    original.effects()

    original_first_variant = original[0]
    original_metadata = original.metadata

    # Test pickling
    reconstructed = pickle.loads(pickle.dumps(original))
    eq_(original, reconstructed)
    eq_(reconstructed[0], original_first_variant)
    eq_(reconstructed.metadata[original_first_variant],
        original_metadata[original_first_variant])

    merged = original.intersection(original)
    merged_reconstructed = pickle.loads(pickle.dumps(merged))
    eq_(merged, merged_reconstructed)

    # Test JSON serialization
    variants_from_json = VariantCollection.from_json(original.to_json())
    eq_(original, variants_from_json)

    eq_(variants_from_json[0], original_first_variant)

    # pylint: disable=no-member
    eq_(variants_from_json.metadata[original_first_variant],
        original_metadata[original_first_variant])
Esempio n. 11
0
def test_variant_collection_serialization():
    variant_list = [
        Variant(
            1, start=10, ref="AA", alt="AAT"),
        Variant(10, start=15, ref="A", alt="G"),
        Variant(20, start=150, ref="", alt="G"),
    ]
    original = VariantCollection(
        variant_list,
        source_to_metadata_dict={
            "test_data":
                {variant: {"a": "b", "bar": 2} for variant in variant_list}})

    # This causes the variants' ensembl objects to make a SQL connection,
    # which makes the ensembl object non-serializable. By calling this
    # method, we are checking that we don't attempt to directly serialize
    # the ensembl object.
    original.effects()

    original_first_variant = original[0]
    original_metadata = original.metadata

    # Test pickling
    reconstructed = pickle.loads(pickle.dumps(original))
    eq_(original, reconstructed)
    eq_(reconstructed[0], original_first_variant)
    eq_(reconstructed.metadata[original_first_variant],
        original_metadata[original_first_variant])

    merged = original.intersection(original)
    merged_reconstructed = pickle.loads(pickle.dumps(merged))
    eq_(merged, merged_reconstructed)

    # Test JSON serialization
    variants_from_json = VariantCollection.from_json(original.to_json())
    eq_(original, variants_from_json)

    eq_(variants_from_json[0], original_first_variant)

    # pylint: disable=no-member
    eq_(variants_from_json.metadata[original_first_variant],
        original_metadata[original_first_variant])
Esempio n. 12
0

def data_path(name):
    """
    Return the absolute path to a file in the varcode/test/data directory.
    The name specified should be relative to varcode/test/data.
    """
    return os.path.join(os.path.dirname(__file__), "data", name)


# BRAF variant coordinates from COSMIC entry:
# http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=476
braf_V600E_variant = Variant(7, 140753336, "A", "T", ensembl_grch38)

# TP53 variant coordinates from COSMIC entry:
# http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=10656
tp53_R248W_variant = Variant(17, 7674221, "G", "A", ensembl_grch38)

cancer_test_variants = VariantCollection(
    [braf_V600E_variant, tp53_R248W_variant])

cancer_test_variant_gene_ids = {
    gene_id
    for v in cancer_test_variants for gene_id in v.gene_ids
}

cancer_test_variant_transcript_ids = {
    transcript_id
    for v in cancer_test_variants for transcript_id in v.transcript_ids
}
Esempio n. 13
0
def test_effects_priority_caching():
    """
    Make sure that effects are cached such that they are not filtered
    prematurely. See https://github.com/hammerlab/cohorts/issues/252.
    """
    cohort = None
    try:
        # This variant has IntronicSpliceSite, Subsitution effects, and more.
        variant = Variant(contig=3, start=20212211, ref="C", alt="T", ensembl=75)
        patient = Patient(id="patient", os=3, pfs=2, deceased=False, progressed=False, variants=VariantCollection([variant]))
        cohort_cache_path = generated_data_path("cache")
        cohort = Cohort(
            patients=[patient],
            cache_dir=cohort_cache_path)

        # All of the effects.
        cohort.clear_caches()
        for i in range(2):
            effects = cohort.load_effects(all_effects=True)[patient.id]
            eq_(len(effects), 15)

        # Top priority effect.
        cohort.clear_caches()
        for i in range(2):
            effects = cohort.load_effects()[patient.id]
            eq_(len(effects), 1)
            eq_(type(effects[0]), IntronicSpliceSite)

        def missense_snv_filter(filterable_effect):
            return (type(filterable_effect.effect) == Substitution and
                    filterable_effect.variant.is_snv)

        # All missense SNV effects, from the large cache.
        cohort.clear_caches()
        for i in range(2):
            effects = cohort.load_effects(all_effects=True, filter_fn=missense_snv_filter)[patient.id]
            eq_(len(effects), 6)

        # Top missense SNV effect, from the large cache.
        cohort.clear_caches()
        for i in range(2):
            effects = cohort.load_effects(filter_fn=missense_snv_filter)[patient.id]
            eq_(len(effects), 1)
            eq_(type(effects[0]), Substitution)

        # Top missense SNV effects, from the small nonsynonymous cache.
        cohort.clear_caches()
        for i in range(2):
            effects = cohort.load_effects(only_nonsynonymous=True, filter_fn=missense_snv_filter)[patient.id]
            eq_(len(effects), 1)
            eq_(type(effects[0]), Substitution)

        # All nonsynonymous effects, from the small nonsynonymous cache.
        cohort.clear_caches()
        for i in range(2):
            effects = cohort.load_effects(all_effects=True, only_nonsynonymous=True)[patient.id]
            eq_(len(effects), 6)
    finally:
        if cohort is not None:
            cohort.clear_caches()
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nose.tools import eq_
from varcode import VariantCollection

from .data import (
    snp_rs4244285,
    snp_rs1537415
)

variants = VariantCollection([
    # gene ids: ['ENSG00000165841', 'ENSG00000276490']
    # transcript_ids : ['ENST00000371321', 'ENST00000464755']
    snp_rs4244285,
    # gene ids: ['ENSG00000204007']
    # transcript ids:  ['ENST00000371763', 'ENST00000613244']
    snp_rs1537415,
])

gene_fpkm_dict = {
    "ENSG00000165841": 10.0,
    "ENSG00000204007": 20.0,
    "ENSG00000276490": 30.0,
}

transcript_fpkm_dict = {
    "ENST00000371321": 10.0,
    "ENST00000464755": 20.0,
    "ENST00000371763": 30.0,
    "ENST00000613244": 40.0,
Esempio n. 15
0
# limitations under the License.
"""
Helper functions and shared datasets for tests
"""

import os
from varcode import Variant, VariantCollection, load_maf
import pandas as pd


def data_path(name):
    """
    Return the absolute path to a file in the varcode/test/data directory.
    The name specified should be relative to varcode/test/data.
    """
    return os.path.join(os.path.dirname(__file__), "data", name)


dbnsp_validation_df = pd.read_csv(data_path('dbnsfp_validation_set.csv'))
tcga_ov_variants = load_maf(data_path("tcga_ov.head.maf"))
ov_wustle_variants = load_maf(data_path("ov.wustle.subset5.maf"))

snp_rs4244285 = Variant(contig=10, start=94781859, ref="G", alt="A")
snp_rs1537415 = Variant(contig=9, start=135637876, ref="C", alt="G")
snp_rs3892097 = Variant(contig=22, start=42524947, ref="G", alt="A")

db_snp_variants = VariantCollection([
    snp_rs4244285,
    snp_rs1537415,
    snp_rs3892097,
])
from __future__ import print_function, division, absolute_import

from mhctools import NetMHCIIpan
from nose.tools import eq_
from pyensembl import ensembl_grch37
from topiary import predict_epitopes_from_variants
from varcode import Variant, VariantCollection

# TODO: find out about these variants,
# what do we expect from them? Are they SNVs?
variants = VariantCollection([
    Variant(contig=10,
            start=100018900,
            ref='C',
            alt='T',
            ensembl=ensembl_grch37),
    Variant(contig=11,
            start=32861682,
            ref='G',
            alt='A',
            ensembl=ensembl_grch37)
])

alleles = ["HLA-DPA1*01:05/DPB1*100:01", "DRB10102"]

epitope_lengths = [15, 16]

mhc_model = NetMHCIIpan(alleles=alleles, epitope_lengths=epitope_lengths)


def test_netmhcii_pan_epitopes():
    epitope_predictions = predict_epitopes_from_variants(
Esempio n. 17
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nose.tools import eq_
from varcode import VariantCollection

from .data import (snp_rs4244285, snp_rs1537415)

variants = VariantCollection([
    # gene ids: ['ENSG00000165841', 'ENSG00000276490']
    # transcript_ids : ['ENST00000371321', 'ENST00000464755']
    snp_rs4244285,
    # gene ids: ['ENSG00000204007']
    # transcript ids:  ['ENST00000371763', 'ENST00000613244']
    snp_rs1537415,
])

gene_fpkm_dict = {
    "ENSG00000165841": 10.0,
    "ENSG00000204007": 20.0,
    "ENSG00000276490": 30.0,
}

transcript_fpkm_dict = {
    "ENST00000371321": 10.0,
    "ENST00000464755": 20.0,
    "ENST00000371763": 30.0,
    "ENST00000613244": 40.0,
Esempio n. 18
0
def test_splice_filtering_frameshift():
    """
    Make sure that ExonicSpliceSite mutations with FrameShift alternates are kept even when we filter to
    only FrameShift effects.
    """
    cohort = None
    try:
        variant = Variant(contig=8, start=145617535, ref="GGGGGTGCAAGGTGA", alt="", ensembl=75)
        patient = Patient(id="patient", os=3, pfs=2, deceased=False, progressed=False, variants=VariantCollection([variant]))
        cohort_cache_path = generated_data_path("cache")
        cohort = Cohort(
            patients=[patient],
            cache_dir=cohort_cache_path)

        def frameshift_filter(filterable_effect):
            return (type(filterable_effect.effect) == FrameShift)

        effects = cohort.load_effects(filter_fn=frameshift_filter)[patient.id]
        eq_(len(effects), 1)
        all_effects = cohort.load_effects(filter_fn=frameshift_filter,
                                          all_effects=True)[patient.id]
        eq_(len(all_effects), 1)
    finally:
        if cohort is not None:
            cohort.clear_caches()
Esempio n. 19
0
def test_sequence_key_with_reading_frame_substitution_on_negative_strand():
    # replace second codon of TP53-001 with 'CCC'
    tp53_substitution = Variant("17", 7676589, "CTC", "GGG", ensembl_grch38)
    variant_collection = VariantCollection([tp53_substitution])

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    # Sequence of TP53 around second codon with 10 context nucleotides:
    # In [51]: t.sequence[193-10:193+13]
    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
    # Which can be split into the following parts:
    #  last 7 nt of 5' UTR: CACTGCC
    #  start codon: ATG (translates to M)
    #  2nd codon: GAG    <---- variant occurs here
    #  3rd codon: GAG
    #  4th codon: CCG
    #  5th codon:  CAG
    #  first nt of 6th codon: T

    # first calling without a transcript ID white to see if we get back
    # multiple contexts
    reference_context_dict_many_transcripts = \
        reference_contexts_for_variants(
            variants=variant_collection,
            context_size=10,
            transcript_id_whitelist=None)

    assert len(reference_context_dict_many_transcripts) == 1, \
        "Dictionary should have only one variant but got %d keys" % (
            len(reference_context_dict_many_transcripts),)

    reference_contexts = reference_context_dict_many_transcripts[
        tp53_substitution]

    assert len(reference_contexts) > 1, \
        "Expected multiple reference contexts for %s but got %d: %s" % (
            tp53_substitution,
            len(reference_contexts),
            reference_contexts)

    reference_context_dict_single_transcript = \
        reference_contexts_for_variants(
            variants=variant_collection,
            context_size=10,
            transcript_id_whitelist={tp53_001.id})

    # still only expect one variant key
    eq_(len(reference_context_dict_single_transcript), 1)

    result_list = reference_context_dict_single_transcript[tp53_substitution]

    # since we limited the transcript ID whitelist, we only expect a single
    # reference context in the result
    eq_(len(result_list), 1)

    result = result_list[0]

    expected = ReferenceContext(strand="-",
                                sequence_before_variant_locus="CACTGCCATG",
                                sequence_at_variant_locus="GAG",
                                sequence_after_variant_locus="GAGCCGCAGT",
                                offset_to_first_complete_codon=7,
                                contains_start_codon=True,
                                overlaps_start_codon=True,
                                contains_five_prime_utr=True,
                                amino_acids_before_variant="M",
                                variant=tp53_substitution,
                                transcripts=[tp53_001])
    eq_(result, expected)
Esempio n. 20
0
def test_splice_filtering_substitution():
    """
    Make sure that ExonicSpliceSite mutations with Substitution alternates are kept even when we filter to
    only Substitution effects.
    """
    cohort = None
    try:
        variant = Variant(contig=10, start=124340409, ref="C", alt="A", ensembl=75)
        patient = Patient(id="patient", os=3, pfs=2, deceased=False, progressed=False, variants=VariantCollection([variant]))
        cohort_cache_path = generated_data_path("cache")
        cohort = Cohort(
            patients=[patient],
            cache_dir=cohort_cache_path)

        def missense_snv_filter(filterable_effect):
            return (type(filterable_effect.effect) == Substitution and
                    filterable_effect.variant.is_snv)

        effects = cohort.load_effects(filter_fn=missense_snv_filter)[patient.id]
        eq_(len(effects), 1)
        all_effects = cohort.load_effects(filter_fn=missense_snv_filter,
                                          all_effects=True)[patient.id]
        eq_(len(all_effects), 7)
    finally:
        if cohort is not None:
            cohort.clear_caches()