def variant_collection_from_args(args): variant_collections = [] if args.reference_name: genome = genome_for_reference_name(args.reference_name) else: # no genome specified, assume it can be inferred from the file(s) # we're loading genome = None for vcf_path in args.vcf: vcf_variants = load_vcf(vcf_path, genome=genome) variant_collections.append(vcf_variants) for maf_path in args.maf: maf_variants = load_maf(maf_path) variant_collections.append(maf_variants) if args.variant: if not genome: raise ValueError( "--reference-name must be specified when using --variant") variants = [ Variant(chromosome, start=position, ref=ref, alt=alt, ensembl=genome) for (chromosome, position, ref, alt) in args.variant ] variant_collection = VariantCollection(variants) variant_collections.append(variant_collection) if len(variant_collections) == 0: raise ValueError( "No variants loaded (use --maf, --vcf, or --variant options)") for json_path in args.json_variant_files: with open(json_path, 'r') as f: json_string = f.read() variant_collections.append( VariantCollection.from_json(json_string)) if len(variant_collections) == 0: raise ValueError( "No variants loaded (use --maf, --vcf, --json-variants options)") elif len(variant_collections) == 1: return variant_collections[0] else: combined_variants = [] for variant_collection in variant_collections: combined_variants.extend(list(variant_collection)) return VariantCollection(combined_variants)
def test_serialization(): original = VariantCollection([ Variant( 1, start=10, ref="AA", alt="AAT", ensembl=77), Variant(10, start=15, ref="A", alt="G"), Variant(20, start=150, ref="", alt="G"), ]) original.metadata[original[0]] = {"a": "b"} original.metadata[original[2]] = {"bar": 2} # This causes the variants' ensembl objects to make a SQL connection, # which makes the ensembl object non-serializable. By calling this # method, we are checking that we don't attempt to directly serialize # the ensembl object. original.effects() # Test pickling. serialized = pickle.dumps(original) reconstituted = pickle.loads(serialized) eq_(original, reconstituted) eq_(reconstituted[0], original[0]) eq_(reconstituted.metadata[original[0]], original.metadata[original[0]]) # Test json. serialized = original.to_json() reconstituted = VariantCollection.from_json(serialized) eq_(original, reconstituted) eq_(reconstituted[0], original[0]) eq_(reconstituted.metadata[original[0]], original.metadata[original[0]])
def variants_to_protein_sequences_dataframe( expressed_vcf="data/b16.f10/b16.expressed.vcf", not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf", tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam", min_mapping_quality=0, max_protein_sequences_per_variant=1, variant_sequence_assembly=False): """ Helper function to load pair of VCFs and tumor RNA BAM and use them to generate a DataFrame of expressed variant protein sequences. """ expressed_variants = load_vcf(expressed_vcf) not_expressed_variants = load_vcf(not_expressed_vcf) combined_variants = VariantCollection( list(expressed_variants) + list(not_expressed_variants)) alignment_file = load_bam(tumor_rna_bam) read_collector = ReadCollector(min_mapping_quality=min_mapping_quality) read_evidence_gen = read_collector.read_evidence_generator( variants=combined_variants, alignment_file=alignment_file) creator = ProteinSequenceCreator( max_protein_sequences_per_variant=max_protein_sequences_per_variant, variant_sequence_assembly=variant_sequence_assembly) protein_sequences_generator = \ creator.protein_sequences_from_read_evidence_generator(read_evidence_gen) df = protein_sequences_generator_to_dataframe(protein_sequences_generator) return df, expressed_variants, combined_variants
def test_multiple_variant_forms(): """ Load VCF, MAF and VariantCollection together. """ vcf_dir, cohort = None, None try: vcf_dir, cohort = make_cohort([FILE_FORMAT_1]) patient = cohort[0] patient.variants.append(data_path(MAF_FILE)) # Make sure listing the file twice has no effect. patient.variants.append(data_path(MAF_FILE)) variant = Variant(start=1000000, ref="A", alt="T", contig=1, ensembl=75) patient.variants.append(VariantCollection([variant])) cohort_variants = cohort.load_variants(patients=[patient]) # Make sure the VariantCollection was included. eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1000000)), 1) # Make sure the VCF was included. eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 53513530)), 1) # Make sure the MAF was included. eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1650797)), 1) # Make sure a non-existant variant is not included. eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1650798)), 0) finally: if vcf_dir is not None and path.exists(vcf_dir): rmtree(vcf_dir) if cohort is not None: cohort.clear_caches()
def generate_random_missense_variants(num_variants=10, max_search=100000, reference="GRCh37"): """ Generate a random collection of missense variants by trying random variants repeatedly. """ variants = [] for i in range(max_search): bases = ["A", "C", "T", "G"] random_ref = choice(bases) bases.remove(random_ref) random_alt = choice(bases) random_contig = choice(["1", "2", "3", "4", "5"]) random_variant = Variant(contig=random_contig, start=randint(1, 1000000), ref=random_ref, alt=random_alt, ensembl=reference) try: effects = random_variant.effects() for effect in effects: if isinstance(effect, Substitution): variants.append(random_variant) break except: continue if len(variants) == num_variants: break return VariantCollection(variants)
def variants_to_protein_sequences_dataframe( expressed_vcf="data/b16.f10/b16.expressed.vcf", not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf", tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam", min_mapping_quality=0, max_protein_sequences_per_variant=1, variant_sequence_assembly=False): """ Helper function to load pair of VCFs and tumor RNA BAM and use them to generate a DataFrame of expressed variant protein sequences. """ expressed_variants = load_vcf(expressed_vcf) not_expressed_variants = load_vcf(not_expressed_vcf) combined_variants = VariantCollection( list(expressed_variants) + list(not_expressed_variants)) samfile = load_bam(tumor_rna_bam) allele_reads_generator = reads_overlapping_variants( variants=combined_variants, samfile=samfile, min_mapping_quality=min_mapping_quality) protein_sequences_generator = reads_generator_to_protein_sequences_generator( allele_reads_generator, max_protein_sequences_per_variant=max_protein_sequences_per_variant, variant_sequence_assembly=variant_sequence_assembly) df = protein_sequences_generator_to_dataframe(protein_sequences_generator) return df, expressed_variants, combined_variants
def test_drop_duplicates(): ensembl = EnsemblRelease(78) v1 = Variant("1", 3000, "A", "G", ensembl=ensembl) v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl) v2 = Variant("2", 10, "G", "T", ensembl=ensembl) collection_without_duplicates = VariantCollection( variants=[v1, v1, v1_copy, v2]) assert len(collection_without_duplicates) == 2
def variant_collection_from_args(args): variant_collections = [] if args.reference_name: genome = genome_for_reference_name(args.reference_name) else: # no genome specified, assume it can be inferred from the file(s) # we're loading genome = None for vcf_path in args.vcf: vcf_variants = load_vcf(vcf_path, genome=genome) variant_collections.append(vcf_variants) for maf_path in args.maf: maf_variants = load_maf(maf_path) variant_collections.append(maf_variants) if args.variant: if not genome: raise ValueError( "--reference-name must be specified when using --variant") variants = [ Variant( chromosome, start=position, ref=ref, alt=alt, ensembl=genome) for (chromosome, position, ref, alt) in args.variant ] variant_collection = VariantCollection(variants) variant_collections.append(variant_collection) if len(variant_collections) == 0: raise ValueError( "No variants loaded (use --maf, --vcf, or --variant options)") for json_path in args.json_variant_files: with open(json_path, 'r') as f: json_string = f.read() variant_collections.append( VariantCollection.from_json(json_string)) if len(variant_collections) == 0: raise ValueError( "No variants loaded (use --maf, --vcf, --json-variants options)") elif len(variant_collections) == 1: return variant_collections[0] else: combined_variants = [] for variant_collection in variant_collections: combined_variants.extend(list(variant_collection)) return VariantCollection(combined_variants)
def _load_single_sample_variants(self, sample_idx, file_format_funcs, variant_type, merge_type): sample_id = self.sample_ids[sample_idx] normal_bam_id = None if self.normal_bam_ids is None else self.normal_bam_ids[ sample_idx] tumor_bam_id = None if self.tumor_bam_ids is None else self.tumor_bam_ids[ sample_idx] cached_file_name = "%s-%s-variants.pkl" % (variant_type, merge_type) cached = self.load_from_cache(self.cache_names["variant"], sample_id, cached_file_name) if cached is not None: return cached combined_variants = [] for file_format_func in file_format_funcs: file_name = file_format_func(sample_id, normal_bam_id, tumor_bam_id) variants = varcode.load_vcf_fast( path.join(self.data_dir, file_name)) combined_variants.append(set(variants.elements)) if len(combined_variants) == 1: # There is nothing to merge merged_variants = VariantCollection(combined_variants[0]) else: assert merge_type in ["union", "intersection" ], "Unknown merge type: %s" % merge_type if merge_type == "union": merged_variants = VariantCollection( set.union(*combined_variants)) elif merge_type == "intersection": merged_variants = VariantCollection( set.intersection(*combined_variants)) self.save_to_cache(merged_variants, self.cache_names["variant"], sample_id, cached_file_name) return merged_variants
def test_variant_collection_serialization(): variant_list = [ Variant(1, start=10, ref="AA", alt="AAT"), Variant(10, start=15, ref="A", alt="G"), Variant(20, start=150, ref="", alt="G"), ] original = VariantCollection( variant_list, source_to_metadata_dict={ "test_data": {variant: { "a": "b", "bar": 2 } for variant in variant_list} }) # This causes the variants' ensembl objects to make a SQL connection, # which makes the ensembl object non-serializable. By calling this # method, we are checking that we don't attempt to directly serialize # the ensembl object. original.effects() original_first_variant = original[0] original_metadata = original.metadata # Test pickling reconstructed = pickle.loads(pickle.dumps(original)) eq_(original, reconstructed) eq_(reconstructed[0], original_first_variant) eq_(reconstructed.metadata[original_first_variant], original_metadata[original_first_variant]) merged = original.intersection(original) merged_reconstructed = pickle.loads(pickle.dumps(merged)) eq_(merged, merged_reconstructed) # Test JSON serialization variants_from_json = VariantCollection.from_json(original.to_json()) eq_(original, variants_from_json) eq_(variants_from_json[0], original_first_variant) # pylint: disable=no-member eq_(variants_from_json.metadata[original_first_variant], original_metadata[original_first_variant])
def test_variant_collection_serialization(): variant_list = [ Variant( 1, start=10, ref="AA", alt="AAT"), Variant(10, start=15, ref="A", alt="G"), Variant(20, start=150, ref="", alt="G"), ] original = VariantCollection( variant_list, source_to_metadata_dict={ "test_data": {variant: {"a": "b", "bar": 2} for variant in variant_list}}) # This causes the variants' ensembl objects to make a SQL connection, # which makes the ensembl object non-serializable. By calling this # method, we are checking that we don't attempt to directly serialize # the ensembl object. original.effects() original_first_variant = original[0] original_metadata = original.metadata # Test pickling reconstructed = pickle.loads(pickle.dumps(original)) eq_(original, reconstructed) eq_(reconstructed[0], original_first_variant) eq_(reconstructed.metadata[original_first_variant], original_metadata[original_first_variant]) merged = original.intersection(original) merged_reconstructed = pickle.loads(pickle.dumps(merged)) eq_(merged, merged_reconstructed) # Test JSON serialization variants_from_json = VariantCollection.from_json(original.to_json()) eq_(original, variants_from_json) eq_(variants_from_json[0], original_first_variant) # pylint: disable=no-member eq_(variants_from_json.metadata[original_first_variant], original_metadata[original_first_variant])
def data_path(name): """ Return the absolute path to a file in the varcode/test/data directory. The name specified should be relative to varcode/test/data. """ return os.path.join(os.path.dirname(__file__), "data", name) # BRAF variant coordinates from COSMIC entry: # http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=476 braf_V600E_variant = Variant(7, 140753336, "A", "T", ensembl_grch38) # TP53 variant coordinates from COSMIC entry: # http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=10656 tp53_R248W_variant = Variant(17, 7674221, "G", "A", ensembl_grch38) cancer_test_variants = VariantCollection( [braf_V600E_variant, tp53_R248W_variant]) cancer_test_variant_gene_ids = { gene_id for v in cancer_test_variants for gene_id in v.gene_ids } cancer_test_variant_transcript_ids = { transcript_id for v in cancer_test_variants for transcript_id in v.transcript_ids }
def test_effects_priority_caching(): """ Make sure that effects are cached such that they are not filtered prematurely. See https://github.com/hammerlab/cohorts/issues/252. """ cohort = None try: # This variant has IntronicSpliceSite, Subsitution effects, and more. variant = Variant(contig=3, start=20212211, ref="C", alt="T", ensembl=75) patient = Patient(id="patient", os=3, pfs=2, deceased=False, progressed=False, variants=VariantCollection([variant])) cohort_cache_path = generated_data_path("cache") cohort = Cohort( patients=[patient], cache_dir=cohort_cache_path) # All of the effects. cohort.clear_caches() for i in range(2): effects = cohort.load_effects(all_effects=True)[patient.id] eq_(len(effects), 15) # Top priority effect. cohort.clear_caches() for i in range(2): effects = cohort.load_effects()[patient.id] eq_(len(effects), 1) eq_(type(effects[0]), IntronicSpliceSite) def missense_snv_filter(filterable_effect): return (type(filterable_effect.effect) == Substitution and filterable_effect.variant.is_snv) # All missense SNV effects, from the large cache. cohort.clear_caches() for i in range(2): effects = cohort.load_effects(all_effects=True, filter_fn=missense_snv_filter)[patient.id] eq_(len(effects), 6) # Top missense SNV effect, from the large cache. cohort.clear_caches() for i in range(2): effects = cohort.load_effects(filter_fn=missense_snv_filter)[patient.id] eq_(len(effects), 1) eq_(type(effects[0]), Substitution) # Top missense SNV effects, from the small nonsynonymous cache. cohort.clear_caches() for i in range(2): effects = cohort.load_effects(only_nonsynonymous=True, filter_fn=missense_snv_filter)[patient.id] eq_(len(effects), 1) eq_(type(effects[0]), Substitution) # All nonsynonymous effects, from the small nonsynonymous cache. cohort.clear_caches() for i in range(2): effects = cohort.load_effects(all_effects=True, only_nonsynonymous=True)[patient.id] eq_(len(effects), 6) finally: if cohort is not None: cohort.clear_caches()
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from nose.tools import eq_ from varcode import VariantCollection from .data import ( snp_rs4244285, snp_rs1537415 ) variants = VariantCollection([ # gene ids: ['ENSG00000165841', 'ENSG00000276490'] # transcript_ids : ['ENST00000371321', 'ENST00000464755'] snp_rs4244285, # gene ids: ['ENSG00000204007'] # transcript ids: ['ENST00000371763', 'ENST00000613244'] snp_rs1537415, ]) gene_fpkm_dict = { "ENSG00000165841": 10.0, "ENSG00000204007": 20.0, "ENSG00000276490": 30.0, } transcript_fpkm_dict = { "ENST00000371321": 10.0, "ENST00000464755": 20.0, "ENST00000371763": 30.0, "ENST00000613244": 40.0,
# limitations under the License. """ Helper functions and shared datasets for tests """ import os from varcode import Variant, VariantCollection, load_maf import pandas as pd def data_path(name): """ Return the absolute path to a file in the varcode/test/data directory. The name specified should be relative to varcode/test/data. """ return os.path.join(os.path.dirname(__file__), "data", name) dbnsp_validation_df = pd.read_csv(data_path('dbnsfp_validation_set.csv')) tcga_ov_variants = load_maf(data_path("tcga_ov.head.maf")) ov_wustle_variants = load_maf(data_path("ov.wustle.subset5.maf")) snp_rs4244285 = Variant(contig=10, start=94781859, ref="G", alt="A") snp_rs1537415 = Variant(contig=9, start=135637876, ref="C", alt="G") snp_rs3892097 = Variant(contig=22, start=42524947, ref="G", alt="A") db_snp_variants = VariantCollection([ snp_rs4244285, snp_rs1537415, snp_rs3892097, ])
from __future__ import print_function, division, absolute_import from mhctools import NetMHCIIpan from nose.tools import eq_ from pyensembl import ensembl_grch37 from topiary import predict_epitopes_from_variants from varcode import Variant, VariantCollection # TODO: find out about these variants, # what do we expect from them? Are they SNVs? variants = VariantCollection([ Variant(contig=10, start=100018900, ref='C', alt='T', ensembl=ensembl_grch37), Variant(contig=11, start=32861682, ref='G', alt='A', ensembl=ensembl_grch37) ]) alleles = ["HLA-DPA1*01:05/DPB1*100:01", "DRB10102"] epitope_lengths = [15, 16] mhc_model = NetMHCIIpan(alleles=alleles, epitope_lengths=epitope_lengths) def test_netmhcii_pan_epitopes(): epitope_predictions = predict_epitopes_from_variants(
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from nose.tools import eq_ from varcode import VariantCollection from .data import (snp_rs4244285, snp_rs1537415) variants = VariantCollection([ # gene ids: ['ENSG00000165841', 'ENSG00000276490'] # transcript_ids : ['ENST00000371321', 'ENST00000464755'] snp_rs4244285, # gene ids: ['ENSG00000204007'] # transcript ids: ['ENST00000371763', 'ENST00000613244'] snp_rs1537415, ]) gene_fpkm_dict = { "ENSG00000165841": 10.0, "ENSG00000204007": 20.0, "ENSG00000276490": 30.0, } transcript_fpkm_dict = { "ENST00000371321": 10.0, "ENST00000464755": 20.0, "ENST00000371763": 30.0, "ENST00000613244": 40.0,
def test_splice_filtering_frameshift(): """ Make sure that ExonicSpliceSite mutations with FrameShift alternates are kept even when we filter to only FrameShift effects. """ cohort = None try: variant = Variant(contig=8, start=145617535, ref="GGGGGTGCAAGGTGA", alt="", ensembl=75) patient = Patient(id="patient", os=3, pfs=2, deceased=False, progressed=False, variants=VariantCollection([variant])) cohort_cache_path = generated_data_path("cache") cohort = Cohort( patients=[patient], cache_dir=cohort_cache_path) def frameshift_filter(filterable_effect): return (type(filterable_effect.effect) == FrameShift) effects = cohort.load_effects(filter_fn=frameshift_filter)[patient.id] eq_(len(effects), 1) all_effects = cohort.load_effects(filter_fn=frameshift_filter, all_effects=True)[patient.id] eq_(len(all_effects), 1) finally: if cohort is not None: cohort.clear_caches()
def test_sequence_key_with_reading_frame_substitution_on_negative_strand(): # replace second codon of TP53-001 with 'CCC' tp53_substitution = Variant("17", 7676589, "CTC", "GGG", ensembl_grch38) variant_collection = VariantCollection([tp53_substitution]) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around second codon with 10 context nucleotides: # In [51]: t.sequence[193-10:193+13] # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT' # Which can be split into the following parts: # last 7 nt of 5' UTR: CACTGCC # start codon: ATG (translates to M) # 2nd codon: GAG <---- variant occurs here # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T # first calling without a transcript ID white to see if we get back # multiple contexts reference_context_dict_many_transcripts = \ reference_contexts_for_variants( variants=variant_collection, context_size=10, transcript_id_whitelist=None) assert len(reference_context_dict_many_transcripts) == 1, \ "Dictionary should have only one variant but got %d keys" % ( len(reference_context_dict_many_transcripts),) reference_contexts = reference_context_dict_many_transcripts[ tp53_substitution] assert len(reference_contexts) > 1, \ "Expected multiple reference contexts for %s but got %d: %s" % ( tp53_substitution, len(reference_contexts), reference_contexts) reference_context_dict_single_transcript = \ reference_contexts_for_variants( variants=variant_collection, context_size=10, transcript_id_whitelist={tp53_001.id}) # still only expect one variant key eq_(len(reference_context_dict_single_transcript), 1) result_list = reference_context_dict_single_transcript[tp53_substitution] # since we limited the transcript ID whitelist, we only expect a single # reference context in the result eq_(len(result_list), 1) result = result_list[0] expected = ReferenceContext(strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="GAG", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=7, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="M", variant=tp53_substitution, transcripts=[tp53_001]) eq_(result, expected)
def test_splice_filtering_substitution(): """ Make sure that ExonicSpliceSite mutations with Substitution alternates are kept even when we filter to only Substitution effects. """ cohort = None try: variant = Variant(contig=10, start=124340409, ref="C", alt="A", ensembl=75) patient = Patient(id="patient", os=3, pfs=2, deceased=False, progressed=False, variants=VariantCollection([variant])) cohort_cache_path = generated_data_path("cache") cohort = Cohort( patients=[patient], cache_dir=cohort_cache_path) def missense_snv_filter(filterable_effect): return (type(filterable_effect.effect) == Substitution and filterable_effect.variant.is_snv) effects = cohort.load_effects(filter_fn=missense_snv_filter)[patient.id] eq_(len(effects), 1) all_effects = cohort.load_effects(filter_fn=missense_snv_filter, all_effects=True)[patient.id] eq_(len(all_effects), 7) finally: if cohort is not None: cohort.clear_caches()