def predicted_effects_for_variant(variant, transcript_id_whitelist=None, only_coding_changes=True): """ For a given variant, return its set of predicted effects. Optionally filter to transcripts where this variant results in a non-synonymous change to the protein sequence. Parameters ---------- variant : varcode.Variant transcript_id_whitelist : set Filter effect predictions to only include these transcripts Returns a varcode.EffectCollection object """ effects = [] for transcript in variant.transcripts: if only_coding_changes and not transcript.complete: logger.info( "Skipping transcript %s for variant %s because it's incomplete", transcript.name, variant) continue if transcript_id_whitelist and transcript.id not in transcript_id_whitelist: logger.info( "Skipping transcript %s for variant %s because it's not one of %d allowed", transcript.name, variant, len(transcript_id_whitelist)) continue effects.append(variant.effect_on_transcript(transcript)) effects = EffectCollection(effects) n_total_effects = len(effects) logger.info("Predicted total %d effects for variant %s" % (n_total_effects, variant)) if not only_coding_changes: return effects else: nonsynonymous_coding_effects = effects.drop_silent_and_noncoding() logger.info( "Keeping %d/%d effects which affect protein coding sequence for %s: %s", len(nonsynonymous_coding_effects), n_total_effects, variant, nonsynonymous_coding_effects) usable_effects = [ effect for effect in nonsynonymous_coding_effects if effect.mutant_protein_sequence is not None ] logger.info( "Keeping %d effects with predictable AA sequences for %s: %s", len(usable_effects), variant, usable_effects) return usable_effects
def _load_single_sample_effects(self, sample_idx, file_format_funcs, only_nonsynonymous, variant_type, merge_type): sample_id = self.sample_ids[sample_idx] cached_file_name = "%s-%s-effects.pkl" % (variant_type, merge_type) if only_nonsynonymous: cached = self.load_from_cache( self.cache_names["nonsynonymous_effect"], sample_id, cached_file_name) else: cached = self.load_from_cache(self.cache_names["effect"], sample_id, cached_file_name) if cached is not None: return cached variants = self._load_single_sample_variants( sample_idx, self.variant_type_to_format_funcs[variant_type], variant_type, merge_type) effects = variants.effects() nonsynonymous_effects = EffectCollection( effects.drop_silent_and_noncoding( ).top_priority_effect_per_variant().values()) self.save_to_cache(effects, self.cache_names["effect"], sample_id, cached_file_name) self.save_to_cache(nonsynonymous_effects, self.cache_names["nonsynonymous_effect"], sample_id, cached_file_name) if only_nonsynonymous: return nonsynonymous_effects return effects
def top_priority_maybe(effect_collection): """ Always (unless all_effects=True) take the top priority effect per variant so we end up with a single effect per variant. """ if all_effects: return effect_collection return EffectCollection( list(effect_collection.top_priority_effect_per_variant().values()))
def filter_effects(effect_collection, variant_collection, patient, filter_fn, all_effects, **kwargs): """Filter variants from the Effect Collection Parameters ---------- effect_collection : varcode.EffectCollection variant_collection : varcode.VariantCollection patient : cohorts.Patient filter_fn : function Takes a FilterableEffect and returns a boolean. Only effects returning True are preserved. all_effects : boolean Return the single, top-priority effect if False. If True, return all effects (don't filter to top-priority). Returns ------- varcode.EffectCollection Filtered effect collection, with only the variants passing the filter """ def top_priority_maybe(effect_collection): """ Always (unless all_effects=True) take the top priority effect per variant so we end up with a single effect per variant. """ if all_effects: return effect_collection return EffectCollection( list(effect_collection.top_priority_effect_per_variant().values())) def apply_filter_fn(filter_fn, effect): """ Return True if filter_fn is true for the effect or its alternate_effect. If no alternate_effect, then just return True if filter_fn is True. """ applied = filter_fn( FilterableEffect(effect=effect, variant_collection=variant_collection, patient=patient), **kwargs) if hasattr(effect, "alternate_effect"): applied_alternate = filter_fn( FilterableEffect(effect=effect.alternate_effect, variant_collection=variant_collection, patient=patient), **kwargs) return applied or applied_alternate return applied if filter_fn: return top_priority_maybe( EffectCollection([ effect for effect in effect_collection if apply_filter_fn(filter_fn, effect) ])) else: return top_priority_maybe(effect_collection)
def protein_change_effects_from_args(args): genome = genome_from_args(args) valid_gene_names = set(genome.gene_names()) substitution_regex = re.compile("([A-Z]+)([0-9]+)([A-Z]+)") effects = [] for gene_name, protein_change_string in args.protein_change: match_obj = substitution_regex.match(protein_change_string) if match_obj is None: logging.warn("Unable to parse protein modification: '%s'" % protein_change_string) continue ref, base1_pos, alt = match_obj.groups() base1_pos = int(base1_pos) if gene_name not in valid_gene_names: logging.warn( "Invalid gene name '%s' in protein modification: '%s'" % (gene_name, protein_change_string)) continue candidate_transcripts = [] for candidate_gene in genome.genes_by_name(gene_name): for candidate_transcript in candidate_gene.transcripts: if not candidate_transcript.is_protein_coding: continue protein_sequence = candidate_transcript.protein_sequence if protein_sequence is None: continue if len(protein_sequence) < (base1_pos + len(ref) - 1): # protein sequence too short for this modification # e.g. EGFR T790M can't happen in an EGFR transcript # with only 789 amino acids continue seq_at_pos = protein_sequence[base1_pos - 1:base1_pos + len(ref) - 1] if seq_at_pos != ref: # if this transcript doesn't have the same reference amino # acids as the change then skip it and use a different # transcript continue candidate_transcripts.append(candidate_transcript) if len(candidate_transcripts) > 0: transcript = best_transcript(candidate_transcripts) effects.append( Substitution(variant=None, transcript=transcript, aa_ref=ref, aa_alt=alt, aa_mutation_start_offset=base1_pos - 1)) return EffectCollection(effects)
def missense_snv_count(cohort, **kwargs): sample_nonsynonymous_effects = cohort.load_effects(only_nonsynonymous=True, **kwargs) sample_missense_effects = dict([ (sample, EffectCollection( [effect for effect in effects if type(effect) == Substitution])) for (sample, effects) in sample_nonsynonymous_effects.items() ]) def count_func(sample): if sample in sample_missense_effects: return len(sample_missense_effects[sample]) return np.nan return count(cohort, count_func, count_col="missense_snv_count")
def predicted_coding_effects_with_mutant_sequence( variant, transcript_id_whitelist=None): """ For a given variant, return the set of predicted mutation effects on transcripts where this variant results in a predictable non-synonymous change to the protein sequence. Parameters ---------- variant : varcode.Variant transcript_id_whitelist : set Filter effect predictions to only include these transcripts Returns a varcode.EffectCollection object """ effects = [] for transcript in variant.transcripts: if not transcript.complete: logger.info( "Skipping transcript %s for variant %s because it's incomplete", transcript.name, variant) continue if transcript_id_whitelist and transcript.id not in transcript_id_whitelist: logger.info( "Skipping transcript %s for variant %s because it's not one of %d allowed", transcript.name, variant, len(transcript_id_whitelist)) continue effects.append(variant.effect_on_transcript(transcript)) effects = EffectCollection(effects) n_total_effects = len(effects) logger.info("Predicted total %d effects for variant %s" % ( n_total_effects, variant)) nonsynonymous_coding_effects = effects.drop_silent_and_noncoding() logger.info( "Keeping %d/%d effects which affect protein coding sequence for %s: %s", len(nonsynonymous_coding_effects), n_total_effects, variant, nonsynonymous_coding_effects) usable_effects = [ effect for effect in nonsynonymous_coding_effects if effect.mutant_protein_sequence is not None ] logger.info( "Keeping %d effects with predictable AA sequences for %s: %s", len(usable_effects), variant, usable_effects) return usable_effects
def test_tcga_effect_collection_to_dict(): eq_(tcga_ov_effects, EffectCollection.from_dict(tcga_ov_effects.to_dict()))
def test_wustle_effect_collection_to_json(): eq_(ov_wustle_effects, EffectCollection.from_json(ov_wustle_effects.to_json()))
def test_tcga_effect_collection_to_json(): eq_(tcga_ov_effects, EffectCollection.from_json(tcga_ov_effects.to_json()))
def test_wustle_effect_collection_to_dict(): eq_(ov_wustle_effects, EffectCollection.from_dict(ov_wustle_effects.to_dict()))
def test_tcga_effect_collection_to_dict(): eq_( tcga_ov_effects, EffectCollection.from_dict(tcga_ov_effects.to_dict()))
def test_wustle_effect_collection_to_json(): eq_( ov_wustle_effects, EffectCollection.from_json(ov_wustle_effects.to_json()))
def test_wustle_effect_collection_to_dict(): eq_( ov_wustle_effects, EffectCollection.from_dict(ov_wustle_effects.to_dict()))