def glycosylations_without_subtype_ratio(self): from models import Site, SiteType glycosylation = SiteType.query.filter_by(name='glycosylation').one() glycosylations = Site.query.filter( SiteType.fuzzy_filter(glycosylation)).count() return (glycosylations - self.glycosylations_with_subtype()) / glycosylations
def ptm_muts_of_gene( path_template='exported/{site_type}_muts_of_{gene}_-_{protein}.tsv', gene='EGFR', site_type='glycosylation', mutation_source='mc3', to_csv=True, show_progress=False, **kwargs ): manager = MutationImportManager() importer_class = manager.importers[mutation_source] importer = importer_class(**kwargs) site_type = SiteType.query.filter_by(name=site_type).one() gene = Gene.query.filter_by(name=gene).one() protein = gene.preferred_isoform mutations = importer.export_to_df( mutation_filter=and_( Mutation.affected_sites.any(SiteType.fuzzy_filter(site_type)), Mutation.protein_id == protein.id ), protein_filter=Protein.id == protein.id, show_progress=show_progress ) path = path_template.format(protein=protein.refseq, gene=gene.name, site_type=site_type.name) if to_csv: mutations.to_csv(path, sep='\t', index=False) return mutations
def most_mutated_sites(sources: List[MutationSource], site_type: SiteType = None, limit=25, intersection=True, exclusive=None, mutation_filter=None): """Sources must have the same value_type (counts/frequencies)""" assert not (intersection and exclusive) counts = prepare_for_summing(sources) query = (db.session.query( Site, *[count.label(f'count_{i}') for i, count in enumerate(counts)]).select_from(Mutation)) if intersection: for source in sources: query = query.join(source) else: for source in sources: query = query.outerjoin(source) if exclusive: query = query.filter(~Mutation.in_sources(*exclusive)) if mutation_filter is not None: query = query.filter(mutation_filter) query = (query.join(Mutation.affected_sites).filter( Site.protein.has(Protein.is_preferred_isoform))) if site_type: query = query.filter(SiteType.fuzzy_filter(site_type, join=True)) query = (query.group_by(Site).having(and_(*counts))) query = query.subquery() total_muts_count = reduce( operator.add, [getattr(query.c, f'count_{i}') for i in range(len(counts))]) total_muts_count = total_muts_count.label('mutations_count') query = (db.session.query( aliased(Site, query), total_muts_count, ).order_by(desc(total_muts_count))) return query.limit(limit)
def site_type_filter_from_str(query, site=Site): if query == 'any': return if query.startswith('not'): query = query[4:] negate = True else: negate = False site_type = SiteType.query.filter_by(name=query).one() site_filter = SiteType.fuzzy_filter(site_type, join=True, site=site) if negate: site_filter = ~site_filter return site_filter
def are_glycosylation_sites_mutated_more_often(source_name: str, disordered=None, alternative='greater'): from stats.table import count_mutated_sites glycosylation = SiteType.query.filter_by(name='glycosylation').one() non_glycosylation = SiteType.query.filter( ~SiteType.name.contains('glycosylation')).all() print(f'Comparing {glycosylation} against {non_glycosylation}') source = source_manager.source_by_name[source_name] count = partial(count_mutated_sites, model=source, only_primary=True, disordered=disordered) glyco_filter = SiteType.fuzzy_filter(glycosylation, join=True) glycosylation_types = SiteType.query.filter( SiteType.name.contains('glycosylation')).all() non_glyco_filter = Site.types.any( ~SiteType.id.in_([site_type.id for site_type in glycosylation_types])) mutated_glycosylation = count(custom_filter=glyco_filter) mutated_non_glycosylation = count(custom_filter=non_glyco_filter) total_glycosylation = Site.query.filter(glyco_filter).count() total_non_glycosylation = Site.query.filter(non_glyco_filter).count() # mutated | not_mutated # glyc | # other | contingency_table = [ [mutated_glycosylation, total_glycosylation - mutated_glycosylation], [ mutated_non_glycosylation, total_non_glycosylation - mutated_non_glycosylation ] ] print(contingency_table) oddsratio, pvalue = fisher_exact(contingency_table, alternative=alternative) print(source_name, oddsratio, pvalue) return oddsratio, pvalue
def enrichment_of_ptm_genes(reference_set, site_type_name: str, only_mutated_sites=False): """ Args: only_mutated_sites: whether only genes with mutated sites should be considered, True, False or an SQLAlchemy filter, e.g. Mutation.in_sources(MC3Mutation) """ site_type = SiteType.query.filter_by(name=site_type_name).one() observed_genes = (Gene.query.join(Gene.preferred_isoform).join( Protein.sites).filter(SiteType.fuzzy_filter(site_type))) if only_mutated_sites is not False: observed_genes = observed_genes.join( Site.mutations).filter(only_mutated_sites) observed_genes = set(observed_genes) return genes_enrichment(observed_genes, reference_set)
def are_glycosylation_sites_enriched(source_name: str, population_name: str, disordered=None, alternative='greater'): from stats.table import count_mutated_sites glycosylation = SiteType.query.filter_by(name='glycosylation').one() glyco_filter = SiteType.fuzzy_filter(glycosylation, join=True) glycosylation_types = SiteType.query.filter( SiteType.name.contains('glycosylation')).all() non_glyco_filter = Site.types.any( ~SiteType.id.in_([site_type.id for site_type in glycosylation_types])) source = source_manager.source_by_name[source_name] population = source_manager.source_by_name[population_name] count_glyc = partial(count_mutated_sites, custom_filter=glyco_filter, only_primary=True, disordered=disordered) count_not_glyc = partial(count_mutated_sites, custom_filter=non_glyco_filter, only_primary=True, disordered=disordered) # mutated glyc | mutated not glyc # cancer | # popula | contingency_table = [[ count_glyc(model=source), count_not_glyc(model=source) ], [count_glyc(model=population), count_not_glyc(model=population)]] print(contingency_table) oddsratio, pvalue = fisher_exact(contingency_table, alternative=alternative) print(source_name, population_name, oddsratio, pvalue) return oddsratio, pvalue
def get_site_filters(exclude=None, glycosylation='together'): """Yields (site type name, type filter) tuples. Args: exclude: site types to exclude glycosylation: 'together', 'only' or 'separate' """ for site_type in tqdm(SiteType.query, total=SiteType.query.count()): glyco_kind = 'glycosylation' in site_type.name type_name = site_type.name if glycosylation == 'together': if glyco_kind: if type_name == 'glycosylation': type_name = 'glycosylation (all subtypes)' else: continue elif glycosylation == 'only': if not glyco_kind: continue if type_name == 'glycosylation': type_name = 'glycosylation (unknown subtype)' if exclude and site_type.name in exclude: continue if glycosylation == 'together': site_filter = SiteType.fuzzy_filter(site_type, join=True) else: site_filter = Site.types.contains(site_type) yield type_name, site_filter
def ptm_on_random(source=MC3Mutation, site_type='glycosylation', same_proteins=False, only_preferred=True, mode='occurrences', repeats=10000, ptm_proteins=False, same_ptm_proteins=False, exclude_genes=None, mutation_filter=None, sample_ptm_muts=True): """"Compare frequencies of PTM mutations of given type with random proteome mutations from protein sequence regions of the same size as analysed PTM regions. """ from numpy import sum from numpy import zeros from numpy.random import choice assert mode in {'distinct', 'occurrences'} distinct = mode == 'distinct' assert not (same_proteins and ptm_proteins) assert not (same_ptm_proteins and not ptm_proteins) def measure(x): """See https://github.com/taschini/pyinterval/issues/2""" return int(fpu.up(lambda: sum((c.sup - c.inf for c in x), 0))) site_type = SiteType.query.filter_by(name=site_type).one() only_preferred = Protein.is_preferred_isoform if only_preferred else True # all muts all_muts = defaultdict(lambda: defaultdict(int)) q = (db.session.query( source, Mutation).select_from(source).join(Mutation).filter( mutation_filter if mutation_filter is not None else True).join( Protein).filter(only_preferred)) if distinct: for mutation_details, mutation in tqdm(q, total=q.count()): all_muts[mutation.protein][mutation.position] += 1 else: for mutation_details, mutation in tqdm(q, total=q.count()): if mutation.position > mutation.protein.length: print(f'Faulty mutation: {mutation}') continue all_muts[mutation.protein][ mutation.position] += mutation_details.count # region size glyco_sequence_region_size = 0 intervals_by_protein = defaultdict(interval) sites = (Site.query.filter( SiteType.fuzzy_filter(site_type)).join(Protein).filter(only_preferred)) for site in tqdm(sites, total=sites.count()): intervals_by_protein[site.protein] |= interval[ max(site.position - 7, 0), min(site.position + 7, site.protein.length)] for sequence_interval in intervals_by_protein.values(): glyco_sequence_region_size += measure(sequence_interval) # ptm muts ptm_muts = (db.session.query( source, Mutation ).select_from(source).join(Mutation).join( Mutation.affected_sites ).filter(mutation_filter if mutation_filter is not None else True).filter( SiteType.fuzzy_filter(site_type)).join(Protein).filter(only_preferred)) if exclude_genes: exclude_proteins = Protein.query.select_from(Gene).join( Gene.isoforms).filter(Gene.name.in_(exclude_genes)).all() ptm_muts = ptm_muts.filter( ~Protein.id.in_([p.id for p in exclude_proteins])) ptm_muts = ptm_muts.group_by(source) if sample_ptm_muts: ptm_muts_by_protein = defaultdict(list) for mutation_details, mutation in ptm_muts: ptm_muts_by_protein[mutation.protein].append( (mutation_details, mutation)) ptm_mutations_array = zeros(glyco_sequence_region_size) pos = 0 for protein, protein_interval in tqdm(intervals_by_protein.items(), total=len(intervals_by_protein)): for mutation_details, mutation in ptm_muts_by_protein[protein]: p = 0 for subinterval in protein_interval.components: if mutation.position in subinterval: p += mutation.position - 1 - int( subinterval[0].inf) # position in interval break p += measure(subinterval) if distinct: ptm_mutations_array[pos + p] += 1 else: ptm_mutations_array[pos + p] += mutation_details.count pos += measure(protein_interval) ptm_counts = [] for repeat in tqdm(range(repeats), total=repeats): ptm_counts.append( sum( choice(ptm_mutations_array, size=glyco_sequence_region_size))) print(Series(ptm_counts).describe()) ptm_muts_count = mean(ptm_counts) else: if distinct: ptm_muts_count = ptm_muts.count() else: ptm_muts_count = 0 for mutation, mutation in ptm_muts: assert mutation.count != 0 ptm_muts_count += mutation.count assert ptm_muts_count >= ptm_muts.count() ptm_counts = [ptm_muts_count] ptm_ratio = ptm_muts_count / glyco_sequence_region_size if same_proteins: proteins = list(intervals_by_protein.keys()) else: proteins = Protein.query.filter(only_preferred) if ptm_proteins: proteins = proteins.join(Protein.sites) if same_ptm_proteins: proteins = proteins.filter(SiteType.fuzzy_filter(site_type)) proteins = proteins.all() proteins = [ protein for protein in proteins if not exclude_genes or protein.gene.name not in exclude_genes ] weights = [p.length for p in proteins] mutations_array = zeros(sum(weights)) pos = 0 for protein in tqdm(proteins): for position, count in all_muts[protein].items(): try: mutations_array[pos + position - 1] = count except Exception: print(protein, pos, position) raise pos += protein.length counts = [] append = counts.append for repeat in tqdm(range(repeats), total=repeats): append(sum(choice(mutations_array, size=glyco_sequence_region_size))) p_value = sum(1 for count in counts if count > ptm_muts_count) / repeats count_of_sampled_muts = mean(counts) random_ratio = count_of_sampled_muts / glyco_sequence_region_size explanation = '(only the same proteins)' if same_proteins else '' print( f'count of {site_type.name} mutations: {ptm_muts_count},\n' f'count of random mutations from protein sequence regions of the same size: {count_of_sampled_muts}' f' {explanation}.') print(f'region size: {glyco_sequence_region_size}; source: {source}') print(f'frequency of {site_type.name} mutations: {ptm_ratio * 100}%,\n' f'frequency of random mutations: {random_ratio * 100}%.') print(f'p-value = {p_value} (permutation test, {repeats} repeats)') print('Permutation test values:') print(Series(counts).describe()) return ptm_counts, counts, glyco_sequence_region_size, p_value
def gather_ptm_muts_impacts(source: MutationSource, site_type: SiteType, limit_to_genes: List[str] = None, occurrences=True, limit_to_muts=False, muts_filter=None): """ Args: source: mutation source to gather mutations from site_type: PTM site type for which affecting mutations will be gathered limit_to_genes: list of gene names for which mutations of primary isoforms will be gathered occurrences: whether to count occurrences or distinct mutations limit_to_muts: list of tuples defining mutations and counts, like from AD data frame providing custom mutations lists overrides "occurrences" setting muts_filter: SQLAlchemy filter for mutations """ try: motifs_counter = MotifsCounter(site_type, mode='change_of_motif') except NoKnownMotifs as error: warn(f'Impacts collection failed, due to: {error}') return {} sites = (Site.query.filter(SiteType.fuzzy_filter( site_type, join=True)).join(Protein).filter(Protein.is_preferred_isoform)) def fuzzy_site_filter(sites): return [ site for site in sites # matches 'O-glycosylation' for site_type 'glycosylation' if any( site_type.name in type_name for type_name in site.types_names) ] mutations_by_impact_by_gene = { # order matters 'direct': defaultdict(int), 'motif-changing': defaultdict(int), 'proximal': defaultdict(int), 'distal': defaultdict(int) } mutations = (Mutation.query.filter( Mutation.in_sources(source)).join(Protein).join( Gene, Gene.preferred_isoform_id == Protein.id)) if muts_filter is not None: mutations = mutations.filter(muts_filter) motifs_data = motifs_counter.gather_muts_and_sites(mutations, sites, occurrences_in=[source]) all_breaking_muts = set() for motif_name, breaking_muts in motifs_data.muts_breaking_sites_motif.items( ): all_breaking_muts.update(breaking_muts) mutations = mutations.filter( Mutation.affected_sites.any(SiteType.fuzzy_filter(site_type, join=True))) if limit_to_genes is not None: proteins_ids = (db.session.query( Protein.id).select_from(Gene).join(Gene.preferred_isoform).filter( Gene.name.in_(limit_to_genes)).all()) mutations = mutations.filter(Protein.id.in_(proteins_ids)) mutations = mutations.with_entities(Gene.name, Mutation) if limit_to_muts is not False: muts = { Mutation.query.filter_by(position=mut.position, alt=mut.mut_residue, protein=Protein.query.filter_by( refseq=mut.isoform).one()).one(): int(mut.count) for mut in limit_to_muts.itertuples(index=False) } for gene_name, mutation in tqdm(mutations, total=mutations.count()): if limit_to_muts is not False: if mutation not in muts: continue value = muts[mutation] else: value = mutation.sources_map[ source.name].get_value() if occurrences else 1 impact = mutation.impact_on_ptm(fuzzy_site_filter) if impact != 'direct' and mutation in all_breaking_muts: mutations_by_impact_by_gene['motif-changing'][gene_name] += value continue assert impact != 'none' mutations_by_impact_by_gene[impact][gene_name] += value return mutations_by_impact_by_gene